Exemple #1
0
def encode_document(doc_id, text):
    """Encode a document as a JSON dictionary so that MRNgramIDFUtility can read it.
    We intend to use `doc_id` as a business/product/entity ID rather than the ID of
    an individual review."""
    
    #text = unicode(text) RAWR some amazon reviews won't encode
    return JSONValueProtocol.write(
        None, {'doc_id': doc_id, 'text': text})
def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()
def root_to_json(root_dir, output_file):
    walker = EmailWalker(root_dir)
    output = open(output_file, "w")

    for email in walker:
        email['date'] = str(email['date'])
        line = JSONValueProtocol.write(None, email) + '\n'
        output.write(line)

    output.close()
def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text, errors="ignore")
    cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems())

    return JSONValueProtocol.write(None, {"document": text, "cats": cats, "docid": id, "type": "document"}) + "\n"
Exemple #5
0
def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text)
    cats = dict((unicode(cat), bool(is_in_cat))
                for cat, is_in_cat
                in (cats or {}).iteritems())

    return JSONValueProtocol.write(
        None, {'text': text, 'cats': cats, 'id': id}) + '\n'
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {'3': 4}),
         JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
Exemple #9
0
 def test_numerical_keys_become_strs(self):
     # JSON should convert numbers to strings when they are dict keys
     self.assertEqual(
         (None, {'3': 4}),
         JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
Exemple #10
0
 def test_tuples_become_lists(self):
     # JSON should convert tuples into lists
     self.assertEqual(
         (None, [3, 4]),
         JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
Exemple #11
0
    def test_uses_json_format(self):
        VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]}
        ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}'

        self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED))
        self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))