Beispiel #1
0
def preprocess(doc):
    """Add additional fields before storing document.

    >>> doc = preprocess({'url': 'http://foo'})
    >>> 'guid' in doc
    True
    >>> 'url' in doc
    True

    """
    doc['guid'] = doc_guid(doc)
    return doc
Beispiel #2
0
def get_classified_items(filepath, db):
    with open(args.from_classified, 'rb') as fp:
        for line in fp:
            key, cat = line.strip().split('\t')
            cat = eval(cat)
            if isinstance(cat, list):
                cat = cat[0]
            doc = db[key]
            doc['guid'] = doc_guid(doc)
            doc['category'] = cat
            if any((f not in doc) for f in ('headline', 'datetime', 'body', 'url')):
                continue
            yield key, doc
Beispiel #3
0
 def do_label(self, line):
     """Update label for current document storing it
     in the train_path.
     """
     label = line.strip()
     path = os.path.join(self.train_path, label)
     if not os.path.exists(path):
         os.mkdir(path)
     filepath = os.path.join(path, doc_guid(self.current_doc) + '.json')
     with open(filepath, 'wb') as fp:
         json.dump(self.current_doc, fp)
     print "Document stored in train category {}".format(label)
     print "Moving to next document"
     return self.do_next('')
 def do_label(self, line):
     """Update label for current document storing it
     in the train_path.
     """
     label = line.strip()
     path = os.path.join(self.train_path, label)
     if not os.path.exists(path):
         os.mkdir(path)
     filepath = os.path.join(path, doc_guid(self.current_doc) + '.json')
     with open(filepath, 'wb') as fp:
         json.dump(self.current_doc, fp)
     print "Document stored in train category {}".format(label)
     print "Moving to next document"
     return self.do_next('')