def preprocess(doc): """Add additional fields before storing document. >>> doc = preprocess({'url': 'http://foo'}) >>> 'guid' in doc True >>> 'url' in doc True """ doc['guid'] = doc_guid(doc) return doc
def get_classified_items(filepath, db): with open(args.from_classified, 'rb') as fp: for line in fp: key, cat = line.strip().split('\t') cat = eval(cat) if isinstance(cat, list): cat = cat[0] doc = db[key] doc['guid'] = doc_guid(doc) doc['category'] = cat if any((f not in doc) for f in ('headline', 'datetime', 'body', 'url')): continue yield key, doc
def do_label(self, line): """Update label for current document storing it in the train_path. """ label = line.strip() path = os.path.join(self.train_path, label) if not os.path.exists(path): os.mkdir(path) filepath = os.path.join(path, doc_guid(self.current_doc) + '.json') with open(filepath, 'wb') as fp: json.dump(self.current_doc, fp) print "Document stored in train category {}".format(label) print "Moving to next document" return self.do_next('')