def extract(self, ngramizer, filters, postagger, overwrite, maxcables=None): """ gets the all cables from storage then extract n-grams and produce networks edges and weights """ if overwrite is True and "ngrams" in self.mongodb.collection_names(): self.mongodb.drop_collection("ngrams") if overwrite is True and "cooc" in self.mongodb.collection_names(): self.mongodb.drop_collection("cooc") count=0 if maxcables is None: maxcables = self.mongodb.cables.count() for cable in self.mongodb.cables.find(timeout=False): if cable is None: logging.warning("cable %d not found in the database, skipping"%cable_id) continue if overwrite is True: cable = initEdges(cable) # extract and filter ngrams ngramizer.extract( cable, filters, postagger, PorterStemmer() ) yield cable self.mongodb.cables.update({"_id":cable['_id']},{"$set":{"edges": cable['edges']}}) count+=1 logging.debug("extracted %d cables topics"%count) if count>=maxcables: return
def process_cable(self, cb, overwrite): """ Cable Content extractor """ cable_id = cb.reference_id cable = self.mongodb.cables.find_one({'_id': cable_id}) if not overwrite and cable is not None: logging.info('CABLE ALREADY EXISTS : SKIPPING') self.cable_list.append(cable_id) logging.info("cables processed = %d, %s" % (len(self.cable_list), cb.reference_id)) return ## updates metas without erasing edges if cable is None: cable = initEdges({}) ## overwrite metas informations without erasing edges cable.update({ # auto index '_id' : "%s" % cable_id, 'label' : titlefy(cb.subject), 'start' : datetime.strptime(cb.created, "%Y-%m-%d %H:%M"), 'classification' : cb.classification, 'embassy' : cb.origin, 'content' : cb.content, 'category': "Document" }) self.mongodb.cables.save(cable) self.cable_list.append(cable_id) logging.info(u"cables processed = %d, %s" % (len(self.cable_list), cb.reference_id))
def process_cable(self, cb, overwrite): """ Cable Content extractor """ cable_id = cb.reference_id cable = self.mongodb.cables.find_one({'_id': cable_id}) if not overwrite and cable is not None: logging.info('CABLE ALREADY EXISTS : SKIPPING') self.cable_list.append(cable_id) logging.info("cables processed = %d, %s" % (len(self.cable_list), cb.reference_id)) return ## updates metas without erasing edges if cable is None: cable = initEdges({}) ## overwrite metas informations without erasing edges cable.update({ # auto index '_id': "%s" % cable_id, 'label': titlefy(cb.subject), 'start': datetime.strptime(cb.created, "%Y-%m-%d %H:%M"), 'classification': cb.classification, 'embassy': cb.origin, 'content': cb.content, 'category': "Document" }) self.mongodb.cables.save(cable) self.cable_list.append(cable_id) logging.info(u"cables processed = %d, %s" % (len(self.cable_list), cb.reference_id))
def worker(config, cable, filters, postagger, overwrite): mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] if overwrite is True: cable = initEdges(cable) # extract and filter ngrams ngramizer = NGramizer(config) ngramizer.extract(cable, filters, postagger, PorterStemmer()) update_cable_cooc(cable, mongodb) mongodb.cables.update({"_id": cable['_id']}, {"$set": { "edges": cable['edges'] }})
def worker(config, cable, filters, postagger, overwrite): mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] if overwrite is True: cable = initEdges(cable) # extract and filter ngrams ngramizer = NGramizer(config) ngramizer.extract( cable, filters, postagger, PorterStemmer() ) update_cable_cooc(cable, mongodb) mongodb.cables.update( {"_id":cable['_id']}, {"$set":{"edges": cable['edges']}})