def extract(self, ngramizer, filters, postagger, overwrite, maxcables=None):
        """
        gets the all cables from storage then extract n-grams and produce networks edges and weights
        """
        if overwrite is True and "ngrams" in self.mongodb.collection_names():
            self.mongodb.drop_collection("ngrams")

        if overwrite is True and "cooc" in self.mongodb.collection_names():
            self.mongodb.drop_collection("cooc")

        count=0
        if maxcables is None:
            maxcables = self.mongodb.cables.count()
        for cable in self.mongodb.cables.find(timeout=False):
            if cable is None:
                logging.warning("cable %d not found in the database, skipping"%cable_id)
                continue
            if overwrite is True:
                cable = initEdges(cable)
            # extract and filter ngrams
            ngramizer.extract(
                cable,
                filters,
                postagger,
                PorterStemmer()
            )
            yield cable
            self.mongodb.cables.update({"_id":cable['_id']},{"$set":{"edges": cable['edges']}})
            count+=1
            logging.debug("extracted %d cables topics"%count)
            if count>=maxcables: return
 def process_cable(self, cb, overwrite):
     """
     Cable Content extractor
     """
     cable_id = cb.reference_id
     cable = self.mongodb.cables.find_one({'_id': cable_id})
     if not overwrite and cable is not None:
         logging.info('CABLE ALREADY EXISTS : SKIPPING')
         self.cable_list.append(cable_id)
         logging.info("cables processed = %d, %s" % (len(self.cable_list), cb.reference_id))
         return
     ## updates metas without erasing edges
     if cable is None:
         cable = initEdges({})
     ## overwrite metas informations without erasing edges
     cable.update({
         # auto index
         '_id' : "%s" % cable_id,
         'label' : titlefy(cb.subject),
         'start' : datetime.strptime(cb.created, "%Y-%m-%d %H:%M"),
         'classification' : cb.classification,
         'embassy' : cb.origin,
         'content' : cb.content,
         'category': "Document"
     })
     self.mongodb.cables.save(cable)
     self.cable_list.append(cable_id)
     logging.info(u"cables processed = %d, %s" % (len(self.cable_list), cb.reference_id))
Beispiel #3
0
 def process_cable(self, cb, overwrite):
     """
     Cable Content extractor
     """
     cable_id = cb.reference_id
     cable = self.mongodb.cables.find_one({'_id': cable_id})
     if not overwrite and cable is not None:
         logging.info('CABLE ALREADY EXISTS : SKIPPING')
         self.cable_list.append(cable_id)
         logging.info("cables processed = %d, %s" %
                      (len(self.cable_list), cb.reference_id))
         return
     ## updates metas without erasing edges
     if cable is None:
         cable = initEdges({})
     ## overwrite metas informations without erasing edges
     cable.update({
         # auto index
         '_id': "%s" % cable_id,
         'label': titlefy(cb.subject),
         'start': datetime.strptime(cb.created, "%Y-%m-%d %H:%M"),
         'classification': cb.classification,
         'embassy': cb.origin,
         'content': cb.content,
         'category': "Document"
     })
     self.mongodb.cables.save(cable)
     self.cable_list.append(cable_id)
     logging.info(u"cables processed = %d, %s" %
                  (len(self.cable_list), cb.reference_id))
Beispiel #4
0
def worker(config, cable, filters, postagger, overwrite):
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    if overwrite is True:
        cable = initEdges(cable)
    # extract and filter ngrams
    ngramizer = NGramizer(config)
    ngramizer.extract(cable, filters, postagger, PorterStemmer())
    update_cable_cooc(cable, mongodb)
    mongodb.cables.update({"_id": cable['_id']},
                          {"$set": {
                              "edges": cable['edges']
                          }})
def worker(config, cable, filters, postagger, overwrite):
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    if overwrite is True:
        cable = initEdges(cable)
    # extract and filter ngrams
    ngramizer = NGramizer(config)
    ngramizer.extract(
        cable,
        filters,
        postagger,
        PorterStemmer()
    )
    update_cable_cooc(cable, mongodb)
    mongodb.cables.update(
        {"_id":cable['_id']},
        {"$set":{"edges": cable['edges']}})