Exemple #1
0
 def __init__(self,
              config,
              graphtype,
              minoccs=1,
              maxcoocs=1,
              maxcables=None,
              year=None):
     self.mongodb = CablegateDatabase(
         config['general']['mongodb'])["cablegate"]
     self.graphdb = GraphDatabase(config['general']['neo4j'])
     self.config = config
     if graphtype is None or graphtype == "occurrences":
         self.update_occurrences_network(minoccs,
                                         maxcoocs,
                                         maxcables,
                                         year,
                                         documents=False)
     elif graphtype == "cooccurrences":
         (nodecache,
          ngramcache) = self.update_occurrences_network(minoccs,
                                                        maxcoocs,
                                                        maxcables,
                                                        year,
                                                        documents=False)
         self.update_cooccurrences_network(nodecache, ngramcache, minoccs,
                                           maxcoocs)
Exemple #2
0
def extract(config, overwrite=True, maxcables=None):
    """
    gets the all cables from storage then extract ngrams and produce networks edges and weights
    """
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    filters = get_extraction_filters(config)
    postagger = SequentialPosTagger(None, config['extraction']['tagger'])

    if overwrite is True and "ngrams" in mongodb.collection_names():
        mongodb.drop_collection("ngrams")

    if overwrite is True and "cooc" in mongodb.collection_names():
        mongodb.drop_collection("cooc")

    count = 0
    if maxcables is None:
        maxcables = mongodb.cables.count()

    extractionpool = pool.Pool(processes=config['general']['processes'])
    for cable in mongodb.cables.find(timeout=False):
        ## just a hack
        if len(cable['edges']['NGram'].keys()) > 0: continue
        extractionpool.apply_async(
            worker, (config, cable, filters, postagger, overwrite))
        count += 1
        if count >= maxcables: break
    extractionpool.close()
    extractionpool.join()
Exemple #3
0
 def __init__(self,
              config,
              data_directory,
              overwrite=False,
              maxcables=None):
     self.data_directory = join(data_directory, "cable")
     self.mongodb = CablegateDatabase(
         config['general']['mongodb'])["cablegate"]
     if overwrite is True and "cables" in self.mongodb.collection_names():
         self.mongodb.drop_collection("cables")
     self.walk_archive(overwrite, maxcables)
Exemple #4
0
def worker(config, cable, filters, postagger, overwrite):
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    if overwrite is True:
        cable = initEdges(cable)
    # extract and filter ngrams
    ngramizer = NGramizer(config)
    ngramizer.extract(cable, filters, postagger, PorterStemmer())
    update_cable_cooc(cable, mongodb)
    mongodb.cables.update({"_id": cable['_id']},
                          {"$set": {
                              "edges": cable['edges']
                          }})
 def __init__(self, config):
     self.mongodb = CablegateDatabase(
         config['general']['mongodb'])["cablegate"]
     self.config = config