Beispiel #1
0
class CableImporter(object):
    """
    Reads and parses all available cables and updates the mongodb
    """
    counts = {'files_not_processed': 0}

    def __init__(self,
                 config,
                 data_directory,
                 overwrite=False,
                 maxcables=None):
        self.data_directory = join(data_directory, "cable")
        self.mongodb = CablegateDatabase(
            config['general']['mongodb'])["cablegate"]
        if overwrite is True and "cables" in self.mongodb.collection_names():
            self.mongodb.drop_collection("cables")
        self.walk_archive(overwrite, maxcables)

    def walk_archive(self, overwrite, maxcables):
        """
        Walks the archive directory
        """
        self.cable_list = []
        try:
            for cable in cables_from_directory(self.data_directory):
                self.process_cable(cable, overwrite)
                if maxcables is not None and len(self.cable_list) >= maxcables:
                    break
        except OSError, oserr:
            logging.error("%s" % oserr)
class CableImporter(object):

    """
    Reads and parses all available cables and updates the mongodb
    usage : mirror = CableGateMirror(wikileaksdb, 'data/cablegate.wikileaks.org')
    """

    counts = {
      'files_not_processed':0
    }

    def __init__(self, config, data_directory, overwrite=False, maxcables=None):
        self.data_directory = join(data_directory, "cable")
        self.mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
        if overwrite is True and "cables" in self.mongodb.collection_names():
            self.mongodb.drop_collection("cables")
        self.walk_archive(overwrite, maxcables)

    def walk_archive(self, overwrite, maxcables):
        """
        Walks the archive directory
        """
        self.cable_list=[]
        try:
            for cable in cables_from_directory(self.data_directory):
                self.process_cable(cable, overwrite)
                if maxcables is not None and len(self.cable_list) >= maxcables:
                    break
        except OSError, oserr:
            logging.error("%s"%oserr)
Beispiel #3
0
 def __init__(self,
              config,
              data_directory,
              overwrite=False,
              maxcables=None):
     self.data_directory = join(data_directory, "cable")
     self.mongodb = CablegateDatabase(
         config['general']['mongodb'])["cablegate"]
     if overwrite is True and "cables" in self.mongodb.collection_names():
         self.mongodb.drop_collection("cables")
     self.walk_archive(overwrite, maxcables)
Beispiel #4
0
def extract(config, overwrite=True, maxcables=None):
    """
    gets the all cables from storage then extract ngrams and produce networks edges and weights
    """
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    filters = get_extraction_filters(config)
    postagger = SequentialPosTagger(None, config['extraction']['tagger'])

    if overwrite is True and "ngrams" in mongodb.collection_names():
        mongodb.drop_collection("ngrams")

    if overwrite is True and "cooc" in mongodb.collection_names():
        mongodb.drop_collection("cooc")

    count = 0
    if maxcables is None:
        maxcables = mongodb.cables.count()

    extractionpool = pool.Pool(processes=config['general']['processes'])
    for cable in mongodb.cables.find(timeout=False):
        ## just a hack
        if len(cable['edges']['NGram'].keys()) > 0: continue
        extractionpool.apply_async(
            worker, (config, cable, filters, postagger, overwrite))
        count += 1
        if count >= maxcables: break
    extractionpool.close()
    extractionpool.join()
Beispiel #5
0
 def __init__(self,
              config,
              graphtype,
              minoccs=1,
              maxcoocs=1,
              maxcables=None,
              year=None):
     self.mongodb = CablegateDatabase(
         config['general']['mongodb'])["cablegate"]
     self.graphdb = GraphDatabase(config['general']['neo4j'])
     self.config = config
     if graphtype is None or graphtype == "occurrences":
         self.update_occurrences_network(minoccs,
                                         maxcoocs,
                                         maxcables,
                                         year,
                                         documents=False)
     elif graphtype == "cooccurrences":
         (nodecache,
          ngramcache) = self.update_occurrences_network(minoccs,
                                                        maxcoocs,
                                                        maxcables,
                                                        year,
                                                        documents=False)
         self.update_cooccurrences_network(nodecache, ngramcache, minoccs,
                                           maxcoocs)
def extract(config, overwrite=True, maxcables=None):
    """
    gets the all cables from storage then extract ngrams and produce networks edges and weights
    """
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    filters = get_extraction_filters(config)
    postagger = SequentialPosTagger(None, config['extraction']['tagger'])

    if overwrite is True and "ngrams" in mongodb.collection_names():
        mongodb.drop_collection("ngrams")

    if overwrite is True and "cooc" in mongodb.collection_names():
        mongodb.drop_collection("cooc")

    count=0
    if maxcables is None:
        maxcables = mongodb.cables.count()

    extractionpool = pool.Pool(processes=config['general']['processes'])
    for cable in mongodb.cables.find(timeout=False):
        ## just a hack
        if len(cable['edges']['NGram'].keys())>0: continue
        extractionpool.apply_async(worker, (config, cable, filters, postagger, overwrite))
        count+=1
        if count>=maxcables: break
    extractionpool.close()
    extractionpool.join()
Beispiel #7
0
def worker(config, cable, filters, postagger, overwrite):
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    if overwrite is True:
        cable = initEdges(cable)
    # extract and filter ngrams
    ngramizer = NGramizer(config)
    ngramizer.extract(cable, filters, postagger, PorterStemmer())
    update_cable_cooc(cable, mongodb)
    mongodb.cables.update({"_id": cable['_id']},
                          {"$set": {
                              "edges": cable['edges']
                          }})
 def __init__(self, config, data_directory, overwrite=False, maxcables=None):
     self.data_directory = join(data_directory, "cable")
     self.mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
     if overwrite is True and "cables" in self.mongodb.collection_names():
         self.mongodb.drop_collection("cables")
     self.walk_archive(overwrite, maxcables)
 def __init__(self, config):
     self.mongodb = CablegateDatabase(
         config['general']['mongodb'])["cablegate"]
     self.config = config