Esempio n. 1
0
class CableImporter(object):

    """
    Reads and parses all available cables and updates the mongodb
    usage : mirror = CableGateMirror(wikileaksdb, 'data/cablegate.wikileaks.org')
    """

    counts = {
      'files_not_processed':0
    }

    def __init__(self, config, data_directory, overwrite=False, maxcables=None):
        self.data_directory = join(data_directory, "cable")
        self.mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
        if overwrite is True and "cables" in self.mongodb.collection_names():
            self.mongodb.drop_collection("cables")
        self.walk_archive(overwrite, maxcables)

    def walk_archive(self, overwrite, maxcables):
        """
        Walks the archive directory
        """
        self.cable_list=[]
        try:
            for cable in cables_from_directory(self.data_directory):
                self.process_cable(cable, overwrite)
                if maxcables is not None and len(self.cable_list) >= maxcables:
                    break
        except OSError, oserr:
            logging.error("%s"%oserr)
def extract(config, overwrite=True, maxcables=None):
    """
    gets the all cables from storage then extract ngrams and produce networks edges and weights
    """
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    filters = get_extraction_filters(config)
    postagger = SequentialPosTagger(None, config['extraction']['tagger'])

    if overwrite is True and "ngrams" in mongodb.collection_names():
        mongodb.drop_collection("ngrams")

    if overwrite is True and "cooc" in mongodb.collection_names():
        mongodb.drop_collection("cooc")

    count=0
    if maxcables is None:
        maxcables = mongodb.cables.count()

    extractionpool = pool.Pool(processes=config['general']['processes'])
    for cable in mongodb.cables.find(timeout=False):
        ## just a hack
        if len(cable['edges']['NGram'].keys())>0: continue
        extractionpool.apply_async(worker, (config, cable, filters, postagger, overwrite))
        count+=1
        if count>=maxcables: break
    extractionpool.close()
    extractionpool.join()
Esempio n. 3
0
def extract(config, overwrite=True, maxcables=None):
    """
    gets the all cables from storage then extract ngrams and produce networks edges and weights
    """
    mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"]
    filters = get_extraction_filters(config)
    postagger = SequentialPosTagger(None, config['extraction']['tagger'])

    if overwrite is True and "ngrams" in mongodb.collection_names():
        mongodb.drop_collection("ngrams")

    if overwrite is True and "cooc" in mongodb.collection_names():
        mongodb.drop_collection("cooc")

    count = 0
    if maxcables is None:
        maxcables = mongodb.cables.count()

    extractionpool = pool.Pool(processes=config['general']['processes'])
    for cable in mongodb.cables.find(timeout=False):
        ## just a hack
        if len(cable['edges']['NGram'].keys()) > 0: continue
        extractionpool.apply_async(
            worker, (config, cable, filters, postagger, overwrite))
        count += 1
        if count >= maxcables: break
    extractionpool.close()
    extractionpool.join()
Esempio n. 4
0
class CableImporter(object):
    """
    Reads and parses all available cables and updates the mongodb
    """
    counts = {'files_not_processed': 0}

    def __init__(self,
                 config,
                 data_directory,
                 overwrite=False,
                 maxcables=None):
        self.data_directory = join(data_directory, "cable")
        self.mongodb = CablegateDatabase(
            config['general']['mongodb'])["cablegate"]
        if overwrite is True and "cables" in self.mongodb.collection_names():
            self.mongodb.drop_collection("cables")
        self.walk_archive(overwrite, maxcables)

    def walk_archive(self, overwrite, maxcables):
        """
        Walks the archive directory
        """
        self.cable_list = []
        try:
            for cable in cables_from_directory(self.data_directory):
                self.process_cable(cable, overwrite)
                if maxcables is not None and len(self.cable_list) >= maxcables:
                    break
        except OSError, oserr:
            logging.error("%s" % oserr)