class CableImporter(object): """ Reads and parses all available cables and updates the mongodb usage : mirror = CableGateMirror(wikileaksdb, 'data/cablegate.wikileaks.org') """ counts = { 'files_not_processed':0 } def __init__(self, config, data_directory, overwrite=False, maxcables=None): self.data_directory = join(data_directory, "cable") self.mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] if overwrite is True and "cables" in self.mongodb.collection_names(): self.mongodb.drop_collection("cables") self.walk_archive(overwrite, maxcables) def walk_archive(self, overwrite, maxcables): """ Walks the archive directory """ self.cable_list=[] try: for cable in cables_from_directory(self.data_directory): self.process_cable(cable, overwrite) if maxcables is not None and len(self.cable_list) >= maxcables: break except OSError, oserr: logging.error("%s"%oserr)
def extract(config, overwrite=True, maxcables=None): """ gets the all cables from storage then extract ngrams and produce networks edges and weights """ mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] filters = get_extraction_filters(config) postagger = SequentialPosTagger(None, config['extraction']['tagger']) if overwrite is True and "ngrams" in mongodb.collection_names(): mongodb.drop_collection("ngrams") if overwrite is True and "cooc" in mongodb.collection_names(): mongodb.drop_collection("cooc") count=0 if maxcables is None: maxcables = mongodb.cables.count() extractionpool = pool.Pool(processes=config['general']['processes']) for cable in mongodb.cables.find(timeout=False): ## just a hack if len(cable['edges']['NGram'].keys())>0: continue extractionpool.apply_async(worker, (config, cable, filters, postagger, overwrite)) count+=1 if count>=maxcables: break extractionpool.close() extractionpool.join()
def extract(config, overwrite=True, maxcables=None): """ gets the all cables from storage then extract ngrams and produce networks edges and weights """ mongodb = CablegateDatabase(config['general']['mongodb'])["cablegate"] filters = get_extraction_filters(config) postagger = SequentialPosTagger(None, config['extraction']['tagger']) if overwrite is True and "ngrams" in mongodb.collection_names(): mongodb.drop_collection("ngrams") if overwrite is True and "cooc" in mongodb.collection_names(): mongodb.drop_collection("cooc") count = 0 if maxcables is None: maxcables = mongodb.cables.count() extractionpool = pool.Pool(processes=config['general']['processes']) for cable in mongodb.cables.find(timeout=False): ## just a hack if len(cable['edges']['NGram'].keys()) > 0: continue extractionpool.apply_async( worker, (config, cable, filters, postagger, overwrite)) count += 1 if count >= maxcables: break extractionpool.close() extractionpool.join()
class CableImporter(object): """ Reads and parses all available cables and updates the mongodb """ counts = {'files_not_processed': 0} def __init__(self, config, data_directory, overwrite=False, maxcables=None): self.data_directory = join(data_directory, "cable") self.mongodb = CablegateDatabase( config['general']['mongodb'])["cablegate"] if overwrite is True and "cables" in self.mongodb.collection_names(): self.mongodb.drop_collection("cables") self.walk_archive(overwrite, maxcables) def walk_archive(self, overwrite, maxcables): """ Walks the archive directory """ self.cable_list = [] try: for cable in cables_from_directory(self.data_directory): self.process_cable(cable, overwrite) if maxcables is not None and len(self.cable_list) >= maxcables: break except OSError, oserr: logging.error("%s" % oserr)