def main(config): """ main occurrences processor reads a whitelist and push a occurrences_worker() to a process pool """ whitelistpath = config['cooccurrences']["whitelist"]["path"] logging.debug("loading whitelist from %s (id = %s)"%(whitelistpath, whitelistpath)) wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii") wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath ) newwl = wlimport.parse_file() newwl['content']=[] # cursor of Whitelist NGrams db ngramgenerator = newwl.getNGram() outputs = output.getConfiguredOutputs(config['cooccurrences']) try: while 1: ngid, ng = ngramgenerator.next() newwl['content'] += [ng] outputs['exportwhitelistcsv'].save("%s,%s\n"%(ngid,ng['label'])) #raise StopIteration() except StopIteration: logging.debug('imported %d n-lemmes from the whitelist file %s'\ %(len(newwl['content']), whitelistpath)) input = mongodbhandler.MongoDB(config['cooccurrences']['input_db']) #occspool = pool.Pool(processes=config['processes']) for notice in input.notices.find(timeout=False): #occspool.apply_async(worker, (config, notice, newwl)) worker(config, notice, newwl)
def apply(self, record): """ filters notices not match a regular expression """ whitelistpath = self.getRules() extraction_fields = rules['fields'] wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii") wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath ) newwl = wlimport.parse_file() for tag in extraction_fields: if tag not in record: continue if type(record[tag]) == str or type(record[tag]) == unicode: if newwl.test(record[tag]): return 1 elif type(record[tag]) == list: for field in record[tag]: if newwl.test(field): return 1 # anyway : reject raise NoticeRejected("notice did not match the whitelist") return 0
def _importwhitelist(self): """ loads and cache all ngrams in the whitelist """ whitelistpath = self.config["whitelist"]["path"] logging.debug("loading whitelist from %s (id = %s)"%(whitelistpath, whitelistpath)) wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii") wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath ) self.newwl = wlimport.parse_file() try: self.newwl['content']=[] # cursor of Whitelist NGrams db ngramgenerator = self.newwl.getNGram() while 1: ngid, ng = ngramgenerator.next() self.newwl['content'] += [ng] self.outputs['exportwhitelistcsv'].save("%s,%s\n"%(ngid,ng['label'])) #raise StopIteration() except StopIteration: logging.debug('imported %d n-lemmes from the whitelist file %s'\ %(len(self.newwl['content']), whitelistpath)) if len(self.newwl['content'])<2: raise Exception("the whitelist contains only one element, aborting")
def _import_whitelist( self, whitelistpath, dataset = None, userstopwords = None, dialect="excel", encoding="utf_8" ): """ import one or a list of whitelits files returns a whitelist object to be used as input of other methods """ whitelist_id = self._get_filepath_id(whitelistpath) kwargs = { 'dialect': dialect, 'encoding': encoding } if whitelist_id is not None: ### whitelistpath EXISTS self.logger.debug("loading whitelist from %s (%s)"%(whitelistpath, whitelist_id)) wlimport = Reader('whitelist://'+whitelistpath, **kwargs) wlimport.whitelist = whitelist.Whitelist( whitelist_id, whitelist_id ) new_wl = wlimport.parse_file() # NOT USED : TO CHECK elif isinstance(dataset, corpora.Corpora): self._load_config() storage = self.get_storage(dataset, create=False, drop_tables=False) if storage == self.STATUS_ERROR: return self.STATUS_ERROR # whitelistpath is a whitelist label into storage self.logger.debug("loading whitelist %s from storage"%whitelist_id) new_wl = whitelist.Whitelist( whitelist_id, whitelist_id ) new_wl.loadFromStorage(storage, dataset) elif exists(whitelistpath): ### whitelist path is a real path but not in a correct format whitelist_id = dataset self.logger.debug("loading whitelist from %s (%s)"%(whitelistpath, whitelist_id)) wlimport = Reader('whitelist://'+whitelistpath, **kwargs) wlimport.whitelist = whitelist.Whitelist( whitelist_id, whitelist_id ) new_wl = wlimport.parse_file() else: raise Exception("unable to find a whitelist at %s"%whitelistpath) return new_wl
outpath = self._get_user_filepath( dataset, 'cooccurrences', "%s-matrix.csv"%(whitelist['label']) ) exporter = Writer("coocmatrix://"+outpath) whitelist_outpath = self._get_user_filepath( dataset, 'cooccurrences', "%s-terms.csv"%(whitelist['label']) ) whitelist_exporter = Writer("basecsv://"+whitelist_outpath) else: exporter = None archive = Reader( format + "://" + path, **self.config['datasets'] ) archive_walker = archive.walkArchive(periods) try: period_gen, period = archive_walker.next() sc = indexer.ArchiveCounter(self.config['datasets'], storage) walkCorpusGen = sc.walk_period(whitelist, period_gen, period) try: while 1: yield walkCorpusGen.next() except StopIteration: pass writeMatrixGen = sc.write_matrix(period, exporter, whitelist_exporter, minCooc) try: while 1: yield writeMatrixGen.next() except StopIteration, si: