Example #1
0
def main(config):
    """
    main occurrences processor
    reads a whitelist and push a occurrences_worker() to a process pool
    """
    whitelistpath = config['cooccurrences']["whitelist"]["path"]
    logging.debug("loading whitelist from %s (id = %s)"%(whitelistpath, whitelistpath))

    wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii")
    wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath )
    newwl = wlimport.parse_file()
    newwl['content']=[]
    # cursor of Whitelist NGrams db
    ngramgenerator = newwl.getNGram()
    outputs = output.getConfiguredOutputs(config['cooccurrences'])
    try:
        while 1:
            ngid, ng = ngramgenerator.next()
            newwl['content'] += [ng]
            outputs['exportwhitelistcsv'].save("%s,%s\n"%(ngid,ng['label']))
            #raise StopIteration()
    except StopIteration:
        logging.debug('imported %d n-lemmes from the whitelist file %s'\
                %(len(newwl['content']), whitelistpath))
     
    input = mongodbhandler.MongoDB(config['cooccurrences']['input_db'])
    #occspool = pool.Pool(processes=config['processes'])
    for notice in input.notices.find(timeout=False):
        #occspool.apply_async(worker, (config, notice, newwl))
        worker(config, notice, newwl)
Example #2
0
 def apply(self, record):
     """
     filters notices not match a regular expression
     """
     whitelistpath = self.getRules()
     extraction_fields = rules['fields']
     wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii")
     wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath )
     newwl = wlimport.parse_file()
     
     for tag in extraction_fields:
         if tag not in record: continue
         if type(record[tag]) == str or type(record[tag]) == unicode:
             if newwl.test(record[tag]):
                 return 1
         elif type(record[tag]) == list:
             for field in record[tag]:
                 if newwl.test(field):
                     return 1
     # anyway : reject
     raise NoticeRejected("notice did not match the whitelist")
     return 0
Example #3
0
 def _importwhitelist(self):
     """
     loads and cache all ngrams in the whitelist
     """
     whitelistpath = self.config["whitelist"]["path"]
     logging.debug("loading whitelist from %s (id = %s)"%(whitelistpath, whitelistpath))
     wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii")
     wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath )
     self.newwl = wlimport.parse_file()
     
     try:
         self.newwl['content']=[]
         # cursor of Whitelist NGrams db
         ngramgenerator = self.newwl.getNGram()
         while 1:
             ngid, ng = ngramgenerator.next()
             self.newwl['content'] += [ng]
             self.outputs['exportwhitelistcsv'].save("%s,%s\n"%(ngid,ng['label']))
             #raise StopIteration()
     except StopIteration:
         logging.debug('imported %d n-lemmes from the whitelist file %s'\
                 %(len(self.newwl['content']), whitelistpath))
     if len(self.newwl['content'])<2:
         raise Exception("the whitelist contains only one element, aborting")
Example #4
0
 def _import_whitelist(
         self,
         whitelistpath,
         dataset = None,
         userstopwords = None,
         dialect="excel",
         encoding="utf_8"
     ):
     """
     import one or a list of whitelits files
     returns a whitelist object to be used as input of other methods
     """
     whitelist_id = self._get_filepath_id(whitelistpath)
     kwargs = {
         'dialect': dialect,
         'encoding': encoding
     }
     if whitelist_id is not None:
         ### whitelistpath EXISTS
         self.logger.debug("loading whitelist from %s (%s)"%(whitelistpath, whitelist_id))
         wlimport = Reader('whitelist://'+whitelistpath, **kwargs)
         wlimport.whitelist = whitelist.Whitelist( whitelist_id, whitelist_id )
         new_wl = wlimport.parse_file()
     # NOT USED : TO CHECK
     elif isinstance(dataset, corpora.Corpora):
         self._load_config()
         storage = self.get_storage(dataset, create=False, drop_tables=False)
         if storage == self.STATUS_ERROR:
             return self.STATUS_ERROR
         # whitelistpath is a whitelist label into storage
         self.logger.debug("loading whitelist %s from storage"%whitelist_id)
         new_wl = whitelist.Whitelist( whitelist_id, whitelist_id )
         new_wl.loadFromStorage(storage, dataset)
     elif exists(whitelistpath):
         ### whitelist path is a real path but not in a correct format
         whitelist_id = dataset
         self.logger.debug("loading whitelist from %s (%s)"%(whitelistpath, whitelist_id))
         wlimport = Reader('whitelist://'+whitelistpath, **kwargs)
         wlimport.whitelist = whitelist.Whitelist( whitelist_id, whitelist_id )
         new_wl = wlimport.parse_file()
     else:
         raise Exception("unable to find a whitelist at %s"%whitelistpath)
     return new_wl
Example #5
0
            outpath = self._get_user_filepath(
                dataset,
                'cooccurrences',
                "%s-matrix.csv"%(whitelist['label'])
            )
            exporter = Writer("coocmatrix://"+outpath)
            whitelist_outpath = self._get_user_filepath(
                dataset,
                'cooccurrences',
                "%s-terms.csv"%(whitelist['label'])
            )
            whitelist_exporter = Writer("basecsv://"+whitelist_outpath)
        else:
            exporter = None

        archive = Reader( format + "://" + path, **self.config['datasets'] )
        archive_walker = archive.walkArchive(periods)
        try:
            period_gen, period = archive_walker.next()
            sc = indexer.ArchiveCounter(self.config['datasets'], storage)
            walkCorpusGen = sc.walk_period(whitelist, period_gen, period)
            try:
                while 1:
                    yield walkCorpusGen.next()
            except StopIteration:
                pass
            writeMatrixGen = sc.write_matrix(period, exporter, whitelist_exporter, minCooc)
            try:
                while 1:
                    yield writeMatrixGen.next()
            except StopIteration, si: