Example #1
0
def search_subworker(config, content, year, doublet):
    """
    Responsible for matching the pair and incrementing cooccurrences count
    """
    logging.debug("looking for cooc of %s and %s"%(doublet[0]['label'],
        doublet[1]['label']))
    outputs = output.getConfiguredOutputs(config['cooccurrences'])
    regex1 = re.compile( r"\b%s\b"%"|".join(doublet[0]['edges']['label'].keys()), re.I|re.M|re.U )
    regex2 = re.compile( r"\b%s\b"%"|".join(doublet[1]['edges']['label'].keys()), re.I|re.M|re.U )

    if regex1.search(content) is not None and regex2.search(content) is not None:
        logging.debug("found a cooc !")
        # will look for both composed ID
        doublet_id12 = year\
            +"_"+ doublet[0]["id"]\
            +"_"+ doublet[1]["id"]
                        
        doublet_id21 = year\
            +"_"+ doublet[1]["id"]\
            +"_"+ doublet[0]["id"]

        if outputs['mongodb'].mongodb.coocmatrix.find_one({'_id':doublet_id12}) is not None:
            outputs['mongodb'].mongodb.coocmatrix.update(\
                {'_id': doublet_id12},\
                {'_id': doublet_id12, '$inc':\
                {'value': 1}}, upsert=True)
        elif outputs['mongodb'].mongodb.coocmatrix.find_one({'_id':doublet_id21}) is not None:
            outputs['mongodb'].mongodb.coocmatrix.update(\
                {'_id': doublet_id21},\
                {'_id': doublet_id21, '$inc':\
                {'value': 1}}, upsert=True)
        else:
            # anyway saves a new cooc line using 'id12' ID
            outputs['mongodb'].mongodb.coocmatrix.save(\
                {'_id': doublet_id12, 'value': 1})
Example #2
0
def main(config):
    """
    main occurrences processor
    reads a whitelist and push a occurrences_worker() to a process pool
    """
    whitelistpath = config['cooccurrences']["whitelist"]["path"]
    logging.debug("loading whitelist from %s (id = %s)"%(whitelistpath, whitelistpath))

    wlimport = Reader('whitelist://'+whitelistpath, dialect="excel", encoding="ascii")
    wlimport.whitelist = whitelist.Whitelist( whitelistpath, whitelistpath )
    newwl = wlimport.parse_file()
    newwl['content']=[]
    # cursor of Whitelist NGrams db
    ngramgenerator = newwl.getNGram()
    outputs = output.getConfiguredOutputs(config['cooccurrences'])
    try:
        while 1:
            ngid, ng = ngramgenerator.next()
            newwl['content'] += [ng]
            outputs['exportwhitelistcsv'].save("%s,%s\n"%(ngid,ng['label']))
            #raise StopIteration()
    except StopIteration:
        logging.debug('imported %d n-lemmes from the whitelist file %s'\
                %(len(newwl['content']), whitelistpath))
     
    input = mongodbhandler.MongoDB(config['cooccurrences']['input_db'])
    #occspool = pool.Pool(processes=config['processes'])
    for notice in input.notices.find(timeout=False):
        #occspool.apply_async(worker, (config, notice, newwl))
        worker(config, notice, newwl)
Example #3
0
def exportcooc(config):
    """
    Basic exporter of the cooccurrences stored to files
    """
    outputs = output.getConfiguredOutputs(config['cooccurrences'])
    for pair in outputs['mongodb'].mongodb.coocmatrix.find():
        year, ngi, ngj = pair['_id'].split("_")
        cooc = pair['value']
        outputs['coocmatrixcsv'].save("%s,%s,%d,%s\n"%(ngi, ngj, cooc, year))
Example #4
0
def extract_worker(config, fieldname):
    """
    copies input db notices matching a regexg to an output db
    """
    input = mongodbhandler.MongoDB(config['extractor']['input_db'])
    outputs = output.getConfiguredOutputs( config['extractor'] )
    reg = re.compile( config['extractor']['filters']['regexp_content']['regexp'], re.I|re.U|re.M)

    for notice in input.notices.find({ fieldname:{"$regex":reg} }, timeout=False):
        outputs['mongodb'].save(notice, "notices")