Exemple #1
0
def foo():
    db = get_connection()
    searcher = GensimSearcher('companies_index')
    COSINE_THRESHOLD = .6
    
    all_companies = set([entity['text'] for story in db.stories.find() for entity in story['entities'] if entity['type'] == 'Company'])
    matched_companies = set(db.stories.distinct('contexts.entities.name'))
        
    for company in all_companies - matched_companies:
        search_results = searcher.search(company)[:1]
        if any(search_results) and search_results[0][1] > COSINE_THRESHOLD:
            print company
            print search_results[0][0]
            print
Exemple #2
0
def debug():
    COSINE_THRESHOLD = .75
    searcher = GensimSearcher('companies_index')
    db = get_connection()
    company_name_difference = collections.defaultdict(int)
    for story in db.stories.find({ 'entities' : { '$exists' : True }}):
        for entity in story['entities']:           
            
            if entity['type'] == 'Company':
                
                search_results = searcher.search(entity['text'])
                if any(search_results) and search_results[0][1] > COSINE_THRESHOLD:
                
                    company = db.companies.find_one({ 'name' : search_results[0][0] })
                
                    if entity['text'].lower() != company['name'].lower():
                        print entity['text']
                        print company['name']
                        print set(entity['text'].lower().split()) ^ set(company['name'].lower().split())
                        for diff in set(entity['text'].lower().split()) ^ set(company['name'].lower().split()):
                            company_name_difference[diff] += 1
                        print
            
    return company_name_difference