def main(): es = Elasticsearch() itr = iterate_over_query(es, '*') for doc in itr: doc_id = doc['_id'] try: text = {'text': doc['fields']['file'][0]} except (KeyError, IndexError): continue entities = call_mitie(text) if len(entities) > 0: print doc_id, entities update_partial(es, entities, doc_id)
for elem in raw['entities']: entities.add(elem['text']) except KeyError: return list() return list(entities) def update_partial(entities, doc_id): # Partial update doc partial_doc = {'doc': {'entities': entities} } es.update(index='dossiers', doc_type='attachment', id=doc_id, body= partial_doc, refresh=True) es = Elasticsearch() it = iterate_over_query(es, '*') for doc in it: doc_id = doc['_id'] try: text = {'text': doc['fields']['file'][0]} except KeyError, IndexError: continue entities = call_mitie(text) if len(entities) > 0: print doc_id, entities update_partial(entities, doc_id)
reader = csv.reader(f) h = {x: i for i, x in enumerate(next(reader))} for row in reader: # Lowercase all entities[row[h['Entity 1 Name']]] = row[h['Entity 1 Type']] entities[row[h['Entity 2 Name']]] = row[h['Entity 2 Type']] del entities['Unknown'] out = open('entity_list.csv', 'wb') w = csv.writer(out) for e, t in entities.iteritems(): w.writerow([e, t]) out.close() it = iterate_over_query('*', fields='file') # Contains entities found in each group doc_entities = defaultdict(list) for i, row in enumerate(it): # Raw text doc_id = row['_id'] try: raw_text = row['fields']['file'][0].lower() except KeyError: # No file found continue # Look for entity matches inside text: for entity in entities:
from app.config import es_index from elasticsearch import Elasticsearch import sys import os sys.path.append(os.path.dirname(os.path.dirname(__file__))) from iterate_search import iterate_over_query def update_partial(doc_id, group): # Partial update doc partial_doc = {'doc': {'owner': group}} es.update(index=es_index, doc_type='attachment', id=doc_id, body=partial_doc, refresh=True) es = Elasticsearch() it = iterate_over_query(es, '*', fields='_id') for i, doc in enumerate(it): doc_id = doc['_id'] update_partial(doc_id, 'Focus Africa') if i % 10 == 0: print i, doc_id