Esempio n. 1
0
def main():
    es = Elasticsearch()
    itr = iterate_over_query(es, '*')

    for doc in itr:
        doc_id = doc['_id']
        try:
            text = {'text': doc['fields']['file'][0]}
        except (KeyError, IndexError):
            continue

        entities = call_mitie(text)

        if len(entities) > 0:
            print doc_id, entities
            update_partial(es, entities, doc_id)
Esempio n. 2
0
def main():
    es = Elasticsearch()
    itr = iterate_over_query(es, '*')

    for doc in itr:
        doc_id = doc['_id']
        try:
            text = {'text': doc['fields']['file'][0]}
        except (KeyError, IndexError):
            continue

        entities = call_mitie(text)

        if len(entities) > 0:
            print doc_id, entities
            update_partial(es, entities, doc_id)
Esempio n. 3
0
        for elem in raw['entities']:
            entities.add(elem['text'])
    except KeyError:
        return list()
    
    return list(entities)

def update_partial(entities, doc_id):
    # Partial update doc
    partial_doc = {'doc': {'entities': entities} }

    es.update(index='dossiers', doc_type='attachment', id=doc_id, body=
            partial_doc, refresh=True)

es = Elasticsearch()
it = iterate_over_query(es, '*')

for doc in it:
    doc_id = doc['_id']
    try:
        text = {'text': doc['fields']['file'][0]}
    except KeyError, IndexError:
        continue
    
    entities = call_mitie(text)
    
    if len(entities) > 0:
        print doc_id, entities
        update_partial(entities, doc_id)

Esempio n. 4
0
    reader = csv.reader(f)
    h = {x: i for i, x in enumerate(next(reader))}
    for row in reader:
        # Lowercase all
        entities[row[h['Entity 1 Name']]] = row[h['Entity 1 Type']]
        entities[row[h['Entity 2 Name']]] = row[h['Entity 2 Type']]

del entities['Unknown']
out = open('entity_list.csv', 'wb')
w = csv.writer(out)
for e, t in entities.iteritems():
    w.writerow([e, t])

out.close()

it = iterate_over_query('*', fields='file')

# Contains entities found in each group
doc_entities = defaultdict(list)

for i, row in enumerate(it):
    # Raw text
    doc_id = row['_id']
    try:
        raw_text = row['fields']['file'][0].lower()
    except KeyError:
        # No file found
        continue

    # Look for entity matches inside text:
    for entity in entities:
Esempio n. 5
0
    reader = csv.reader(f)
    h = {x: i for i, x in enumerate(next(reader))}
    for row in reader:
        # Lowercase all
        entities[row[h['Entity 1 Name']]] = row[h['Entity 1 Type']]
        entities[row[h['Entity 2 Name']]] = row[h['Entity 2 Type']]

del entities['Unknown']
out = open('entity_list.csv', 'wb')
w = csv.writer(out)
for e, t in entities.iteritems():
    w.writerow([e, t])

out.close()

it = iterate_over_query('*', fields='file')

# Contains entities found in each group
doc_entities = defaultdict(list)

for i, row in enumerate(it):
    # Raw text
    doc_id = row['_id']
    try:
        raw_text = row['fields']['file'][0].lower()
    except KeyError:
        # No file found
        continue

    # Look for entity matches inside text:
    for entity in entities:
Esempio n. 6
0
from app.config import es_index
from elasticsearch import Elasticsearch
import sys

import os
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from iterate_search import iterate_over_query


def update_partial(doc_id, group):
    # Partial update doc
    partial_doc = {'doc': {'owner': group}}

    es.update(index=es_index,
              doc_type='attachment',
              id=doc_id,
              body=partial_doc,
              refresh=True)

es = Elasticsearch()
it = iterate_over_query(es, '*', fields='_id')

for i, doc in enumerate(it):
    doc_id = doc['_id']
    update_partial(doc_id, 'Focus Africa')

    if i % 10 == 0:
        print i, doc_id