Beispiel #1
0
def index_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data,
             id=document.id)
    clear_children(document)

    try:
        if document.type == Document.TYPE_TEXT:
            bulk(es, generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(es, generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Beispiel #2
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    try:
        log.info("Index document: %r", document)
        data = document.to_index_dict()
        data['entities'] = generate_entities(document)
        data['title_latin'] = latinize_text(data.get('title'))
        data['summary_latin'] = latinize_text(data.get('summary'))
        get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                       id=document.id)

        clear_children(document)
        if document.type == Document.TYPE_TEXT:
            bulk(get_es(), generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(get_es(), generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INDEX, component=__name__,
                          document_id=document.id, meta=document.meta,
                          source_id=document.source_id, exception=ex)
Beispiel #3
0
def records_query(document_id, args, size=5):
    terms = []
    text = args.get('q', '').strip()
    if len(text):
        terms.append(text)

    entities = Entity.by_id_set(args.getlist('entity'))
    for entity in entities.values():
        terms.extend(entity.terms)

    if not len(terms):
        return None

    shoulds = []
    for term in terms:
        shoulds.append({
            'match': {
                'text': {
                    'query': term,
                    'boost': 10,
                    'operator': 'and'
                }
            }
        })
        shoulds.append({
            'match': {
                'text_latin': {
                    'query': latinize_text(term),
                    'operator': 'and'
                }
            }
        })

    q = {
        'bool': {
            'minimum_should_match': 1,
            'should': shoulds
        }
    }
    if document_id is not None:
        q['bool']['must'] = {
            'term': {'document_id': document_id}
        }

    try:
        snippet = int(args.get('snippet', 150))
    except:
        snippet = 150

    return {
        'size': size,
        'query': q,
        'highlight': {
            'fields': {
                'text': {'fragment_size': snippet},
                'text_latin': {'fragment_size': snippet}
            }
        },
        '_source': ['document_id', 'sheet', 'row_id', 'page']
    }
Beispiel #4
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(), generate_records(document), stats_only=True,
         chunk_size=2000, request_timeout=60.0)
Beispiel #5
0
def generate_records(document):
    for table in document.tables:
        for row in table:
            row_id = row.pop('_id')
            tid = sha1(str(document.id))
            tid.update(str(table.schema.sheet))
            tid.update(str(row_id))
            tid = tid.hexdigest()
            text = [t for t in row.values() if t is not None]
            text = list(set(text))
            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': es_index,
                '_parent': document.id,
                '_source': {
                    'id': tid,
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'source_id': document.source_id,
                    'row_id': row_id,
                    'sheet': table.schema.sheet,
                    'text': text,
                    'text_latin': latinize_text(text),
                    'raw': row
                }
            }
Beispiel #6
0
def generate_records(document):
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()
            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'source_id': document.source_id,
                    'page': page.number,
                    'text': page.text,
                    'text_latin': latinize_text(page.text)
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            text = record.text
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]
            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'source_id': document.source_id,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': record.data
                }
            }
Beispiel #7
0
def generate_records(document):
    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            tid = sha1(str(document.id))
            tid.update(str(page.id))
            tid = tid.hexdigest()
            yield {
                '_id': tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'page',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'source_id': document.source_id,
                    'page': page.number,
                    'text': page.text,
                    'text_latin': latinize_text(page.text)
                }
            }
    elif document.type == Document.TYPE_TABULAR:
        for record in document.records:
            text = record.text
            latin = [latinize_text(t) for t in text]
            latin = [t for t in latin if t not in text]
            yield {
                '_id': record.tid,
                '_type': TYPE_RECORD,
                '_index': get_es_index(),
                '_parent': document.id,
                '_source': {
                    'type': 'row',
                    'content_hash': document.content_hash,
                    'document_id': document.id,
                    'source_id': document.source_id,
                    'row_id': record.row_id,
                    'sheet': record.sheet,
                    'text': text,
                    'text_latin': latin,
                    'raw': record.data
                }
            }
Beispiel #8
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(),
         generate_records(document),
         stats_only=True,
         chunk_size=2000,
         request_timeout=60.0)
Beispiel #9
0
def text_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % latinize_text(text)
    return {
        'query_string': {
            'query': text,
            'fields': ['text^6', 'text_latin^2'],
            'default_operator': 'AND',
            'use_dis_max': True
        }
    }
Beispiel #10
0
def text_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % latinize_text(text)
    return {
        'query_string': {
            'query': text,
            'fields': ['text^6', 'text_latin^2'],
            'default_operator': 'AND',
            'use_dis_max': True
        }
    }
Beispiel #11
0
def meta_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % latinize_text(text)
    return {
        "query_string": {
            "query": text,
            "fields": ['title^15', 'file_name',
                       'summary^10', 'title_latin^12',
                       'summary_latin^8'],
            "default_operator": "AND",
            "use_dis_max": True
        }
    }
Beispiel #12
0
def normalize(text):
    if not isinstance(text, six.string_types):
        return

    if six.PY2 and not isinstance(text, six.text_type):
        text = text.decode('utf-8')

    text = latinize_text(text.lower())
    text = unicodedata.normalize('NFKD', text)
    characters = []
    for character in text:
        category = unicodedata.category(character)[0]
        character = CATEGORIES.get(category, character)
        characters.append(character)
    text = u''.join(characters)

    return COLLAPSE.sub(WS, text).strip(WS)
Beispiel #13
0
def meta_query_string(text, literal=False):
    if text is None or not len(text.strip()):
        return match_all()
    if literal:
        text = '"%s"' % latinize_text(text)
    return {
        "query_string": {
            "query":
            text,
            "fields": [
                'title^15', 'file_name', 'summary^10', 'title_latin^12',
                'summary_latin^8'
            ],
            "default_operator":
            "AND",
            "use_dis_max":
            True
        }
    }
Beispiel #14
0
def generate_pages(document):
    for page in document.pages:
        tid = sha1(str(document.id))
        tid.update(str(page.id))
        tid = tid.hexdigest()
        yield {
            '_id': tid,
            '_type': TYPE_RECORD,
            '_index': get_es_index(),
            '_parent': document.id,
            '_source': {
                'type': 'page',
                'content_hash': document.content_hash,
                'document_id': document.id,
                'source_id': document.source_id,
                'page': page.number,
                'text': page.text,
                'text_latin': latinize_text(page.text)
            }
        }
Beispiel #15
0
def generate_records(document):
    for record in document.records:
        text = record.text
        latin = [latinize_text(t) for t in text]
        latin = [t for t in latin if t not in text]
        yield {
            '_id': record.tid,
            '_type': TYPE_RECORD,
            '_index': get_es_index(),
            '_parent': unicode(document.id),
            '_source': {
                'type': 'row',
                'content_hash': document.content_hash,
                'document_id': document.id,
                'source_id': document.source_id,
                'row_id': record.row_id,
                'sheet': record.sheet,
                'text': text,
                'text_latin': latin,
                'raw': record.data
            }
        }
Beispiel #16
0
def tabular_query(document_id, sheet, args):
    scored = False
    q = {
        'match_all': {}
    }

    text = args.get('q', '').strip()
    if len(text):
        scored = True
        text_latin = latinize_text(text)
        q = {
            "bool": {
                "should": {
                    "match": {
                        "text": {
                            "query": text,
                            "cutoff_frequency": 0.0007,
                            "operator": "and"
                        }
                    }
                },
                "should": {
                    "match": {
                        "text_latin": {
                            "query": text_latin,
                            "cutoff_frequency": 0.0007,
                            "operator": "and"
                        }
                    }
                }
            }
        }

    try:
        rows = [int(r) for r in args.getlist('row')]
    except Exception:
        rows = []

    if len(rows):
        scored = True
        q = {
            "bool": {
                "must": q,
                "should": {
                    "constant_score": {
                        "filter": {'terms': {'row_id': rows}},
                        "boost": 1000
                    }
                }
            }
        }

    q = add_filter(q, {'term': {'document_id': document_id}})
    q = add_filter(q, {'term': {'sheet': sheet}})

    # from pprint import pprint
    # pprint(q)

    sort = [{'row_id': 'asc'}]
    if scored:
        sort.insert(0, '_score')
    return {
        'from': 0,
        'size': 100,
        'query': q,
        'sort': sort,
        '_source': ['document_id', 'sheet', 'row_id', 'raw']
    }
Beispiel #17
0
def text_query(text):
    """ Construct the part of a query which is responsible for finding a
    piece of thext in the selected documents. """
    text = text.strip()
    text_latin = latinize_text(text)
    if len(text):
        q = {
            "bool": {
                "minimum_should_match": 1,
                "should": [
                    {
                        "multi_match": {
                            "query": text,
                            "fields": ['title^100', 'file_name^10', 'summary^2'],
                            "type": "most_fields",
                            "cutoff_frequency": 0.0007,
                            "operator": "and",
                        }
                    },
                    {
                        "multi_match": {
                            "query": text_latin,
                            "fields": ['title_latin^100', 'summary_latin^2'],
                            "type": "most_fields",
                            "cutoff_frequency": 0.0007,
                            "operator": "and",
                        }
                    },
                    {
                        "multi_match": {
                            "query": text,
                            "fields": ['title^100', 'file_name^10', 'summary^2'],
                            "type": "phrase"
                        }
                    },
                    {
                        "has_child": {
                            "type": TYPE_RECORD,
                            "score_mode": "avg",
                            "query": {
                                "bool": {
                                    "should": {
                                        "match": {
                                            "text": {
                                                "query": text,
                                                "cutoff_frequency": 0.0007,
                                                "operator": "and"
                                            }
                                        }
                                    },
                                    "should": {
                                        "match": {
                                            "text_latin": {
                                                "query": text_latin,
                                                "cutoff_frequency": 0.0007,
                                                "operator": "and"
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                ]
            }
        }
    else:
        q = {'match_all': {}}
    return q