def index_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) try: if document.type == Document.TYPE_TEXT: bulk(es, generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(es, generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return try: log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) if document.type == Document.TYPE_TEXT: bulk(get_es(), generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex)
def records_query(document_id, args, size=5): terms = [] text = args.get('q', '').strip() if len(text): terms.append(text) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): terms.extend(entity.terms) if not len(terms): return None shoulds = [] for term in terms: shoulds.append({ 'match': { 'text': { 'query': term, 'boost': 10, 'operator': 'and' } } }) shoulds.append({ 'match': { 'text_latin': { 'query': latinize_text(term), 'operator': 'and' } } }) q = { 'bool': { 'minimum_should_match': 1, 'should': shoulds } } if document_id is not None: q['bool']['must'] = { 'term': {'document_id': document_id} } try: snippet = int(args.get('snippet', 150)) except: snippet = 150 return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': {'fragment_size': snippet}, 'text_latin': {'fragment_size': snippet} } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0)
def generate_records(document): for table in document.tables: for row in table: row_id = row.pop('_id') tid = sha1(str(document.id)) tid.update(str(table.schema.sheet)) tid.update(str(row_id)) tid = tid.hexdigest() text = [t for t in row.values() if t is not None] text = list(set(text)) yield { '_id': tid, '_type': TYPE_RECORD, '_index': es_index, '_parent': document.id, '_source': { 'id': tid, 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'source_id': document.source_id, 'row_id': row_id, 'sheet': table.schema.sheet, 'text': text, 'text_latin': latinize_text(text), 'raw': row } }
def generate_records(document): if document.type == Document.TYPE_TEXT: for page in document.pages: tid = sha1(str(document.id)) tid.update(str(page.id)) tid = tid.hexdigest() yield { '_id': tid, '_type': TYPE_RECORD, '_index': get_es_index(), '_parent': document.id, '_source': { 'type': 'page', 'content_hash': document.content_hash, 'document_id': document.id, 'source_id': document.source_id, 'page': page.number, 'text': page.text, 'text_latin': latinize_text(page.text) } } elif document.type == Document.TYPE_TABULAR: for record in document.records: text = record.text latin = [latinize_text(t) for t in text] latin = [t for t in latin if t not in text] yield { '_id': record.tid, '_type': TYPE_RECORD, '_index': get_es_index(), '_parent': document.id, '_source': { 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'source_id': document.source_id, 'row_id': record.row_id, 'sheet': record.sheet, 'text': text, 'text_latin': latin, 'raw': record.data } }
def text_query_string(text, literal=False): if text is None or not len(text.strip()): return match_all() if literal: text = '"%s"' % latinize_text(text) return { 'query_string': { 'query': text, 'fields': ['text^6', 'text_latin^2'], 'default_operator': 'AND', 'use_dis_max': True } }
def meta_query_string(text, literal=False): if text is None or not len(text.strip()): return match_all() if literal: text = '"%s"' % latinize_text(text) return { "query_string": { "query": text, "fields": ['title^15', 'file_name', 'summary^10', 'title_latin^12', 'summary_latin^8'], "default_operator": "AND", "use_dis_max": True } }
def normalize(text): if not isinstance(text, six.string_types): return if six.PY2 and not isinstance(text, six.text_type): text = text.decode('utf-8') text = latinize_text(text.lower()) text = unicodedata.normalize('NFKD', text) characters = [] for character in text: category = unicodedata.category(character)[0] character = CATEGORIES.get(category, character) characters.append(character) text = u''.join(characters) return COLLAPSE.sub(WS, text).strip(WS)
def meta_query_string(text, literal=False): if text is None or not len(text.strip()): return match_all() if literal: text = '"%s"' % latinize_text(text) return { "query_string": { "query": text, "fields": [ 'title^15', 'file_name', 'summary^10', 'title_latin^12', 'summary_latin^8' ], "default_operator": "AND", "use_dis_max": True } }
def generate_pages(document): for page in document.pages: tid = sha1(str(document.id)) tid.update(str(page.id)) tid = tid.hexdigest() yield { '_id': tid, '_type': TYPE_RECORD, '_index': get_es_index(), '_parent': document.id, '_source': { 'type': 'page', 'content_hash': document.content_hash, 'document_id': document.id, 'source_id': document.source_id, 'page': page.number, 'text': page.text, 'text_latin': latinize_text(page.text) } }
def generate_records(document): for record in document.records: text = record.text latin = [latinize_text(t) for t in text] latin = [t for t in latin if t not in text] yield { '_id': record.tid, '_type': TYPE_RECORD, '_index': get_es_index(), '_parent': unicode(document.id), '_source': { 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'source_id': document.source_id, 'row_id': record.row_id, 'sheet': record.sheet, 'text': text, 'text_latin': latin, 'raw': record.data } }
def tabular_query(document_id, sheet, args): scored = False q = { 'match_all': {} } text = args.get('q', '').strip() if len(text): scored = True text_latin = latinize_text(text) q = { "bool": { "should": { "match": { "text": { "query": text, "cutoff_frequency": 0.0007, "operator": "and" } } }, "should": { "match": { "text_latin": { "query": text_latin, "cutoff_frequency": 0.0007, "operator": "and" } } } } } try: rows = [int(r) for r in args.getlist('row')] except Exception: rows = [] if len(rows): scored = True q = { "bool": { "must": q, "should": { "constant_score": { "filter": {'terms': {'row_id': rows}}, "boost": 1000 } } } } q = add_filter(q, {'term': {'document_id': document_id}}) q = add_filter(q, {'term': {'sheet': sheet}}) # from pprint import pprint # pprint(q) sort = [{'row_id': 'asc'}] if scored: sort.insert(0, '_score') return { 'from': 0, 'size': 100, 'query': q, 'sort': sort, '_source': ['document_id', 'sheet', 'row_id', 'raw'] }
def text_query(text): """ Construct the part of a query which is responsible for finding a piece of thext in the selected documents. """ text = text.strip() text_latin = latinize_text(text) if len(text): q = { "bool": { "minimum_should_match": 1, "should": [ { "multi_match": { "query": text, "fields": ['title^100', 'file_name^10', 'summary^2'], "type": "most_fields", "cutoff_frequency": 0.0007, "operator": "and", } }, { "multi_match": { "query": text_latin, "fields": ['title_latin^100', 'summary_latin^2'], "type": "most_fields", "cutoff_frequency": 0.0007, "operator": "and", } }, { "multi_match": { "query": text, "fields": ['title^100', 'file_name^10', 'summary^2'], "type": "phrase" } }, { "has_child": { "type": TYPE_RECORD, "score_mode": "avg", "query": { "bool": { "should": { "match": { "text": { "query": text, "cutoff_frequency": 0.0007, "operator": "and" } } }, "should": { "match": { "text_latin": { "query": text_latin, "cutoff_frequency": 0.0007, "operator": "and" } } } } } } } ] } } else: q = {'match_all': {}} return q