def run(delete, load, search): ret = ElasticRetriever() if load: ret.build_index('contracts.parquet') if delete: ret.delete(dataset_id='contracts') if search != '': ret.search(search)
def run(delete, load, search, entity_search, cls, host): ret = ElasticRetriever(hosts=[host]) if load: ret.build_index('contracts.parquet') if delete: ret.delete(dataset_id='contracts') if search != '': result = ret.search(search, entity_search=entity_search, cls=cls, ndocs=1) print(result)
def create_app(): app = Flask(__name__, instance_relative_config=True) app.config['JSON_SORT_KEYS'] = False app.config.from_mapping(SECRET_KEY='dev', ) app.debug = True try: os.makedirs(app.instance_path) except OSError: pass app.retriever = ElasticRetriever(os.environ['ELASTIC_ADDRESS']) app.page_retriever = ElasticPageRetriever(os.environ['ELASTIC_ADDRESS']) try: app.word_embeddings_model = fasttext.load_model('/data/vecs.bin') except Exception as e: logger.error(f'{e}') pass from . import retrieval app.register_blueprint(retrieval.bp) # hack to get url prefixes registered as required/desired IAR - 30.Oct.2020 if 'PREFIX' in os.environ: logging.info(f"Stripping {os.environ['PREFIX']}") prefix = os.environ['PREFIX'] else: logging.info("No prefix stripped.") prefix = '' if "API_VERSION" in os.environ: api_version = os.environ['API_VERSION'] else: api_version = 'v2_beta' app.register_blueprint(retrieval.bp, url_prefix=f"{prefix}/{api_version}") app.register_blueprint( retrieval.bp, url_prefix='/sets/xdd-covid-19/api') # for backward compatibility app.register_blueprint(retrieval.bp, url_prefix=f'/sets/xdd-covid-19/api/{api_version}' ) # for backward compatibility #from . import extraction #app.register_blueprint(extraction.bp) from . import embeddings app.register_blueprint(embeddings.bp) logger.error(app.url_map) CORS(app) return app
def run(dataset_id, aws_host, host): if aws_host != '': auth = AWS4Auth(os.environ.get('AWS_ACCESS_KEY_ID'), os.environ.get('AWS_SECRET_ACCESS_KEY'), os.environ.get('AWS_DEFAULT_REGION'), 'es', session_token=os.environ.get('AWS_SESSION_TOKEN')) ret = ElasticRetriever(hosts=[{ 'host': aws_host, 'port': 443 }], awsauth=auth) else: ret = ElasticRetriever(hosts=[host]) print('Connected to retriever, building indices') ret.delete(dataset_id) print('Done deleting index')
def run(sections_parquet, documents_parquet, tables_parquet, figures_parquet, equations_parquet, aws_host, host): if aws_host != '': auth = AWS4Auth(os.environ.get('AWS_ACCESS_KEY_ID'), os.environ.get('AWS_SECRET_ACCESS_KEY'), os.environ.get('AWS_DEFAULT_REGION'), 'es', session_token=os.environ.get('AWS_SESSION_TOKEN')) ret = ElasticRetriever(hosts=[{ 'host': aws_host, 'port': 443 }], awsauth=auth) else: ret = ElasticRetriever(hosts=[host]) print('Connected to retriever, building indices') ret.build_index(documents_parquet, sections_parquet, tables_parquet, figures_parquet, equations_parquet) print('Done building index')
def __init__(self, client, hosts=[os.environ["ELASTIC_ADDRESS"]]): self.elastic_retriever = ElasticRetriever(hosts) self.reranker = BertRerankingRetriever(client)
class ElasticRerankingRetriever(Retriever): def __init__(self, client, hosts=[os.environ["ELASTIC_ADDRESS"]]): self.elastic_retriever = ElasticRetriever(hosts) self.reranker = BertRerankingRetriever(client) def search(self, query, ndocs=10, page=0, cls=None, detect_min=None, postprocess_min=None, return_all=False, get_count=False): logger.error('Starting search.') contexts = self.elastic_retriever.search(query, ndocs=ndocs, page=page, cls=cls, detect_min=detect_min, postprocess_min=postprocess_min) if get_count: pdf_count = set() for c in contexts: pdf_count.add(c['pdf_name']) return len(pdf_count) logger.info('Starting reranking') results = self.rerank(query, contexts) logger.info('Finished reranking') if return_all: return results doc_set = set() final_results = [] for result in results: if result['docname'] in doc_set: continue doc_set.add(result['docname']) final_results.append(result) final_results = [r['id'] for r in final_results] final_results = [self.elastic_retriever.get_object(i) for i in final_results] final_results = [ { 'header': {}, 'pdf_name': obj.pdf_name, 'children': [{ 'id': obj.meta.id, 'bytes': obj.img_pth, 'cls': obj.cls, 'postprocessing_confidence': obj.postprocess_score, 'base_confidence': obj.detect_score, 'content': obj.content, 'header_content': obj.header_content, }], 'context_keywords': '', 'context_summary': '', 'context_content': '', 'context_id': obj.meta.id } for obj in final_results ] return final_results def rerank(self, query, contexts): return self.reranker.rerank(query, contexts) def build_index(self, document_parquet, entities_parquet, section_parquet, tables_parquet, figures_parquet, equations_parquet): self.elastic_retriever.build_index(document_parquet, entities_parquet, section_parquet, tables_parquet, figures_parquet, equations_parquet) def delete(self, dataset_id): self.elastic_retriever.delete(dataset_id)