Example #1
0
def test_near_duplicates_no_results(mocked_boto3, mocked_auth, good_doc):
    mocked_boto3.Session.return_value.get_credentials.return_value = mock.MagicMock(
    )
    es = ElasticsearchPlus('dummy', aws_auth_region='blah')
    es.search = mock.MagicMock(return_value={'hits': {'total': 0}})
    es.get = mock.MagicMock(return_value=good_doc)
    hits = list(
        es.near_duplicates(index=None, doc_id=None, doc_type=None,
                           fields=None))
    assert hits == [good_doc]
Example #2
0
def run():
    s3_bucket = os.environ["BATCHPAR_bucket"]
    batch_file = os.environ["BATCHPAR_batch_file"]
    count = int(os.environ['BATCHPAR_count'])
    es_index = os.environ['BATCHPAR_index']
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]
    fields = literal_eval(os.environ["BATCHPAR_fields"])
    score_field = os.environ["BATCHPAR_score_field"]
    test = literal_eval(os.environ["BATCHPAR_test"])

    # Extract all document ids in this chunk
    s3 = boto3.resource('s3')
    ids_obj = s3.Object(s3_bucket, batch_file)
    logging.info(f'Getting document ids...')
    all_doc_ids = json.loads(ids_obj.get()['Body']._raw_stream.read())
    logging.info(f'Got {len(all_doc_ids)} document ids')

    # Set up Elasticsearch
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           do_sort=False)

    min_match = 0.3 if not test else 0.05
    for doc_id in all_doc_ids:
        # Check whether the doc exists with the correct fields
        existing = es.get(es_index, doc_type=es_type, id=doc_id)['_source']
        # Get the score
        score = None
        if any(f in existing for f in fields):
            score = lolvelty(es,
                             es_index,
                             doc_id,
                             fields,
                             total=count,
                             minimum_should_match=min_match)
        # Merge existing info into new doc
        doc = {**existing}
        doc[score_field] = score
        es.index(index=es_index, doc_type=es_type, id=doc_id, body=doc)
Example #3
0
def run():
    bucket = os.environ["BATCHPAR_s3_bucket"]
    abstract_file = os.environ["BATCHPAR_s3_key"]
    dupe_file = os.environ["BATCHPAR_dupe_file"]
    es_config = literal_eval(os.environ["BATCHPAR_outinfo"])
    db = os.environ["BATCHPAR_db"]
    entity_type = os.environ["BATCHPAR_entity_type"]

    # mysql setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db)
    Session = sessionmaker(bind=engine)
    session = Session()

    # retrieve a batch of meshed terms
    mesh_terms = retrieve_mesh_terms(bucket, abstract_file)
    mesh_terms = format_mesh_terms(mesh_terms)
    logging.info(f'batch {abstract_file} contains '
                 f'{len(mesh_terms)} meshed abstracts')

    # retrieve duplicate map
    dupes = retrieve_duplicate_map(bucket, dupe_file)
    dupes = format_duplicate_map(dupes)

    # Set up elastic search connection
    field_null_mapping = load_json_from_pathstub(
        "tier_1/"
        "field_null_mappings/", "health_scanner.json")
    es = ElasticsearchPlus(hosts=es_config['host'],
                           port=es_config['port'],
                           aws_auth_region=es_config['region'],
                           use_ssl=True,
                           entity_type=entity_type,
                           strans_kwargs=None,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=True,
                           listify_terms=True)
    all_es_ids = get_es_ids(es, es_config)

    docs = []
    for doc_id, terms in mesh_terms.items():
        if doc_id not in all_es_ids:
            continue
        try:
            _filter = Abstracts.application_id == doc_id
            abstract = (session.query(Abstracts).filter(_filter).one())
        except NoResultFound:
            logging.warning(f'Not found {doc_id} in database')
            raise NoResultFound(doc_id)
        clean_abstract_text = clean_abstract(abstract.abstract_text)
        docs.append({
            'doc_id': doc_id,
            'terms_mesh_abstract': terms,
            'textBody_abstract_project': clean_abstract_text
        })
        duped_docs = dupes.get(doc_id, [])
        if len(duped_docs) > 0:
            logging.info(f'Found {len(duped_docs)} duplicates')
        for duped_doc in duped_docs:
            docs.append({
                'doc_id': duped_doc,
                'terms_mesh_abstract': terms,
                'textBody_abstract_project': clean_abstract_text,
                'booleanFlag_duplicate_abstract': True
            })

    # output to elasticsearch
    logging.warning(f'Writing {len(docs)} documents to elasticsearch')
    for doc in docs:
        uid = doc.pop("doc_id")
        # Extract existing info
        existing = es.get(es_config['index'],
                          doc_type=es_config['type'],
                          id=uid)['_source']
        # Merge existing info into new doc
        doc = {**existing, **doc}
        es.index(index=es_config['index'],
                 doc_type=es_config['type'],
                 id=uid,
                 body=doc)