コード例 #1
0
def test_near_duplicates_no_results(mocked_boto3, mocked_auth, good_doc):
    mocked_boto3.Session.return_value.get_credentials.return_value = mock.MagicMock(
    )
    es = ElasticsearchPlus('dummy', aws_auth_region='blah')
    es.search = mock.MagicMock(return_value={'hits': {'total': 0}})
    es.get = mock.MagicMock(return_value=good_doc)
    hits = list(
        es.near_duplicates(index=None, doc_id=None, doc_type=None,
                           fields=None))
    assert hits == [good_doc]
コード例 #2
0
def test_near_duplicates_some_results(mocked_boto3, mocked_auth, good_doc,
                                      bad_doc):
    mocked_boto3.Session.return_value.get_credentials.return_value = mock.MagicMock(
    )
    es = ElasticsearchPlus('dummy', aws_auth_region='blah')

    hits = [good_doc] * 6 + [bad_doc] * 231
    results = {
        'hits': {
            'total': len(hits),
            'max_score': good_doc['_score'],
            'hits': hits
        }
    }
    es.search = mock.MagicMock(return_value=results)
    hits = list(
        es.near_duplicates(index=None, doc_id=None, doc_type=None,
                           fields=None))
    assert len(hits) == 6  # excludes bad_doc
コード例 #3
0
ファイル: run.py プロジェクト: hmessafi/nesta
def run():

    # Fetch the input parameters
    s3_bucket = os.environ["BATCHPAR_bucket"]
    batch_file = os.environ["BATCHPAR_batch_file"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_new_index = os.environ['BATCHPAR_out_index']
    es_old_index = os.environ['BATCHPAR_in_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # Extract the article ids in this chunk
    s3 = boto3.resource('s3')
    ids_obj = s3.Object(s3_bucket, batch_file)
    art_ids = json.loads(ids_obj.get()['Body']._raw_stream.read())
    logging.info(f'Processing {len(art_ids)} article ids')

    field_null_mapping = load_json_from_pathstub(("tier_1/"
                                                  "field_null_mappings/"),
                                                 "health_scanner.json")
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           field_null_mapping=field_null_mapping,
                           send_get_body_as='POST')

    # Iterate over article IDs
    processed_ids = set()
    for _id in art_ids:
        if _id in processed_ids:  # To avoid duplicated effort
            continue

        # Collect all duplicated data together
        dupe_ids = {}  # For identifying the most recent dupe
        yearly_funds = []  # The new deduped collection of annual funds
        hits = {}
        for hit in es.near_duplicates(index=es_old_index,
                                      doc_id=_id,
                                      doc_type=es_type,
                                      fields=[
                                          "textBody_descriptive_project",
                                          "title_of_project",
                                          "textBody_abstract_project"
                                      ]):
            # Extract key values
            src = hit['_source']
            hit_id = hit['_id']
            # Record this hit
            processed_ids.add(hit_id)
            hits[hit_id] = src
            # Extract year and funding info
            yearly_funds += extract_yearly_funds(src)
            year = get_value(src, 'year_fiscal_funding')
            if year is not None:
                dupe_ids[hit_id] = year

        # Get the most recent instance of the duplicates
        final_id = sorted(hits.keys())[-1]  # default if years are all null
        if len(dupe_ids) > 0:  # implies years are not all null
            final_id, year = Counter(dupe_ids).most_common()[0]
        body = hits[final_id]
        processed_ids = processed_ids.union(set(dupe_ids))

        # Sort and sum the funding
        yearly_funds = sorted(yearly_funds, key=lambda row: row['year'])
        sum_funding = sum(row['cost_ref'] for row in yearly_funds
                          if row['cost_ref'] is not None)

        # Add funding info and commit to the new index
        body['json_funding_project'] = yearly_funds
        body['cost_total_project'] = sum_funding
        body['date_start_project'] = yearly_funds[0][
            'start_date']  # just in case
        es.index(index=es_new_index, doc_type=es_type, id=final_id, body=body)

    logging.info(f'Processed {len(processed_ids)} ids')
    logging.info("Batch job complete.")