def test_near_duplicates_no_results(mocked_boto3, mocked_auth, good_doc): mocked_boto3.Session.return_value.get_credentials.return_value = mock.MagicMock( ) es = ElasticsearchPlus('dummy', aws_auth_region='blah') es.search = mock.MagicMock(return_value={'hits': {'total': 0}}) es.get = mock.MagicMock(return_value=good_doc) hits = list( es.near_duplicates(index=None, doc_id=None, doc_type=None, fields=None)) assert hits == [good_doc]
def test_near_duplicates_some_results(mocked_boto3, mocked_auth, good_doc, bad_doc): mocked_boto3.Session.return_value.get_credentials.return_value = mock.MagicMock( ) es = ElasticsearchPlus('dummy', aws_auth_region='blah') hits = [good_doc] * 6 + [bad_doc] * 231 results = { 'hits': { 'total': len(hits), 'max_score': good_doc['_score'], 'hits': hits } } es.search = mock.MagicMock(return_value=results) hits = list( es.near_duplicates(index=None, doc_id=None, doc_type=None, fields=None)) assert len(hits) == 6 # excludes bad_doc
def run(): # Fetch the input parameters s3_bucket = os.environ["BATCHPAR_bucket"] batch_file = os.environ["BATCHPAR_batch_file"] es_host = os.environ['BATCHPAR_outinfo'] es_port = int(os.environ['BATCHPAR_out_port']) es_new_index = os.environ['BATCHPAR_out_index'] es_old_index = os.environ['BATCHPAR_in_index'] es_type = os.environ['BATCHPAR_out_type'] entity_type = os.environ["BATCHPAR_entity_type"] aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] # Extract the article ids in this chunk s3 = boto3.resource('s3') ids_obj = s3.Object(s3_bucket, batch_file) art_ids = json.loads(ids_obj.get()['Body']._raw_stream.read()) logging.info(f'Processing {len(art_ids)} article ids') field_null_mapping = load_json_from_pathstub(("tier_1/" "field_null_mappings/"), "health_scanner.json") es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, no_commit=("AWSBATCHTEST" in os.environ), entity_type=entity_type, field_null_mapping=field_null_mapping, send_get_body_as='POST') # Iterate over article IDs processed_ids = set() for _id in art_ids: if _id in processed_ids: # To avoid duplicated effort continue # Collect all duplicated data together dupe_ids = {} # For identifying the most recent dupe yearly_funds = [] # The new deduped collection of annual funds hits = {} for hit in es.near_duplicates(index=es_old_index, doc_id=_id, doc_type=es_type, fields=[ "textBody_descriptive_project", "title_of_project", "textBody_abstract_project" ]): # Extract key values src = hit['_source'] hit_id = hit['_id'] # Record this hit processed_ids.add(hit_id) hits[hit_id] = src # Extract year and funding info yearly_funds += extract_yearly_funds(src) year = get_value(src, 'year_fiscal_funding') if year is not None: dupe_ids[hit_id] = year # Get the most recent instance of the duplicates final_id = sorted(hits.keys())[-1] # default if years are all null if len(dupe_ids) > 0: # implies years are not all null final_id, year = Counter(dupe_ids).most_common()[0] body = hits[final_id] processed_ids = processed_ids.union(set(dupe_ids)) # Sort and sum the funding yearly_funds = sorted(yearly_funds, key=lambda row: row['year']) sum_funding = sum(row['cost_ref'] for row in yearly_funds if row['cost_ref'] is not None) # Add funding info and commit to the new index body['json_funding_project'] = yearly_funds body['cost_total_project'] = sum_funding body['date_start_project'] = yearly_funds[0][ 'start_date'] # just in case es.index(index=es_new_index, doc_type=es_type, id=final_id, body=body) logging.info(f'Processed {len(processed_ids)} ids') logging.info("Batch job complete.")