def push_keywords(docs): """ Add keywords from datatables to the corresponding publication record """ from hepdata.utils.miscellanous import splitter datatables, publications = splitter(docs, lambda d: 'related_publication' in d) if len(publications) == 0 and len(datatables) == 0: raise ValueError("Documents provided are not appropriate " + "for pushing keywords") # check the related publication field for pub in publications: data = filter(lambda table: table['related_publication'] == pub['recid'], datatables) keywords = reduce(lambda acc, d: acc + d['keywords'], data, []) agg_keywords = defaultdict(list) for kw in keywords: agg_keywords[kw['name']].append(kw['value']) # Remove duplicates for k, v in agg_keywords.items(): agg_keywords[k] = list(set(v)) pub['data_keywords'] = agg_keywords return publications + datatables
def test_utils(): docs = [{'id': 1, 'type': 'publication'}, {'related_publication': 1, 'id': 2, 'type': 'data_table'}, {'related_publication': 1, 'id': 3, 'type': 'data_table'}, {'related_publication': 1, 'id': 4, 'type': 'data_table'}, {'related_publication': 1, 'id': 5, 'type': 'data_table'}, {'related_publication': 1, 'id': 6, 'type': 'data_table'}, {'related_publication': 1, 'id': 7, 'type': 'data_table'}, {'related_publication': 1, 'id': 8, 'type': 'data_table'}] datatables, publications = splitter(docs, lambda d: 'related_publication' in d) assert (publications[0]['id'] == 1) assert (publications[0]['type'] == 'publication') assert (datatables[0]['type'] == 'data_table')
def map_result(es_result): hits = es_result["hits"] total_hits = es_result["total"] aggregations = es_result["aggregations"] # Separate tables, papers = splitter(hits, is_datatable) fetch_remaining_papers(tables, papers) aggregated = match_tables_to_papers(tables, papers) results = [] for paper, datatables in aggregated: mapped_hit = get_basic_record_information(paper) data = map(get_basic_record_information, datatables) mapped_hit.update({"data": data, "total_tables": len(data)}) results.append(mapped_hit) facets = parse_aggregations(aggregations) return {"results": results, "facets": facets, "total": total_hits}
def test_utils(): docs = [{ 'id': 1, 'type': 'publication' }, { 'related_publication': 1, 'id': 2, 'type': 'data_table' }, { 'related_publication': 1, 'id': 3, 'type': 'data_table' }, { 'related_publication': 1, 'id': 4, 'type': 'data_table' }, { 'related_publication': 1, 'id': 5, 'type': 'data_table' }, { 'related_publication': 1, 'id': 6, 'type': 'data_table' }, { 'related_publication': 1, 'id': 7, 'type': 'data_table' }, { 'related_publication': 1, 'id': 8, 'type': 'data_table' }] datatables, publications = splitter(docs, lambda d: 'related_publication' in d) assert (publications[0]['id'] == 1) assert (publications[0]['type'] == 'publication') assert (datatables[0]['type'] == 'data_table')
def map_result(es_result): hits = es_result['hits'] total_hits = es_result['total'] aggregations = es_result['aggregations'] # Separate tables, papers = splitter(hits, is_datatable) fetch_remaining_papers(tables, papers) aggregated = match_tables_to_papers(tables, papers) results = [] for paper, datatables in aggregated: mapped_hit = get_basic_record_information(paper) data = map(get_basic_record_information, datatables) mapped_hit.update({ 'data': data, 'total_tables': len(data), }) results.append(mapped_hit) facets = parse_aggregations(aggregations) return {'results': results, 'facets': facets, 'total': total_hits}