def hepcrawl_to_hep(crawler_record): """ Args: crawler_record(dict): dictionary representing the hepcrawl formatted record. Returns: dict: The hep formatted record. """ def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder( source=crawler_record['acquisition_source']['source'] ) for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( full_name=author['full_name'], raw_affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title( title=title.get('title'), subtitle=title.get('subtitle'), source=title.get('source') ) for abstract in crawler_record.get('abstracts', []): builder.add_abstract( abstract=abstract.get('value'), source=abstract.get('source') ) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories') ) for doi in crawler_record.get('dois', []): builder.add_doi( doi=doi.get('value'), material=doi.get('material'), ) for private_note in crawler_record.get('private_notes', []): builder.add_private_note( private_notes=private_note ) for public_note in crawler_record.get('public_notes', []): builder.add_public_note( public_note=public_note.get('value'), source=public_note.get('source') ) for license in crawler_record.get('license', []): builder.add_license( url=license.get('url'), license=license.get('license'), material=license.get('material'), ) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration( collaboration=collaboration.get('value') ) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date( imprint_date=imprint.get('date') ) for copyright in crawler_record.get('copyright', []): builder.add_copyright( holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement') ) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date') ) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method=acquisition_source['method'], date=acquisition_source['datetime'], source=acquisition_source['source'], submission_number=acquisition_source['submission_number'], ) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0]) ) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', 'manual', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), material=_pub_info.get('pubinfo_material'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number( report_number=report_number.get('value'), source=report_number.get('source') ) for url in crawler_record.get('urls', []): builder.add_url(url=url.get('value')) for document in crawler_record.get('documents', []): builder.add_document( description=document.get('description'), fulltext=document.get('fulltext'), hidden=document.get('hidden'), key=document['key'], material=document.get('material'), original_url=document.get('original_url'), url=document['url'], ) return builder.record
def crawler2hep(crawler_record): def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder('hepcrawl') for author in crawler_record.get('authors', []): builder.add_author( builder.make_author( author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title(title=title.get('title'), source=title.get('source')) for abstract in crawler_record.get('abstracts', []): builder.add_abstract(abstract=abstract.get('value'), source=abstract.get('source')) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories')) for doi in crawler_record.get('dois', []): builder.add_doi(doi=doi.get('value')) for public_note in crawler_record.get('public_notes', []): builder.add_public_note(public_note=public_note.get('value'), source=public_note.get('source')) for license in crawler_record.get('license', []): builder.add_license(url=license.get('url'), license=license.get('license')) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration(collaboration=collaboration.get('value')) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date(imprint_date=imprint.get('date')) for copyright in crawler_record.get('copyright', []): builder.add_copyright(holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement')) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date')) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method='hepcrawl', date=acquisition_source.get('date'), source=acquisition_source.get('source'), submission_number=acquisition_source.get('submission_number')) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0])) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', ] special_collections = [ 'cdf-internal-note', 'cdf-note', 'cds', 'd0-internal-note', 'd0-preliminary-note', 'h1-internal-note', 'h1-preliminary-note', 'halhidden', 'hephidden', 'hermes-internal-note', 'larsoft-internal-note', 'larsoft-note', 'zeus-internal-note', 'zeus-preliminary-note', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection in special_collections: builder.add_special_collection(collection.upper()) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number(report_number=report_number.get('value'), source=report_number.get('source')) builder.validate_record() return builder.record