def to_hep(self, source): """Get an output ready hep formatted record from the given :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be. Args: source(str): string identifying the source for this item (ex. 'arXiv'). Returns: hepcrawl.utils.ParsedItem: the new item, with the internal record formated as hep record. Raises: UnknownItemFormat: if the source item format is unknown. """ builder = LiteratureBuilder(source=source) builder.add_acquisition_source( source=source, method='hepcrawl', date=datetime.datetime.now().isoformat(), submission_number=os.environ.get('SCRAPY_JOB', ''), ) self.record['acquisition_source'] = builder.record[ 'acquisition_source'] if self.record_format == 'hep': record = hep_to_hep( hep_record=self.record, record_files=self.record_files, ) for document in record.get('documents', []): if 'old_url' in document and 'original_url' not in document: document['original_url'] = document['old_url'] del document['old_url'] return record elif self.record_format == 'hepcrawl': record = _normalize_hepcrawl_record( item=self.record, source=source, ) return hepcrawl_to_hep(dict(record)) else: raise UnknownItemFormat('Unknown ParsedItem::{}'.format( self.record_format))
def test_no_document_type(input_no_document_type_record, expected_no_document_type_record): produced_record = hepcrawl_to_hep(input_no_document_type_record) assert produced_record == expected_no_document_type_record
def test_no_document_type( input_no_document_type_record, expected_no_document_type_record ): produced_record = hepcrawl_to_hep(input_no_document_type_record) assert produced_record == expected_no_document_type_record
def test_generic_crawler_record(input_generic_crawler_record, expected_generic_crawler_record): produced_record = hepcrawl_to_hep(input_generic_crawler_record) assert produced_record == expected_generic_crawler_record
def test_generic_crawler_record( input_generic_crawler_record, expected_generic_crawler_record ): produced_record = hepcrawl_to_hep(input_generic_crawler_record) assert produced_record == expected_generic_crawler_record