def _get_art_context(record): reader = LiteratureReader(record) abstract = reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = "" return { "abstract": abstract, "abstract_language": abstract_language, "arxiv_id": reader.arxiv_id, "authors": get_authors(record), "collaborations": reader.collaborations, "divulgation": get_divulgation(record), "doi": reader.doi, "domains": get_domains(record), "inspire_id": reader.inspire_id, "journal_issue": reader.journal_issue, "journal_title": reader.journal_title, "journal_volume": reader.journal_volume, "keywords": reader.keywords, "language": reader.language, "page_artid": reader.get_page_artid(), "peer_reviewed": 1 if reader.peer_reviewed else 0, "publication_date": reader.publication_date, "subtitle": reader.subtitle, "title": reader.title, }
def _get_art_context(record): reader = LiteratureReader(record) abstract = reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = '' return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': reader.arxiv_id, 'authors': get_authors(record), 'collaborations': reader.collaborations, 'divulgation': get_divulgation(record), 'doi': reader.doi, 'domains': get_domains(record), 'inspire_id': reader.inspire_id, 'journal_issue': reader.journal_issue, 'journal_title': reader.journal_title, 'journal_volume': reader.journal_volume, 'keywords': reader.keywords, 'language': reader.language, 'page_artid': reader.get_page_artid(), 'peer_reviewed': 1 if reader.peer_reviewed else 0, 'publication_date': reader.publication_date, 'subtitle': reader.subtitle, 'title': reader.title, }
def start_merger(head_id, update_id, current_user_id=None): """Start a new ManualMerge workflow to merge two records manually. Args: head_id: the id of the first record to merge. This record is the one that will be updated with the new information. update_id: the id of the second record to merge. This record is the one that is going to be deleted and replaced by `head`. current_user_id: Id of the current user provided by the Flask app. Returns: (int): the current workflow object's id. """ data = { 'pid_type': 'lit', # TODO: support 'recid_head': head_id, 'recid_update': update_id, } head = get_db_record('lit', head_id) update = get_db_record('lit', update_id) workflow_object = workflow_object_class.create( data=None, id_user=current_user_id, data_type='hep' ) wf_id = workflow_object.id # to retrieve it later workflow_object.extra_data.update(data) update_source = LiteratureReader(update).source update_source = update_source if update_source else 'arxiv' workflow_object.extra_data['update_source'] = update_source.lower() workflow_object.extra_data['head_control_number'] = head_id workflow_object.extra_data['update_control_number'] = update_id workflow_object.extra_data['head_uuid'] = str(head.id) workflow_object.extra_data['update_uuid'] = str(update.id) workflow_object.extra_data['head'] = head workflow_object.extra_data['update'] = update workflow_object.save() start('manual_merge', object_id=wf_id) return wf_id
def merge_articles(obj, eng): """Merge two articles. The workflow payload is overwritten by the merged record, the conflicts are stored in ``extra_data.conflicts``. Also, it adds a ``callback_url`` which contains the endpoint which resolves the merge conflicts. Note: When the feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is ``False`` it will skip the merge. """ if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER'): return None matched_control_number = obj.extra_data['matches']['approved'] head_uuid = PersistentIdentifier.get('lit', matched_control_number).object_uuid head_record = InspireRecord.get_record(head_uuid) update = obj.data update_source = LiteratureReader(obj.data).source head_root = read_wf_record_source(record_uuid=head_record.id, source=update_source.lower()) head_root = head_root.json if head_root else {} obj.extra_data['head_uuid'] = str(head_uuid) obj.extra_data['head_version_id'] = head_record.model.version_id obj.extra_data['merger_head_revision'] = head_record.revision_id obj.extra_data['merger_original_root'] = deepcopy(head_root) merged, conflicts = merge( head=head_record.to_dict(), root=head_root, update=update, ) obj.data = merged if conflicts: obj.extra_data['conflicts'] = conflicts obj.extra_data['conflicts_metadata'] = { 'datetime': datetime.now().strftime("%b %d, %Y, %H:%M:%S %p"), 'update_source': update_source, } obj.extra_data['callback_url'] = \ get_resolve_merge_conflicts_callback_url() obj.save()
def start_merger(head_id, update_id, current_user_id=None): """Start a new ManualMerge workflow to merge two records manually. Args: head_id: the id of the first record to merge. This record is the one that will be updated with the new information. update_id: the id of the second record to merge. This record is the one that is going to be deleted and replaced by `head`. current_user_id: Id of the current user provided by the Flask app. Returns: (int): the current workflow object's id. """ data = { 'pid_type': 'lit', # TODO: support 'recid_head': head_id, 'recid_update': update_id, } head = get_db_record('lit', head_id) update = get_db_record('lit', update_id) workflow_object = workflow_object_class.create(data=None, id_user=current_user_id, data_type='hep') wf_id = workflow_object.id # to retrieve it later workflow_object.extra_data.update(data) update_source = LiteratureReader(update).source update_source = update_source if update_source else 'arxiv' workflow_object.extra_data['update_source'] = update_source.lower() workflow_object.extra_data['head_control_number'] = head_id workflow_object.extra_data['update_control_number'] = update_id workflow_object.extra_data['head_uuid'] = str(head.id) workflow_object.extra_data['update_uuid'] = str(update.id) workflow_object.extra_data['head'] = head workflow_object.extra_data['update'] = update workflow_object.save() start('manual_merge', object_id=wf_id) return wf_id
def populate_arxiv_document(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'): url = current_app.config[conf_name].format(arxiv_id=arxiv_id) is_valid_pdf_link = is_pdf_link(url) if is_valid_pdf_link: break try: if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return except requests.exceptions.RequestException: raise DownloadError("Error accessing url {url}".format(url=url)) if not is_valid_pdf_link: raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def get_note(data, doc_type): """Write and addendum/errata information to the BibTeX note field. Traverse publication_info looking for erratum and addendum in `publication_info.material` field and build a string of references to those publication entries. Returns: string: formatted list of the errata and addenda available for a given record """ notices = ('erratum', 'addendum') entries = [entry for entry in get_value(data, 'publication_info', []) if entry.get('material') in notices] if not entries: return None note_strings = [ text_type('{field}: {journal} {volume}, {pages} {year}').format( field=entry['material'].title(), journal=entry.get('journal_title'), volume=entry.get('journal_volume'), pages=LiteratureReader.get_page_artid_for_publication_info(entry, '--'), year='({})'.format(entry['year']) if 'year' in entry else '' ).strip() for entry in entries ] note_string = '[' + ', '.join(note_strings) + ']' note_string = re.sub(' +', ' ', note_string) # Remove possible multiple spaces return re.sub(',,', ',', note_string) # ... and commas
def get_note(data, doc_type): """Write and addendum/errata information to the BibTeX note field. Traverse publication_info looking for erratum and addendum in `publication_info.material` field and build a string of references to those publication entries. Returns: string: formatted list of the errata and addenda available for a given record """ notices = ('erratum', 'addendum') entries = [entry for entry in get_value(data, 'publication_info', []) if entry.get('material') in notices] if not entries: return None note_strings = [ text_type('{field}: {journal} {volume}, {pages} {year}').format( field=entry['material'].title(), journal=entry.get('journal_title'), volume=entry.get('journal_volume'), pages=LiteratureReader.get_page_artid_for_publication_info(entry, '--'), year='({})'.format(entry['year']) if 'year' in entry else '' ).strip() for entry in entries ] note_string = '[' + ', '.join(note_strings) + ']' note_string = re.sub(' +', ' ', note_string) # Remove possible multiple spaces return re.sub(',,', ',', note_string) # ... and commas
def arxiv_derive_inspire_categories(obj, eng): """Derive ``inspire_categories`` from the arXiv categories. Uses side effects to populate the ``inspire_categories`` key in ``obj.data`` by converting its arXiv categories. Args: obj (WorkflowObject): a workflow object. eng (WorkflowEngine): a workflow engine. Returns: None """ obj.data.setdefault('inspire_categories', []) for arxiv_category in LiteratureReader(obj.data).arxiv_categories: term = classify_field(arxiv_category) if term: inspire_category = { 'source': 'arxiv', 'term': term, } if inspire_category not in obj.data['inspire_categories']: obj.data['inspire_categories'].append(inspire_category)
def _is_art(record): reader = LiteratureReader(record) document_types = reader.document_types published = reader.is_published return ARTICLE_LIKE_DOCUMENT_TYPES.intersection( document_types) and published
def reply_ticket_context(user, obj): """Context for literature replies.""" return dict( object=obj, user=user, title=LiteratureReader(obj.data).title, reason=obj.extra_data.get("reason", ""), record_url=obj.extra_data.get("url", ""), )
def conference_information(self): """Conference information. Returns a list with information about conferences related to the record. """ conf_info = [] for pub_info in self['publication_info']: conference_recid = None parent_recid = None parent_rec = {} conference_rec = {} if 'conference_record' in pub_info: conference_rec = replace_refs(pub_info['conference_record'], 'es') if conference_rec and conference_rec.get('control_number'): conference_recid = conference_rec['control_number'] else: conference_rec = {} if 'parent_record' in pub_info: parent_rec = replace_refs(pub_info['parent_record'], 'es') if parent_rec and parent_rec.get('control_number'): parent_recid = parent_rec['control_number'] else: parent_rec = {} conf_info.append({ "conference_recid": conference_recid, "conference_title": LiteratureReader(conference_rec).title, "parent_recid": parent_recid, "parent_title": LiteratureReader(parent_rec).title.replace( "Proceedings, ", "", 1), "page_start": pub_info.get('page_start'), "page_end": pub_info.get('page_end'), "artid": pub_info.get('artid'), }) return conf_info
def test_manual_merge_existing_records(workflow_app): json_head = fake_record('This is the HEAD', 1) json_update = fake_record('While this is the update', 2) # this two fields will create a merging conflict json_head['core'] = True json_update['core'] = False head = InspireRecord.create_or_update(json_head, skip_files=False) head.commit() update = InspireRecord.create_or_update(json_update, skip_files=False) update.commit() head_id = head.id update_id = update.id obj_id = start_merger( head_id=1, update_id=2, current_user_id=1, ) do_resolve_manual_merge_wf(workflow_app, obj_id) # retrieve it again, otherwise Detached Instance Error obj = workflow_object_class.get(obj_id) assert obj.status == ObjectStatus.COMPLETED assert obj.extra_data['approved'] is True assert obj.extra_data['auto-approved'] is False # no root present before last_root = read_wf_record_source(head_id, 'arxiv') assert last_root is None update_source = LiteratureReader(update).source root_update = read_wf_record_source(update_id, update_source) assert root_update is None # check that head's content has been replaced by merged deleted_record = RecordMetadata.query.filter_by(id=update_id).one() latest_record = get_db_record('lit', 1) assert deleted_record.json['deleted'] is True # check deleted record is linked in the latest one deleted_rec_ref = {'$ref': 'http://localhost:5000/api/literature/2'} assert [deleted_rec_ref] == latest_record['deleted_records'] # check the merged record is linked in the deleted one new_record_metadata = {'$ref': 'http://localhost:5000/api/literature/1'} assert new_record_metadata == deleted_record.json['new_record'] del latest_record['deleted_records'] assert latest_record == obj.data # -> resulted merged record
def get_institution_papers_datatables_rows(hits): """Row used by datatables to render institution papers.""" result = [] title_html = "<a href='/literature/{id}'>{name}</a>" for hit in hits: row = [] title = LiteratureReader(hit.to_dict()).title row.append( title_html.format( id=hit.control_number, name=title.encode('utf8') ) ) ctx = { 'record': hit.to_dict(), 'is_brief': 'true', 'number_of_displayed_authors': 1, 'show_affiliations': 'false', 'collaboration_only': 'true' } row.append(render_macro_from_template( name="render_record_authors", template="inspirehep_theme/format/record/Inspire_Default_HTML_general_macros.tpl", ctx=ctx ) ) try: row.append(hit.publication_info[0].journal_title) except AttributeError: row.append('') try: row.append(hit.citation_count) except AttributeError: row.append(0) row.append(hit.earliest_date.split('-')[0]) result.append(row) return result
def get_institution_papers_datatables_rows(hits): """Row used by datatables to render institution papers.""" result = [] title_html = "<a href='/literature/{id}'>{name}</a>" for hit in hits: row = [] title = LiteratureReader(hit.to_dict()).title row.append( title_html.format( id=hit.control_number, name=title.encode('utf8') ) ) ctx = { 'record': hit.to_dict(), 'is_brief': 'true', 'number_of_displayed_authors': 1, 'show_affiliations': 'false', 'collaboration_only': 'true' } row.append(render_macro_from_template( name="render_record_authors", template="inspirehep_theme/format/record/Inspire_Default_HTML_general_macros.tpl", ctx=ctx ) ) try: row.append(hit.publication_info[0].journal_title) except AttributeError: row.append('') try: row.append(hit.citation_count) except AttributeError: row.append(0) row.append(hit.earliest_date.split('-')[0]) result.append(row) return result
def _author_list(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend( marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def _build_publication(record): reader = LiteratureReader(record) return { 'abstract': reader.abstract, 'authors': _get_authors(record), 'collaborations': reader.collaborations, 'keywords': reader.keywords, 'publication_id': record['control_number'], 'title': reader.title, 'topics': reader.inspire_categories, }
def test_that_db_changes_are_mirrored_in_es(app): search = LiteratureSearch() json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'document_type': [ 'article', ], 'titles': [ { 'title': 'foo' }, ], '_collections': ['Literature'] } # When a record is created in the DB, it is also created in ES. record = InspireRecord.create(json) record.commit() db.session.commit() es_record = search.get_source(record.id) assert LiteratureReader(es_record).title == 'foo' # When a record is updated in the DB, is is also updated in ES. record['titles'][0]['title'] = 'bar' record.commit() db.session.commit() es_record = search.get_source(record.id) assert LiteratureReader(es_record).title == 'bar' # When a record is deleted in the DB, it is also deleted in ES. record._delete(force=True) db.session.commit() with pytest.raises(NotFoundError): es_record = search.get_source(record.id)
def is_submission(obj, eng): """Check if a workflow contains a submission. Args: obj: a workflow object. eng: a workflow engine. Returns: bool: whether the workflow contains a submission. """ source = LiteratureReader(obj.data).method return source == 'submitter'
def refextract(obj, eng): """Extract references from various sources and add them to the workflow. Runs ``refextract`` on both the PDF attached to the workflow and the references provided by the submitter, if any, then chooses the one that generated the most and attaches them to the workflow object. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if 'references' in obj.data: extracted_raw_references = dedupe_list( extract_references_from_raw_refs(obj.data['references'])) obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references)) obj.data['references'] = match_references_based_on_flag( extracted_raw_references) return matched_pdf_references, matched_text_references = [], [] source = LiteratureReader(obj.data).source with get_document_in_workflow(obj) as tmp_document: if tmp_document: pdf_references = dedupe_list( extract_references_from_pdf(tmp_document, source)) matched_pdf_references = match_references_based_on_flag( pdf_references) text = get_value(obj.extra_data, 'formdata.references') if text: text_references = dedupe_list( extract_references_from_text(text, source)) matched_text_references = match_references_based_on_flag( text_references) if len(matched_pdf_references) == len(matched_text_references) == 0: obj.log.info('No references extracted.') elif len(matched_pdf_references) > len(matched_text_references): obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references)) obj.data['references'] = matched_pdf_references elif len(matched_text_references) >= len(matched_pdf_references): obj.log.info('Extracted %d references from text.', len(matched_text_references)) obj.data['references'] = matched_text_references
def _get_comm_context(record): lit_reader = LiteratureReader(record) abstract = lit_reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = "" conference_record = get_conference_record(record) conference_title = get_value(conference_record, "titles.title[0]") conf_reader = ConferenceReader(conference_record) return { "abstract": abstract, "abstract_language": abstract_language, "arxiv_id": lit_reader.arxiv_id, "authors": get_authors(record), "collaborations": lit_reader.collaborations, "conference_city": conf_reader.city, "conference_country": conf_reader.country, "conference_end_date": conf_reader.end_date, "conference_start_date": conf_reader.start_date, "conference_title": conference_title, "divulgation": get_divulgation(record), "doi": lit_reader.doi, "domains": get_domains(record), "inspire_id": lit_reader.inspire_id, "journal_issue": lit_reader.journal_issue, "journal_title": lit_reader.journal_title, "journal_volume": lit_reader.journal_volume, "keywords": lit_reader.keywords, "language": lit_reader.language, "page_artid": lit_reader.get_page_artid(), "peer_reviewed": 1 if lit_reader.peer_reviewed else 0, "publication_date": lit_reader.publication_date, "subtitle": lit_reader.subtitle, "title": lit_reader.title, }
def _get_comm_context(record): lit_reader = LiteratureReader(record) abstract = lit_reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = '' conference_record = get_conference_record(record) conference_title = get_value(conference_record, 'titles.title[0]') conf_reader = ConferenceReader(conference_record) return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': lit_reader.arxiv_id, 'authors': get_authors(record), 'collaborations': lit_reader.collaborations, 'conference_city': conf_reader.city, 'conference_country': conf_reader.country, 'conference_end_date': conf_reader.end_date, 'conference_start_date': conf_reader.start_date, 'conference_title': conference_title, 'divulgation': get_divulgation(record), 'doi': lit_reader.doi, 'domains': get_domains(record), 'inspire_id': lit_reader.inspire_id, 'journal_issue': lit_reader.journal_issue, 'journal_title': lit_reader.journal_title, 'journal_volume': lit_reader.journal_volume, 'keywords': lit_reader.keywords, 'language': lit_reader.language, 'page_artid': lit_reader.get_page_artid(), 'peer_reviewed': 1 if lit_reader.peer_reviewed else 0, 'publication_date': lit_reader.publication_date, 'subtitle': lit_reader.subtitle, 'title': lit_reader.title, }
def new_ticket_context(user, obj): """Context for literature new tickets.""" title = LiteratureReader(obj.data).title subject = u"Your suggestion to INSPIRE: {0}".format(title) user_comment = obj.extra_data.get('formdata', {}).get('extra_comments', '') identifiers = get_value(obj.data, "external_system_numbers.value") or [] return dict(email=user.email, title=title, identifier=identifiers or "", user_comment=user_comment, references=obj.extra_data.get('formdata', {}).get('references'), object=obj, subject=subject)
def has_fully_harvested_category(record): """Check if the record in `obj.data` has fully harvested categories. Arguments: record(dict): the ingested article. Return: bool: True when the record belongs to an arXiv category that is fully harvested, otherwise False. """ record_categories = set(LiteratureReader(record).arxiv_categories) harvested_categories = current_app.config.get('ARXIV_CATEGORIES', {}) return len(record_categories & set( harvested_categories.get('core') + harvested_categories.get('non-core'))) > 0
def arxiv_package_download(obj, eng): """Perform the package download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id), ) if tarball: obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
def render_contributions(hits): """Render a list of conferences to HTML.""" result = [] title_html = u"<a href='/literature/{id}'>{name}</a>" for hit in hits: row = [] title = LiteratureReader(hit.to_dict()).title row.append( title_html.format( id=hit.control_number, name=title ) ) ctx = { 'record': hit.to_dict(), 'is_brief': 'true', 'number_of_displayed_authors': 1, 'show_affiliations': 'false', 'collaboration_only': 'true' } row.append(render_macro_from_template( name="render_record_authors", template="inspirehep_theme/format/record/Inspire_Default_HTML_general_macros.tpl", ctx=ctx ) ) try: row.append(hit.publication_info[0].journal_title) except AttributeError: row.append('') try: row.append(hit.citation_count) except AttributeError: row.append(0) result.append(row) return result, hits.total
def is_arxiv_paper(obj, eng): """Check if a workflow contains a paper from arXiv. Args: obj: a workflow object. eng: a workflow engine. Returns: bool: whether the workflow contains a paper from arXiv. """ reader = LiteratureReader(obj.data) method = reader.method source = reader.source is_submission_with_arxiv = method == 'submitter' and 'arxiv_eprints' in obj.data is_harvested_from_arxiv = method == 'hepcrawl' and source.lower( ) == 'arxiv' return is_submission_with_arxiv or is_harvested_from_arxiv
def build(cls, record): """Build Publication object from record dictionary Args: record (dict): dictionary containing record data Returns: Publication: Object built from provided data """ reader = LiteratureReader(record) return cls( **{ "abstract": reader.abstract, "authors": get_authors_full_names(record), "collaborations": reader.collaborations, "keywords": reader.keywords, "publication_id": record["control_number"], "title": reader.title, "topics": reader.inspire_categories, })
def is_experimental_paper(obj, eng): """Check if a workflow contains an experimental paper. Args: obj: a workflow object. eng: a workflow engine. Returns: bool: whether the workflow contains an experimental paper. """ reader = LiteratureReader(obj.data) arxiv_categories = reader.arxiv_categories inspire_categories = reader.inspire_categories has_experimental_arxiv_category = len( set(arxiv_categories) & set(EXPERIMENTAL_ARXIV_CATEGORIES)) > 0 has_experimental_inspire_category = len( set(inspire_categories) & set(EXPERIMENTAL_INSPIRE_CATEGORIES)) > 0 return has_experimental_arxiv_category or has_experimental_inspire_category
def test_manual_merge_existing_records(mock_put_record_to_hep, mock_store_records, workflow_app): json_head = fake_record('This is the HEAD', 1) json_update = fake_record('While this is the update', 2) # this two fields will create a merging conflict json_head['core'] = True json_update['core'] = False head = InspireRecord.create_or_update(json_head, skip_files=False) head.commit() update = InspireRecord.create_or_update(json_update, skip_files=False) update.commit() head_id = head.id update_id = update.id obj_id = start_merger( head_id=1, update_id=2, current_user_id=1, ) do_resolve_manual_merge_wf(workflow_app, obj_id) mock_put_record_to_hep.assert_called() # retrieve it again, otherwise Detached Instance Error obj = workflow_object_class.get(obj_id) assert obj.status == ObjectStatus.COMPLETED assert obj.extra_data['approved'] is True assert obj.extra_data['auto-approved'] is False # no root present before last_root = read_wf_record_source(head_id, 'arxiv') assert last_root is None update_source = LiteratureReader(update).source root_update = read_wf_record_source(update_id, update_source) assert root_update is None
def _get_preprint_context(record): reader = LiteratureReader(record) abstract = reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = '' return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': reader.arxiv_id, 'authors': get_authors(record), 'collaborations': reader.collaborations, 'divulgation': get_divulgation(record), 'domains': get_domains(record), 'inspire_id': reader.inspire_id, 'keywords': reader.keywords, 'language': reader.language, 'subtitle': reader.subtitle, 'title': reader.title, }
def _get_preprint_context(record): reader = LiteratureReader(record) abstract = reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = "" return { "abstract": abstract, "abstract_language": abstract_language, "arxiv_id": reader.arxiv_id, "authors": get_authors(record), "collaborations": reader.collaborations, "divulgation": get_divulgation(record), "domains": get_domains(record), "inspire_id": reader.inspire_id, "keywords": reader.keywords, "language": reader.language, "subtitle": reader.subtitle, "title": reader.title, }
def store_root(obj, eng): """Insert or update the current record head's root into the ``WorkflowsRecordSources`` table.""" if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER', False): obj.log.info( 'skipping storing source root, feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is disabled.' ) return root = obj.extra_data['merger_root'] head_uuid = obj.extra_data['head_uuid'] source = LiteratureReader(root).source.lower() if not source: return root_record = WorkflowsRecordSources( source=get_source_for_root(source), record_uuid=head_uuid, json=root, ) db.session.merge(root_record) db.session.commit()
def get_pages(data, doc_type): pub_info = get_best_publication_info(data) return LiteratureReader.get_page_artid_for_publication_info(pub_info, '--')