def insert_urls(pmid, reference_id, doi_url, pmc_url, source_id, created_by): x = ReferenceUrl(display_name='PubMed', obj_url=pubmed_root + str(pmid), reference_id=reference_id, url_type='PubMed', source_id=source_id, created_by=created_by) DBSession.add(x) if doi_url: x = ReferenceUrl(display_name='DOI full text', obj_url=doi_url, reference_id=reference_id, url_type='DOI full text', source_id=source_id, created_by=created_by) DBSession.add(x) if pmc_url: x = ReferenceUrl(display_name='PMC full text', obj_url=pmc_url, reference_id=reference_id, url_type='PMC full text', source_id=source_id, created_by=created_by) DBSession.add(x) DBSession.flush() DBSession.refresh(x)
def insert_abstract(pmid, reference_id, record, source_id, journal_abbrev, journal_title, issn_print, created_by): """ Add abstract to Referencedocument table This method does not return anything, just does the necessary CRUD operations Parameters ---------- pmid: int reference_id: int source_id: int journal_abbrev: str journal_title: str issn_print: str created_by: str Return ------ empty does not return anything """ text = record.get('AB', '') if text == '': return locus_names_ids = DBSession.query(Locusdbentity.display_name, Locusdbentity.sgdid).all() html = link_gene_names(text, locus_names_ids) x = Referencedocument(document_type='Abstract', source_id=source_id, reference_id=reference_id, text=text, html=html, created_by=created_by) DBSession.add(x) entries = create_bibentry(pmid, record, journal_abbrev, journal_title, issn_print) y = Referencedocument(document_type='Medline', source_id=source_id, reference_id=reference_id, text='\n'.join([ key + ' - ' + str(value) for key, value in entries if value is not None ]), html='\n'.join([ key + ' - ' + str(value) for key, value in entries if value is not None ]), created_by=created_by) DBSession.add(y) DBSession.flush() DBSession.refresh(x)
def insert_pubtypes(pmid, reference_id, pubtypes, source_id, created_by): for type in pubtypes: x = Referencetype(display_name=type, obj_url='/referencetype/' + type.replace(' ', '_'), source_id=source_id, reference_id=reference_id, created_by=created_by) DBSession.add(x) DBSession.flush() DBSession.refresh(x)
def get_journal_id(record, created_by): journal_abbr = record.get('TA', '') journal_full_name = record.get('JT', '') # 1469-221X (Print) 1469-221X (Linking) # 1573-6881 (Electronic) 0145-479X (Linking) issn_list = record.get('IS', '').split(') ') issn_print = '' issn_electronic = '' for issn in issn_list: if "Print" in issn or "Linking" in issn: issn_print = issn.split(' ')[0] if "Electronic" in issn: issn_electronic = issn.split(' ')[0] if issn_print: journals = DBSession.query(Journal).filter_by( issn_print=issn_print).all() if len(journals) > 0: return journals[0].journal_id, journals[ 0].med_abbr, journal_full_name, issn_print if journal_abbr == '': return None, '', '', '' if journal_abbr: journals = DBSession.query(Journal).filter_by( med_abbr=journal_abbr).all() if len(journals) > 0: return journals[0].journal_id, journals[ 0].med_abbr, journal_full_name, issn_print source_id = 824 # 'PubMed' shortened_full_name = ( journal_full_name[:197] + '...') if len(journal_full_name) > 200 else journal_full_name format_name = journal_full_name.replace(' ', '_') + journal_abbr.replace( ' ', '_') j = Journal(issn_print=issn_print, issn_electronic=issn_electronic, display_name=shortened_full_name, format_name=(format_name[:97] + '...') if len(format_name) > 100 else format_name, title=shortened_full_name, med_abbr=journal_abbr, source_id=source_id, obj_url='/journal/' + format_name, created_by=created_by) DBSession.add(j) DBSession.flush() DBSession.refresh(j) return j.journal_id, j.med_abbr, journal_full_name, issn_print
def insert_authors(reference_id, authors, source_id, created_by): if len(authors) == 0: return i = 0 for author in authors: i = i + 1 x = Referenceauthor(display_name=author, obj_url='/author/' + author.replace(' ', '_'), source_id=source_id, reference_id=reference_id, author_order=i, author_type='Author', created_by=created_by) DBSession.add(x) DBSession.flush() DBSession.refresh(x)
def insert_relations(pmid, reference_id, record, created_by): tag_to_type = { "CON": "Comment", "CIN": "Comment", "EIN": "Erratum", "EFR": "Erratum", "CRI": "Corrected and Republished", "CRF": "Corrected and Republished", "PRIN": "Partial retraction", "PROF": "Partial retraction", "RPI": "Republished", "RPF": "Republished", "RIN": "Retraction", "ROF": "Retraction", "UIN": "Update", "UOF": "Update", "SPIN": "Summary for patients", "ORI": "Original report" } inText = None onText = None rtype = None for tag in [ 'CIN', 'EIN', 'CRI', 'PRIN', 'RPI', 'RIN', 'UIN', 'SPIN', 'ORI' ]: if record.get(tag): inText = record[tag] rtype = tag_to_type[tag] break for tag in ['CON', 'EFR', 'CRF', 'PROF', 'RPF', 'ROF', 'UOF']: if record.get(tag): onText = record[tag] rtype = tag_to_type[tag] break if inText is None and onText is None: return source_id = 834 # 'SGD' parent_reference_id = None child_reference_id = None if type(inText) == list: inText = inText[0] if inText is not None and "PMID:" in inText: print(inText) parent_reference_id = reference_id child_pmid = inText.split("PMID: ")[1].strip() print(child_pmid) child_reference_id = get_reference_id(int(child_pmid)) print('is there a child?') print((child_pmid, child_reference_id)) if child_reference_id is not None: x = ReferenceRelation(parent_id=parent_reference_id, child_id=child_reference_id, source_id=source_id, correction_type=rtype, created_by=created_by) DBSession.add(x) if type(onText) == list: onText = onText[0] if onText is not None and "PMID:" in onText: child_reference_id = reference_id parent_pmid = onText.split("PMID: ")[1].strip() parent_reference_id = get_reference_id(int(parent_pmid)) print('is there a parent?') print((parent_pmid, parent_reference_id)) if parent_reference_id is not None: x = ReferenceRelation(parent_id=parent_reference_id, child_id=child_reference_id, source_id=source_id, correction_type=rtype, created_by=created_by) DBSession.add(x) DBSession.flush() DBSession.refresh(x)
def insert_referencedbentity(pmid, source_id, record, created_by, method_obtained="Curator triage"): """ Inserts referencedbentity object into table referencedbentity Parameters ---------- pmid: int source_id: int record: dict created_oby: str method_obtained: str, optional Returns -------- list """ pubstatus, date_revised = get_pubstatus_date_revised(record) journal_id, journal, journal_title, issn_print = get_journal_id( record, created_by) pubdate = record.get('DP', None) year = pubdate.split(' ')[0] title = record.get('TI', None) authors = record.get('AU', []) volume = record.get('VI', None) issue = record.get('IP', None) pages = record.get('PG', None) citation = set_cite(title, authors, year, journal, volume, issue, pages) doi, doi_url = get_doi(record) pmcid = record.get('PMC', None) pmc_url = pmc_root + pmcid + '/' if pmcid else None publication_status = status fulltext_status = pdf_status if pubstatus == 'aheadofprint': publication_status = epub_status fulltext_status = epub_pdf_status if year: year = int(year) if journal_id: journal_id = int(journal_id) x = Referencedbentity(display_name=citation.split(')')[0] + ')', source_id=source_id, subclass='REFERENCE', dbentity_status='Active', method_obtained=method_obtained, publication_status=publication_status, fulltext_status=fulltext_status, citation=citation, year=year, pmid=int(pmid), pmcid=pmcid, date_published=pubdate, date_revised=date_revised, issue=issue, page=pages, volume=volume, title=title, doi=doi, journal_id=journal_id, created_by=created_by) DBSession.add(x) DBSession.flush() DBSession.refresh(x) dbentity_id = x.dbentity_id ## insert into REFERENCEDOCUMENT insert_abstract(pmid, dbentity_id, record, source_id, journal, journal_title, issn_print, created_by) return [dbentity_id, authors, doi_url, pmc_url, x.sgdid, x]
def upload_file_obj_db_s3(): """ Upload file metadata to database and s3 """ readme_file_id = None file_content_list = file_upload_to_obj() try: if file_content_list: sorted_content = sorted(file_content_list, key=itemgetter('file_extension')) for item in sorted_content: if item['readme_name']: readme = DBSession.query(Filedbentity).filter( Filedbentity.display_name == item['readme_name']).one_or_none() if readme is None: print('unable to find README ' + item['readme_name']) logging.warning('unable to find README ' + item['readme_name']) else: readme_file_id = readme.dbentity_id # see if file_meta already exists, else create existing_file_meta_data = DBSession.query(Filedbentity).filter( Filedbentity.display_name == item['display_name']).one_or_none() source_id = DBSession.query(Source.source_id).filter( Source.display_name == item['source']).one_or_none()[0] d_name = item['display_name'] f_ext = item['file_extension'] temp_file_path = get_file_from_path_collection(f_ext, d_name) if not existing_file_meta_data: try: data_id = DBSession.query(Edam.edam_id).filter( Edam.edamid == item['data_edam_id']).one_or_none()[0] format_id = DBSession.query(Edam.edam_id).filter( Edam.edamid == item['format_edam_id']).one_or_none()[0] topic_id = DBSession.query(Edam.edam_id).filter( Edam.edamid == item['topic_edam_id']).one_or_none()[0] item["data_id"] = data_id item["format_id"] = format_id item["topic_id"] = topic_id item["source_id"] = source_id item["readme_file_id"] = readme_file_id except TypeError: logging.error('invalid EDAM id or source in row ' + ' val in ' + item['data_edam_id'] + ', ' + item['format_edam_id'] + ', ' + item['topic_edam_id']) if temp_file_path: with open(temp_file_path, 'rb') as remote_file: upload_file_helper(CREATED_BY, remote_file, item, temp_file_path) DBSession.flush() else: existing_file_meta_data.display_name = item['display_name'] existing_file_meta_data.description = item['description'] existing_file_meta_data.status = item['status'] existing_file_meta_data.is_public = item['is_public'] existing_file_meta_data.is_in_spell = item['is_in_spell'] existing_file_meta_data.is_in_browser = item[ 'is_in_browser'] existing_file_meta_data.readme_file_id = readme_file_id existing_file_meta_data.source_id = source_id if temp_file_path: with open(temp_file_path, 'rb') as remote_file: #update file size if not existing_file_meta_data.file_size and existing_file_meta_data.s3_url: remote_file.seek(0, os.SEEK_END) file_size = remote_file.tell() remote_file.seek(0) existing_file_meta_data.file_size = file_size if item['file_date']: existing_file_meta_data.file_date = item[ 'file_date'] existing_file_meta_data.year = item[ 'file_date'].year existing_file_meta_data.readme_file_id = readme_file_id remote_file.seek(0, os.SEEK_END) #transaction.commit() existing_file_meta_data = DBSession.query( Filedbentity).filter( Filedbentity.display_name == item['display_name']).one_or_none() # only upload s3 file if not defined existing_file_meta_data.upload_file_to_s3( file=remote_file, filename=item['display_name'], file_path=temp_file_path, flag=False) add_path_entries(item['display_name'], item['new_path'], SGD_SOURCE_ID, CREATED_BY) add_pmids(item['display_name'], item['pmids'], SGD_SOURCE_ID, CREATED_BY) add_keywords(item['display_name'], item['keywords'], SGD_SOURCE_ID, CREATED_BY) if item['display_name'].endswith('.README'): update_readme_files_with_urls(item['display_name']) transaction.commit() DBSession.flush() logging.info('finished processing file: ' + item['display_name']) except Exception as e: logging.error("Exception occurred", exc_info=True)