Exemple #1
0
def insert_urls(pmid, reference_id, doi_url, pmc_url, source_id, created_by):
    x = ReferenceUrl(display_name='PubMed',
                     obj_url=pubmed_root + str(pmid),
                     reference_id=reference_id,
                     url_type='PubMed',
                     source_id=source_id,
                     created_by=created_by)
    DBSession.add(x)
    if doi_url:
        x = ReferenceUrl(display_name='DOI full text',
                         obj_url=doi_url,
                         reference_id=reference_id,
                         url_type='DOI full text',
                         source_id=source_id,
                         created_by=created_by)
        DBSession.add(x)
    if pmc_url:
        x = ReferenceUrl(display_name='PMC full text',
                         obj_url=pmc_url,
                         reference_id=reference_id,
                         url_type='PMC full text',
                         source_id=source_id,
                         created_by=created_by)
        DBSession.add(x)
    DBSession.flush()
    DBSession.refresh(x)
Exemple #2
0
def insert_abstract(pmid, reference_id, record, source_id, journal_abbrev,
                    journal_title, issn_print, created_by):
    """ Add abstract to Referencedocument table

    This method does not return anything, just does the necessary CRUD operations
    
    Parameters
    ----------
    pmid: int
    reference_id: int
    source_id: int
    journal_abbrev: str
    journal_title: str
    issn_print: str
    created_by: str

    Return
    ------
    empty
        does not return anything


    """

    text = record.get('AB', '')

    if text == '':
        return
    locus_names_ids = DBSession.query(Locusdbentity.display_name,
                                      Locusdbentity.sgdid).all()
    html = link_gene_names(text, locus_names_ids)
    x = Referencedocument(document_type='Abstract',
                          source_id=source_id,
                          reference_id=reference_id,
                          text=text,
                          html=html,
                          created_by=created_by)
    DBSession.add(x)

    entries = create_bibentry(pmid, record, journal_abbrev, journal_title,
                              issn_print)
    y = Referencedocument(document_type='Medline',
                          source_id=source_id,
                          reference_id=reference_id,
                          text='\n'.join([
                              key + ' - ' + str(value)
                              for key, value in entries if value is not None
                          ]),
                          html='\n'.join([
                              key + ' - ' + str(value)
                              for key, value in entries if value is not None
                          ]),
                          created_by=created_by)
    DBSession.add(y)
    DBSession.flush()
    DBSession.refresh(x)
Exemple #3
0
def insert_pubtypes(pmid, reference_id, pubtypes, source_id, created_by):
    for type in pubtypes:
        x = Referencetype(display_name=type,
                          obj_url='/referencetype/' + type.replace(' ', '_'),
                          source_id=source_id,
                          reference_id=reference_id,
                          created_by=created_by)
        DBSession.add(x)
    DBSession.flush()
    DBSession.refresh(x)
Exemple #4
0
def get_journal_id(record, created_by):
    journal_abbr = record.get('TA', '')
    journal_full_name = record.get('JT', '')

    # 1469-221X (Print) 1469-221X (Linking)
    # 1573-6881 (Electronic) 0145-479X (Linking)
    issn_list = record.get('IS', '').split(') ')
    issn_print = ''
    issn_electronic = ''
    for issn in issn_list:
        if "Print" in issn or "Linking" in issn:
            issn_print = issn.split(' ')[0]
        if "Electronic" in issn:
            issn_electronic = issn.split(' ')[0]
    if issn_print:
        journals = DBSession.query(Journal).filter_by(
            issn_print=issn_print).all()
        if len(journals) > 0:
            return journals[0].journal_id, journals[
                0].med_abbr, journal_full_name, issn_print

    if journal_abbr == '':
        return None, '', '', ''

    if journal_abbr:
        journals = DBSession.query(Journal).filter_by(
            med_abbr=journal_abbr).all()
        if len(journals) > 0:
            return journals[0].journal_id, journals[
                0].med_abbr, journal_full_name, issn_print

    source_id = 824  # 'PubMed'
    shortened_full_name = (
        journal_full_name[:197] +
        '...') if len(journal_full_name) > 200 else journal_full_name
    format_name = journal_full_name.replace(' ', '_') + journal_abbr.replace(
        ' ', '_')

    j = Journal(issn_print=issn_print,
                issn_electronic=issn_electronic,
                display_name=shortened_full_name,
                format_name=(format_name[:97] +
                             '...') if len(format_name) > 100 else format_name,
                title=shortened_full_name,
                med_abbr=journal_abbr,
                source_id=source_id,
                obj_url='/journal/' + format_name,
                created_by=created_by)
    DBSession.add(j)
    DBSession.flush()
    DBSession.refresh(j)

    return j.journal_id, j.med_abbr, journal_full_name, issn_print
Exemple #5
0
def insert_authors(reference_id, authors, source_id, created_by):
    if len(authors) == 0:
        return

    i = 0
    for author in authors:
        i = i + 1
        x = Referenceauthor(display_name=author,
                            obj_url='/author/' + author.replace(' ', '_'),
                            source_id=source_id,
                            reference_id=reference_id,
                            author_order=i,
                            author_type='Author',
                            created_by=created_by)
        DBSession.add(x)
    DBSession.flush()
    DBSession.refresh(x)
Exemple #6
0
def insert_relations(pmid, reference_id, record, created_by):
    tag_to_type = {
        "CON": "Comment",
        "CIN": "Comment",
        "EIN": "Erratum",
        "EFR": "Erratum",
        "CRI": "Corrected and Republished",
        "CRF": "Corrected and Republished",
        "PRIN": "Partial retraction",
        "PROF": "Partial retraction",
        "RPI": "Republished",
        "RPF": "Republished",
        "RIN": "Retraction",
        "ROF": "Retraction",
        "UIN": "Update",
        "UOF": "Update",
        "SPIN": "Summary for patients",
        "ORI": "Original report"
    }

    inText = None
    onText = None
    rtype = None
    for tag in [
            'CIN', 'EIN', 'CRI', 'PRIN', 'RPI', 'RIN', 'UIN', 'SPIN', 'ORI'
    ]:
        if record.get(tag):
            inText = record[tag]
            rtype = tag_to_type[tag]
            break

    for tag in ['CON', 'EFR', 'CRF', 'PROF', 'RPF', 'ROF', 'UOF']:
        if record.get(tag):
            onText = record[tag]
            rtype = tag_to_type[tag]
            break

    if inText is None and onText is None:
        return

    source_id = 834  # 'SGD'

    parent_reference_id = None
    child_reference_id = None

    if type(inText) == list:
        inText = inText[0]
    if inText is not None and "PMID:" in inText:
        print(inText)
        parent_reference_id = reference_id
        child_pmid = inText.split("PMID: ")[1].strip()
        print(child_pmid)
        child_reference_id = get_reference_id(int(child_pmid))
        print('is there a child?')
        print((child_pmid, child_reference_id))
        if child_reference_id is not None:
            x = ReferenceRelation(parent_id=parent_reference_id,
                                  child_id=child_reference_id,
                                  source_id=source_id,
                                  correction_type=rtype,
                                  created_by=created_by)
            DBSession.add(x)

    if type(onText) == list:
        onText = onText[0]
    if onText is not None and "PMID:" in onText:
        child_reference_id = reference_id
        parent_pmid = onText.split("PMID: ")[1].strip()
        parent_reference_id = get_reference_id(int(parent_pmid))
        print('is there a parent?')
        print((parent_pmid, parent_reference_id))
        if parent_reference_id is not None:
            x = ReferenceRelation(parent_id=parent_reference_id,
                                  child_id=child_reference_id,
                                  source_id=source_id,
                                  correction_type=rtype,
                                  created_by=created_by)
            DBSession.add(x)

    DBSession.flush()
    DBSession.refresh(x)
Exemple #7
0
def insert_referencedbentity(pmid,
                             source_id,
                             record,
                             created_by,
                             method_obtained="Curator triage"):
    """ Inserts referencedbentity object into table referencedbentity
    
    Parameters
    ----------
    pmid: int
    source_id: int
    record: dict
    created_oby: str
    method_obtained: str, optional

    Returns
    --------
    list

    """

    pubstatus, date_revised = get_pubstatus_date_revised(record)
    journal_id, journal, journal_title, issn_print = get_journal_id(
        record, created_by)
    pubdate = record.get('DP', None)
    year = pubdate.split(' ')[0]
    title = record.get('TI', None)
    authors = record.get('AU', [])
    volume = record.get('VI', None)
    issue = record.get('IP', None)
    pages = record.get('PG', None)
    citation = set_cite(title, authors, year, journal, volume, issue, pages)
    doi, doi_url = get_doi(record)
    pmcid = record.get('PMC', None)
    pmc_url = pmc_root + pmcid + '/' if pmcid else None

    publication_status = status
    fulltext_status = pdf_status
    if pubstatus == 'aheadofprint':
        publication_status = epub_status
        fulltext_status = epub_pdf_status

    if year:
        year = int(year)
    if journal_id:
        journal_id = int(journal_id)

    x = Referencedbentity(display_name=citation.split(')')[0] + ')',
                          source_id=source_id,
                          subclass='REFERENCE',
                          dbentity_status='Active',
                          method_obtained=method_obtained,
                          publication_status=publication_status,
                          fulltext_status=fulltext_status,
                          citation=citation,
                          year=year,
                          pmid=int(pmid),
                          pmcid=pmcid,
                          date_published=pubdate,
                          date_revised=date_revised,
                          issue=issue,
                          page=pages,
                          volume=volume,
                          title=title,
                          doi=doi,
                          journal_id=journal_id,
                          created_by=created_by)

    DBSession.add(x)
    DBSession.flush()
    DBSession.refresh(x)
    dbentity_id = x.dbentity_id
    ## insert into REFERENCEDOCUMENT
    insert_abstract(pmid, dbentity_id, record, source_id, journal,
                    journal_title, issn_print, created_by)

    return [dbentity_id, authors, doi_url, pmc_url, x.sgdid, x]
def upload_file_obj_db_s3():
    """ Upload file metadata to database and s3 """
    readme_file_id = None
    file_content_list = file_upload_to_obj()

    try:
        if file_content_list:
            sorted_content = sorted(file_content_list,
                                    key=itemgetter('file_extension'))
            for item in sorted_content:
                if item['readme_name']:
                    readme = DBSession.query(Filedbentity).filter(
                        Filedbentity.display_name ==
                        item['readme_name']).one_or_none()

                    if readme is None:
                        print('unable to find README ' + item['readme_name'])
                        logging.warning('unable to find README ' +
                                        item['readme_name'])
                    else:
                        readme_file_id = readme.dbentity_id

                # see if file_meta already exists, else create
                existing_file_meta_data = DBSession.query(Filedbentity).filter(
                    Filedbentity.display_name ==
                    item['display_name']).one_or_none()
                source_id = DBSession.query(Source.source_id).filter(
                    Source.display_name == item['source']).one_or_none()[0]

                d_name = item['display_name']
                f_ext = item['file_extension']
                temp_file_path = get_file_from_path_collection(f_ext, d_name)
                if not existing_file_meta_data:
                    try:
                        data_id = DBSession.query(Edam.edam_id).filter(
                            Edam.edamid ==
                            item['data_edam_id']).one_or_none()[0]

                        format_id = DBSession.query(Edam.edam_id).filter(
                            Edam.edamid ==
                            item['format_edam_id']).one_or_none()[0]
                        topic_id = DBSession.query(Edam.edam_id).filter(
                            Edam.edamid ==
                            item['topic_edam_id']).one_or_none()[0]
                        item["data_id"] = data_id
                        item["format_id"] = format_id
                        item["topic_id"] = topic_id
                        item["source_id"] = source_id
                        item["readme_file_id"] = readme_file_id

                    except TypeError:
                        logging.error('invalid EDAM id or source in row ' +
                                      ' val in ' + item['data_edam_id'] +
                                      ', ' + item['format_edam_id'] + ', ' +
                                      item['topic_edam_id'])

                    if temp_file_path:
                        with open(temp_file_path, 'rb') as remote_file:
                            upload_file_helper(CREATED_BY, remote_file, item,
                                               temp_file_path)

                    DBSession.flush()
                else:
                    existing_file_meta_data.display_name = item['display_name']
                    existing_file_meta_data.description = item['description']
                    existing_file_meta_data.status = item['status']
                    existing_file_meta_data.is_public = item['is_public']
                    existing_file_meta_data.is_in_spell = item['is_in_spell']
                    existing_file_meta_data.is_in_browser = item[
                        'is_in_browser']
                    existing_file_meta_data.readme_file_id = readme_file_id
                    existing_file_meta_data.source_id = source_id

                    if temp_file_path:
                        with open(temp_file_path, 'rb') as remote_file:
                            #update file size
                            if not existing_file_meta_data.file_size and existing_file_meta_data.s3_url:
                                remote_file.seek(0, os.SEEK_END)
                                file_size = remote_file.tell()
                                remote_file.seek(0)
                                existing_file_meta_data.file_size = file_size

                            if item['file_date']:
                                existing_file_meta_data.file_date = item[
                                    'file_date']
                                existing_file_meta_data.year = item[
                                    'file_date'].year
                            existing_file_meta_data.readme_file_id = readme_file_id
                            remote_file.seek(0, os.SEEK_END)

                            #transaction.commit()
                            existing_file_meta_data = DBSession.query(
                                Filedbentity).filter(
                                    Filedbentity.display_name ==
                                    item['display_name']).one_or_none()
                            # only upload s3 file if not defined
                            existing_file_meta_data.upload_file_to_s3(
                                file=remote_file,
                                filename=item['display_name'],
                                file_path=temp_file_path,
                                flag=False)

                add_path_entries(item['display_name'], item['new_path'],
                                 SGD_SOURCE_ID, CREATED_BY)
                add_pmids(item['display_name'], item['pmids'], SGD_SOURCE_ID,
                          CREATED_BY)
                add_keywords(item['display_name'], item['keywords'],
                             SGD_SOURCE_ID, CREATED_BY)
                if item['display_name'].endswith('.README'):
                    update_readme_files_with_urls(item['display_name'])

                transaction.commit()
                DBSession.flush()
                logging.info('finished processing file: ' +
                             item['display_name'])

    except Exception as e:
        logging.error("Exception occurred", exc_info=True)