def insert_locussummary(nex_session, fw, locus_id, summary, source_id, created_by, date_created):

    x = Locussummary(locus_id = locus_id,
                     summary_type = summary_type,
                     text = summary,
                     html = summary, 
                     source_id = source_id, 
                     date_created = date_created,
                     created_by = created_by)
    nex_session.add(x)
    nex_session.commit()

    fw.write("insert summary for locus_id=" + str(locus_id) + ": TEXT=" + summary + "\n")
Esempio n. 2
0
def insert_summary(nex_session, load_summary_holder, source_id, x, created_by):
    x = Locussummary(locus_id = x['locus_id'], 
                     summary_type = x['summary_type'], 
                     summary_order = x['summary_order'], 
                     text = x['text'], 
                     html = x['html'], 
                     source_id = source_id, 
                     created_by = created_by)
    nex_session.add(x)
    nex_session.commit()

    load_summary_holder['summary_added'] = load_summary_holder['summary_added'] + 1

    # fw.write("insert summary:" + str(x['locus_id']) + ", " + x['summary_type'] + ", " + str(x['summary_order']) + ", " + x['text'] + ", " + x['html'])
    
    return x.summary_id
def upload_db(obj, row_num):
    try:
        temp_engine = create_engine(NEX2_URI)
        session_factory = sessionmaker(bind=temp_engine,
                                       extension=ZopeTransactionExtension(),
                                       expire_on_commit=False)
        db_session = scoped_session(session_factory)

        dbentity_id = db_session.query(Dbentity.dbentity_id).filter(
            Dbentity.sgdid == obj['sgdid']).one_or_none()[0]
        source_id = db_session.query(Source.source_id).filter(
            Source.display_name == 'SGD').one_or_none()[0]

        if dbentity_id:
            locus_summary_id = db_session.query(Locussummary.locus_id).filter(
                Locussummary.locus_id == dbentity_id,
                Locussummary.summary_type == 'Disease').one_or_none()
            if not locus_summary_id:
                new_summary_row = Locussummary(locus_id=dbentity_id,
                                               source_id=source_id,
                                               text=obj['summary'],
                                               html=obj['summary'],
                                               summary_type=SUMMARY_TYPE,
                                               summary_order='1',
                                               created_by=CREATED_BY)
            db_session.add(new_summary_row)
            #db_session.query(Locusdbentity).filter_by(dbentity_id = dbentity_id).update({'has_disease': 'true'})
            transaction.commit()
            db_session.flush()
            logging.info('finished ' + obj['sgdid'] + ', line ' + str(row_num))
    except:
        logging.error('error with ' + obj['sgdid'] + ' in line ' +
                      str(row_num))
        traceback.print_exc()
        db_session.rollback()
        db_session.close()
        sys.exit()
Esempio n. 4
0
def validate_file_content_and_process(file_content, nex_session, username):
    ''' Check file content, process and save to db

    Parameters
    ----------
    file_content: csv-reader object
                  csv-reader reads a tvs file and returns an object
    nex_session: database_session object
    username: str
              authorized user to make CRUD operations

    Returns
    -------
    dictionary
        number of entries
        number of updates
        database entries(dictionary)

    Note:
    Accepted summary types: Phenotype, Regulation, Disease, Interaction
                            Sequence, Protein
    Checks correct number of columns in the header and valid IDs
    '''

    header_literal = [
        '# Feature',
        'Summary Type (phenotype, regulation, disease, interaction, sequence, protein )',
        'Summary', 'PMIDs'
    ]
    accepted_summary_types = [
        'Phenotype', 'Regulation', 'Disease', 'Interaction', 'Sequence',
        'Protein'
    ]
    file_gene_ids = []
    file_pmids = []
    copied = []
    already_used_genes = []
    clear_target_urls = []
    # use regex to get keys from dictionary
    key_feature = re.compile(r".*feature$", re.IGNORECASE)
    key_summary_type = re.compile(r"^summary type.*", re.IGNORECASE)

    # use regex to get keys from dictionary
    key_feature = re.compile(r".*feature$", re.IGNORECASE)
    key_summary_type = re.compile(r"^summary type.*", re.IGNORECASE)

    try:
        for item in file_content:
            if (len(item) != len(header_literal)):
                raise ValueError(
                    'Row or header has incorrect number of columns.')
            #TODO: abstract the loop below in the next release
            gene_id = ''
            summary_type = ''
            for k, v in item.items():
                if key_feature.match(k):
                    gene_id = item.get(k)
                if key_summary_type.match(k):
                    summary_type = item.get(k)

            pmid_temp = item.get('PMIDs', None)
            if pmid_temp:
                pmids = str(pmid_temp).replace(' ', '').replace('0.0', '')
            else:
                pmids = ''
            summary_text = item.get('Summary', '')

            if gene_id:
                file_gene_ids.append(gene_id.strip())
            if summary_type:
                gene_id_with_summary = gene_id + summary_type
                if gene_id_with_summary in already_used_genes:
                    raise ValueError(
                        'The same gene summary cannot be updated twice in the same\
                        file: ' + str(gene_id))
                already_used_genes.append(gene_id_with_summary)
                if summary_type.lower() not in ''.join(
                        accepted_summary_types).lower():
                    raise ValueError(
                        'Unaccepted summary type. Must be one of ' +
                        ', '.join(accepted_summary_types))
            if len(pmids) > 0:
                pmids = re.split('\||,', pmids)
                for pmid in pmids:
                    file_pmids.append(str(pmid))

            copied.append(item)
    except IndexError:
        raise ValueError(
            'The file is not a valid TSV with the correct number of columns. Check the file and try again.'
        )

    nex_session.execute('SET LOCAL ROLE ' + username)

    # check that gene names are valid
    valid_genes = nex_session.query(Locusdbentity.format_name).filter(
        Locusdbentity.format_name.in_(file_gene_ids)).all()
    valid_genes = [str(d[0]) for d in valid_genes]
    invalid_genes = [d for d in file_gene_ids if d not in valid_genes]
    if len(invalid_genes):
        raise ValueError('Invalid gene identifier: ' +
                         ', '.join(invalid_genes))
    # must be valid PMIDs in last column or nothing
    matching_refs = nex_session.query(Referencedbentity).filter(
        Referencedbentity.pmid.in_(file_pmids)).all()
    temp_matching_refs = [str(d.pmid) for d in matching_refs]
    invalid_refs = [d for d in file_pmids if d not in temp_matching_refs]
    if len(invalid_refs):
        # raise ValueError('Invalid PMID: ' + ', '.join(invalid_refs) + '. Must be a pipe-separated list of PMIDs from SGD.')
        print(len(invalid_refs))
    # update
    receipt_entries = []
    locus_names_ids = nex_session.query(Locusdbentity.display_name,
                                        Locusdbentity.sgdid).all()
    inserts = 0
    updates = 0

    for item in copied:
        if item:
            for k, v in item.items():
                if key_feature.match(k):
                    file_id = item.get(k)
                if key_summary_type.match(k):
                    file_summary_type = item.get(k)
            #file_id = item.get('# Feature', '')
            #file_summary_type = item.get(
            #      'Summary Type (phenotype, regulation)', '')
            file_summary_val = item.get('Summary', '')

            file_summary_html = link_gene_names(file_summary_val,
                                                locus_names_ids)
            if file_id:
                gene = nex_session.query(Locusdbentity).filter_by(
                    format_name=file_id).one_or_none()
            if file_summary_type:
                summaries = nex_session.query(
                    Locussummary.summary_type, Locussummary.summary_id,
                    Locussummary.html, Locussummary.date_created).filter_by(
                        locus_id=gene.dbentity_id,
                        summary_type=file_summary_type).all()
                # update
                summary = None
                if len(summaries):
                    summary = summaries[0]
                    nex_session.query(Locussummary).filter_by(
                        summary_id=summary.summary_id).update({
                            'text':
                            file_summary_val,
                            'html':
                            file_summary_html
                        })
                    updates += 1
                else:
                    mod_summary_type = file_summary_type.lower().capitalize()
                    new_summary = Locussummary(locus_id=gene.dbentity_id,
                                               summary_type=mod_summary_type,
                                               text=file_summary_val,
                                               html=file_summary_html,
                                               created_by=username,
                                               source_id=SGD_SOURCE_ID)
                    nex_session.add(new_summary)
                    inserts += 1

                summary = nex_session.query(
                    Locussummary.summary_type, Locussummary.summary_id,
                    Locussummary.html, Locussummary.date_created).filter_by(
                        locus_id=gene.dbentity_id,
                        summary_type=mod_summary_type).all()[0]
                # add LocussummaryReference(s)
            if item:
                if item.get('PMIDs'):
                    pmids = item.get('PMIDs').replace(' ', '')
                else:
                    pmids = ''
                if len(pmids) > 0:
                    pmids = re.split('\||,', pmids)
                    for idx, pmid in enumerate(pmids):
                        matching_ref = [
                            x for x in matching_refs if x.pmid == int(pmid)
                        ][0]
                        summary_id = summary.summary_id
                        reference_id = matching_ref.dbentity_id
                        order = _idx + 1
                        # look for matching LocussummaryReference
                        matching_locussummary_refs = nex_session.query(
                            LocussummaryReference).filter_by(
                                summary_id=summary_id,
                                reference_id=reference_id).all()
                        if len(matching_locussummary_refs):
                            nex_session.query(LocussummaryReference).filter_by(
                                summary_id=summary_id,
                                reference_id=reference_id).update(
                                    {'reference_order': order})
                        else:
                            new_locussummaryref = LocussummaryReference(
                                summary_id=summary_id,
                                reference_id=reference_id,
                                reference_order=order,
                                source_id=SGD_SOURCE_ID,
                                created_by=username)
                            nex_session.add(new_locussummaryref)

            # add receipt
            summary_type_url_segment = file_summary_type.lower()
            if summary_type_url_segment not in [
                    'phenotype', 'regulation', 'interaction', 'sequence',
                    'disease', 'protein'
            ]:
                summary_type_url_segment = ''
            preview_url = '/locus/' + gene.sgdid + '/' + summary_type_url_segment
            clear_target_urls.append(preview_url)
            if summary:
                summary_obj = {
                    'display_name':
                    gene.format_name,
                    'obj_url':
                    preview_url,
                    'activity_category':
                    summary.summary_type,
                    'json':
                    json.dumps({
                        'summary_data': item,
                        'modified_date': str(datetime.now())
                    }),
                    'created_by':
                    username,
                    'dbentity_id':
                    gene.dbentity_id
                }
                message = 'added'
                new_curate_activity = CuratorActivity(
                    display_name=summary_obj['display_name'],
                    obj_url=summary_obj['obj_url'],
                    activity_category=summary_obj['activity_category'],
                    dbentity_id=summary_obj['dbentity_id'],
                    message=message,
                    json=summary_obj['json'],
                    created_by=summary_obj['created_by'])
                nex_session.add(new_curate_activity)
            receipt_entries.append({
                'category': 'locus',
                'href': preview_url,
                'name': gene.display_name,
                'type': file_summary_type,
                'value': file_summary_val
            })
    transaction.commit()
    nex_session.close()
    if len(clear_target_urls) > 0:
        ban_from_cache(clear_target_urls)
    return {'inserts': inserts, 'updates': updates, 'entries': receipt_entries}
def validate_file_content_and_process(file_content, nex_session, username):
    header_literal = [
        '# Feature', 'Summary Type (phenotype, regulation)', 'Summary', 'PMIDs'
    ]
    accepted_summary_types = ['Phenotype', 'Regulation']
    file_gene_ids = []
    file_pmids = []
    copied = []
    already_used_genes = []
    try:
        for i, val in enumerate(file_content):
            # match header
            if i is 0:
                is_header_match = header_literal == val
                if not is_header_match:
                    raise ValueError(
                        'File header does not match expected format. Please make your file match the template file linked below.'
                    )
            else:
                gene_id = val[0]
                file_gene_ids.append(gene_id.strip())
                gene_id_with_summary = gene_id + val[1]
                if gene_id_with_summary in already_used_genes:
                    raise ValueError(
                        'The same gene summary cannot be updated twice in the same file: '
                        + str(gene_id))
                already_used_genes.append(gene_id_with_summary)
                # match summary types
                if val[1] not in accepted_summary_types:
                    raise ValueError(
                        'Unaccepted summary type. Must be one of ' +
                        ', '.join(accepted_summary_types))
                # collect PMIDs
                if len(val) == 4:
                    pmids = val[3].replace(' ', '')
                    if len(pmids):
                        pmids = re.split('\||,', pmids)
                        for d in pmids:
                            file_pmids.append(str(d))
            # match length of each row
            if (len(val) != len(header_literal)
                    and len(val) != len(header_literal) - 1):
                raise ValueError('Row has incorrect number of columns.')
            copied.append(val)
    except IndexError:
        raise ValueError(
            'The file is not a valid TSV with the correct number of columns. Check the file and try again.'
        )
    nex_session.execute('SET LOCAL ROLE ' + username)
    # check that gene names are valid
    valid_genes = nex_session.query(Locusdbentity.format_name).filter(
        Locusdbentity.format_name.in_(file_gene_ids)).all()
    valid_genes = [str(d[0]) for d in valid_genes]
    invalid_genes = [d for d in file_gene_ids if d not in valid_genes]
    if len(invalid_genes):
        raise ValueError('Invalid gene identifier: ' +
                         ', '.join(invalid_genes))
    # must be valid PMIDs in last column or nothing
    matching_refs = nex_session.query(Referencedbentity).filter(
        Referencedbentity.pmid.in_(file_pmids)).all()
    temp_matching_refs = [str(d.pmid) for d in matching_refs]
    invalid_refs = [d for d in file_pmids if d not in temp_matching_refs]
    if len(invalid_refs):
        raise ValueError('Invalid PMID: ' + ', '.join(invalid_refs) +
                         '. Must be a pipe-separated list of PMIDs from SGD.')
    # update
    receipt_entries = []
    locus_names_ids = nex_session.query(Locusdbentity.display_name,
                                        Locusdbentity.sgdid).all()
    inserts = 0
    updates = 0
    for i, val in enumerate(copied):
        if i != 0:
            file_id = val[0]
            file_summary_type = val[1]
            file_summary_val = val[2]
            file_summy_html = link_gene_names(file_summary_val,
                                              locus_names_ids)
            gene = nex_session.query(Locusdbentity).filter_by(
                format_name=file_id).one_or_none()
            summaries = nex_session.query(
                Locussummary.summary_type, Locussummary.summary_id,
                Locussummary.html, Locussummary.date_created).filter_by(
                    locus_id=gene.dbentity_id,
                    summary_type=file_summary_type).all()
            # update
            summary = None
            if len(summaries):
                summary = summaries[0]
                nex_session.query(Locussummary).filter_by(
                    summary_id=summary.summary_id).update({
                        'text':
                        file_summary_val,
                        'html':
                        file_summy_html
                    })
                updates += 1
            else:
                new_summary = Locussummary(locus_id=gene.dbentity_id,
                                           summary_type=file_summary_type,
                                           text=file_summary_val,
                                           html=file_summy_html,
                                           created_by=username,
                                           source_id=SGD_SOURCE_ID)
                nex_session.add(new_summary)
                inserts += 1
            summary = nex_session.query(
                Locussummary.summary_type, Locussummary.summary_id,
                Locussummary.html, Locussummary.date_created).filter_by(
                    locus_id=gene.dbentity_id,
                    summary_type=file_summary_type).all()[0]
            # add LocussummaryReference(s)
            if len(val) == 4:
                pmids = val[3].replace(' ', '')
                if len(pmids):
                    pmids = re.split('\||,', pmids)
                    for _i, p in enumerate(pmids):
                        matching_ref = [
                            x for x in matching_refs if x.pmid == int(p)
                        ][0]
                        summary_id = summary.summary_id
                        reference_id = matching_ref.dbentity_id
                        order = _i + 1
                        # look for matching LocussummaryReference
                        matching_locussummary_refs = nex_session.query(
                            LocussummaryReference).filter_by(
                                summary_id=summary_id,
                                reference_id=reference_id).all()
                        if len(matching_locussummary_refs):
                            nex_session.query(LocussummaryReference).filter_by(
                                summary_id=summary_id,
                                reference_id=reference_id).update(
                                    {'reference_order': order})
                        else:
                            new_locussummaryref = LocussummaryReference(
                                summary_id=summary_id,
                                reference_id=reference_id,
                                reference_order=order,
                                source_id=SGD_SOURCE_ID,
                                created_by=username)
                            nex_session.add(new_locussummaryref)

            # add receipt
            summary_type_url_segment = file_summary_type.lower()
            if summary_type_url_segment not in ['phenotype', 'regulation']:
                summary_type_url_segment = ''
            preview_url = '/locus/' + gene.sgdid + '/' + summary_type_url_segment
            receipt_entries.append({
                'category': 'locus',
                'href': preview_url,
                'name': gene.display_name,
                'type': file_summary_type,
                'value': file_summary_val
            })
    transaction.commit()
    nex_session.close()
    return {'inserts': inserts, 'updates': updates, 'entries': receipt_entries}