예제 #1
0
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    #download(args)
    start_time = time.time()
    load(args, dba, logger, logfile)
    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'TIGA',
예제 #2
0
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    load_ids_classifications(args, dba, logger, logfile)
    dto = parse_dto_owl(args, DTO_OWL_FILE)
    load_dto(args, dba, logfile, dto)

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
예제 #3
0
        logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
        fh = logging.FileHandler(logfile)
        fmtr = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S')
        fh.setFormatter(fmtr)
        logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    print "\nWorking on JensenLab DISEASES..."
    download_DISEASES(args)
    start_time = time.time()
    load_DISEASES(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with DISEASES. Elapsed time: {}".format(slmf.secs2str(elapsed))
예제 #4
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'HGNC',
        'source':
        'Custom download file from https://www.genenames.org/download/custom/',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.genenames.org/',
        'comments':
        'File downloaded with the following column data: HGNC ID Approved symbol Approved name   Status  UniProt ID NCBI Gene ID    Mouse genome database ID'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'sym',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'geneid',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    line_ct = slmf.wcl(HGNC_TSV_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    tmark = {}
    hgnc_ct = 0
    mgi_ct = 0
    sym_ct = 0
    symdiscr_ct = 0
    geneid_ct = 0
    geneiddiscr_ct = 0
    nf_ct = 0
    db_err_ct = 0
    with open(HGNC_TSV_FILE, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # 0: HGNC ID
            # 1: Approved symbol
            # 2: Approved name
            # 3: Status
            # 4: NCBI Gene ID
            # 5: UniProt ID
            # 6: Mouse genome database ID
            ct += 1
            pbar.update(ct)
            sym = row[1]
            if row[4] != '':
                geneid = int(row[4])
            else:
                geneid = None
            if row[5] != '':
                up = row[5]
            else:
                up = None
            targets = dba.find_targets({'sym': sym})
            if not targets and up:
                targets = dba.find_targets({'uniprot': up})
            if not targets and geneid:
                targets = dba.find_targets({'geneid': geneid})
            if not targets:
                nf_ct += 1
                logger.warn("No target found for {}|{}|{}".format(
                    sym, up, geneid))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pid = p['id']
                tmark[pid] = True
                # HGNC xref
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'HGNC',
                    'dataset_id': dataset_id,
                    'value': row[0]
                })
                if rv:
                    hgnc_ct += 1
                else:
                    db_err_ct += 1
                # MGI xref
                if row[6] != '':
                    rv = dba.ins_xref({
                        'protein_id': pid,
                        'xtype': 'MGI ID',
                        'dataset_id': dataset_id,
                        'value': row[6]
                    })
                    if rv:
                        mgi_ct += 1
                    else:
                        db_err_ct += 1
                # Add missing syms
                if p['sym'] == None:
                    rv = dba.upd_protein(pid, 'sym', sym)
                    if rv:
                        logger.info(
                            "Inserted new sym {} for protein {}, {}".format(
                                sym, pid, p['uniprot']))
                        sym_ct += 1
                    else:
                        db_err_ct += 1
                else:
                    # Check for symbol discrepancies
                    if p['sym'] != sym:
                        logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" %
                                    (p['sym'], sym))
                        symdiscr_ct += 1
                if geneid:
                    # Add missing geneids
                    if p['geneid'] == None:
                        rv = dba.upd_protein(pid, 'geneid', geneid)
                        if rv:
                            logger.info(
                                "Inserted new geneid {} for protein {}, {}".
                                format(geneid, pid, p['uniprot']))
                            geneid_ct += 1
                        else:
                            db_err_ct += 1
                    else:
                        # Check for geneid discrepancies
                        if p['geneid'] != geneid:
                            logger.warn(
                                "GeneID discrepancy: UniProt={}, HGNC={}".
                                format(p['geneid'], geneid))
                            geneiddiscr_ct += 1
    pbar.finish()
    print "Processed {} lines - {} targets annotated.".format(ct, len(tmark))
    print "No target found for {} lines.".format(nf_ct)
    print "  Inserted {} HGNC ID xrefs".format(hgnc_ct)
    print "  Inserted {} MGI ID xrefs".format(mgi_ct)
    if sym_ct > 0:
        print "  Added {} new HGNC symbols".format(sym_ct)
    if symdiscr_ct > 0:
        print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format(
            symdiscr_ct, logfile)
    if geneid_ct > 0:
        print "  Added {} new NCBI Gene IDs".format(geneid_ct)
    if geneiddiscr_ct > 0:
        print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format(
            geneiddiscr_ct, logfile)
    if db_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            db_err_ct, logfile)
예제 #5
0
  if args['--logfile']:
    logfile =  args['--logfile']
  else:
    logfile = LOGFILE
  loglevel = int(args['--loglevel'])
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(LOGFILE)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    
  download(args)
  start_time = time.time()
  load(args, dba, logger, logfile)
  elapsed = time.time() - start_time

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'HomoloGene', 'source': 'File %s'%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/homologene', 'comments': 'Only Human, Mouse and Rat members of HomoloGene groups are loaded. These relate protein to nhprotein.'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'homology'})
예제 #6
0
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    download(args)
    start_time = time.time()
    load(args, dba, logger, logfile)
    elapsed = time.time() - start_time

    # Dataset
    dataset_id = dba.ins_dataset({
예제 #7
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'NCBI GI Numbers',
        'source':
        'UniProt ID Mapping file %s' % (BASE_URL + FILENAME),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.uniprot.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    start_time = time.time()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    # ID Mappiing fields
    # 1. UniProtKB-AC
    # 2. UniProtKB-ID
    # 3. GeneID (EntrezGene)
    # 4. RefSeq
    # 5. GI
    # 6. PDB
    # 7. GO
    # 8. UniRef100
    # 9. UniRef90
    # 10. UniRef50
    # 11. UniParc
    # 12. PIR
    # 13. NCBI-taxon
    # 14. MIM
    # 15. UniGene
    # 16. PubMed
    # 17. EMBL
    # 18. EMBL-CDS
    # 19. Ensembl
    # 20. Ensembl_TRS
    # 21. Ensembl_PRO
    # 22. Additional PubMed
    if not args['--quiet']:
        print "\nProcessing {} rows in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        tmark = {}
        xref_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for line in tsv:
            data = line.split('\t')
            ct += 1
            up = data[0]
            if not data[4]:  # no gi
                skip_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                skip_ct += 1
                continue
            target = targets[0]
            tmark[target['id']] = True
            pid = target['components']['protein'][0]['id']
            for gi in data[4].split('; '):
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'NCBI GI',
                    'dataset_id': dataset_id,
                    'value': gi
                })
                if rv:
                    xref_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} rows processed".format(ct)
    print "  Inserted {} new GI xref rows for {} targets".format(
        xref_ct, len(tmark))
    print "  Skipped {} rows with no GI".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #8
0
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    download_mappings(args)
    start_time = time.time()
    load(args, dba, logfile, logger)
    elapsed = time.time() - start_time

    # Dataset
    dataset_id = dba.ins_dataset({
예제 #9
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IDG Eligible Targets List',
        'source':
        'IDG generated data in file %s.' % IDG_LIST_FILE,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'IDG Flags and Families set from list of targets on GitHub.',
        'url':
        'https://github.com/druggablegenome/IDGTargets'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'idg',
        'where_clause': 'column_name == "idg"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'fam',
        'where_clause': 'column_name == "fam"',
        'where_clause': 'idg == 1'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'famext',
        'where_clause': 'column_name == "fam"',
        'where_clause': 'idg == 1'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(IDG_LIST_FILE)
    print '\nProcessing {} lines in list file {}'.format(
        line_ct, IDG_LIST_FILE)
    logger.info("Processing {} lines in list file {}".format(
        line_ct, IDG_LIST_FILE))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    notfnd = []
    multfnd = []
    ct = 0
    idg_ct = 0
    fam_ct = 0
    famext_ct = 0
    dba_err_ct = 0
    with open(IDG_LIST_FILE, 'rU') as ifh:
        csvreader = csv.reader(ifh)
        #header = csvreader.next() # skip header line
        #ct += 1
        for row in csvreader:
            ct += 1
            sym = row[0]
            fam = row[2]
            targets = dba.find_targets({'sym': sym},
                                       idg=False,
                                       include_annotations=False)
            if not targets:
                notfnd.append(sym)
                continue
            if len(targets) > 1:
                multfnd.append(sym)
            for t in targets:
                rv = dba.upd_target(t['id'], 'idg', 1)
                if rv:
                    idg_ct += 1
                else:
                    dba_err_ct += 1
                rv = dba.upd_target(t['id'], 'fam', fam)
                if rv:
                    fam_ct += 1
                else:
                    dba_err_ct += 1
                if row[3]:
                    famext = row[3]
                    rv = dba.upd_target(t['id'], 'famext', famext)
                    if rv:
                        famext_ct += 1
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "{} targets updated with IDG flags".format(idg_ct)
    print "{} targets updated with fams".format(fam_ct)
    print "  {} targets updated with famexts".format(famext_ct)
    if notfnd:
        print "No target found for {} symbols: {}".format(
            len(notfnd), ", ".join(notfnd))
    if multfnd:
        print "Multiple targets found for {} symbols: {}".format(
            len(multfnd), ", ".join(multfnd))
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #10
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    #dataset_id = 8
    dataset_id = dba.ins_dataset({
        'name': 'NCBI Gene',
        'source': 'EUtils web API at %s' % EFETCH_GENE_URL,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.ncbi.nlm.nih.gov/gene'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'NCBI Gene Summary'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'NCBI Gene PubMed Count'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'generif'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }, {
        'dataset_id': dataset_id,
        'table_name': 'alias',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    s = shelve.open(SHELF_FILE, writeback=True)
    s['loaded'] = []
    s['retries'] = {}
    s['counts'] = defaultdict(int)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ct = 0
    skip_ct = 0
    if args['--pastid']:
        past_id = args['--pastid']
        tct = dba.get_target_count(past_id=past_id)
    else:
        past_id = None
        tct = dba.get_target_count()
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    if not args['--quiet']:
        print "\nLoading NCBI Gene annotations for %d TCRD targets" % tct
    logger.info("Loading NCBI Gene annotations for %d TCRD targets\n" % tct)
    for t in dba.get_targets(past_id=past_id):
        ct += 1
        tid = t['id']
        if tid in s['loaded']:
            logger.info("Skipping previously loaded target %d" % tid)
        p = t['components']['protein'][0]
        pid = p['id']
        if p['geneid'] == None:
            skip_ct += 1
            continue
        geneid = str(p['geneid'])
        logger.info("Processing target %d: geneid %s" % (tid, geneid))
        (status, headers, xml) = get_ncbigene(geneid)
        if not status:
            logger.warn("Failed getting Gene ID %s" % geneid)
            s['retries'][tid] = True
            continue
        if status != 200:
            logger.warn("Bad API response for Gene ID %s: %s" %
                        (geneid, status))
            s['retries'][tid] = True
            continue
        gene_annotations = parse_genexml(xml)
        if not gene_annotations:
            s['counts']['xml_err'] += 1
            logger.error("XML Error for Gene ID %s" % geneid)
            s['retries'][tid] = True
            continue
        load_annotations(dba, t, dataset_id, gene_annotations, s)
        time.sleep(0.5)
        pbar.update(ct)
    pbar.finish()
    print "Processed %d targets." % ct
    if skip_ct > 0:
        print "Skipped %d targets with no geneid" % skip_ct
    print "Loaded NCBI annotations for %d targets" % len(s['loaded'])
    if len(s['retries']) > 0:
        print "Total targets remaining for retries: %d " % len(s['retries'])

    loop = 1
    while len(s['retries']) > 0:
        print "\nRetry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % (
            loop, len(s['retries']))
        logger.info(
            "Retry loop %d: Loading NCBI Gene annotations for %d TCRD targets"
            % (loop, len(s['retries'])))
        pbar_widgets = [
            'Progress: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA()
        ]
        pbar = ProgressBar(widgets=pbar_widgets,
                           maxval=len(s['retries'])).start()
        ct = 0
        act = 0
        for tid, _ in s['retries'].items():
            ct += 1
            t = dba.get_target(tid, include_annotations=False)
            geneid = str(t['components']['protein'][0]['geneid'])
            logger.info("Processing target %d: geneid %s" % (tid, geneid))
            (status, headers, xml) = get_ncbigene(geneid)
            if not status:
                logger.warn("Failed getting Gene ID %s" % geneid)
                continue
            if status != 200:
                logger.warn("Bad API response for Gene ID %s: %s" %
                            (geneid, status))
                continue
            gene_annotations = parse_genexml(xml)
            if not gene_annotations:
                s['counts']['xml_err'] += 1
                logger.error("XML Error for Gene ID %s" % geneid)
                continue
            load_annotations(dba, t, dataset_id, gene_annotations, s)
            act += 1
            del s['retries'][tid]
            time.sleep(0.5)
            pbar.update(ct)
        loop += 1
        if loop == 5:
            print("Completed 5 retry loops. Aborting.")
            break
        pbar.finish()
        print "Processed %d targets." % ct
        print "  Annotated %d additional targets" % act
        print "  Total annotated targets: %d" % len(s['loaded'])
        if len(s['retries']) > 0:
            print "Total targets remaining for retries: %d " % len(
                s['retries'])

    print "\nInserted %d aliases" % s['counts']['alias']
    print "Inserted %d NCBI Gene Summary tdl_infos" % s['counts']['summary']
    print "Inserted %d NCBI Gene PubMed Count tdl_infos" % s['counts']['pmc']
    print "Inserted %d GeneRIFs" % s['counts']['generif']
    print "Inserted %d PubMed xrefs" % s['counts']['pmxr']
    #print "Inserted %d other xrefs" % s['counts']['xref']
    if s['counts']['xml_err'] > 0:
        print "WARNNING: %d XML parsing errors occurred. See logfile %s for details." % (
            s['counts']['xml_err'], logfile)
    if s['counts']['dba_err'] > 0:
        print "WARNNING: %d DB errors occurred. See logfile %s for details." % (
            s['counts']['dba_err'], logfile)
예제 #11
0
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    start_time = time.time()
    load(args, dba, logfile, logger)
    elapsed = time.time() - start_time

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
예제 #12
0
def load(args):
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IDG Families',
        'source':
        'IDG-KMC generated data from file %s' % os.path.basename(INFILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        "Target family designations generated by IDG-KMC groups at UNM and UMiami."
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'tiofam'
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    line_ct = slmf.wcl(INFILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INFILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    idg_ct = 0
    upd_ct1 = 0
    upd_ct2 = 0
    null_ct = 0
    notfnd = []
    mulfnd = []
    dba_err_ct = 0
    with open(INFILE, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct += 1
        for row in csvreader:
            ct += 1
            pbar.update(ct)
            up = row[2].strip()
            fam = row[3].strip()
            famext = row[4].strip()
            if not fam:
                null_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                notfnd.append(up)
                continue
            if len(targets) > 1:
                mulfnd.append(up)
                continue
            t = targets[0]
            # only update fam for non-IDG targets
            # IDG target fams are set by load-IDGList.py
            if t['fam']:
                idg_ct += 1
                continue
            rv = dba.rv = dba.upd_target(t['id'], 'fam', fam)
            if not rv:
                print "ERROR updating target.fam: %d to %s" % (t['id'], fam)
            else:
                upd_ct1 += 1
            if famext and famext != '':
                rv = dba.upd_target(t['id'], 'famext', famext)
                if not rv:
                    print "ERROR updating target.famext: %d to %s" % (t['id'],
                                                                      famext)
                else:
                    upd_ct2 += 1
    pbar.finish()
    print "{} rows processed.".format(ct)
    print "{} IDG family designations loaded into TCRD.".format(upd_ct1)
    print "{} IDG extended family designations loaded into TCRD.".format(
        upd_ct2)
    print "Skipped {} IDG2 targets.".format(idg_ct)
    if notfnd:
        print "[WARNING] No target found for {} UniProt accessions: {}".format(
            len(notfnd), ", ".join(notfnd))
    if mulfnd:
        print "[WARNING] Multiple targets found for {} UniProt accessions: {}".format(
            len(mulfnd), ", ".join(mulfnd))
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)