def main(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(LOGLEVEL) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nLoading project info from pickle file {}".format(PROJECTS_P) projects = pickle.load(open(PROJECTS_P, 'rb')) if not args['--quiet']: print "\nCreating Tagger..." tgr = Tagger() tgr.load_names(ENTITIES_FILE, NAMES_FILE) tgr.load_global(GLOBAL_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] for year in [str(yr) for yr in range(2000, 2018)]: # 2000-2017 pct = len(projects[year]) print "\nTagging {} projects from {}".format(pct, year) logger.info("Tagging {} projects from {}".format(pct, year)) pbar = ProgressBar(widgets=pbar_widgets, maxval=pct).start() start_time = time.time() ct = 0 ttag_ct = 0 abstag_ct = 0 skip_ct = 0 ttagsnotfnd = set() ttag2targetid = {} appid2targets = defaultdict(set) target2appids = defaultdict(set) for appid in projects[year].keys(): ct += 1 logger.debug(" Processing appid {}".format(appid)) ginfo = projects[year][appid] # if there's no $$, we're not interested if ginfo['TOTAL_COST']: gcost = int(ginfo['TOTAL_COST']) elif ginfo['TOTAL_COST_SUB_PROJECT']: gcost = int(ginfo['TOTAL_COST_SUB_PROJECT']) else: continue # also, if there's less than $10k we're not interested if gcost < 10000: skip_ct += 1 continue # # tag titles # matches = tgr.get_matches(projects[year][appid]['PROJECT_TITLE'], appid, [9606]) if matches: ttag_ct += 1 # the same tag can match multiple times, so get a set of ENSPs ensps = set() for m in matches: ensps.add(m[2][0][1]) ensps = list(ensps) for ensp in ensps: if ensp in ttag2targetid: tid = ttag2targetid[ensp] elif ensp in ttagsnotfnd: continue else: targets = dba.find_targets({'stringid': ensp}, idg=False) if not targets: targets = dba.find_targets_by_xref( { 'xtype': 'Ensembl', 'value': ensp }, idg=False) if not targets: ttagsnotfnd.add(ensp) continue tid = targets[0]['id'] ttag2targetid[ ensp] = tid # save this so we don't look up the targets again appid2targets[appid].add(tid) target2appids[tid].add(appid) # # tag abstracts # if 'ABSTRACT' in projects[year][appid]: matches = tgr.get_matches(projects[year][appid]['ABSTRACT'], appid, [9606]) if matches: abstag_ct += 1 # the same tag can match multiple times, so get a set of ENSPs ensps = set() for m in matches: ensps.add(m[2][0][1]) ensps = list(ensps) for ensp in ensps: if ensp in ttag2targetid: tid = ttag2targetid[ensp] elif ensp in ttagsnotfnd: continue else: targets = dba.find_targets({'stringid': ensp}, idg=False) if not targets: targets = dba.find_targets_by_xref( { 'xtype': 'Ensembl', 'value': ensp }, idg=False) if not targets: ttagsnotfnd.add(ensp) continue tid = targets[0]['id'] ttag2targetid[ ensp] = tid # save this so we don't look up the targets again appid2targets[appid].add(tid) target2appids[tid].add(appid) pbar.update(ct) pbar.finish() del_ct = 0 for appid, tidset in appid2targets.items(): if len(tidset) > 10: del_ct += 1 del (appid2targets[appid]) logger.info("{} projects processed.".format(ct)) logger.info(" Removed {} projects with > 10 targets" % del_ct) logger.info( " Skipped {} projects with funds less than $10k:".format(skip_ct)) logger.info(" {} titles have tagging result(s)".format(ttag_ct)) logger.info(" {} abstracts have tagging result(s)".format(abstag_ct)) logger.info("{} total tags map to {}/{} distinct targets".format( len(ttag2targetid.keys()), len(set(ttag2targetid.values())), len(target2appids.keys()))) logger.info("{} project applications map to target(s)".format( len(appid2targets.keys()))) if ttagsnotfnd: logger.info(" No target found for {} tags".format( len(ttagsnotfnd))) pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year) pickle.dump(appid2targets, open(pfile, 'wb')) logger.info("Tagging results saved to pickle {} for {}".format( pfile, year)) pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year) pickle.dump(target2appids, open(pfile, 'wb')) logger.info("Tagging results saved to pickle {} for {}".format( pfile, year)) print "{} projects processed. See logfile {} for details.".format( ct, LOGFILE)