def get_acc(prediction_dict, label_seen_pos, label_unseen_pos, prefix=""):
    print('\nmodel type {}'.format(prefix))
    print('\nsize {}\n'.format(prediction_dict['prediction'].shape))
    print('\nwhole')
    evaluation_metric.print_metrics(eval(prediction_dict))
    print('\noriginal')
    evaluation_metric.print_metrics(eval(prediction_dict, label_seen_pos))
    print('\nunseen')
    evaluation_metric.print_metrics(eval(prediction_dict, label_unseen_pos))
Example #2
0
def submitJobs(onto, label_original, count_file, method, path, filter_down):

    if filter_down == 'none':
        filter_down = False
    else:
        filter_down = True

    # label_original = pd.read_csv('/u/scratch/d/datduong/deepgo/data/train/deepgo.'+onto+'.csv',sep="\t",header=None)
    label_original = pd.read_csv(label_original, sep="\t", header=None)
    label_original = sorted(list(
        label_original[0]))  ## we sort labels in training

    #### compute accuracy by frequency

    low, middle, high = get_label_by_count(count_file + '/CountGoInTrain-' +
                                           onto + '.tsv')
    low_index = np.array(
        [index for index, value in enumerate(label_original) if value in low])
    middle_index = np.array([
        index for index, value in enumerate(label_original) if value in middle
    ])
    high_index = np.array(
        [index for index, value in enumerate(label_original) if value in high])

    prediction_dict = pickle.load(open(method, "rb"))

    print('\nsize {}\n'.format(prediction_dict['prediction'].shape))

    print('\nwhole {}'.format(onto))
    evaluation_metric.print_metrics(
        eval(prediction_dict, path=path, add_name='whole'))

    print('\nlow {}'.format(onto))
    evaluation_metric.print_metrics(
        eval(prediction_dict,
             low_index,
             path=path,
             add_name='low',
             filter_down=filter_down))

    print('\nmiddle {}'.format(onto))
    evaluation_metric.print_metrics(
        eval(prediction_dict, middle_index, path=path, add_name='middle'))

    print('\nhigh {}'.format(onto))
    evaluation_metric.print_metrics(
        eval(prediction_dict, high_index, path=path, add_name='high'))
Example #3
0
def submitJobs(where, count_file, method, save_file_type, filter_down):

    if filter_down == 'none':
        filter_down = False
    else:
        filter_down = True

    os.chdir(where)

    for onto in ['cc', 'mf', 'bp']:

        print('\n\ntype {}'.format(onto))

        if save_file_type == 'prediction_train_all_on_test':  ####
            data_type = 'data'
        else:
            data_type = 'dataExpandGoSet16Jan2020'

        label_original = pd.read_csv('/u/scratch/d/datduong/deepgo/' +
                                     data_type + '/train/deepgo.' + onto +
                                     '.csv',
                                     sep="\t",
                                     header=None)
        label_original = sorted(list(
            label_original[0]))  ## we sort labels in training

        #### compute accuracy by frequency

        low, middle, high = get_label_by_count(count_file +
                                               '/CountGoInTrain-' + onto +
                                               '.tsv')
        low_index = np.array([
            index for index, value in enumerate(label_original) if value in low
        ])
        middle_index = np.array([
            index for index, value in enumerate(label_original)
            if value in middle
        ])
        high_index = np.array([
            index for index, value in enumerate(label_original)
            if value in high
        ])

        prediction_dict = pickle.load(
            open(
                "/u/scratch/d/datduong/deepgo/" + data_type +
                "/train/fold_1/" + method + "/test-" + onto +
                "-prediction.pickle", "rb"))

        path = "/u/scratch/d/datduong/deepgo/" + data_type + "/train/fold_1/" + method + "/" + onto
        if not os.path.exists(path):
            os.mkdir(path)

        print('\nsize {}\n'.format(prediction_dict['prediction'].shape))

        print('\nwhole {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, path=path, add_name='whole'))

        print('\nlow {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict,
                 low_index,
                 path=path,
                 add_name='low',
                 filter_down=filter_down))

        print('\nmiddle {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, middle_index, path=path, add_name='middle'))

        print('\nhigh {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, high_index, path=path, add_name='high'))
    def do_eval(self, prot_loader, **kwargs):

        torch.cuda.empty_cache()
        self.eval()

        tr_loss = 0
        preds = []
        true_label = []

        for step, batch in enumerate(prot_loader):

            with torch.no_grad(
            ):  ## no gradient for everything in this section

                batch = tuple(t for t in batch)
                if self.args.has_ppi_emb:
                    prot_idx, prot_len, mask, label_ids, prot_interact_emb = batch  ## @label_ids must be of size @args.num_label_to_test
                else:
                    prot_idx, prot_len, mask, label_ids, _ = batch

                prot_idx = prot_idx[:, 0:int(max(prot_len))]  ## trim down
                mask = mask[:, 0:int(max(prot_len))]

                if self.args.has_ppi_emb and (self.args.prot_interact_vec_dim >
                                              0):
                    prot_interact_emb = prot_interact_emb.cuda()
                else:
                    prot_interact_emb = None

                pred, loss = self.forward(prot_idx.cuda(),
                                          mask.cuda(), prot_interact_emb,
                                          label_ids.cuda(), **kwargs)

                # loss = self.classify_loss ( pred, label_ids.cuda() )

            tr_loss = tr_loss + loss

            ## take sgimoid here, if sigmoid was not taken inside @forward
            if self.loss_type == 'BCEWithLogitsLoss':
                pred = F.sigmoid(pred)

            if len(preds) == 0:
                preds.append(pred.detach().cpu().numpy())
                true_label.append(label_ids.detach().cpu().numpy())
            else:
                preds[0] = np.append(preds[0],
                                     pred.detach().cpu().numpy(),
                                     axis=0)
                true_label[0] = np.append(true_label[0],
                                          label_ids.detach().cpu().numpy(),
                                          axis=0)  # row array

        # end eval
        true_label = true_label[0]
        preds = preds[0]

        print('loss {}'.format(tr_loss))

        print('pred label')
        print(preds)

        print('true label')
        print(true_label)

        trackF1macro = {}
        trackF1micro = {}  # metrics["f1_micro"]

        trackMacroPrecision = {}  # [MACRO] accuracy, precision, recall
        trackMacroRecall = {}

        trackMicroPrecision = {}
        trackMicroRecall = {}

        ##!! DO NOT NEED TO DO THIS ALL THE TIME DURING TRAINING
        # if self.args.not_train:
        #   rounding = np.arange(.1,1,.4)
        # else:
        rounding = [0.5]

        for round_cutoff in rounding:

            print('\n\nround cutoff {}'.format(round_cutoff))

            preds_round = 1.0 * (round_cutoff < preds)  ## converted into 0/1

            result = evaluation_metric.all_metrics(
                preds_round, true_label, yhat_raw=preds,
                k=[10, 20])  ## we can pass vector of P@k and R@k
            evaluation_metric.print_metrics(result)

            if 'full_data' not in trackF1macro:
                trackF1macro['full_data'] = [result["f1_macro"]]
                trackF1micro['full_data'] = [result["f1_micro"]]
                trackMacroPrecision['full_data'] = [result["prec_macro"]]
                trackMicroPrecision['full_data'] = [result["prec_micro"]]
                trackMacroRecall['full_data'] = [result["rec_macro"]]
                trackMicroRecall['full_data'] = [result["rec_micro"]]
            else:
                trackF1macro['full_data'].append(result["f1_macro"])
                trackF1micro['full_data'].append(result["f1_micro"])
                trackMacroPrecision['full_data'].append(result["prec_macro"])
                trackMicroPrecision['full_data'].append(result["prec_micro"])
                trackMacroRecall['full_data'].append(result["rec_macro"])
                trackMicroRecall['full_data'].append(result["rec_micro"])

            if ('GoCount'
                    in kwargs) and (self.args.not_train
                                    ):  ## do not eed to do this all the time
                print(
                    '\n\nsee if method improves accuracy conditioned on frequency of GO terms'
                )

                ## frequency less than 25 quantile  and over 75 quantile
                ## indexing must be computed ahead of time to to avoid redundant calculation

                for cutoff in ['quant25', 'quant75', 'betweenQ25Q75']:
                    ## indexing of the column to pull out , @pred is num_prot x num_go
                    result = evaluation_metric.all_metrics(
                        preds_round[:, kwargs[cutoff]],
                        true_label[:, kwargs[cutoff]],
                        yhat_raw=preds[:, kwargs[cutoff]],
                        k=[10, 20])
                    print("\nless than {} count".format(cutoff))
                    evaluation_metric.print_metrics(result)

                    if cutoff not in trackF1macro:
                        trackF1macro[cutoff] = [result["f1_macro"]]
                        trackF1micro[cutoff] = [result["f1_micro"]]
                        trackMacroPrecision[cutoff] = [result["prec_macro"]]
                        trackMicroPrecision[cutoff] = [result["prec_micro"]]
                        trackMacroRecall[cutoff] = [result["rec_macro"]]
                        trackMicroRecall[cutoff] = [result["rec_micro"]]
                    else:
                        trackF1macro[cutoff].append(result["f1_macro"])
                        trackF1micro[cutoff].append(result["f1_micro"])
                        trackMacroPrecision[cutoff].append(
                            result["prec_macro"])
                        trackMicroPrecision[cutoff].append(
                            result["prec_micro"])
                        trackMacroRecall[cutoff].append(result["rec_macro"])
                        trackMicroRecall[cutoff].append(result["rec_micro"])

        ##
        if self.args.not_train:
            print('\n\ntracking f1 compile into list\n')

            # print ('\nmacro f1 prec rec')
            for k, v in trackF1macro.items():
                print('macroF1 ' + k + " " + " ".join(str(s) for s in v))

            for k, v in trackMacroPrecision.items():
                print('macroPrec ' + k + " " + " ".join(str(s) for s in v))

            for k, v in trackMacroRecall.items():
                print('macroRec ' + k + " " + " ".join(str(s) for s in v))

            # print ('\nmicro f1 prec rec')
            for k, v in trackF1micro.items():
                print('microF1 ' + k + " " + " ".join(str(s) for s in v))

            for k, v in trackMicroPrecision.items():
                print('microPrec ' + k + " " + " ".join(str(s) for s in v))

            for k, v in trackMicroRecall.items():
                print('microRec ' + k + " " + " ".join(str(s) for s in v))

        output = {
            'prediction': preds,
            'truth': true_label
        }  ##!! make life easier if we have both
        return result, output, tr_loss
def submitJobs(where, method, save_file_type, filter_down):

    os.chdir(where)

    if filter_down == 'none':
        filter_down = False
    else:
        filter_down = True

    for onto in ['cc', 'mf', 'bp']:

        print('\n\ntype {}'.format(onto))

        label_original = pd.read_csv(
            '/u/scratch/d/datduong/deepgo/data/train/deepgo.' + onto + '.csv',
            sep="\t",
            header=None)
        label_original = set(list(label_original[0]))

        label_large = pd.read_csv(
            '/u/scratch/d/datduong/deepgo/dataExpandGoSet16Jan2020/train/deepgo.'
            + onto + '.csv',
            sep="\t",
            header=None)
        label_large = set(list(label_large[0]))

        label_unseen = sorted(list(label_large - label_original))
        label_large = sorted(
            list(label_large))  ## by default we sort label for the model
        label_original = sorted(list(label_original))

        label_lookup = {
            value: counter
            for counter, value in enumerate(label_large)
        }
        label_unseen_pos = np.array(
            [label_lookup[v] for v in label_lookup if v in label_unseen])
        label_seen_pos = np.array(
            [label_lookup[v] for v in label_lookup if v in label_original])

        #### want to compute accuracy on original set of labels, then on unseen labels
        #### possible original set prediction will change because we do joint prediction. so attention weight will affect outcome

        ##!! prediction_train_all_on_test.pickle save_prediction_expand
        try:
            print("/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" +
                  onto + "/" + method + "/" + save_file_type + ".pickle")
            prediction_dict = pickle.load(
                open(
                    "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" +
                    onto + "/" + method + "/" + save_file_type + ".pickle",
                    "rb"))
        except:
            print('\npass {}'.format(onto))
            continue

        path = "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method

        print('\nsize {}\n'.format(prediction_dict['prediction'].shape))

        # if save_file_type == 'prediction_train_all_on_test':
        print('\nwhole {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, path=path, add_name='whole'))

        # if save_file_type == 'save_prediction_expand':

        print('\noriginal {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict,
                 label_seen_pos,
                 path=path,
                 add_name='original'))

        print('\nunseen {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict,
                 label_unseen_pos,
                 path=path,
                 add_name='unseen',
                 filter_down=filter_down))
def submitJobs(where, count_file, method, save_file_type, filter_down):

    os.chdir(where)

    if filter_down == 'none':
        filter_down = False
    else:
        filter_down = True

    for onto in ['cc', 'mf', 'bp']:

        print('\n\ntype {}'.format(onto))

        label_original = pd.read_csv(
            '/u/scratch/d/datduong/deepgo/data/train/deepgo.' + onto + '.csv',
            sep="\t",
            header=None)
        label_original = sorted(list(
            label_original[0]))  ## we sort labels in training

        #### compute accuracy by frequency

        low, middle, high = get_label_by_count(count_file +
                                               '/CountGoInTrain-' + onto +
                                               '.tsv')
        low_index = np.array([
            index for index, value in enumerate(label_original) if value in low
        ])
        middle_index = np.array([
            index for index, value in enumerate(label_original)
            if value in middle
        ])
        high_index = np.array([
            index for index, value in enumerate(label_original)
            if value in high
        ])

        prediction_dict = pickle.load(
            open(
                "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" +
                onto + "/" + method + "/" + save_file_type + ".pickle", "rb"))

        path = "/u/scratch/d/datduong/deepgo/data/BertNotFtAARawSeqGO/" + onto + "/" + method

        print('\nsize {}\n'.format(prediction_dict['prediction'].shape))

        print('\nwhole {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, path=path, add_name='whole'))

        print('\nlow {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict,
                 low_index,
                 path=path,
                 add_name='low',
                 filter_down=filter_down))

        print('\nmiddle {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, middle_index, path=path, add_name='middle'))

        print('\nhigh {}'.format(onto))
        evaluation_metric.print_metrics(
            eval(prediction_dict, high_index, path=path, add_name='high'))
def submitJobs (where_train,set_type,where_test, add_name, do_split ): ## @do_split is needed if we use metaGO data

  if add_name == 'none':
    add_name = ""

  os.chdir ( '/u/flashscratch/d/datduong/goAndGeneAnnotationDec2018/')

  test_gene_annot = pickle.load(open(where_test+"/"+set_type+"_gene_annot.pickle","rb"))
  print ('num of gene to be tested {}'.format(len(test_gene_annot)))

  print ('\n\nmust use the prot names in the annot, not psiblast outcome\n\n')
  genes = list (test_gene_annot.keys())
  genes.sort() ## alphabet

  prediction = pickle.load ( open(where_train+"/seq_seq_predict_go_"+add_name+".pickle","rb") )

  ## for each gene, fill in the prediction matrix
  label_index_map = pickle.load ( open (where_train+"/label_index_map.pickle","rb") )
  prediction_np = np.zeros( (len(genes), len(label_index_map)) )

  for g in genes :
    if g not in prediction:
      continue
    go_assign = list ( prediction[g].keys() )
    go_assign.sort()
    score = [prediction[g][go] for go in go_assign]
    location = [label_index_map[go] for go in go_assign]
    ## assign the score
    prediction_np [ genes.index(g), location ] = score

  ## convert np into pd to get row names
  df = pd.DataFrame(prediction_np, index=genes)
  pickle.dump ( df, open(where_train+"/seq_seq_predict_go_"+add_name+".pd.pickle","wb"))

  ## filter out to only go terms in training set
  truth_np = np.zeros( (len(genes), len(label_index_map)) )

  for g in genes :
    if do_split == 1:
      if ";" in test_gene_annot[g][0]:
        go_assign = test_gene_annot[g][0].strip().split(";")
      else:
        go_assign = test_gene_annot[g][0].strip().split(",")
    else:
      go_assign = test_gene_annot[g]

    #
    go_assign.sort()
    go_assign = [re.sub("GO:","",go) for go in go_assign]

    location = [label_index_map[go] for go in go_assign if go in label_index_map ] ## !! record only GO we saw in training
    ## assign the score
    truth_np [ genes.index(g), location ] = 1

  print ('animo GO prediction')
  print (prediction_np)
  track_prec = []
  track_rec = []
  for k in [5,10,15,20,25,30,35,40]:
    animo_go_metric = evaluation_metric.all_metrics ( np.round(prediction_np), truth_np, yhat_raw=prediction_np, k=k ) ##  [ 0:(16*3) , :]
    if k == 5 :
      evaluation_metric.print_metrics( animo_go_metric )
    track_prec.append(animo_go_metric['prec_at_'+str(k)])
    track_rec.append(animo_go_metric['rec_at_'+str(k)])

  #
  fmax_val = fmax.f_max ( truth_np, prediction_np, threshold=np.arange(0,1,.02) )
  print ('fmax value {}'.format ( fmax_val ) )
  print ('precision/recall at K')
  print (track_prec)
  print (track_rec)


  label_bio_type = pickle.load( open( where_train+'/label_bio_type.pickle','rb') )
  # common30 = pickle.load ( open(where_train+"/common_index30.pickle","rb"))
  # label_bio_type['common30'] = common30

  for bio_type in label_bio_type:
    index = label_bio_type [ bio_type ]
    print ( "\n\n"+bio_type)
    print ( index[0:10] )
    track_prec = []
    track_rec = []
    for k in [5,10,15,20,25,30,35,40]:
      animo_go_metric = evaluation_metric.all_metrics ( np.round(prediction_np[: , index]), truth_np[: , index], yhat_raw=prediction_np[: , index], k=k)
      if k == 5 :
        evaluation_metric.print_metrics( animo_go_metric )
      track_prec.append(animo_go_metric['prec_at_'+str(k)])
      track_rec.append(animo_go_metric['rec_at_'+str(k)])

    fmax_val = fmax.f_max ( truth_np[: , index], prediction_np[: , index], threshold=np.arange(0,1,.02) )
    print ('fmax value {}'.format ( fmax_val ) )
    print ('precision/recall at K')
    print (track_prec)
    print (track_rec)
def submitJobs (main_dir, data_dir, blast_result_dir, what_set, ontology_type, all_test_label,add_name='none') :

  if add_name=='none':
    add_name = ""

  #### blast and psi-blast will have the same format.
  ## @all_test_label is file of all labels to be tested, adding this so that we return a matrix num_ob x num_label
  os.chdir(main_dir)

  ## labels to be tested
  all_test_label = pd.read_csv(all_test_label,header=None)
  print ('\nsort labels to be tested, we do the same when using NN model.')
  all_test_label = sorted ( list(all_test_label[0]) )
  label_lookup = {value:index for index,value in enumerate(all_test_label)}

  ## prot annotation train set, will be used later to infer assignment in testset
  ## can only predict what is found in train set if we use blast
  print ('load go annotation for train data')
  ## we can convert text into dict on-the-fly
  # try:
  #   prot_annot = pickle.load ( open (data_dir+'train-'+ontology_type+'.TrueLabel.pickle','rb') )
  # except:
  # train-mf.tsv

  prot_annot, prot_name_train = MakeGroundTruthText2Dict(data_dir+'train-'+ontology_type+add_name+'.tsv')
  print ('\nnum of prots in train data {}\n'.format(len(prot_annot)))

  print ('load go annotation for test data')
  ## COMMENT get true labels
  ## COMMENT 'test-'+ontology_type+'.tsv' has different ordering than 'test-'+ontology_type+'-input.tsv'
  print ('test file name {}'.format(data_dir+'test-'+ontology_type+add_name+'.tsv')) ##!!##!!

  ground_truth, prot_name_test = load_true_data (data_dir+'test-'+ontology_type+add_name+'.tsv',label_lookup) ##!!##!!
  print ('\nnum of prots in test data {}\n'.format(len(prot_name_test)))

  print ('\nread psiblast result')
  df_psiblast = pd.read_csv ( blast_result_dir+what_set+"-"+ontology_type+".psiblast.txt" , header=None, skip_blank_lines=True )
  df_psiblast = df_psiblast.dropna()
  df_psiblast = df_psiblast.reset_index(drop=True)

  prot_name_in_psi = sorted ( list ( set (list ( df_psiblast[0] ) ) ) )
  print ('\nnum of prots from test found in psiblast {}, we may be unable to find match for all test sequence\n'.format(len(prot_name_in_psi)))

  print ('\nread blast result')
  df_blast = pd.read_csv ( blast_result_dir+what_set+"-"+ontology_type+".blast.txt" , header=None,skip_blank_lines=True )

  ## should make prediction as a matrix
  # prediction = {}
  prediction = np.zeros([len(prot_name_test),len(label_lookup)])

  in_psi = set(df_psiblast[0])
  in_blast = set(df_blast[0])

  for index,this_prot in tqdm(enumerate(prot_name_test)) :

    if (this_prot not in in_psi) and (this_prot not in in_blast):
      print ('not found in both blast and psiblast {}'.format(this_prot))
      continue

    df_psiblast_g = df_psiblast[ df_psiblast[0] == this_prot ]
    df_psiblast_g = df_psiblast_g[ df_psiblast_g[1] != this_prot ] ## don't compare to self

    df_blast_g = df_blast[ df_blast[0] == this_prot ]
    df_blast_g = df_blast_g[ df_blast_g[1] != this_prot ] ## don't compare to self

    psiblast_go_score_array, w_psiblast = tally_over_n_template ( df_psiblast_g, prot_annot )
    blast_go_score_array, _ = tally_over_n_template ( df_blast_g, prot_annot )

    final_score = {}
    psiblast_go = list ( psiblast_go_score_array.keys() )
    blast_go = list ( blast_go_score_array.keys() )

    go_found = set ( psiblast_go + blast_go )
    if len(go_found) == 0: ## funky stuffs ??
      print ('pass 1st screen in blast+psiblast but not found any go term ?? {}'.format(this_prot))
      final_score[this_prot] = None
      continue

    for g in go_found: ## average between psiblast and blast
      if (g in psiblast_go_score_array) and (g in blast_go_score_array) :
        x1 = psiblast_go_score_array[g] * (1-w_psiblast) + blast_go_score_array[g] * (w_psiblast)
      if (g in psiblast_go_score_array) and (g not in blast_go_score_array) :
        x1 = psiblast_go_score_array[g]
      if (g not in psiblast_go_score_array) and (g in blast_go_score_array) :
        x1 = blast_go_score_array[g]
      final_score[g] = x1 ## each GO term has a score for this one protein

    ## done with this one protein
    prediction [index] = order_go_score (final_score,label_lookup)

    ## filter down original set so things run faster
    # df[~df.countries.isin(countries)]
    # df_psiblast = df_psiblast[ ~df_psiblast[0].isin([this_prot]) ]
    # df_blast = df_blast[ ~df_blast[0].isin([this_prot]) ]

    # if index > 10:
    #   print (prediction[0:10])
    #   exit()

  ## finish all proteins

  pickle.dump ( {'prediction':prediction, 'true_label':ground_truth}, open(blast_result_dir+what_set+"-"+ontology_type+"-prediction.pickle","wb") )

  result = evaluation_metric.all_metrics ( np.round(prediction) , ground_truth, yhat_raw=prediction, k=[5,10,15,20,25]) ## we can pass vector of P@k and R@k
  evaluation_metric.print_metrics( result )