def make_tsvfile_predictions( namefile, modfile, ucsc_exe, ucsc_dbs, web=False, win=2, dbfasta='hg38.2bit', dbpps=['hg38.phyloP7way.bw', 'hg38.phyloP100way.bw'], pklcod='hg38_coding.pkl', fprog='twoBitToFa', cprog='bigWigToBedGraph'): nucs = 'ACGTN' try: model1 = joblib.load(modfile[0]) model2 = joblib.load(modfile[1]) except: print >> sys.stderr, 'ERROR: Program not able to load modfile. Please check that you have installed a compatible version joblib.' sys.exit(1) proc = subprocess.Popen([prog_cat, '-f', namefile], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() c = 0 print "#CHROM\tPOS\tREF\tALT\tCODING\tPREDICTION\tSCORE\tFDR\tPhyloP100\tAvgPhyloP100" for line in stdout.split('\n'): if line == '': continue v = line.rstrip().split() c = c + 1 if len(v) < 4: print >> sys.stderr, 'ERROR: Incorrect line ', c print >> sys.stderr, 'ERROR:', line continue (ichr, pos, wt, nw) = tuple(v[:4]) if wt == nw or nw.find(',') > -1: print >> sys.stderr, 'ERROR: Incorrect input line.', ichr, pos, wt, nw print >> sys.stderr, 'ERROR:', line continue nchr = ichr if nchr.find('chr') == -1: nchr = 'chr' + ichr try: ipos = int(pos) except: print >> sys.stderr, 'ERROR: Incorrect input data. The tsv input file should have has four columns (chr,position,ref,alt).' print >> sys.stderr, 'ERROR:', line continue sys.exit(1) lwt = len(wt) lnw = len(nw) ##Option for selecting snv #if lwt!=1 or lnw!=1 or nucs.find(wt)==-1 or nucs.find(nw)==-1 or wt==nw: # print >> sys.stderr, 'ERROR: Not single nucloetide variant',','.join([nchr,pos,wt,nw]) # continue if wt == '-': lwt = 1 lnw += 1 if nw == '-': lwt += 1 lnw = 1 n_wt, n_nw, n_pos = parse_variants(nchr, ipos, wt, nw, ucsc_exe, ucsc_dbs, web, dbfasta, fprog) if n_wt == '' or n_nw == '': print >> sys.stderr, 'ERROR: Incorrect mutation mapping. Check position', ichr, ipos, wt, nw print >> sys.stderr, 'ERROR:', line continue if 'ACGTN'.find(n_wt) == -1 or 'ACGTN'.find(n_nw) == -1: print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw print >> sys.stderr, 'ERROR:', line continue if len(wt) == 1 and len(nw) == 1: r_cod = [] (nuc, seq, seq_input, cons_input, r_cod) = get_snv_input(nchr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs, web, win, dbfasta, dbpps, pklcod, fprog, cprog) else: (nuc, seq, seq_input, cons_input, r_cod) = get_indel_input(nchr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs, web, win, dbfasta, dbpps, pklcod, fprog, cprog) if seq == '': print >> sys.stderr, 'ERROR: Sequence not found for line ' + str( c) + '. Genome location:', ichr, pos print >> sys.stderr, 'ERROR:', line continue if seq_input == []: print >> sys.stderr, 'ERROR: Incorrect nucleotide in line ' + str( c) + '. Genome location:', ichr, pos print >> sys.stderr, 'ERROR:', line continue if cons_input != []: cons_input1 = cons_input[0] cons_input2 = cons_input[1] else: print >> sys.stderr, 'ERROR: Incorrect conservation data for line ' + str( c) + '. Genome location:', ichr, pos print >> sys.stderr, 'ERROR:', line continue #if cons_input1==[] or cons_input2==[]: #Check only P100 if cons_input2 == []: print >> sys.stderr, 'ERROR: Incorrect conservation data for line ' + str( c) + '. Genome location:', ichr, pos print >> sys.stderr, 'ERROR:', line continue if cons_input1 == []: cons_input1 = [0.0 for i in range(2 * win + 1)] print >> sys.stderr, 'WARNING: PhyloP7 data not found for line', c, 'mutated site', ichr, pos p_cod = 0 cod = 'No' if r_cod != []: p_cod = 1 cod = 'Yes' if len(wt) == 1 and len(nw) == 1 and wt != '-' and nw != '-': X = [seq_input + cons_input1 + cons_input2] y_pred, y_fdrs, c_pred = prediction(X, model1) v_fdr = [y_fdrs[0][0], y_fdrs[0][1]] else: X = [seq_input + cons_input1 + cons_input2 + [lwt, lnw, p_cod]] y_pred, y_fdrs, c_pred = prediction(X, model2) v_fdr = [y_fdrs[0][2], y_fdrs[0][3]] if y_pred == []: print >> sys.stderr, 'WARNING: Variant not scored. Check modfile and input' print >> sys.stderr, 'WARNING:', line continue pp100 = cons_input2[win] avgpp100 = sum(cons_input2) / float(len(cons_input2)) #print pp100,avgpp100,cons_input2 if c_pred[0] == "Pathogenic": d_fdr = v_fdr[0] if c_pred[0] == "Benign": d_fdr = v_fdr[1] print line + '\t' + '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f' % ( cod, c_pred[0], y_pred[0], d_fdr, pp100, avgpp100) #print '\t'.join(str(i) for i in [ichr,ipos,wt,nw,'%.4f' %y_pred[0]]) return
def make_file_predictions(namefile, modfile, ucsc_exe, ucsc_dbs, web=False, win=2, s='\t', dbfasta='hg38.2bit', dbpps=['hg38.phyloP7way.bw', 'hg38.phyloP100way.bw'], pklcod='hg38_coding.pkl', fprog='twoBitToFa', cprog='bigWigToBedGraph'): try: model1 = joblib.load(modfile[0]) model2 = joblib.load(modfile[1]) except: print >> sys.stderr, 'ERROR: Program not able to load modfile. Please check that you have installed a compatible version joblib.' sys.exit(1) f = open(namefile) c = 1 print "#CHROM\tPOS\tREF\tALT\tCODING\tPREDICTION\tSCORE\tFDR\tPhyloP100\tAvgPhyloP100" for line in f: v = line.rstrip().split(s) if len(v) < 4: print >> sys.stderr, 'ERROR: Incorrect line ', c, line.rstrip() print >> sys.stderr, line continue (ichr, pos, wt, nw) = v[:4] try: ipos = int(pos) except: print >> sys.stderr, 'ERROR: Incorrect genome location', pos, '. Check your input file' print >> sys.stderr, 'ERROR:', line continue lwt = len(wt) lnw = len(nw) if wt == '-': lwt = 1 lnw += 1 if nw == '-': lwt += 1 lnw = 1 n_wt, n_nw, n_pos = parse_variants(ochr, ipos, wt, nw, ucsc_exe, ucsc_dbs, web, dbfasta, fprog) if n_wt == '' or n_nw == '': print >> sys.stderr, 'ERROR: Incorrect mutation mapping. Check position', ichr, ipos, wt, nw print >> sys.stderr, 'ERROR:', line continue if 'ACGTN'.find(n_wt) == -1 or 'ACGTN'.find(n_nw) == -1: print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw print >> sys.stderr, 'ERROR:', line continue if wt == nw or nw.find(',') > -1: print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw print >> sys.stderr, 'ERROR:', line continue if len(wt) == 1 and len(nw) == 1: r_cod = [] (nuc, seq, seq_input, cons_input, r_cod) = get_snv_input(ichr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs, web, win, dbfasta, dbpps, pklcod, fprog, cprog) else: (nuc, seq, seq_input, cons_input, r_cod) = get_indel_input(ichr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs, web, win, dbfasta, dbpps, pklcod, fprog, cprog) if seq == '': print >> sys.stderr, 'ERROR: Sequence not found for line', c, ichr, pos print >> sys.stderr, 'ERROR:', line continue if seq_input == []: print >> sys.stderr, 'ERROR: Incorrect nucleotide in line ' + str( c) + '. Genome location:', ichr, pos print >> sys.stderr, 'ERROR:', line continue if cons_input != []: cons_input1 = cons_input[0] cons_input2 = cons_input[1] else: print >> sys.stderr, 'ERROR: Incorrect conservation data for line', c, ichr, pos print >> sys.stderr, 'ERROR:', line continue if cons_input2 == []: print >> sys.stderr, 'ERROR: Incorrect conservation data for line', c, ichr, pos print >> sys.stderr, 'ERROR:', line continue if cons_input1 == []: cons_input1 = [0.0 for i in range(2 * win + 1)] print >> sys.stderr, 'WARNING: PhyloP7 data not found for line', c, ichr, pos p_cod = 0 cod = 'No' if r_cod != []: p_cod = 1 cod = 'Yes' if len(wt) == 1 and len(nw) == 1 and wt != '-' and nw != '-': X = [seq_input + cons_input1 + cons_input2] y_pred, y_fdrs, c_pred = prediction(X, model1) v_fdr = [y_fdrs[0][0], y_fdrs[0][1]] else: X = [seq_input + cons_input1 + cons_input2 + [lwt, lnw, p_cod]] y_pred, y_fdrs, c_pred = prediction(X, model2) v_fdr = [y_fdrs[0][2], y_fdrs[0][3]] if y_pred == []: print >> sys.stderr, 'WARNING: Variant not scored. Check modfile and input' print >> sys.stderr, 'WARNING:', line continue pp100 = cons_input2[win] avgpp100 = sum(cons_input2) / float(len(cons_input2)) if c_pred[0] == "Pathogenic": d_fdr = v_fdr[0] if c_pred[0] == "Benign": d_fdr = v_fdr[1] print '\t'.join( str(i) for i in [ ichr, ipos, wt, nw, cod, c_pred[0], '%.3f' % y_pred[0], '%.3f' % d_fdr, pp100, avgpp100 ]) return
def make_prediction(ichr, ipos, wt, nw, modfile, ucsc_exe, ucsc_dbs, web=False, win=2, dbfasta='hg38.2bit', dbpps=['hg38.phyloP7way.bw', 'hg38.phyloP100way.bw'], pklcod='', fprog='twoBitToFa', cprog='bigWigToBedGraph'): lwt = len(wt) lnw = len(nw) if wt == '-': lwt = 1 lnw += 1 if nw == '-': lwt += 1 lnw = 1 n_wt, n_nw, n_pos = parse_variants(ochr, ipos, wt, nw, ucsc_exe, ucsc_dbs, web, dbfasta, fprog) if n_wt == '' or n_nw == '': print >> sys.stderr, 'ERROR: Incorrect mutation mapping. Check position', ichr, ipos, wt, nw sys.exit(1) if 'ACGTN'.find(n_wt) == -1 or 'ACGTN'.find(n_nw) == -1: print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw sys.exit(1) if pklcod == '': (nuc, seq, seq_input, cons_input, r_cod) = get_snv_input(ichr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs, web, win, dbfasta, dbpps, pklcod, fprog, cprog) else: (nuc, seq, seq_input, cons_input, r_cod) = get_indel_input(ichr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs, web, win, dbfasta, dbpps, pklcod, fprog, cprog) if seq == '': print >> sys.stderr, 'ERROR: Sequence not found for position', ichr, ipos sys.exit(1) if seq_input == []: print >> sys.stderr, 'ERROR: Incorrect nucleotide in position', ichr, ipos sys.exit(1) if cons_input.count([]) > 0: print >> sys.stderr, 'ERROR: Incorrect conservation data in position', ichr, ipos sys.exit(1) if cons_input != []: cons_input1 = cons_input[0] cons_input2 = cons_input[1] else: print >> sys.stderr, 'ERROR: Incorrect conservation data for position', ichr, ipos sys.exit(1) if cons_input2 == []: print >> sys.stderr, 'ERROR: Incorrect conservation data for position', ichr, pos sys.exit(1) if cons_input1 == []: cons_input1 = [0.0 for i in range(2 * win + 1)] print >> sys.stderr, 'WARNING: PhyloP7 data not found for position', ichr, pos try: model = joblib.load(modfile) except: print >> sys.stderr, 'ERROR: Program not able to load modfile. Please check that you have installed a compatible version joblib.' sys.exit(1) p_cod = 0 cod = 'No' if r_cod != []: p_cod = 1 cod = 'Yes' if pklcod == '': X = [seq_input + cons_input1 + cons_input2] y_pred, y_fdrs, c_pred = prediction(X, model) v_fdr = [y_fdrs[0][0], y_fdrs[0][1]] else: X = [seq_input + cons_input1 + cons_input2 + [lwt, lnw, p_cod]] y_pred, y_fdrs, c_pred = prediction(X, model) v_fdr = [y_fdrs[0][2], y_fdrs[0][3]] if y_pred == []: print >> sys.stderr, 'WARNING: Variants not scored. Check modfile and input' print '\t'.join([str(i) for i in [ichr, ipos, wt, nw] ]) + '\tNA\tNA\tNA\tNA\tNA\tNA' else: print "#CHROM\tPOS\tREF\tALT\tCODING\tPREDICTION\tSCORE\tFDR\tPhyloP100\tAvgPhyloP100" pp100 = cons_input2[win] avgpp100 = sum(cons_input2) / float(len(cons_input2)) if c_pred[0] == "Pathogenic": d_fdr = v_fdr[0] if c_pred[0] == "Benign": d_fdr = v_fdr[1] print '\t'.join( str(i) for i in [ ichr, ipos, wt, nw, cod, c_pred[0], '%.3f' % y_pred[0], '%.3f' % d_fdr, '%.3f' % pp100, '%.3f' % avgpp100 ]) return