Python get_indel_input Exemples, score_variants.get_indel_input Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : predict_variants.py Projet : ginnyintifa/PhD-SNPg

def make_tsvfile_predictions(
        namefile,
        modfile,
        ucsc_exe,
        ucsc_dbs,
        web=False,
        win=2,
        dbfasta='hg38.2bit',
        dbpps=['hg38.phyloP7way.bw', 'hg38.phyloP100way.bw'],
        pklcod='hg38_coding.pkl',
        fprog='twoBitToFa',
        cprog='bigWigToBedGraph'):
    nucs = 'ACGTN'
    try:
        model1 = joblib.load(modfile[0])
        model2 = joblib.load(modfile[1])
    except:
        print >> sys.stderr, 'ERROR: Program not able to load modfile. Please check that you have installed a compatible version joblib.'
        sys.exit(1)
    proc = subprocess.Popen([prog_cat, '-f', namefile],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()
    c = 0
    print "#CHROM\tPOS\tREF\tALT\tCODING\tPREDICTION\tSCORE\tFDR\tPhyloP100\tAvgPhyloP100"
    for line in stdout.split('\n'):
        if line == '': continue
        v = line.rstrip().split()
        c = c + 1
        if len(v) < 4:
            print >> sys.stderr, 'ERROR: Incorrect line ', c
            print >> sys.stderr, 'ERROR:', line
            continue
        (ichr, pos, wt, nw) = tuple(v[:4])
        if wt == nw or nw.find(',') > -1:
            print >> sys.stderr, 'ERROR: Incorrect input line.', ichr, pos, wt, nw
            print >> sys.stderr, 'ERROR:', line
            continue
        nchr = ichr
        if nchr.find('chr') == -1: nchr = 'chr' + ichr
        try:
            ipos = int(pos)
        except:
            print >> sys.stderr, 'ERROR: Incorrect input data. The tsv input file should have has four columns (chr,position,ref,alt).'
            print >> sys.stderr, 'ERROR:', line
            continue
            sys.exit(1)
        lwt = len(wt)
        lnw = len(nw)
        ##Option for selecting snv
        #if lwt!=1 or lnw!=1 or nucs.find(wt)==-1 or nucs.find(nw)==-1 or wt==nw:
        #	print >> sys.stderr, 'ERROR: Not single nucloetide variant',','.join([nchr,pos,wt,nw])
        #	continue
        if wt == '-':
            lwt = 1
            lnw += 1
        if nw == '-':
            lwt += 1
            lnw = 1
        n_wt, n_nw, n_pos = parse_variants(nchr, ipos, wt, nw, ucsc_exe,
                                           ucsc_dbs, web, dbfasta, fprog)
        if n_wt == '' or n_nw == '':
            print >> sys.stderr, 'ERROR: Incorrect mutation mapping. Check position', ichr, ipos, wt, nw
            print >> sys.stderr, 'ERROR:', line
            continue
        if 'ACGTN'.find(n_wt) == -1 or 'ACGTN'.find(n_nw) == -1:
            print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw
            print >> sys.stderr, 'ERROR:', line
            continue
        if len(wt) == 1 and len(nw) == 1:
            r_cod = []
            (nuc, seq, seq_input, cons_input,
             r_cod) = get_snv_input(nchr, n_pos, n_wt, n_nw, ucsc_exe,
                                    ucsc_dbs, web, win, dbfasta, dbpps, pklcod,
                                    fprog, cprog)
        else:
            (nuc, seq, seq_input, cons_input,
             r_cod) = get_indel_input(nchr, n_pos, n_wt, n_nw, ucsc_exe,
                                      ucsc_dbs, web, win, dbfasta, dbpps,
                                      pklcod, fprog, cprog)
        if seq == '':
            print >> sys.stderr, 'ERROR: Sequence not found for line ' + str(
                c) + '. Genome location:', ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if seq_input == []:
            print >> sys.stderr, 'ERROR: Incorrect nucleotide in line ' + str(
                c) + '. Genome location:', ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if cons_input != []:
            cons_input1 = cons_input[0]
            cons_input2 = cons_input[1]
        else:
            print >> sys.stderr, 'ERROR: Incorrect conservation data for line ' + str(
                c) + '. Genome location:', ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        #if cons_input1==[] or cons_input2==[]:
        #Check only P100
        if cons_input2 == []:
            print >> sys.stderr, 'ERROR: Incorrect conservation data for line ' + str(
                c) + '. Genome location:', ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if cons_input1 == []:
            cons_input1 = [0.0 for i in range(2 * win + 1)]
            print >> sys.stderr, 'WARNING: PhyloP7 data not found for line', c, 'mutated site', ichr, pos
        p_cod = 0
        cod = 'No'
        if r_cod != []:
            p_cod = 1
            cod = 'Yes'
        if len(wt) == 1 and len(nw) == 1 and wt != '-' and nw != '-':
            X = [seq_input + cons_input1 + cons_input2]
            y_pred, y_fdrs, c_pred = prediction(X, model1)
            v_fdr = [y_fdrs[0][0], y_fdrs[0][1]]
        else:
            X = [seq_input + cons_input1 + cons_input2 + [lwt, lnw, p_cod]]
            y_pred, y_fdrs, c_pred = prediction(X, model2)
            v_fdr = [y_fdrs[0][2], y_fdrs[0][3]]
        if y_pred == []:
            print >> sys.stderr, 'WARNING: Variant not scored. Check modfile and input'
            print >> sys.stderr, 'WARNING:', line
            continue
        pp100 = cons_input2[win]
        avgpp100 = sum(cons_input2) / float(len(cons_input2))
        #print pp100,avgpp100,cons_input2
        if c_pred[0] == "Pathogenic": d_fdr = v_fdr[0]
        if c_pred[0] == "Benign": d_fdr = v_fdr[1]
        print line + '\t' + '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f' % (
            cod, c_pred[0], y_pred[0], d_fdr, pp100, avgpp100)
        #print '\t'.join(str(i) for i in [ichr,ipos,wt,nw,'%.4f' %y_pred[0]])
    return

Exemple #2

0

Afficher le fichier

Fichier : predict_variants.py Projet : ginnyintifa/PhD-SNPg

def make_file_predictions(namefile,
                          modfile,
                          ucsc_exe,
                          ucsc_dbs,
                          web=False,
                          win=2,
                          s='\t',
                          dbfasta='hg38.2bit',
                          dbpps=['hg38.phyloP7way.bw', 'hg38.phyloP100way.bw'],
                          pklcod='hg38_coding.pkl',
                          fprog='twoBitToFa',
                          cprog='bigWigToBedGraph'):
    try:
        model1 = joblib.load(modfile[0])
        model2 = joblib.load(modfile[1])
    except:
        print >> sys.stderr, 'ERROR: Program not able to load modfile. Please check that you have installed a compatible version joblib.'
        sys.exit(1)
    f = open(namefile)
    c = 1
    print "#CHROM\tPOS\tREF\tALT\tCODING\tPREDICTION\tSCORE\tFDR\tPhyloP100\tAvgPhyloP100"
    for line in f:
        v = line.rstrip().split(s)
        if len(v) < 4:
            print >> sys.stderr, 'ERROR: Incorrect line ', c, line.rstrip()
            print >> sys.stderr, line
            continue
        (ichr, pos, wt, nw) = v[:4]
        try:
            ipos = int(pos)
        except:
            print >> sys.stderr, 'ERROR: Incorrect genome location', pos, '. Check your input file'
            print >> sys.stderr, 'ERROR:', line
            continue
        lwt = len(wt)
        lnw = len(nw)
        if wt == '-':
            lwt = 1
            lnw += 1
        if nw == '-':
            lwt += 1
            lnw = 1
        n_wt, n_nw, n_pos = parse_variants(ochr, ipos, wt, nw, ucsc_exe,
                                           ucsc_dbs, web, dbfasta, fprog)
        if n_wt == '' or n_nw == '':
            print >> sys.stderr, 'ERROR: Incorrect mutation mapping. Check position', ichr, ipos, wt, nw
            print >> sys.stderr, 'ERROR:', line
            continue
        if 'ACGTN'.find(n_wt) == -1 or 'ACGTN'.find(n_nw) == -1:
            print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw
            print >> sys.stderr, 'ERROR:', line
            continue
        if wt == nw or nw.find(',') > -1:
            print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw
            print >> sys.stderr, 'ERROR:', line
            continue
        if len(wt) == 1 and len(nw) == 1:
            r_cod = []
            (nuc, seq, seq_input, cons_input,
             r_cod) = get_snv_input(ichr, n_pos, n_wt, n_nw, ucsc_exe,
                                    ucsc_dbs, web, win, dbfasta, dbpps, pklcod,
                                    fprog, cprog)
        else:
            (nuc, seq, seq_input, cons_input,
             r_cod) = get_indel_input(ichr, n_pos, n_wt, n_nw, ucsc_exe,
                                      ucsc_dbs, web, win, dbfasta, dbpps,
                                      pklcod, fprog, cprog)
        if seq == '':
            print >> sys.stderr, 'ERROR: Sequence not found for line', c, ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if seq_input == []:
            print >> sys.stderr, 'ERROR: Incorrect nucleotide in line ' + str(
                c) + '. Genome location:', ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if cons_input != []:
            cons_input1 = cons_input[0]
            cons_input2 = cons_input[1]
        else:
            print >> sys.stderr, 'ERROR: Incorrect conservation data for line', c, ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if cons_input2 == []:
            print >> sys.stderr, 'ERROR: Incorrect conservation data for line', c, ichr, pos
            print >> sys.stderr, 'ERROR:', line
            continue
        if cons_input1 == []:
            cons_input1 = [0.0 for i in range(2 * win + 1)]
            print >> sys.stderr, 'WARNING: PhyloP7 data not found for line', c, ichr, pos

        p_cod = 0
        cod = 'No'
        if r_cod != []:
            p_cod = 1
            cod = 'Yes'
        if len(wt) == 1 and len(nw) == 1 and wt != '-' and nw != '-':
            X = [seq_input + cons_input1 + cons_input2]
            y_pred, y_fdrs, c_pred = prediction(X, model1)
            v_fdr = [y_fdrs[0][0], y_fdrs[0][1]]
        else:
            X = [seq_input + cons_input1 + cons_input2 + [lwt, lnw, p_cod]]
            y_pred, y_fdrs, c_pred = prediction(X, model2)
            v_fdr = [y_fdrs[0][2], y_fdrs[0][3]]
        if y_pred == []:
            print >> sys.stderr, 'WARNING: Variant not scored. Check modfile and input'
            print >> sys.stderr, 'WARNING:', line
            continue
        pp100 = cons_input2[win]
        avgpp100 = sum(cons_input2) / float(len(cons_input2))
        if c_pred[0] == "Pathogenic": d_fdr = v_fdr[0]
        if c_pred[0] == "Benign": d_fdr = v_fdr[1]
        print '\t'.join(
            str(i) for i in [
                ichr, ipos, wt, nw, cod, c_pred[0],
                '%.3f' % y_pred[0],
                '%.3f' % d_fdr, pp100, avgpp100
            ])
    return

Exemple #3

0

Afficher le fichier

Fichier : predict_variants.py Projet : ginnyintifa/PhD-SNPg

def make_prediction(ichr,
                    ipos,
                    wt,
                    nw,
                    modfile,
                    ucsc_exe,
                    ucsc_dbs,
                    web=False,
                    win=2,
                    dbfasta='hg38.2bit',
                    dbpps=['hg38.phyloP7way.bw', 'hg38.phyloP100way.bw'],
                    pklcod='',
                    fprog='twoBitToFa',
                    cprog='bigWigToBedGraph'):
    lwt = len(wt)
    lnw = len(nw)
    if wt == '-':
        lwt = 1
        lnw += 1
    if nw == '-':
        lwt += 1
        lnw = 1
    n_wt, n_nw, n_pos = parse_variants(ochr, ipos, wt, nw, ucsc_exe, ucsc_dbs,
                                       web, dbfasta, fprog)
    if n_wt == '' or n_nw == '':
        print >> sys.stderr, 'ERROR: Incorrect mutation mapping. Check position', ichr, ipos, wt, nw
        sys.exit(1)
    if 'ACGTN'.find(n_wt) == -1 or 'ACGTN'.find(n_nw) == -1:
        print >> sys.stderr, 'ERROR: Incorrect wild-type or mutant nucleotide', wt, nw
        sys.exit(1)
    if pklcod == '':
        (nuc, seq, seq_input, cons_input,
         r_cod) = get_snv_input(ichr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs,
                                web, win, dbfasta, dbpps, pklcod, fprog, cprog)
    else:
        (nuc, seq, seq_input, cons_input,
         r_cod) = get_indel_input(ichr, n_pos, n_wt, n_nw, ucsc_exe, ucsc_dbs,
                                  web, win, dbfasta, dbpps, pklcod, fprog,
                                  cprog)
    if seq == '':
        print >> sys.stderr, 'ERROR: Sequence not found for position', ichr, ipos
        sys.exit(1)
    if seq_input == []:
        print >> sys.stderr, 'ERROR: Incorrect nucleotide in position', ichr, ipos
        sys.exit(1)
    if cons_input.count([]) > 0:
        print >> sys.stderr, 'ERROR: Incorrect conservation data in position', ichr, ipos
        sys.exit(1)
    if cons_input != []:
        cons_input1 = cons_input[0]
        cons_input2 = cons_input[1]
    else:
        print >> sys.stderr, 'ERROR: Incorrect conservation data for position', ichr, ipos
        sys.exit(1)
    if cons_input2 == []:
        print >> sys.stderr, 'ERROR: Incorrect conservation data for position', ichr, pos
        sys.exit(1)
    if cons_input1 == []:
        cons_input1 = [0.0 for i in range(2 * win + 1)]
        print >> sys.stderr, 'WARNING: PhyloP7 data not found for position', ichr, pos

    try:
        model = joblib.load(modfile)
    except:
        print >> sys.stderr, 'ERROR: Program not able to load modfile. Please check that you have installed a compatible version joblib.'
        sys.exit(1)
    p_cod = 0
    cod = 'No'
    if r_cod != []:
        p_cod = 1
        cod = 'Yes'
    if pklcod == '':
        X = [seq_input + cons_input1 + cons_input2]
        y_pred, y_fdrs, c_pred = prediction(X, model)
        v_fdr = [y_fdrs[0][0], y_fdrs[0][1]]
    else:
        X = [seq_input + cons_input1 + cons_input2 + [lwt, lnw, p_cod]]
        y_pred, y_fdrs, c_pred = prediction(X, model)
        v_fdr = [y_fdrs[0][2], y_fdrs[0][3]]
    if y_pred == []:
        print >> sys.stderr, 'WARNING: Variants not scored. Check modfile and input'
        print '\t'.join([str(i) for i in [ichr, ipos, wt, nw]
                         ]) + '\tNA\tNA\tNA\tNA\tNA\tNA'
    else:
        print "#CHROM\tPOS\tREF\tALT\tCODING\tPREDICTION\tSCORE\tFDR\tPhyloP100\tAvgPhyloP100"
        pp100 = cons_input2[win]
        avgpp100 = sum(cons_input2) / float(len(cons_input2))
        if c_pred[0] == "Pathogenic": d_fdr = v_fdr[0]
        if c_pred[0] == "Benign": d_fdr = v_fdr[1]
        print '\t'.join(
            str(i) for i in [
                ichr, ipos, wt, nw, cod, c_pred[0],
                '%.3f' % y_pred[0],
                '%.3f' % d_fdr,
                '%.3f' % pp100,
                '%.3f' % avgpp100
            ])
    return