Ejemplo n.º 1
0
def varpileup_line2vcf_line(pileupfields):
    """ Converts Variant Pileup format to VCF format """

    t = '\t'
    chr = str(pileupfields[0])
    pos = str(pileupfields[1])
    ref = str(pileupfields[2])
    alt = str(pileupfields[3])
    consqual = str(pileupfields[4])
    snpqual = str(pileupfields[5])
    mapqual = str(pileupfields[6])
    depth = str(pileupfields[7])
    alt_count = str(count_alt(depth, pileupfields[8]))

    GT = '1/1'
    if fu.isOnTheList(HETERO.keys(), alt):
        GT = '0/1'
        alt = hetero2homo(ref, alt)

    # 4 - Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'.
    # 5 - Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'.
    #    Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because
    #    it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though,
    #    because the evidence of a SNP is very strong.
    # 6 - root mean square (RMS) mapping quality
    return chr + t + pos + t + '.' + t + ref + t + alt + t + mapqual + t + 'PASS' + t + '.' + t + 'GT:GQ:DP:AD' + t + GT + ':' + consqual + ':' + depth + ':' + alt_count
Ejemplo n.º 2
0
def varpileup_line2vcf_line(pileupfields):
    """ Converts Variant Pileup format to VCF format """

    t='\t'
    chr=str(pileupfields[0])
    pos=str(pileupfields[1])
    ref=str(pileupfields[2])
    alt=str(pileupfields[3])
    consqual=str(pileupfields[4])
    snpqual=str(pileupfields[5])
    mapqual=str(pileupfields[6])
    depth=str(pileupfields[7])
    alt_count=str(count_alt(depth, pileupfields[8]))

    GT='1/1'
    if fu.isOnTheList(HETERO.keys(), alt):
        GT='0/1'
        alt=hetero2homo(ref,alt)


    # 4 - Phred-scaled likelihood that the genotype is wrong, which is also called `consensus quality'.
    # 5 - Phred-scaled likelihood that the genotype is identical to the reference, which is also called `SNP quality'.
    #    Suppose the reference base is A and in alignment we see 17 G and 3 A. We will get a low consensus quality because
    #    it is difficult to distinguish an A/G heterozygote from a G/G homozygote. We will get a high SNP quality, though,
    #    because the evidence of a SNP is very strong.
    # 6 - root mean square (RMS) mapping quality
    return chr+t+pos+t+'.' +t+ ref +t+ alt +t+ mapqual +t+ 'PASS' +t+ '.' + t + 'GT:GQ:DP:AD'+t+GT+':'+consqual+':'+depth+':'+alt_count
Ejemplo n.º 3
0
def hetero2homo(ref, alt):
    """ Converts heterozygous symbols from Samtools pileup to A, G, T, C """
    if fu.isOnTheList(HETERO.keys(), alt) == False:
        return alt
    else:
        alt_x=HETERO[alt]
        if str(ref)==str(alt_x)[0]:
            return str(alt_x)[1]
        else:
            return str(alt_x)[0]
Ejemplo n.º 4
0
def hetero2homo(ref, alt):
    """ Converts heterozygous symbols from Samtools pileup to A, G, T, C """
    if fu.isOnTheList(HETERO.keys(), alt) == False:
        return alt
    else:
        alt_x = HETERO[alt]
        if str(ref) == str(alt_x)[0]:
            return str(alt_x)[1]
        else:
            return str(alt_x)[0]
Ejemplo n.º 5
0
def varpileup_line2vcf_line(pileupfields):
    """ Converts Variant Pileup format to VCF format """

    t = '\t'
    chr = str(pileupfields[0])
    pos = str(pileupfields[1])
    ref = str(pileupfields[2])
    alt = str(pileupfields[3])
    consqual = str(pileupfields[4])
    snpqual = str(pileupfields[5])
    mapqual = str(pileupfields[6])
    depth = str(pileupfields[7])
    alt_count = str(count_alt(depth, pileupfields[8]))

    GT = '1/1'
    if fu.isOnTheList(HETERO.keys(), alt):
        GT = '0/1'
        alt = hetero2homo(ref, alt)

    return chr + t + pos + t + '.' + t + ref + t + alt + t + mapqual + \
        t + 'PASS' + t + '.' + t + 'GT:GQ:DP:AD' + t + GT + ':' + \
        consqual + ':' + depth + ':' + alt_count
Ejemplo n.º 6
0
def addOverlapWithGadAll(vcf,
                         format='vcf',
                         table='gadAll',
                         tmpextin='',
                         tmpextout='.1',
                         sep='\t'):

    basefile = vcf
    vcf = basefile + tmpextin
    outfile = basefile + tmpextout

    fh_out = open(outfile, "w")
    fh = open(vcf)

    logcountfile = basefile + '.count.log'
    fh_log = open(logcountfile, 'a')
    var_count = 0
    line_count = 0

    inds = getFormatSpecificIndices(format=format)
    conn = u.db_connect()
    cursor = conn.cursor()
    linenum = 1

    for line in fh:
        line = line.strip()
        ## not comments
        if not line.startswith("##"):
            #header line
            if (line.startswith('CHROM') or line.startswith('#CHROM')):
                fh_out.write(line + '\n')
            else:
                fields = line.split(sep)
                chr = fields[inds[0]].strip()
                # For some reason this table has no "chr" preceeding number
                if chr.startswith("chr"):
                    chr = str(chr).replace("chr", "")

                pos = fields[inds[1]].strip()
                isOverlap = False

                sql = 'select * from ' + table + ' where chromosome="' + \
                    str(chr) + '" AND (chromStart <= ' + str(pos) + \
                    ' AND ' + str(pos) + ' <= chromEnd);'
                cursor.execute(sql)
                rows = cursor.fetchall()
                records = []

                if (len(rows) > 0):
                    records_count = 1
                    line_count = line_count + 1
                    r_tmp = []
                    for row in rows:
                        var_count = var_count + 1
                        if not fu.isOnTheList(r_tmp, str(row[3])):
                            r_tmp.append(str(row[3]))
                            records.append(str(table) + '=' + str(row[3]))
                            records_count = records_count + 1
                    if str(fields[7]).endswith(';'):
                        fields[7] = fields[7] + ';'.join(records)
                    else:
                        fields[7] = fields[7] + ';' + ';'.join(records)
                    fh_out.write('\t '.join(fields) + '\n')
                else:
                    fh_out.write(line + '\n')

            linenum = linenum + 1
        else:
            fh_out.write(line + '\n')

    fh_log.write(f"In {str(table)}: {str(var_count)} in " + \
        f"{str(line_count)} variants\n")
    fh_log.close()

    conn.close()
    fh.close()
    fh_out.close()
Ejemplo n.º 7
0
def addOverlapWitHUGOGeneNomenclature(vcf,
                                      format='vcf',
                                      table='hugo',
                                      tmpextin='',
                                      tmpextout='.1',
                                      sep='\t'):

    basefile = vcf
    vcf = basefile + tmpextin
    outfile = basefile + tmpextout

    fh_out = open(outfile, "w")
    fh = open(vcf)

    logcountfile = basefile + '.count.log'
    fh_log = open(logcountfile, 'a')
    var_count = 0
    line_count = 0

    inds = getFormatSpecificIndices(format=format)
    conn = u.db_connect()
    cursor = conn.cursor()
    linenum = 1

    for line in fh:
        line = line.strip()
        ## not comments
        if not line.startswith("##"):
            #header line
            if (line.startswith('CHROM') or line.startswith('#CHROM')):
                fh_out.write(line + '\n')
            else:
                fields = line.split(sep)
                chr = fields[inds[0]].strip()
                if not chr.startswith("chr"):
                    chr = "chr" + chr

                pos = fields[inds[1]].strip()
                isOverlap = False

                sql = 'select * from ' + table + ' where chrom="' + \
                    str(chr) + '" AND (chromStart <= ' + str(pos) + \
                    ' AND ' + str(pos) + ' <= chromEnd);'
                cursor.execute(sql)
                rows = cursor.fetchall()
                records = []

                if (len(rows) > 0):
                    line_count = line_count + 1
                    records_count = 1
                    r_tmp = []
                    for row in rows:
                        var_count = var_count + 1
                        t = str(str(row[5]) + ',' + str(row[6])).strip()
                        if not fu.isOnTheList(r_tmp, t):
                            r_tmp.append(t)
                            records.append('HGNC_GeneAnnotation' + '=' + t)
                        records_count = records_count + 1

                    records_str = ','.join(records).replace(';', ',')

                    if str(fields[7]).endswith(';'):
                        fields[7] = fields[7] + records_str
                    else:
                        fields[7] = fields[7] + ';' + records_str
                    fh_out.write('\t'.join(fields) + '\n')
                else:
                    fh_out.write(line + '\n')

            linenum = linenum + 1
        else:
            fh_out.write(line + '\n')

    fh_log.write(f"In {str(table)}: {str(var_count)} in " + \
        f"{str(line_count)} variants\n")
    fh_log.close()

    conn.close()
    fh.close()
    fh_out.close()