Beispiel #1
0
def loadGLFFiles(inputGLFFiles=''):

    glffiles = []
    fg = open(inputGLFFiles, mode='r')
    for line in fg.readlines():
        glffiles.append(line.rstrip("\n").split()[0])
    fg.close()

    fp_to_fname = {}

    for glffile in glffiles:
        if not os.path.exists(glffile):
            sys.stderr.write("File %s does not exist\n" % glffile)
            continue
        fg = FileUtils.FileWithHeader(fname=glffile, mode='r')
        while True:
            dat = fg.readline()
            if dat['realigned_position'] != 'NA':
                firstpos = int(dat['realigned_position'])
                if fp_to_fname.has_key(firstpos):
                    raise NameError('Huh?')

                fp_to_fname[firstpos] = glffile
                fg.close()
                break

    newglffiles = []
    for pos in sorted(fp_to_fname.keys()):
        print "pos:", pos, "glffile:", fp_to_fname[pos]
        newglffiles.append(fp_to_fname[pos])

    return newglffiles
Beispiel #2
0
def getCalls(callFile=''):

    vcf = FileUtils.FileWithHeader(fname=callFile, mode='r', joinChar="\t")

    calls = {}
    numcalls = 0
    while True:
        dat = vcf.readline()
        if dat == {}:
            break

        if dat['FILTER'] == "PASS" or (dat['FILTER'] == "q20"
                                       and float(dat['QUAL']) >= 10):
            chrom = dat['CHROM']
            pos = int(dat['POS'])
            ref = dat['REF']
            alt = dat['ALT']
            if alt.find(',') != -1:
                raise NameError("Cannot deal with these entries")

            var = Variant.Variant4(ref=ref, alt=alt)
            # note, must be zero-based pos! SEE
            newpos = pos + var.offset - 1
            newstr = var.str

            if not calls.has_key(chrom):
                calls[chrom] = {}
            if not calls[chrom].has_key(newpos):
                calls[chrom][newpos] = {}
            if calls[chrom][newpos].has_key(newstr):
                raise NameError('Multiple same variants?')

            calls[chrom][newpos][newstr] = dat.copy()
            numcalls += 1

    vcf.close()
    print "Number of calls imported:", numcalls
    return calls
def processPooledGLFFiles(bamFilesFile='',
                          glfFilesFile='',
                          refFile='',
                          outputVCFFile='',
                          maxHPLen=10,
                          minForwardReverse=1,
                          minDist=10,
                          dbSNPWindow=50,
                          newVarCov=False,
                          doNotFilterOnFR=False,
                          filterQual=20,
                          numSamples=1,
                          numBamFiles=1):
    coverageRange = [20, 10000]

    # read file with glf files
    allFiles = []
    headerLabels = []

    f = open(glfFilesFile, 'r')
    idx = 0
    for line in f.readlines():
        idx += 1
        dat = line.rstrip("\n").split()
        for gf in dat:
            if not os.path.exists(gf):
                sys.stderr.write("WARNING: GLF file %s does not exist.\n" % gf)
            else:
                if os.path.splitext(gf)[-1] == '.gz':
                    fgf = gzip.open(gf, 'r')
                else:
                    fgf = open(gf, 'r')
                line = fgf.readline()
                if line == '':
                    sys.stderr.write("WARNING: GLF file %s is empty.\n" % gf)
                else:
                    d = line.rstrip("\n").split()
                    if headerLabels == []:
                        headerLabels = d[:]
                        allFiles.append(gf)
                    else:
                        if d != headerLabels:
                            sys.stderr.write(
                                "Inconsistent header in GLF file %s\n" % gf)
                        else:
                            allFiles.append(gf)

                fgf.close()

    f.close()

    fa = Fasta.Fasta(fname=refFile)

    # read precall files
    # make hash table [pos][variant][fname]

    numInds = numSamples
    minFreq = 1.0 / (float(2 * numInds) * 5)

    nf = 0

    try:
        realpos_col = headerLabels.index('realigned_position')
        var_col = headerLabels.index('nref_all')

        # apply filters across individuals

        tcFilter = "tc%d" % minDist

        col_num_reads = headerLabels.index('num_reads')

        col_num_forward_old = headerLabels.index('num_cover_forward')
        col_num_reverse_old = headerLabels.index('num_cover_reverse')

        col_num_forward = headerLabels.index('var_coverage_forward')
        col_num_reverse = headerLabels.index('var_coverage_reverse')

        col_post_prob = headerLabels.index('post_prob_variant')
        chr_col = headerLabels.index('tid')
        idx_col = headerLabels.index('indidx')
        ana_col = headerLabels.index('analysis_type')
    except ValueError:
        raise NameError(
            "GLF files are corrupt. Could not find all required columns.")

    pass_filters = {}
    varStat = {}
    nr = 0
    num_pass = 0

    # read depth histo
    rdhist = {}
    for glffile in allFiles:
        fglf = FileUtils.FileWithHeader(fname=glffile, mode='r', joinChar=' ')
        print "Reading", glffile
        done = False
        while True:
            pos = -1
            var = ''
            nr += 1

            if nr % 10000 == 9999:
                print "Number of lines read:", nr + 1

            num_ind_with_data = 0
            tot_coverage = 0
            tot_num_forward = 0
            tot_num_reverse = 0

            tot_num_forward_old = 0
            tot_num_reverse_old = 0

            skip = False

            for fidx in range(0, numBamFiles):
                try:
                    dat = fglf.readlineList()
                except IOError:
                    sys.stderr.write("WARNING: IOError in %s\n" % glffile)
                    done = True
                    break

                if dat == []:
                    done = True
                    break
                if dat[realpos_col] == 'NA':
                    skip = True
                    break
                if dat[ana_col] != "singlevariant":
                    skip = True
                    break

                if dat[idx_col] != 'NA' and int(dat[idx_col]) >= numBamFiles:
                    raise NameError(
                        'Error. Is the number of BAM files correctly specified?'
                    )

                if pos == -1:
                    pos = int(dat[realpos_col])
                    var = dat[var_col]
                    chr = dat[chr_col]
                else:
                    if int(dat[realpos_col]) != pos:
                        raise NameError(
                            'Inconsistent glf files! Is the number of BAM files correctly specified?'
                        )

                if int(dat[idx_col]) != fidx:
                    sys.stderr.write(
                        "Error reading this variant: %s %d %s in %s\n" %
                        (chr, pos, var, glffile))

                tot_num_forward_old += int(dat[col_num_forward_old])
                tot_num_reverse_old += int(dat[col_num_reverse_old])

                if fidx == 0:
                    # only record for first individual
                    tot_num_forward = int(dat[col_num_forward])
                    tot_num_reverse = int(dat[col_num_reverse])

                numreads = int(dat[col_num_reads])
                if numreads > 0:
                    num_ind_with_data += 1

                tot_coverage += numreads
            if skip:
                continue
            if done:
                break

            prob = float(dat[col_post_prob])
            freq = float(dat[headerLabels.index('est_freq')])
            if rdhist.has_key(tot_coverage):
                rdhist[tot_coverage] += 1
            else:
                rdhist[tot_coverage] = 1

            if prob > 0.20:
                if not varStat.has_key(chr):
                    varStat[chr] = {}
                if not varStat[chr].has_key(pos):
                    varStat[chr][pos] = {}
                # hplen
                seq = fa.get(chr, pos + 1 - 25, 50)
                hplen = AnalyzeSequence.HomopolymerLength(seq=seq, pos=25)

                varStat[chr][pos][var] = {
                    'QUAL': prob,
                    'NF': tot_num_forward,
                    'NR': tot_num_reverse,
                    'NFS': tot_num_forward_old,
                    'NRS': tot_num_reverse_old,
                    'DP': tot_coverage,
                    'NS': num_ind_with_data,
                    'AF': freq,
                    'HP': hplen
                }

            del dat
        # finished reading this one
        fglf.close()
    #print "Number of variants passing filters:", num_pass

    # apply haplotype coverage and other filters

    coverageRange = getPercentiles(rdhist, [1, 99])

    fqp = 1.0 - math.pow(10.0, -float(filterQual) / 10.0)
    fqp_str = "q%d" % filterQual

    for chr in varStat.keys():
        for pos in varStat[chr].keys():
            for varseq, var in varStat[chr][pos].iteritems():

                filters = []
                prob = var['QUAL']
                num_ind_with_data = var['NS']
                hplen = var['HP']
                freq = var['AF']
                tot_coverage = var['DP']
                tot_num_forward = var['NF']
                tot_num_reverse = var['NR']
                if prob < fqp:
                    filters.append(fqp_str)
                if (tot_num_forward < minForwardReverse or tot_num_reverse <
                        minForwardReverse) and not doNotFilterOnFR:
                    filters.append('fr0')
                if tot_coverage < coverageRange[
                        0] or tot_coverage > coverageRange[1]:
                    filters.append('ocr')
                if num_ind_with_data < numInds / 2:
                    filters.append('s50')
                if hplen > maxHPLen:
                    filters.append("hp%d" % (maxHPLen))
                if freq < minFreq:
                    filters.append("mf")

                if filters == []:
                    if not pass_filters.has_key(chr):
                        pass_filters[chr] = {}
                    if not pass_filters[chr].has_key(pos):
                        pass_filters[chr][pos] = []
                    pass_filters[chr][pos].append(varseq)
                    num_pass += 1

                if filters == []:
                    varStat[chr][pos][varseq]['filter'] = ''
                else:
                    varStat[chr][pos][varseq]['filter'] = ';'.join(filters)

    # now visit each chromosome and apply closeness filter
    chromosomes = [str(c) for c in range(1, 23)]
    chromosomes.extend(['X', 'Y'])

    other_chr = list(set(varStat.keys()) - set(chromosomes))
    chromosomes.extend(other_chr)

    # create VCF file
    print "Writing VCF"

    fv = open(outputVCFFile, 'w')
    fv.write("##fileformat=VCFv4.0\n")
    fv.write("##source=Dindel\n")
    fv.write("##reference=%s\n" % refFile)
    fv.write(
        "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">\n"
    )
    fv.write(
        "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total number of reads in haplotype window\">\n"
    )
    fv.write(
        "##INFO=<ID=HP,Number=1,Type=Integer,Description=\"Reference homopolymer tract length\">\n"
    )
    fv.write(
        "##INFO=<ID=NFS,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant site on forward strand\">\n"
    )
    fv.write(
        "##INFO=<ID=NRS,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant site on reverse strand\">\n"
    )
    fv.write(
        "##INFO=<ID=NF,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant on forward strand\">\n"
    )
    fv.write(
        "##INFO=<ID=NR,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant on reverse strand\">\n"
    )
    fv.write(
        "##INFO=<ID=AF,Number=-1,Type=Float,Description=\"Allele frequency\">\n"
    )
    fv.write(
        "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership build 129 - type match and indel sequence length match within %d bp\">\n"
        % dbSNPWindow)
    fv.write("##FILTER=<ID=q%d,Description=\"Quality below %d\">\n" %
             (filterQual, filterQual))
    fv.write(
        "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">\n"
    )
    fv.write(
        "##FILTER=<ID=tc%d,Description=\"Indel site was closer than %d base pairs from another site with higher posterior probability\">\n"
        % (minDist, minDist))
    fv.write(
        "##FILTER=<ID=hp%d,Description=\"Reference homopolymer length was longer than %d\">\n"
        % (maxHPLen, maxHPLen))
    if not doNotFilterOnFR:
        fv.write(
            "##FILTER=<ID=fr0,Description=\"Non-ref allele is not covered by at least one read on both strands\">\n"
        )
    fv.write(
        "##FILTER=<ID=ocr,Description=\"Number of reads in haplotype window outside coverage range %d %d\">\n"
        % (coverageRange[0], coverageRange[1]))
    fv.write(
        "##FILTER=<ID=mf,Description=\"Too low non-ref allele frequency\">\n")

    fv.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")

    for chr in chromosomes:
        if not pass_filters.has_key(chr):
            continue
        # filter out variants that are too close
        totSites = 0
        positions = sorted(pass_filters[chr].keys())
        newPosition = positions[:]

        done = False
        while not done:
            done = True
            for p in range(1, len(positions)):
                if newPosition[p] != newPosition[
                        p -
                        1] and newPosition[p] - positions[p - 1] <= minDist:
                    newPosition[p] = newPosition[p - 1]
                    done = False

        newSites = {}
        for p in range(0, len(newPosition)):
            newPos = newPosition[p]
            pos = positions[p]

            if not newSites.has_key(newPos):
                newSites[newPos] = {}

            if not newSites[newPos].has_key(pos):
                newSites[newPos][pos] = []

            for var in varStat[chr][pos].keys():
                newSites[newPos][pos].append(var)

        print "New number of sites:", len(newSites.keys())
        print "Number of sites filtered:", len(pass_filters[chr].keys()) - len(
            newSites.keys())

        # select best call for double sites

        filtered = []
        for newPos in newSites.keys():
            old = newSites[newPos].keys()

            pos_probs = []
            pos_vars = []
            pos_pos = []
            for oldPos in old:
                probs = []
                vars = []
                max_prob = -1.0
                max_var = ''
                for var in newSites[newPos][oldPos]:
                    prob = varStat[chr][oldPos][var]['QUAL']
                    if prob > max_prob:
                        max_prob = prob
                        max_var = var
                pos_probs.append(max_prob)
                pos_vars.append(max_var)
                pos_pos.append(oldPos)

            idx = pos_probs.index(max(pos_probs))
            okpos = pos_pos[idx]
            filtered.append(pos_pos[idx])

            for duppos in set(old) - set([okpos]):
                for var in varStat[chr][duppos].keys():

                    if varStat[chr][duppos][var]['filter'] == '':
                        varStat[chr][duppos][var]['filter'] == tcFilter
                    else:
                        varStat[chr][duppos][var]['filter'] += ';' + tcFilter

        print "Number of indel sites:", len(filtered)

        for pos in sorted(varStat[chr].keys()):
            for var in varStat[chr][pos].keys():

                indel_report_pos = pos
                #refall = fa.get(chr, pos+1, 1)
                qual = -int(10.0 * math.log10(
                    max(1.0 - float(varStat[chr][pos][var]['QUAL']), 1e-10)))
                infofield = []
                for tag in ['AF', 'NS', 'DP', 'HP', 'NF', 'NR', 'NFS', 'NRS']:
                    val = (varStat[chr][pos][var][tag])
                    infofield.append("%s=%s" % (tag, val))

                vnref = Variant.Variant(varString=var)
                max_del_len = 0
                if vnref.type == "del":
                    if vnref.length > max_del_len:
                        max_del_len = vnref.length

                seqlen = 1 + max_del_len
                refseq = ''.join(fa.get(chr, indel_report_pos, seqlen))
                if vnref.type == "del":
                    altseq = refseq[0] + refseq[(1 + vnref.length):]
                elif vnref.type == "ins":
                    altseq = refseq[0] + vnref.seq + refseq[1:]
                elif vnref.type == "snp":
                    indel_report_pos += 1
                    refseq = refseq[1]
                    altseq = vnref.seq[0]

                infostr = ';'.join(infofield)
                filterstr = varStat[chr][pos][var]['filter']
                if filterstr == '':
                    filterstr = 'PASS'
                id = '.'
                outstr = "%s\t%d\t%s\t%s\t%s\t%d\t%s\t%s\n" % (
                    chr, indel_report_pos, id, refseq, altseq, qual, filterstr,
                    infostr)
                fv.write(outstr)
    fv.close()
def processDiploidGLFFile(glfFile='',
                          variants={},
                          refFile='',
                          maxHPLen=10,
                          isHomozygous=False,
                          doNotFilterOnFR=False,
                          newVarCov=False,
                          filterQual=20):

    # setup reference sequence

    fa = Fasta.Fasta(fname=refFile)

    # variants will be added to variants

    fglf = FileUtils.FileWithHeader(fname=glfFile)

    numSkipped = 0  # number of windows that were skipped by Dindel

    # read line by line, aggregate results for identical windows

    prevPos = -1
    prevChr = -1
    prevDat = {}
    while True:
        dat = fglf.readline()
        if dat == {}:
            break
        if True:
            errcode = dat['msg']
            index = dat['index']  # index of window in original variant file

            if errcode != "ok":
                numSkipped += 1
                continue

            if dat['analysis_type'] != 'dip.map':
                continue

            if dat['was_candidate_in_window'] != '1':
                continue

            glf = {}
            chrom = dat['tid']
            if chrom != prevChr:
                prevPos = -1
                prevChr = chrom

            glf['chr'] = dat['tid']
            glf['pos'] = dat['realigned_position']

            pos = int(glf['pos'])

            prevDat = dat
            glf['qual'] = int(float(dat['qual']))

            if float(glf['qual']) < 1.0:
                continue

            glf['nref_all'] = dat['nref_all'].split(',')
            if glf['nref_all'] == ['R=>D']:
                continue
            nfa = dat['var_coverage_forward'].split(',')
            nra = dat['var_coverage_reverse'].split(',')
            ai = 0

            glf['num_cover_forward'] = int(nfa[ai])
            glf['num_cover_reverse'] = int(nra[ai])

            glf['num_cover_forward_old'] = int(dat['num_cover_forward'])
            glf['num_cover_reverse_old'] = int(dat['num_cover_reverse'])

            glf['num_hap_reads'] = dat['num_reads']
            glf['genotype'] = dat['glf']

            (vcf_str, report_pos) = getVCFString(glf=glf,
                                                 fa=fa,
                                                 filterQual=filterQual)
            if not variants.has_key(chrom):
                variants[chrom] = {}

            if not variants[chrom].has_key(report_pos):
                variants[chrom][report_pos] = []

            variants[chrom][report_pos].append(vcf_str)

            prevPos = pos
Beispiel #5
0
def makeGLF(inputGLFFiles='', outputFile='', callFile='', bamfilesFile=''):

    # get VCF calls

    sys.stdout.write("Reading VCF file\n")
    calls = getCalls(callFile=callFile)
    sys.stdout.write("done\n")
    sys.stdout.flush()

    # read through glf files

    glffiles = loadGLFFiles(inputGLFFiles=inputGLFFiles)

    # get BAMfiles file

    bamfiles = []
    fb = open(bamfilesFile, 'r')
    for line in fb.readlines():
        bamfiles.append(line.rstrip("\n").split()[0])
    fb.close()

    # check each GLF file

    numwritten = 0

    # open output file

    fout = open(outputFile, 'w')

    for glffile in glffiles:
        sys.stdout.write("Checking %s\n" % glffile)

        fg = FileUtils.FileWithHeader(fname=glffile, mode='r')

        buffer = {}
        curr_index = '-1'
        while True:
            dat = fg.readline()
            if dat == {}:
                break

            newindex = "%s.%s.%s" % (dat['index'], dat['realigned_position'],
                                     dat['nref_all'])
            if not buffer.has_key(newindex):
                buffer[newindex] = []
            buffer[newindex].append(dat)

            if newindex != curr_index:
                if curr_index != '-1':
                    result = emptyBuffer(index=curr_index,
                                         buffer=buffer,
                                         calls=calls,
                                         outputFileHandle=fout,
                                         bamfiles=bamfiles)

                    if result == "a-ok":
                        numwritten += 1

                curr_index = newindex

        result = emptyBuffer(index=curr_index,
                             buffer=buffer,
                             calls=calls,
                             outputFileHandle=fout,
                             bamfiles=bamfiles)

        if result == "a-ok":
            numwritten += 1

        fg.close()

        print "Number written:", numwritten
        sys.stdout.flush()

    # finish up

    fout.close()