Example #1
0
    def __init__(self, vcf_file, fast_forward=0) :
        self.indexOf = broad.COLUMN_MAP
        globes.printColumnWarning( vcf_file, self.indexOf )
        self.fin = open( vcf_file, "rb" )
        self.patients = broad.getPatients( self.fin )

        self.allow_absent = False
        self.group_repeats = False
        self.iterator = self.iterate(fast_forward)
Example #2
0
def parseForINDEL( indel_file ) :
    print indel_file
    fin = open( indel_file )
    (path,ext) = indel_file.split('.',1)
    fname = path.split('/')[-1]
    fout = open( "%s/intermediate_data/sift/input/%s_sift_input.csv" \
                    % (globes.DATA_DIR, fname), 'wb' )
    print fout

    #fast-forward through header lines
    patients = broad.getPatients( fin )

    indexOf = broad.COLUMN_MAP
    for dataline in fin :

        splt = dataline.strip().split('\t')
        col_keys = ['chrom','pos','mut','ref']
        chrom,pos,mut,ref = [ splt[ indexOf[k] ] for k in col_keys ]

        dinfo = broad.makeInfoDict( splt[ indexOf["info"] ] )
        try :
            strand = dinfo["refseq.transcriptStrand"]
        except KeyError :
            try :
                strand = dinfo["refseq.transcriptStrand_1"]
            except KeyError : #dont have it guess '+'
                strand = '+' 
                ##raise Exception("what the f**k: %s" % splt[ indexOf["info"] ] )

        if strand == "+" : strand = 1
        elif strand == "-" : strand = -1
        else : raise Exception("Strand is not + or - ??")

        isInsertion = len(ref) == 1 and len(mut) > 1
        isDeletion = len(ref) > 1 and len(mut) == 1
        if isInsertion :
            start = int(pos)
            end = start
            allele = mut
        elif isDeletion :
            start = int(pos)
            end = start + (len(ref)-len(mut))
            allele = '-/'
        else : assert False

        fout.write( "%s,%d,%d,%d,%s\n" % (chrom,start,end,strand,allele) )

    fout.close()
    fin.close()
Example #3
0
def separateOutputToFamilies() :
    fin = open( "%s/seattle/input/indel_input.vcf" % (globes.INT_DIR) )
    patients = broad.getPatients( fin )
    fouts = [ open("%s/indels_by_fam/%s.tsv" % (globes.OUT_DIR, \
                                                pat.replace('/','-')), 'wb' ) \
              for pat in patients ]
    fin.close()

    #errrgg so I can re-get out the original read data
    finin = open( "%s/seattle/input/indel_input.vcf" % (globes.INT_DIR) )
    patients = broad.getPatients( finin )
    finin_splt = finin.readline().strip().split('\t')

    fin = open( "%s/seattle/output/indel_output.tsv" % (globes.INT_DIR) )
    column_splt = fin.readline().strip().split('\t')
    bp = indexOf["sampleAlleles"]
    column_splt = column_splt[:bp] + ["originalBroadCall"] + column_splt[bp:]
    new_columns = "\t".join( column_splt )
    for fout in fouts :
        fout.write( "%s\n" % new_columns )

    for line in fin :
        #ignore the comment lines at the end
        if '#' in line : continue

        #get the necessary column values
        splt = line.strip().split('\t')
        cols = ["chromosome","position","refBase","sampleGenotype"]
        chrom,pos,refBase,sampleGTs = [ splt[ indexOf[c] ] for c in cols ]
        sampleGTs = sampleGTs.split(',')

        #find line in input file that corresponds to the output line
        num_incs = 0
        while True :
            cols = ["chrom","pos"]
            values = [ finin_splt[ broad.COLUMN_MAP[c] ] for c in cols ]
            finin_chrom, finin_pos = values
            finin_calls = finin_splt[ broad.COLUMN_MAP["calls"]: ]
            if pos == finin_pos and chrom == finin_chrom :
                break
            else :
                num_incs += 1
                finin_splt = finin.readline().strip().split('\t')

        #The output may have multiple lines for each input line, corresponding
        #to the different transcripts. This means that if 'line' no longer
        #matches the finin_line, we should only have to jump next once
        assert num_incs <= 1

        #isMutated is a function that takes a GT from the output file
        #and determines if it is a mutation
        isInsertion = '-' in refBase
        if isInsertion :
            l,r = refBase.split('-')
            isMutated = lambda gt : \
                            not gt == '%s/%s' % (l,l) and not gt == "N/N"
        else :
            isMutated = lambda gt : \
                            not refBase in gt.split('/')[1] and not gt == 'N/N'

        num_mutations = 0
        for i,(fout,gt) in enumerate( zip(fouts,sampleGTs) ) :
            if isMutated(gt) :
                num_mutations += 1
                #splt_copy = list(splt)
                #print splt_copy
                splt[ indexOf["sampleGenotype"] ] = "%s" % (gt)
                newline = "\t".join( splt[:bp] + [ finin_calls[i] ] + splt[bp:] )
                fout.write( "%s\n" % newline )

        #because of indel.indelUniqueToDisease
        assert num_mutations == 1

    [f.close() for f in fouts]
    fin.close()