def __init__(self, vcf_file, fast_forward=0) : self.indexOf = broad.COLUMN_MAP globes.printColumnWarning( vcf_file, self.indexOf ) self.fin = open( vcf_file, "rb" ) self.patients = broad.getPatients( self.fin ) self.allow_absent = False self.group_repeats = False self.iterator = self.iterate(fast_forward)
def parseForINDEL( indel_file ) : print indel_file fin = open( indel_file ) (path,ext) = indel_file.split('.',1) fname = path.split('/')[-1] fout = open( "%s/intermediate_data/sift/input/%s_sift_input.csv" \ % (globes.DATA_DIR, fname), 'wb' ) print fout #fast-forward through header lines patients = broad.getPatients( fin ) indexOf = broad.COLUMN_MAP for dataline in fin : splt = dataline.strip().split('\t') col_keys = ['chrom','pos','mut','ref'] chrom,pos,mut,ref = [ splt[ indexOf[k] ] for k in col_keys ] dinfo = broad.makeInfoDict( splt[ indexOf["info"] ] ) try : strand = dinfo["refseq.transcriptStrand"] except KeyError : try : strand = dinfo["refseq.transcriptStrand_1"] except KeyError : #dont have it guess '+' strand = '+' ##raise Exception("what the f**k: %s" % splt[ indexOf["info"] ] ) if strand == "+" : strand = 1 elif strand == "-" : strand = -1 else : raise Exception("Strand is not + or - ??") isInsertion = len(ref) == 1 and len(mut) > 1 isDeletion = len(ref) > 1 and len(mut) == 1 if isInsertion : start = int(pos) end = start allele = mut elif isDeletion : start = int(pos) end = start + (len(ref)-len(mut)) allele = '-/' else : assert False fout.write( "%s,%d,%d,%d,%s\n" % (chrom,start,end,strand,allele) ) fout.close() fin.close()
def separateOutputToFamilies() : fin = open( "%s/seattle/input/indel_input.vcf" % (globes.INT_DIR) ) patients = broad.getPatients( fin ) fouts = [ open("%s/indels_by_fam/%s.tsv" % (globes.OUT_DIR, \ pat.replace('/','-')), 'wb' ) \ for pat in patients ] fin.close() #errrgg so I can re-get out the original read data finin = open( "%s/seattle/input/indel_input.vcf" % (globes.INT_DIR) ) patients = broad.getPatients( finin ) finin_splt = finin.readline().strip().split('\t') fin = open( "%s/seattle/output/indel_output.tsv" % (globes.INT_DIR) ) column_splt = fin.readline().strip().split('\t') bp = indexOf["sampleAlleles"] column_splt = column_splt[:bp] + ["originalBroadCall"] + column_splt[bp:] new_columns = "\t".join( column_splt ) for fout in fouts : fout.write( "%s\n" % new_columns ) for line in fin : #ignore the comment lines at the end if '#' in line : continue #get the necessary column values splt = line.strip().split('\t') cols = ["chromosome","position","refBase","sampleGenotype"] chrom,pos,refBase,sampleGTs = [ splt[ indexOf[c] ] for c in cols ] sampleGTs = sampleGTs.split(',') #find line in input file that corresponds to the output line num_incs = 0 while True : cols = ["chrom","pos"] values = [ finin_splt[ broad.COLUMN_MAP[c] ] for c in cols ] finin_chrom, finin_pos = values finin_calls = finin_splt[ broad.COLUMN_MAP["calls"]: ] if pos == finin_pos and chrom == finin_chrom : break else : num_incs += 1 finin_splt = finin.readline().strip().split('\t') #The output may have multiple lines for each input line, corresponding #to the different transcripts. This means that if 'line' no longer #matches the finin_line, we should only have to jump next once assert num_incs <= 1 #isMutated is a function that takes a GT from the output file #and determines if it is a mutation isInsertion = '-' in refBase if isInsertion : l,r = refBase.split('-') isMutated = lambda gt : \ not gt == '%s/%s' % (l,l) and not gt == "N/N" else : isMutated = lambda gt : \ not refBase in gt.split('/')[1] and not gt == 'N/N' num_mutations = 0 for i,(fout,gt) in enumerate( zip(fouts,sampleGTs) ) : if isMutated(gt) : num_mutations += 1 #splt_copy = list(splt) #print splt_copy splt[ indexOf["sampleGenotype"] ] = "%s" % (gt) newline = "\t".join( splt[:bp] + [ finin_calls[i] ] + splt[bp:] ) fout.write( "%s\n" % newline ) #because of indel.indelUniqueToDisease assert num_mutations == 1 [f.close() for f in fouts] fin.close()