def parseForINDEL( indel_file ) : print indel_file fin = open( indel_file ) (path,ext) = indel_file.split('.',1) fname = path.split('/')[-1] fout = open( "%s/intermediate_data/sift/input/%s_sift_input.csv" \ % (globes.DATA_DIR, fname), 'wb' ) print fout #fast-forward through header lines patients = broad.getPatients( fin ) indexOf = broad.COLUMN_MAP for dataline in fin : splt = dataline.strip().split('\t') col_keys = ['chrom','pos','mut','ref'] chrom,pos,mut,ref = [ splt[ indexOf[k] ] for k in col_keys ] dinfo = broad.makeInfoDict( splt[ indexOf["info"] ] ) try : strand = dinfo["refseq.transcriptStrand"] except KeyError : try : strand = dinfo["refseq.transcriptStrand_1"] except KeyError : #dont have it guess '+' strand = '+' ##raise Exception("what the f**k: %s" % splt[ indexOf["info"] ] ) if strand == "+" : strand = 1 elif strand == "-" : strand = -1 else : raise Exception("Strand is not + or - ??") isInsertion = len(ref) == 1 and len(mut) > 1 isDeletion = len(ref) > 1 and len(mut) == 1 if isInsertion : start = int(pos) end = start allele = mut elif isDeletion : start = int(pos) end = start + (len(ref)-len(mut)) allele = '-/' else : assert False fout.write( "%s,%d,%d,%d,%s\n" % (chrom,start,end,strand,allele) ) fout.close() fin.close()
def integrator( self, target, splts ) : if len(splts) != 1 : assert "len isn't right" for splt in splts : ##TODO generalize this to make it vendor independent, call start column is feature of VCF, not broad??? calls = splt[ broad.CALL_START: ] base_calls = [] for pat_ix,c in enumerate(calls) : sc = broad.splitCall(c) gt = broad.convertGT( sc ) if broad.isMutated( gt ) or broad.noInf( gt ) : base_calls.append( BaseCall(sc,pat_ix) ) fields = {} keys = broad.COLUMN_MAP.keys() for k in keys : if k == "chrom" : fields[k] = globes.chromNum( splt[self.indexOf[k]] ) elif k == "info" : ##TODO generalize this to make it vendor independent dinfo = broad.makeInfoDict( splt[ self.indexOf[k] ] ) fields["AF"] = dinfo["AF"] elif k == "dbSNP" : value = splt[self.indexOf[k]] #when we getFields, 'dbsnp' will be missing and yield null if value == '.' : pass #right now just take the first rs number if multiple elif value.startswith('rs') : fields[k] = [int(t.strip()[2:]) for t in value.split(';')][0] else : print "malformed rs number?", splt assert False else : fields[k] = splt[ self.indexOf[k] ] #according to: http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper's_VCF_files #ref and alt are always given for the forward strand fields['strand'] = True return Variant( fields, base_calls )