def pickOutFamilies( orig_filename, outdir,family_groups, \ callToString = lambda x:x,\ lineFilter = lambda x:True, \ cols_to_use = range(len(COLUMN_MAP)), \ ) : fin = open( orig_filename, "rb" ) #open filehandles for each group, initialize group's column index list if not os.path.isdir(outdir) : os.mkdir(outdir) fouts = {} groupIXs = {} for group in family_groups : safe_group = sanitizePatientName( group ) fouts[group] = open( "%s/%s.vcf" % (outdir,safe_group), 'wb' ) groupIXs[group] = [] (columns,headers) = getColumnsAndHeaders( fin ) header_string = "\n".join(headers) for i in range( len(columns) ) : for group in family_groups : family_names = family_groups[group] print columns[i], family_names if columns[i] in family_names : groupIXs[group].append( i ) print groupIXs #print headers for group in family_groups : fouts[group].write( "%s\n" % header_string ) out_header = '\t'.join( [columns[i] for i in \ cols_to_use + groupIXs[group]] ) fouts[group].write( "%s\n" % out_header ) #fouts[group].write( '\n'.join(columns) ) #fouts[group].write( "%s\n" % out_header ) # 'indexOf' dictionary maps header string to it's column index #in the input file indexOf = COLUMN_MAP globes.printColumnWarning( orig_filename, indexOf ) # process the data lines for dataline in fin.readlines() : splt = dataline.strip().split('\t') if lineFilter( splt ) : data = [splt[ix] for ix in cols_to_use] for group in family_groups : calls = [ callToString(splt[ix]) for ix in groupIXs[group] ] string = "%s\t%s\n" % ( '\t'.join(data), '\t'.join(calls)) fouts[group].write( string ) #close filehandles for each group for group in fouts : fouts[group].close() fin.close()
def __init__(self, vcf_file, fast_forward=0) : self.indexOf = COLUMN_MAP globes.printColumnWarning( vcf_file, self.indexOf ) self.fin = open( vcf_file, "rb" ) self.patients = getPatients( self.fin ) print "after getPatients" self.allow_absent = False self.group_repeats = False self.iterator = globes.splitIterator( self.fin, burn=fast_forward )
def __init__(self, vcf_file, fast_forward=0) : self.indexOf = broad.COLUMN_MAP globes.printColumnWarning( vcf_file, self.indexOf ) self.fin = open( vcf_file, "rb" ) self.patients = broad.getPatients( self.fin ) self.allow_absent = False self.group_repeats = False self.iterator = self.iterate(fast_forward)