def data_iterator(self, keys=('tumor_bam', 'normal_bam', 'data_filename', 'project', 'dataset', 'sample', 'evidence_type')): file = open(self.filename, 'rU') reader = csv.DictReader(file, delimiter='\t') logging.getLogger(__name__).info( "Gathering variants from table of files:" + self.filename) for file_data in reader: if file_data.has_key('FILTER'): if len(filter) > 0: continue meta_data_dict = get_entries_from_dict(file_data, keys=keys, return_type=dict) logging.getLogger( __name__).info("Gathering variants from individual file:" + meta_data_dict['data_filename']) D = DatabaseParser(meta_data_dict['data_filename']) self.current_file = meta_data_dict['data_filename'] n = 0 self.new_file = True for variant_dict in D.get_variants(): yield merge_dicts(variant_dict, meta_data_dict) if self.new_file == True: self.new_file = False self.current_file = None
def data_iterator(self,keys=('tumor_bam','normal_bam', 'data_filename', 'project', 'dataset', 'sample', 'evidence_type')): file = open(self.filename,'rU') reader = csv.DictReader(file,delimiter='\t') logging.getLogger(__name__).info("Gathering variants from table of files:"+ self.filename) for file_data in reader: if file_data.has_key('FILTER'): if len(filter) > 0: continue meta_data_dict = get_entries_from_dict(file_data, keys=keys, return_type=dict) logging.getLogger(__name__).info("Gathering variants from individual file:"+ meta_data_dict['data_filename']) D = DatabaseParser(meta_data_dict['data_filename']) self.current_file = meta_data_dict['data_filename'] n=0 self.new_file = True for variant_dict in D.get_variants(): yield merge_dicts(variant_dict, meta_data_dict) if self.new_file == True: self.new_file = False self.current_file = None
def survey(filename): logging.getLogger(__name__).info("Beginning survey.") query = "{'project' : { '$exists' : 'true' } }" tally = defaultdict(int) collector = connect_to_mongo() # collect query information n = 0 for record in collector.find(ast.literal_eval(query)): sample_information = get_entries_from_dict( record, keys=['project', 'dataset', 'sample', 'evidence_type'], return_type=dict) feature = None if is_snp(record): feature = 'snp' if is_indel(record): feature = 'indel' if feature is None: print record continue if sample_information['evidence_type'] == 'TP': n += 1 project = sample_information['project'] dataset = sample_information['dataset'] sample = sample_information['sample'] tally[(project, dataset, sample, feature)] += 1 tally[(project, dataset, '', feature)] += 1 tally[(project, '', '', feature)] += 1 tally[('', '', '', feature)] += 1 if not (n % 10000): logging.getLogger(__name__).info("Variants seen: " + str(n)) fp = csv.DictWriter( open(filename, 'w'), fieldnames=['project', 'dataset', 'sample', 'feature', 'count'], delimiter='\t') fp.writeheader() for item in tally: fp.writerow({ 'project': item[0], 'dataset': item[1], 'sample': item[2], 'feature': item[3], 'count': tally[item] })
def survey(filename): logging.getLogger(__name__).info("Beginning survey.") query = "{'project' : { '$exists' : 'true' } }" tally = defaultdict(int) collector = connect_to_mongo() # collect query information n = 0 for record in collector.find(ast.literal_eval(query)): sample_information = get_entries_from_dict(record, keys=['project', 'dataset', 'sample', 'evidence_type'],return_type=dict) feature = None if is_snp(record): feature = 'snp' if is_indel(record): feature = 'indel' if feature is None: print record continue if sample_information['evidence_type'] == 'TP': n+=1 project = sample_information['project'] dataset = sample_information['dataset'] sample = sample_information['sample'] tally[(project,dataset,sample,feature)]+=1 tally[(project,dataset,'',feature)]+=1 tally[(project,'','',feature)]+=1 tally[('','','',feature)]+=1 if not (n % 10000): logging.getLogger(__name__).info("Variants seen: "+str(n)) fp = csv.DictWriter(open(filename,'w'), fieldnames=['project','dataset','sample','feature','count'],delimiter='\t') fp.writeheader() for item in tally: fp.writerow({'project':item[0],'dataset':item[1],'sample':item[2],'feature':item[3],'count': tally[item] })
def variant_extract(query, output_filename, max_number_of_records): query = query_processor(query) output = [] collection = connect_to_mongo() for record in collection.find(ast.literal_eval(query)): sample_information = get_entries_from_dict(record, keys=[ 'chromosome', 'start', 'ref', 'alt', 'project', 'dataset', 'sample', 'evidence_type' ], return_type=dict) for sample in sample_information: sample_information[sample] = str(sample_information[sample]) output.append(sample_information) output = pd.DataFrame(output) if max_number_of_records is not None: if len(output) > max_number_of_records: output = output[:max_number_of_records] if output_filename == "<stdout>": print "project, dataset, sample, evidence_type, chromosome, start, ref, alt" for k, row in output.iterrows(): row = [ row['project'], row['dataset'], row['sample'], row['evidence_type'], row['chrosome'], row['start'], row['ref'], row['alt'] ] row = ",".join(row) print row else: output.to_csv(output_filename, index=False, columns=[ "project", "dataset", "sample", "evidence_type", "chromosome", "start", "ref", "alt" ], sep='\t')
def variant_extract(query, output_filename, max_number_of_records): query = query_processor(query) output = [] collection = connect_to_mongo() for record in collection.find(ast.literal_eval(query)): sample_information = get_entries_from_dict(record, keys=['chromosome', 'start', 'ref', 'alt', 'project', 'dataset', 'sample', 'evidence_type'],return_type=dict) for sample in sample_information: sample_information[sample] = str(sample_information[sample]) output.append(sample_information) output = pd.DataFrame(output) if max_number_of_records is not None: if len(output) > max_number_of_records: output = output[:max_number_of_records] if output_filename == "<stdout>": print "project, dataset, sample, evidence_type, chromosome, start, ref, alt" for k, row in output.iterrows(): row = [row['project'],row['dataset'],row['sample'],row['evidence_type'],row['chrosome'],row['start'],row['ref'],row['alt']] row =",".join(row) print row else: output.to_csv(output_filename,index=False,columns=["project", "dataset", "sample", "evidence_type", "chromosome", "start", "ref", "alt"], sep='\t')
def VariantUploader(tsv, submit_to_filesystem=False): gather = DataGatherer(tsv) variants = connect_to_mongo() if submit_to_filesystem: filesystem = SomaticFileSystem('/dsde/working/somaticDB/master/data') S = SubmissionFile(tsv) S.change_file_dir() S.to_csv( os.path.join('/dsde/working/somaticDB/master/records', os.path.basename(tsv))) else: filesystem = None bulk_count = 0 bulk = variants.initialize_unordered_bulk_op() start_time = time.time() n = 0 for variant_dict in gather.data_iterator(): n += 1 bulk_count += 1 additional_data_dict = {} mongo_submission = merge_dicts(variant_dict, additional_data_dict) # if the user is uploading a mutect1 maf ... rename fields if mongo_submission.has_key('contig'): mongo_submission['chromosome'] = mongo_submission.pop('contig') if mongo_submission.has_key('position'): mongo_submission['start'] = mongo_submission.pop('position') if mongo_submission.has_key('ref_allele'): mongo_submission['ref'] = mongo_submission.pop('ref_allele') if mongo_submission.has_key('alt_allele'): mongo_submission['alt'] = mongo_submission.pop('alt_allele') unique_data = get_entries_from_dict(mongo_submission, keys=[ 'chromosome', 'start', 'ref', 'alt', 'project', 'dataset', 'evidence_type' ], return_type=dict) if filesystem: project = mongo_submission['project'] dataset = mongo_submission['dataset'] filesystem.add_project(project) filesystem[project].add_dataset(dataset) filesystem[project][dataset].add_file( mongo_submission['data_filename']) mongo_submission['data_filename']=\ change_data_filename("/dsde/working/somaticDB/master/data/%s/%s/"%(project,dataset), mongo_submission['data_filename']) bulk.insert(mongo_submission) if bulk_count == 10000: print "variants uploaded: %d (%.2f seconds since start of upload)." % ( n, time.time() - start_time) bulk_count = 0 bulk.execute() bulk = variants.initialize_unordered_bulk_op() if bulk_count > 0: print "variants uploaded: %d (%.2f seconds since start of upload)." % ( n, time.time() - start_time) bulk.execute()
def VariantAssessor(query,tsv,output_file,outdir=""): collection = connect_to_mongo() caller_output = pd.read_csv(tsv,sep='\t') known_true = {'snp': defaultdict(set),'indel': defaultdict(set)} known_false = {'snp': defaultdict(set),'indel': defaultdict(set)} found_variants = {'snp': defaultdict(set),'indel': defaultdict(set)} false_positive = {'snp': defaultdict(set),'indel': defaultdict(set)} query = query_processor(query) logging.getLogger(__name__).info("Querying database for variants.") # collect query information for record in collection.find(ast.literal_eval(query)): sample_information = get_entries_from_dict(record, keys=['project','dataset','sample'],return_type=tuple) variant = get_entries_from_dict(record, keys=['chromosome','start','ref','alt'],return_type=tuple) sample_information = tuple(map(str, sample_information)) variant = tuple(map(str, variant)) evidence_type = record['evidence_type'] if is_snp(record): if 'TP' in evidence_type: known_true['snp'][sample_information].add(variant) if 'FP' in evidence_type: known_false['snp'][sample_information].add(variant) elif is_indel(record): if 'TP' in evidence_type: known_true['indel'][sample_information].add(variant) if 'FP' in evidence_type: known_false['indel'][sample_information].add(variant) normal_normal = set([]) cm = set([]) #index the type of assessment to be done for each datatype. for k,row in caller_output.iterrows(): sample_information = (row['project'],row['dataset'],row['sample']) if row['evidence_type'] == 'NN': normal_normal.add(sample_information) elif row['evidence_type'] == 'CM': cm.add(sample_information) else: cm.add(sample_information) #by default, ROC-like curves are used. gather = DataGatherer(tsv) logging.getLogger(__name__).info("Collection of variants from user submitted files.") found_feature_data = defaultdict(dict) #data from file (algorithm being tested) for variant_dict in gather.data_iterator(): sample_information = get_entries_from_dict(variant_dict, keys=['project','dataset','sample'],return_type=tuple) variant = get_entries_from_dict(variant_dict, keys=['chromosome','start','ref','alt'],return_type=tuple) print sample_information, variant #found_feature_data[sample_information][variant] = get_entries_from_dict(variant_dict, keys=['ECNT','HCNT','NLOD','TLOD'],return_type=dict) if is_snp(variant_dict): found_variants['snp'][sample_information].add(variant) elif is_indel(variant_dict): found_variants['indel'][sample_information].add(variant) #print found_feature_data.keys() caller_samples = caller_output[['project','dataset','sample']].values.tolist() data = [] for feature in ['snp','indel']: filename = {}; fp_fn = {}; fp_fp={}; all_dict={}; fp_tp= {} filename[feature] = feature+".false_negatives.tsv" fp_fn[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'), delimiter='\t', fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type']) #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type'] fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type'] filename[feature] = feature+".false_positives.tsv" fp_fp[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'), delimiter='\t', fieldnames=fieldnames) #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type'] fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type'] filename[feature] = feature+".true_positives.tsv" fp_tp[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'), delimiter='\t', fieldnames=fieldnames) fp_fn[feature].writeheader() fp_fp[feature].writeheader() fp_tp[feature].writeheader() for eval_type in ['CM','NN']: all_dict[eval_type] = {'project': 'all', 'dataset': 'all', 'sample' : 'all', 'false_positives': 0, 'true_positives': 0, 'false_negatives': 0, 'tpr': np.nan, 'fpr': np.nan, 'precision': np.nan, 'evidence_type': eval_type, 'variant_type': feature } for sample_information in map(tuple,caller_samples): if sample_information in normal_normal: assessment_type = 'NN' elif sample_information in cm: assessment_type = 'CM' else: assessment_type = 'CM' row_dict = {'project': sample_information[0], 'dataset': sample_information[1], 'sample' : sample_information[2], 'false_positives': 0, 'true_positives': 0, 'false_negatives': 0, 'tpr': np.nan, 'fpr': np.nan, 'precision': np.nan, 'evidence_type': assessment_type, 'variant_type': feature} if assessment_type == 'NN': FN = len(found_variants[feature][sample_information]) row_dict['false_positives'] = FN row_dict['precision'] = 0 if assessment_type == 'CM': TP = np.float(len(found_variants[feature][sample_information].intersection(known_true[feature][sample_information]))) FN = np.float(len(known_true[feature][sample_information].difference(found_variants[feature][sample_information]))) FP = np.float(len(found_variants[feature][sample_information].difference(known_true[feature][sample_information]))) print TP, FN, FP try: row_dict['tpr'] = TP/(TP+FN) except: row_dict['tpr'] = np.nan row_dict['true_positives'] = TP row_dict['false_negatives'] = FN row_dict['false_positives'] = FP all_dict['CM']['true_positives'] += TP all_dict['CM']['false_negatives'] += FN all_dict['CM']['false_positives'] += FP try: row_dict['precision'] = TP/(TP+FP) except: row_dict['precision'] = np.nan row_dict['dream_accuracy'] = (row_dict['tpr'] + row_dict['precision'])/2.0 print row_dict['tpr'], row_dict['precision'], row_dict['dream_accuracy'] row_dict['variant_type'] = feature data.append(row_dict) true_positives = list(found_variants[feature][sample_information].intersection(known_true[feature][sample_information])) false_positives = list(found_variants[feature][sample_information].difference(known_true[feature][sample_information])) false_negatives = list(known_true[feature][sample_information].difference(found_variants[feature][sample_information])) for variant in true_positives: fp_tp[feature].writerow({'project': sample_information[0], 'dataset':sample_information[1], 'sample':sample_information[2], 'chromosome':variant[0], 'start':variant[1], 'ref':variant[2], 'alt':variant[3], #'ECNT':found_feature_data[sample_information][variant]['ECNT'], #'HCNT':found_feature_data[sample_information][variant]['HCNT'], #'NLOD':found_feature_data[sample_information][variant]['NLOD'], #'TLOD':found_feature_data[sample_information][variant]['TLOD'], 'variant_type':feature}) for variant in false_positives: fp_fp[feature].writerow({'project': sample_information[0], 'dataset':sample_information[1], 'sample':sample_information[2], 'chromosome':variant[0], 'start':variant[1], 'ref':variant[2], 'alt':variant[3], #'ECNT':found_feature_data[sample_information][variant]['ECNT'], #'HCNT':found_feature_data[sample_information][variant]['HCNT'], #'NLOD':found_feature_data[sample_information][variant]['NLOD'], #'TLOD':found_feature_data[sample_information][variant]['TLOD'], 'variant_type':feature}) for variant in false_negatives: fp_fn[feature].writerow({'project': sample_information[0], 'dataset':sample_information[1], 'sample':sample_information[2], 'chromosome':variant[0], 'start':variant[1], 'ref':variant[2], 'alt':variant[3], 'variant_type':feature}) try: all_dict['CM']['tpr'] = all_dict['CM']['true_positives']/(all_dict['CM']['true_positives']+all_dict['CM']['false_negatives']) except: all_dict['CM']['tpr'] = np.nan all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + 1 -all_dict['CM']['precision'])/2.0 try: all_dict['CM']['precision'] = all_dict['CM']['true_positives']/(all_dict['CM']['true_positives']+all_dict['CM']['false_positives']) except: all_dict['CM']['precision'] = np.nan all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + all_dict['CM']['precision'])/2.0 all_dict['CM']['variant_type'] = feature #data.append(all_dict['CM']) #data.append(all_dict['NN']) fieldnames=['project','dataset','sample' ,'false_positives','true_positives','false_negatives','tpr','precision','evidence_type','dream_accuracy','variant_type'] pd.DataFrame(data).to_csv(output_file, sep='\t',index=False,columns=fieldnames,na_rep='nan')
def BamAggregator(query, normal_bam_list_name, tumor_bam_list_name, interval_list_name, metadata_list_name, folder): collection = connect_to_mongo() query = query_processor(query) interval_list = defaultdict(set) metadata_list = {} query = query.strip('"') query = ast.literal_eval(query) print query print type(query) print "query dictionary:", query.items() print "directory:" + os.getcwd() doesrecordloop = False for record in collection.find(query): if doesrecordloop == False: print "Contains at least one record." doesrecordloop = True if not record.has_key('tumor_bam'): print record continue #print record['tumor_bam'] record['tumor_bam'] = picard_version_to_current(record['tumor_bam']) record['normal_bam'] = picard_version_to_current(record['normal_bam']) #print record['tumor_bam'] #print tumor_bam = record['tumor_bam'] normal_bam = record['normal_bam'] interval = "%s:%s-%s" % (record['chromosome'], record['start'], record['end']) interval_list[(tumor_bam, normal_bam)].add(interval) field_names = [ 'tumor_bam', 'normal_bam', 'data_filename', 'project', 'dataset', 'sample' ] metadata_list[(tumor_bam, normal_bam)] = get_entries_from_dict(record, keys=field_names, return_type=dict) metadata_list[(tumor_bam, normal_bam)]['evidence_type'] = '.' metadata_list[(tumor_bam, normal_bam)]['author'] = '.' print 'OPENNING FILES HERE.' print 'tumor_bam_file: ' + tumor_bam_list_name print 'normal_bam_file: ' + normal_bam_list_name print 'interval file:' + interval_list_name tumor_bam_file = open(tumor_bam_list_name, 'w') normal_bam_file = open(normal_bam_list_name, 'w') interval_file = open(interval_list_name, 'w') location = os.path.dirname(tumor_bam_list_name) fname = os.path.join(location, 'test.txt') print "test:" + fname f = open(fname, 'w') f.close() for filename in os.listdir(location): if filename.endswith("list"): print filename fieldnames = [ 'tumor_bam', 'normal_bam', 'data_filename', 'project', 'dataset', 'sample', 'evidence_type', 'author' ] metadata_file = csv.DictWriter(open(metadata_list_name, 'w'), fieldnames=fieldnames, delimiter='\t') metadata_file.writeheader() current_dir = os.getcwd() for pair in interval_list: tumor_bam, normal_bam = pair tumor_bam_file.write(tumor_bam + '\n') normal_bam_file.write(normal_bam + '\n') metadata_file.writerow(metadata_list[(tumor_bam, normal_bam)]) sample =\ "".join([random.choice('abcdef0123456789') for k in range(40)]) intervals_dir = folder current_filename = ".".join([ "intervals", os.path.splitext(os.path.basename(tumor_bam))[0], os.path.splitext(os.path.basename(normal_bam))[0], "list" ]) current_filename = os.path.join(intervals_dir, current_filename) if not os.path.exists(intervals_dir): os.mkdir(intervals_dir) print "made this folder:", intervals_dir, os.path.exists(intervals_dir) current_interval_file = open(current_filename, 'w') sorted_intervals = sorted( list(interval_list[pair]), key=lambda x: int(x.split(':')[1].split('-')[0])) sorted_intervals = sorted(sorted_intervals, key=lambda x: x.split(':')) for interval in sorted_intervals: current_interval_file.write(interval + "\n") current_interval_file.close() interval_file.write(current_filename + '\n') for thing in os.listdir(intervals_dir): print "file in dir:", thing tumor_bam_file.close() normal_bam_file.close() interval_file.close()
def VariantUploader(tsv, submit_to_filesystem=False): gather = DataGatherer(tsv) variants = connect_to_mongo() if submit_to_filesystem: filesystem = SomaticFileSystem("/dsde/working/somaticDB/master/data") S = SubmissionFile(tsv) S.change_file_dir() S.to_csv(os.path.join("/dsde/working/somaticDB/master/records", os.path.basename(tsv))) else: filesystem = None bulk_count = 0 bulk = variants.initialize_unordered_bulk_op() start_time = time.time() n = 0 for variant_dict in gather.data_iterator(): n += 1 bulk_count += 1 additional_data_dict = {} mongo_submission = merge_dicts(variant_dict, additional_data_dict) # if the user is uploading a mutect1 maf ... rename fields if mongo_submission.has_key("contig"): mongo_submission["chromosome"] = mongo_submission.pop("contig") if mongo_submission.has_key("position"): mongo_submission["start"] = mongo_submission.pop("position") if mongo_submission.has_key("ref_allele"): mongo_submission["ref"] = mongo_submission.pop("ref_allele") if mongo_submission.has_key("alt_allele"): mongo_submission["alt"] = mongo_submission.pop("alt_allele") unique_data = get_entries_from_dict( mongo_submission, keys=["chromosome", "start", "ref", "alt", "project", "dataset", "evidence_type"], return_type=dict, ) if filesystem: project = mongo_submission["project"] dataset = mongo_submission["dataset"] filesystem.add_project(project) filesystem[project].add_dataset(dataset) filesystem[project][dataset].add_file(mongo_submission["data_filename"]) mongo_submission["data_filename"] = change_data_filename( "/dsde/working/somaticDB/master/data/%s/%s/" % (project, dataset), mongo_submission["data_filename"] ) bulk.insert(mongo_submission) if bulk_count == 10000: print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time) bulk_count = 0 bulk.execute() bulk = variants.initialize_unordered_bulk_op() if bulk_count > 0: print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time) bulk.execute()
def VariantAssessor(query, tsv, output_file, outdir=""): collection = connect_to_mongo() caller_output = pd.read_csv(tsv, sep='\t') known_true = {'snp': defaultdict(set), 'indel': defaultdict(set)} known_false = {'snp': defaultdict(set), 'indel': defaultdict(set)} found_variants = {'snp': defaultdict(set), 'indel': defaultdict(set)} false_positive = {'snp': defaultdict(set), 'indel': defaultdict(set)} query = query_processor(query) logging.getLogger(__name__).info("Querying database for variants.") # collect query information for record in collection.find(ast.literal_eval(query)): sample_information = get_entries_from_dict( record, keys=['project', 'dataset', 'sample'], return_type=tuple) variant = get_entries_from_dict( record, keys=['chromosome', 'start', 'ref', 'alt'], return_type=tuple) sample_information = tuple(map(str, sample_information)) variant = tuple(map(str, variant)) evidence_type = record['evidence_type'] if is_snp(record): if 'TP' in evidence_type: known_true['snp'][sample_information].add(variant) if 'FP' in evidence_type: known_false['snp'][sample_information].add(variant) elif is_indel(record): if 'TP' in evidence_type: known_true['indel'][sample_information].add(variant) if 'FP' in evidence_type: known_false['indel'][sample_information].add(variant) normal_normal = set([]) cm = set([]) #index the type of assessment to be done for each datatype. for k, row in caller_output.iterrows(): sample_information = (row['project'], row['dataset'], row['sample']) if row['evidence_type'] == 'NN': normal_normal.add(sample_information) elif row['evidence_type'] == 'CM': cm.add(sample_information) else: cm.add(sample_information) #by default, ROC-like curves are used. gather = DataGatherer(tsv) logging.getLogger(__name__).info( "Collection of variants from user submitted files.") found_feature_data = defaultdict(dict) #data from file (algorithm being tested) for variant_dict in gather.data_iterator(): sample_information = get_entries_from_dict( variant_dict, keys=['project', 'dataset', 'sample'], return_type=tuple) variant = get_entries_from_dict( variant_dict, keys=['chromosome', 'start', 'ref', 'alt'], return_type=tuple) print sample_information, variant #found_feature_data[sample_information][variant] = get_entries_from_dict(variant_dict, keys=['ECNT','HCNT','NLOD','TLOD'],return_type=dict) if is_snp(variant_dict): found_variants['snp'][sample_information].add(variant) elif is_indel(variant_dict): found_variants['indel'][sample_information].add(variant) #print found_feature_data.keys() caller_samples = caller_output[['project', 'dataset', 'sample']].values.tolist() data = [] for feature in ['snp', 'indel']: filename = {} fp_fn = {} fp_fp = {} all_dict = {} fp_tp = {} filename[feature] = feature + ".false_negatives.tsv" fp_fn[feature] = csv.DictWriter( open(os.path.join(outdir, filename[feature]), 'w'), delimiter='\t', fieldnames=[ 'project', 'dataset', 'sample', 'chromosome', 'start', 'ref', 'alt', 'variant_type' ]) #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type'] fieldnames = [ 'project', 'dataset', 'sample', 'chromosome', 'start', 'ref', 'alt', 'variant_type' ] filename[feature] = feature + ".false_positives.tsv" fp_fp[feature] = csv.DictWriter(open( os.path.join(outdir, filename[feature]), 'w'), delimiter='\t', fieldnames=fieldnames) #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type'] fieldnames = [ 'project', 'dataset', 'sample', 'chromosome', 'start', 'ref', 'alt', 'variant_type' ] filename[feature] = feature + ".true_positives.tsv" fp_tp[feature] = csv.DictWriter(open( os.path.join(outdir, filename[feature]), 'w'), delimiter='\t', fieldnames=fieldnames) fp_fn[feature].writeheader() fp_fp[feature].writeheader() fp_tp[feature].writeheader() for eval_type in ['CM', 'NN']: all_dict[eval_type] = { 'project': 'all', 'dataset': 'all', 'sample': 'all', 'false_positives': 0, 'true_positives': 0, 'false_negatives': 0, 'tpr': np.nan, 'fpr': np.nan, 'precision': np.nan, 'evidence_type': eval_type, 'variant_type': feature } for sample_information in map(tuple, caller_samples): if sample_information in normal_normal: assessment_type = 'NN' elif sample_information in cm: assessment_type = 'CM' else: assessment_type = 'CM' row_dict = { 'project': sample_information[0], 'dataset': sample_information[1], 'sample': sample_information[2], 'false_positives': 0, 'true_positives': 0, 'false_negatives': 0, 'tpr': np.nan, 'fpr': np.nan, 'precision': np.nan, 'evidence_type': assessment_type, 'variant_type': feature } if assessment_type == 'NN': FN = len(found_variants[feature][sample_information]) row_dict['false_positives'] = FN row_dict['precision'] = 0 if assessment_type == 'CM': TP = np.float( len(found_variants[feature] [sample_information].intersection( known_true[feature][sample_information]))) FN = np.float( len(known_true[feature][sample_information].difference( found_variants[feature][sample_information]))) FP = np.float( len(found_variants[feature][sample_information].difference( known_true[feature][sample_information]))) print TP, FN, FP try: row_dict['tpr'] = TP / (TP + FN) except: row_dict['tpr'] = np.nan row_dict['true_positives'] = TP row_dict['false_negatives'] = FN row_dict['false_positives'] = FP all_dict['CM']['true_positives'] += TP all_dict['CM']['false_negatives'] += FN all_dict['CM']['false_positives'] += FP try: row_dict['precision'] = TP / (TP + FP) except: row_dict['precision'] = np.nan row_dict['dream_accuracy'] = (row_dict['tpr'] + row_dict['precision']) / 2.0 print row_dict['tpr'], row_dict['precision'], row_dict[ 'dream_accuracy'] row_dict['variant_type'] = feature data.append(row_dict) true_positives = list( found_variants[feature][sample_information].intersection( known_true[feature][sample_information])) false_positives = list( found_variants[feature][sample_information].difference( known_true[feature][sample_information])) false_negatives = list( known_true[feature][sample_information].difference( found_variants[feature][sample_information])) for variant in true_positives: fp_tp[feature].writerow({ 'project': sample_information[0], 'dataset': sample_information[1], 'sample': sample_information[2], 'chromosome': variant[0], 'start': variant[1], 'ref': variant[2], 'alt': variant[3], #'ECNT':found_feature_data[sample_information][variant]['ECNT'], #'HCNT':found_feature_data[sample_information][variant]['HCNT'], #'NLOD':found_feature_data[sample_information][variant]['NLOD'], #'TLOD':found_feature_data[sample_information][variant]['TLOD'], 'variant_type': feature }) for variant in false_positives: fp_fp[feature].writerow({ 'project': sample_information[0], 'dataset': sample_information[1], 'sample': sample_information[2], 'chromosome': variant[0], 'start': variant[1], 'ref': variant[2], 'alt': variant[3], #'ECNT':found_feature_data[sample_information][variant]['ECNT'], #'HCNT':found_feature_data[sample_information][variant]['HCNT'], #'NLOD':found_feature_data[sample_information][variant]['NLOD'], #'TLOD':found_feature_data[sample_information][variant]['TLOD'], 'variant_type': feature }) for variant in false_negatives: fp_fn[feature].writerow({ 'project': sample_information[0], 'dataset': sample_information[1], 'sample': sample_information[2], 'chromosome': variant[0], 'start': variant[1], 'ref': variant[2], 'alt': variant[3], 'variant_type': feature }) try: all_dict['CM']['tpr'] = all_dict['CM']['true_positives'] / ( all_dict['CM']['true_positives'] + all_dict['CM']['false_negatives']) except: all_dict['CM']['tpr'] = np.nan all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + 1 - all_dict['CM']['precision']) / 2.0 try: all_dict['CM']['precision'] = all_dict['CM']['true_positives'] / ( all_dict['CM']['true_positives'] + all_dict['CM']['false_positives']) except: all_dict['CM']['precision'] = np.nan all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + all_dict['CM']['precision']) / 2.0 all_dict['CM']['variant_type'] = feature #data.append(all_dict['CM']) #data.append(all_dict['NN']) fieldnames = [ 'project', 'dataset', 'sample', 'false_positives', 'true_positives', 'false_negatives', 'tpr', 'precision', 'evidence_type', 'dream_accuracy', 'variant_type' ] pd.DataFrame(data).to_csv(output_file, sep='\t', index=False, columns=fieldnames, na_rep='nan')
def BamAggregator(query, normal_bam_list_name, tumor_bam_list_name, interval_list_name, metadata_list_name, folder): collection = connect_to_mongo() query = query_processor(query) interval_list = defaultdict(set) metadata_list = {} query = query.strip('"') query = ast.literal_eval(query) print query print type(query) print "query dictionary:", query.items() print "directory:" + os.getcwd() doesrecordloop = False for record in collection.find(query): if doesrecordloop == False: print "Contains at least one record." doesrecordloop = True if not record.has_key("tumor_bam"): print record continue # print record['tumor_bam'] record["tumor_bam"] = picard_version_to_current(record["tumor_bam"]) record["normal_bam"] = picard_version_to_current(record["normal_bam"]) # print record['tumor_bam'] # print tumor_bam = record["tumor_bam"] normal_bam = record["normal_bam"] interval = "%s:%s-%s" % (record["chromosome"], record["start"], record["end"]) interval_list[(tumor_bam, normal_bam)].add(interval) field_names = ["tumor_bam", "normal_bam", "data_filename", "project", "dataset", "sample"] metadata_list[(tumor_bam, normal_bam)] = get_entries_from_dict(record, keys=field_names, return_type=dict) metadata_list[(tumor_bam, normal_bam)]["evidence_type"] = "." metadata_list[(tumor_bam, normal_bam)]["author"] = "." print "OPENNING FILES HERE." print "tumor_bam_file: " + tumor_bam_list_name print "normal_bam_file: " + normal_bam_list_name print "interval file:" + interval_list_name tumor_bam_file = open(tumor_bam_list_name, "w") normal_bam_file = open(normal_bam_list_name, "w") interval_file = open(interval_list_name, "w") location = os.path.dirname(tumor_bam_list_name) fname = os.path.join(location, "test.txt") print "test:" + fname f = open(fname, "w") f.close() for filename in os.listdir(location): if filename.endswith("list"): print filename fieldnames = ["tumor_bam", "normal_bam", "data_filename", "project", "dataset", "sample", "evidence_type", "author"] metadata_file = csv.DictWriter(open(metadata_list_name, "w"), fieldnames=fieldnames, delimiter="\t") metadata_file.writeheader() current_dir = os.getcwd() for pair in interval_list: tumor_bam, normal_bam = pair tumor_bam_file.write(tumor_bam + "\n") normal_bam_file.write(normal_bam + "\n") metadata_file.writerow(metadata_list[(tumor_bam, normal_bam)]) sample = "".join([random.choice("abcdef0123456789") for k in range(40)]) intervals_dir = folder current_filename = ".".join( [ "intervals", os.path.splitext(os.path.basename(tumor_bam))[0], os.path.splitext(os.path.basename(normal_bam))[0], "list", ] ) current_filename = os.path.join(intervals_dir, current_filename) if not os.path.exists(intervals_dir): os.mkdir(intervals_dir) print "made this folder:", intervals_dir, os.path.exists(intervals_dir) current_interval_file = open(current_filename, "w") sorted_intervals = sorted(list(interval_list[pair]), key=lambda x: int(x.split(":")[1].split("-")[0])) sorted_intervals = sorted(sorted_intervals, key=lambda x: x.split(":")) for interval in sorted_intervals: current_interval_file.write(interval + "\n") current_interval_file.close() interval_file.write(current_filename + "\n") for thing in os.listdir(intervals_dir): print "file in dir:", thing tumor_bam_file.close() normal_bam_file.close() interval_file.close()