def data_iterator(self, keys=('tumor_bam', 'normal_bam', 'data_filename', 'project', 'dataset', 'sample', 'evidence_type')): file = open(self.filename, 'rU') reader = csv.DictReader(file, delimiter='\t') logging.getLogger(__name__).info( "Gathering variants from table of files:" + self.filename) for file_data in reader: if file_data.has_key('FILTER'): if len(filter) > 0: continue meta_data_dict = get_entries_from_dict(file_data, keys=keys, return_type=dict) logging.getLogger( __name__).info("Gathering variants from individual file:" + meta_data_dict['data_filename']) D = DatabaseParser(meta_data_dict['data_filename']) self.current_file = meta_data_dict['data_filename'] n = 0 self.new_file = True for variant_dict in D.get_variants(): yield merge_dicts(variant_dict, meta_data_dict) if self.new_file == True: self.new_file = False self.current_file = None
def data_iterator(self,keys=('tumor_bam','normal_bam', 'data_filename', 'project', 'dataset', 'sample', 'evidence_type')): file = open(self.filename,'rU') reader = csv.DictReader(file,delimiter='\t') logging.getLogger(__name__).info("Gathering variants from table of files:"+ self.filename) for file_data in reader: if file_data.has_key('FILTER'): if len(filter) > 0: continue meta_data_dict = get_entries_from_dict(file_data, keys=keys, return_type=dict) logging.getLogger(__name__).info("Gathering variants from individual file:"+ meta_data_dict['data_filename']) D = DatabaseParser(meta_data_dict['data_filename']) self.current_file = meta_data_dict['data_filename'] n=0 self.new_file = True for variant_dict in D.get_variants(): yield merge_dicts(variant_dict, meta_data_dict) if self.new_file == True: self.new_file = False self.current_file = None
def VariantUploader(tsv, submit_to_filesystem=False): gather = DataGatherer(tsv) variants = connect_to_mongo() if submit_to_filesystem: filesystem = SomaticFileSystem('/dsde/working/somaticDB/master/data') S = SubmissionFile(tsv) S.change_file_dir() S.to_csv( os.path.join('/dsde/working/somaticDB/master/records', os.path.basename(tsv))) else: filesystem = None bulk_count = 0 bulk = variants.initialize_unordered_bulk_op() start_time = time.time() n = 0 for variant_dict in gather.data_iterator(): n += 1 bulk_count += 1 additional_data_dict = {} mongo_submission = merge_dicts(variant_dict, additional_data_dict) # if the user is uploading a mutect1 maf ... rename fields if mongo_submission.has_key('contig'): mongo_submission['chromosome'] = mongo_submission.pop('contig') if mongo_submission.has_key('position'): mongo_submission['start'] = mongo_submission.pop('position') if mongo_submission.has_key('ref_allele'): mongo_submission['ref'] = mongo_submission.pop('ref_allele') if mongo_submission.has_key('alt_allele'): mongo_submission['alt'] = mongo_submission.pop('alt_allele') unique_data = get_entries_from_dict(mongo_submission, keys=[ 'chromosome', 'start', 'ref', 'alt', 'project', 'dataset', 'evidence_type' ], return_type=dict) if filesystem: project = mongo_submission['project'] dataset = mongo_submission['dataset'] filesystem.add_project(project) filesystem[project].add_dataset(dataset) filesystem[project][dataset].add_file( mongo_submission['data_filename']) mongo_submission['data_filename']=\ change_data_filename("/dsde/working/somaticDB/master/data/%s/%s/"%(project,dataset), mongo_submission['data_filename']) bulk.insert(mongo_submission) if bulk_count == 10000: print "variants uploaded: %d (%.2f seconds since start of upload)." % ( n, time.time() - start_time) bulk_count = 0 bulk.execute() bulk = variants.initialize_unordered_bulk_op() if bulk_count > 0: print "variants uploaded: %d (%.2f seconds since start of upload)." % ( n, time.time() - start_time) bulk.execute()
def get_variants(self, dataset_type=None): logging.getLogger(__name__).info("Opening file:"+ self.filename) if dataset_type is None: if any([self.filename.endswith(".table"), self.filename.endswith(".table.gz")]): dataset_type = "VCF_TABLE" if any([self.filename.endswith(".vcf"), self.filename.endswith(".vcf.gz")]): dataset_type = "VCF" if any([self.filename.endswith(".maf"), self.filename.endswith(".maf.gz")]): dataset_type = "MAF" logging.getLogger(__name__).info("Using vcf module: "+vcf.__file__) logging.getLogger(__name__).info("File is of type:"+ dataset_type) if dataset_type not in ['VCF', 'MAF', 'VCF_TABLE']: raise Exception("Bad file format: %s"%self.filename) if dataset_type == 'VCF': self.file = vcf.Reader(self.file) else: self.file = ifilter(lambda line: not line.startswith('#'), self.file) self.file = csv.DictReader(self.file,delimiter='\t') for record in self.file: if dataset_type == 'VCF': for k, alt in enumerate(record.ALT): #split biallelic sites chrom = record.CHROM start = record.POS ref = record.REF alt = str(alt) filter = record.FILTER start, end, ref, alt = adjustIndelFormat(start, ref, alt) core_data = {"chromosome":chrom,"start":start,"end":end,"ref":ref,"alt":alt,"FILTER":filter} for key in record.INFO: if key in ['MLEAC','MLEAF','AC','AF']: core_data[key] = record.INFO[key][k] else: core_data[key] = record.INFO[key] #filters for callers if record.FILTER is None or record.FILTER == '.' or (not record.FILTER) or record.FILTER=='PASS': pass else: continue #filters for dream challenge if record.INFO.has_key('SVTYPE'): continue core_data = merge_dicts(core_data,record.INFO) core_data = stringify_dict(core_data) yield core_data if dataset_type == 'VCF_TABLE': ALT = record['ALT'].split(",") for k, alt in enumerate(ALT): chrom = record['CHROM'] start = record['POS'] ref = record['REF'] start, end, ref, alt = adjustIndelFormat(start, ref, alt) core_data = {"chromosome":chrom,"start":start,"end":end,"ref":ref,"alt":alt} mleac = record['MLEAC'].split(',') mleaf = record['MLEAF'].split(',') ac = record['AC'].split(',') af = record['AF'].split(',') #filters for callers if record.has_key('FILTER'): if record['FILTER'] == '.' or record['FILTER'] =='PASS': pass else: continue #filters for dream challenge if record.INFO.get('SVTYPE') in ('IGN', 'MSK'): continue for key in record: if key in ['CHROM','POS','REF','MLEAF','MLEAC','AC','AF']: continue if key == 'MLEAC': core_data['MLEAC'] = mleac[k] elif key == 'MLEAF': core_data['MLEAF'] = mleaf[k] elif key == 'AC': core_data['AC'] = ac[k] elif key == 'AF': core_data['AF'] = af[k] else: core_data[key] = record[key] if "," in core_data[key]: core_data[key] = core_data[key].split(',')[k] core_data = stringify_dict(core_data) yield core_data if dataset_type == "MAF": if record.has_key('contig'): record['Chromosome'] = record.pop('contig') if record.has_key('position'): record['Start_position'] = record.pop('position') if record.has_key('ref_allele'): record['Reference_Allele'] = record.pop('ref_allele') if record.has_key('alt_allele'): record['Tumor_Seq_Allele2'] = record.pop('alt_allele') if not record.has_key('End_position'): record['End_position'] = str(int(record['Start_position'])+ len(record['Reference_Allele'])-1) chrom = record['Chromosome'] start = record['Start_position'] end = record['End_position'] ref = record['Reference_Allele'] alt = record['Tumor_Seq_Allele2'] if record.has_key('judgement'): if record['judgement'] != 'KEEP': continue core_data = {"chromosome":chrom,"start":start,"end":end,"ref":ref,"alt":alt} for key in record: if key not in ['Chromosome','Start_Position','End_Position','Reference_Allele','Tumor_Seq_Allele1','Tumor_Seq_Allele2']: core_data[key] = record[key] core_data = stringify_dict(core_data) yield core_data
def VariantUploader(tsv, submit_to_filesystem=False): gather = DataGatherer(tsv) variants = connect_to_mongo() if submit_to_filesystem: filesystem = SomaticFileSystem("/dsde/working/somaticDB/master/data") S = SubmissionFile(tsv) S.change_file_dir() S.to_csv(os.path.join("/dsde/working/somaticDB/master/records", os.path.basename(tsv))) else: filesystem = None bulk_count = 0 bulk = variants.initialize_unordered_bulk_op() start_time = time.time() n = 0 for variant_dict in gather.data_iterator(): n += 1 bulk_count += 1 additional_data_dict = {} mongo_submission = merge_dicts(variant_dict, additional_data_dict) # if the user is uploading a mutect1 maf ... rename fields if mongo_submission.has_key("contig"): mongo_submission["chromosome"] = mongo_submission.pop("contig") if mongo_submission.has_key("position"): mongo_submission["start"] = mongo_submission.pop("position") if mongo_submission.has_key("ref_allele"): mongo_submission["ref"] = mongo_submission.pop("ref_allele") if mongo_submission.has_key("alt_allele"): mongo_submission["alt"] = mongo_submission.pop("alt_allele") unique_data = get_entries_from_dict( mongo_submission, keys=["chromosome", "start", "ref", "alt", "project", "dataset", "evidence_type"], return_type=dict, ) if filesystem: project = mongo_submission["project"] dataset = mongo_submission["dataset"] filesystem.add_project(project) filesystem[project].add_dataset(dataset) filesystem[project][dataset].add_file(mongo_submission["data_filename"]) mongo_submission["data_filename"] = change_data_filename( "/dsde/working/somaticDB/master/data/%s/%s/" % (project, dataset), mongo_submission["data_filename"] ) bulk.insert(mongo_submission) if bulk_count == 10000: print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time) bulk_count = 0 bulk.execute() bulk = variants.initialize_unordered_bulk_op() if bulk_count > 0: print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time) bulk.execute()
def get_variants(self, dataset_type=None): logging.getLogger(__name__).info("Opening file:" + self.filename) if dataset_type is None: if any([ self.filename.endswith(".table"), self.filename.endswith(".table.gz") ]): dataset_type = "VCF_TABLE" if any([ self.filename.endswith(".vcf"), self.filename.endswith(".vcf.gz") ]): dataset_type = "VCF" if any([ self.filename.endswith(".maf"), self.filename.endswith(".maf.gz") ]): dataset_type = "MAF" logging.getLogger(__name__).info("Using vcf module: " + vcf.__file__) logging.getLogger(__name__).info("File is of type:" + dataset_type) if dataset_type not in ['VCF', 'MAF', 'VCF_TABLE']: raise Exception("Bad file format: %s" % self.filename) if dataset_type == 'VCF': self.file = vcf.Reader(self.file) else: self.file = ifilter(lambda line: not line.startswith('#'), self.file) self.file = csv.DictReader(self.file, delimiter='\t') for record in self.file: if dataset_type == 'VCF': for k, alt in enumerate(record.ALT): #split biallelic sites chrom = record.CHROM start = record.POS ref = record.REF alt = str(alt) filter = record.FILTER start, end, ref, alt = adjustIndelFormat(start, ref, alt) core_data = { "chromosome": chrom, "start": start, "end": end, "ref": ref, "alt": alt, "FILTER": filter } for key in record.INFO: if key in ['MLEAC', 'MLEAF', 'AC', 'AF']: core_data[key] = record.INFO[key][k] else: core_data[key] = record.INFO[key] #filters for callers if record.FILTER is None or record.FILTER == '.' or ( not record.FILTER) or record.FILTER == 'PASS': pass else: continue #filters for dream challenge if record.INFO.has_key('SVTYPE'): continue core_data = merge_dicts(core_data, record.INFO) core_data = stringify_dict(core_data) yield core_data if dataset_type == 'VCF_TABLE': ALT = record['ALT'].split(",") for k, alt in enumerate(ALT): chrom = record['CHROM'] start = record['POS'] ref = record['REF'] start, end, ref, alt = adjustIndelFormat(start, ref, alt) core_data = { "chromosome": chrom, "start": start, "end": end, "ref": ref, "alt": alt } mleac = record['MLEAC'].split(',') mleaf = record['MLEAF'].split(',') ac = record['AC'].split(',') af = record['AF'].split(',') #filters for callers if record.has_key('FILTER'): if record['FILTER'] == '.' or record[ 'FILTER'] == 'PASS': pass else: continue #filters for dream challenge if record.INFO.get('SVTYPE') in ('IGN', 'MSK'): continue for key in record: if key in [ 'CHROM', 'POS', 'REF', 'MLEAF', 'MLEAC', 'AC', 'AF' ]: continue if key == 'MLEAC': core_data['MLEAC'] = mleac[k] elif key == 'MLEAF': core_data['MLEAF'] = mleaf[k] elif key == 'AC': core_data['AC'] = ac[k] elif key == 'AF': core_data['AF'] = af[k] else: core_data[key] = record[key] if "," in core_data[key]: core_data[key] = core_data[key].split(',')[k] core_data = stringify_dict(core_data) yield core_data if dataset_type == "MAF": if record.has_key('contig'): record['Chromosome'] = record.pop('contig') if record.has_key('position'): record['Start_position'] = record.pop('position') if record.has_key('ref_allele'): record['Reference_Allele'] = record.pop('ref_allele') if record.has_key('alt_allele'): record['Tumor_Seq_Allele2'] = record.pop('alt_allele') if not record.has_key('End_position'): record['End_position'] = str( int(record['Start_position']) + len(record['Reference_Allele']) - 1) chrom = record['Chromosome'] start = record['Start_position'] end = record['End_position'] ref = record['Reference_Allele'] alt = record['Tumor_Seq_Allele2'] if record.has_key('judgement'): if record['judgement'] != 'KEEP': continue core_data = { "chromosome": chrom, "start": start, "end": end, "ref": ref, "alt": alt } for key in record: if key not in [ 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2' ]: core_data[key] = record[key] core_data = stringify_dict(core_data) yield core_data