def testAnnotSVParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201' ] analysis_type = 'parsers.parse_annotsv' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual( prefixes, ['0228T', '5437_NA12878', '6037_NA12878', 'Count']) self.assertListEqual(fieldnames, [ 'Event1', 'Event2', 'Gene1', 'Gene2', 'location1', 'location2', 'NM', '1000g_event', '1000g_max_AF', 'Repeats1', 'Repeats2', 'DGV_GAIN_found|tested', 'DGV_LOSS_found|tested', '0228T', '5437_NA12878', '6037_NA12878', 'Count' ]) self.assertListEqual(variant_keys, ['Event1', 'Event2']) self.assertEqual(len(specimens), 19)
def testSNPParser(self): """ Test for correct fieldname parsing """ specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201', '0228T_CON_OPXv4_INT' ] analysis_type = 'parsers.parse_snp' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual(prefixes, [ '5437_NA12878_Ref|Var', '6037_NA12878_Ref|Var', '0228T_Ref|Var', 'Count' ]) self.assertListEqual(fieldnames, [ 'Position', 'Ref_Base', 'Var_Base', 'Gene', 'Variant_Type', 'Transcripts', 'Clinically_Flagged', 'Cosmic', 'Segdup', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', 'UW_Freq', 'UW_Count', 'UW_DEC_p', '1000g_ALL', 'EVS_esp6500_ALL', '1000g_AMR', 'EVS_esp6500_AA', '1000g_EUR', 'EVS_esp6500_EU', '1000g_SAS', '1000g_EAS', '1000g_AFR', 'ADA_Alter_Splice', 'RF_Alter_Splice', 'mutalyzer_errors', '5437_NA12878_Ref|Var', '6037_NA12878_Ref|Var', '0228T_Ref|Var', 'Count' ]) self.assertListEqual(variant_keys, ['Position', 'Ref_Base', 'Var_Base'])
def action(args): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] variant_keys = ['Position', 'Ref_Base', 'Var_Base'] files = ifilter(filters.any_analysis, walker(args.path)) files = ifilter(filters.only_analysis, files) #sort the files so that the output in the workbook is sorted files=sorted(files) for pth in files: pfx = munge_pfx(pth.fname) reads_pfx=pfx['mini-pfx']+'_Ref|Var' prefixes.append(reads_pfx) with open(os.path.join(pth.dir, pth.fname)) as fname: print pth.fname reader = csv.DictReader(fname, delimiter='\t') for row in reader: variant = tuple(row[k] for k in variant_keys) specimens[variant][reads_pfx] = row['Ref_Reads']+'|'+row['Var_Reads'] annotation[variant] = row annotation_headers = [ 'Gene', 'Variant_Type', 'Transcripts', 'Clinically_Flagged', 'Cosmic', 'Segdup', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', 'HiSeq_Freq', 'HiSeq_Count', 'MiSeq_Freq', 'MiSeq_Count', '1000g_ALL', 'EVS_esp6500_ALL', '1000g_AMR', 'EVS_esp6500_AA', '1000g_EUR', 'EVS_esp6500_EU', '1000g_ASN', '1000g_AFR'] writer = csv.DictWriter(args.outfile, fieldnames = variant_keys + annotation_headers + prefixes, extrasaction = 'ignore', delimiter = '\t') writer.writeheader() for variant in sorted(specimens.keys()): d = {k:v for k,v in zip(variant_keys,variant)} d.update({pfx:specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def testQualityParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order=['0228T_CON_OPXv4_INT','5437_E05_OPXv4_NA12878_MA0013','6037_E05_OPXv4_NA12878_HA0201'] analysis_type='parsers.parse_quality' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) self.assertListEqual(fieldnames, ['MEAN_TARGET_COVERAGE', '0228T','5437_NA12878','6037_NA12878']) self.assertListEqual(variant_keys, ['MEAN_TARGET_COVERAGE']) self.assertListEqual(prefixes,['0228T','5437_NA12878','6037_NA12878'])
def testClinFlaggedParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order=['0228T_CON_OPXv4_INT','5437_E05_OPXv4_NA12878_MA0013','6037_E05_OPXv4_NA12878_HA0201'] analysis_type='parsers.parse_clin_flagged' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) self.assertListEqual(prefixes,['0228T_Variants', '5437_NA12878_Variants', '6037_NA12878_Variants']) self.assertListEqual(fieldnames, ['Position', 'Ref_Base', 'Var_Base', 'Clinically_Flagged', '0228T_Variants', '5437_NA12878_Variants', '6037_NA12878_Variants']) self.assertListEqual(variant_keys, ['Position', 'Ref_Base', 'Var_Base'])
def testCNVExonParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order=['0228T_CON_OPXv4_INT','5437_E05_OPXv4_NA12878_MA0013','6037_E05_OPXv4_NA12878_HA0201'] analysis_type='parsers.parse_cnv_exon' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) self.assertListEqual(prefixes,['0228T_Log', '5437_NA12878_Log', '6037_NA12878_Log']) self.assertListEqual(fieldnames, ['Position', 'Gene', 'Transcripts', '0228T_Log', '5437_NA12878_Log', '6037_NA12878_Log']) self.assertListEqual(variant_keys, ['Position', 'Gene'])
def testAnnotSVParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order=['0228T_CON_OPXv4_INT','5437_E05_OPXv4_NA12878_MA0013','6037_E05_OPXv4_NA12878_HA0201'] analysis_type='parsers.parse_annotsv' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) self.assertListEqual(prefixes,['0228T', '5437_NA12878', '6037_NA12878','Count']) self.assertListEqual(fieldnames, ['Event1','Event2','Gene1','Gene2','NM','0228T', '5437_NA12878', '6037_NA12878','Count']) self.assertListEqual(variant_keys, ['Event1','Event2','Gene1','Gene2','NM']) self.assertEqual(len(specimens), 19)
def testQualityMetricsParser(self): variant_keys = [] files = ifilter(filters.quality_file_finder, walker(qualityfiles)) fname=open(path.join(qualityfiles, '6037_E05_OPXv4_NA12878_HA0201.quality_metrics'),'rU') lines=fname.readlines() output_dict, variant_keys = parsers.parse_qualitymetrics(lines, variant_keys) self.assertListEqual(sorted(variant_keys),sorted(['UNPAIRED_READS_EXAMINED', 'READ_PAIRS_EXAMINED', 'UNMAPPED_READS', 'UNPAIRED_READ_DUPLICATES', 'READ_PAIR_DUPLICATES', 'READ_PAIR_OPTICAL_DUPLICATES', 'PERCENT_DUPLICATION', 'ESTIMATED_LIBRARY_SIZE'])) self.assertDictContainsSubset({'PERCENT_DUPLICATION':'0.130625'}, output_dict)
def testHotSpotFlaggedParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '6037_E05_OPXv4_NA12878_HA0201', '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013' ] analysis_type = 'parsers.parse_hotspot_flagged' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys,sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual(prefixes, [ '6037_NA12878_Variants|Total', '6037_NA12878_VAF', '6037_NA12878_Status', '0228T_Variants|Total', '0228T_VAF', '0228T_Status', '5437_NA12878_Variants|Total', '5437_NA12878_VAF', '5437_NA12878_Status' ]) self.assertListEqual(fieldnames, [ 'Position', 'Ref_Base', 'Var_Base', 'Clinically_Flagged', '6037_NA12878_Variants|Total', '6037_NA12878_VAF', '6037_NA12878_Status', '0228T_Variants|Total', '0228T_VAF', '0228T_Status', '5437_NA12878_Variants|Total', '5437_NA12878_VAF', '5437_NA12878_Status' ]) self.assertListEqual(variant_keys, ['Position', 'Ref_Base', 'Var_Base']) self.assertEqual(specimens[('chr7:55259524', 'T', 'A')]['0228T_Status'], 'REVIEW') #less than 100 reads self.assertEqual(specimens[('chr3:37034946', 'G', 'A')]['0228T_Status'], 'HET') #.2-.7 self.assertEqual(specimens[('chr2:215661788', 'C', 'T')]['0228T_Status'], 'H**O') #>.7 self.assertEqual(specimens[('chr13:32936674', 'C', 'T')]['0228T_Status'], 'NEG') #<.1 self.assertEqual(specimens[('chr7:55259524', 'T', 'A')]['0228T_VAF'], '0.0000') self.assertEqual(specimens[('chr3:37034946', 'G', 'A')]['0228T_VAF'], '0.5062') self.assertEqual(specimens[('chr2:215661788', 'C', 'T')]['0228T_VAF'], '0.9924') self.assertEqual(specimens[('chr13:32936674', 'C', 'T')]['0228T_VAF'], '0.0163')
def testHotSpotFlaggedParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order=['6037_E05_OPXv4_NA12878_HA0201','0228T_CON_OPXv4_INT','5437_E05_OPXv4_NA12878_MA0013'] analysis_type='parsers.parse_hotspot_flagged' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys,sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) self.assertListEqual(prefixes,['6037_NA12878_Variants|Total', '6037_NA12878_Status', '0228T_Variants|Total', '0228T_Status', '5437_NA12878_Variants|Total', '5437_NA12878_Status']) self.assertListEqual(fieldnames, ['Position', 'Ref_Base', 'Var_Base', 'Clinically_Flagged', '6037_NA12878_Variants|Total', '6037_NA12878_Status', '0228T_Variants|Total', '0228T_Status', '5437_NA12878_Variants|Total', '5437_NA12878_Status']) self.assertListEqual(variant_keys, ['Position', 'Ref_Base', 'Var_Base']) self.assertEqual(specimens[('chr7:55259524','T','A')]['0228T_Status'], 'REVIEW')#less than 100 reads self.assertEqual(specimens[('chr3:37034946', 'G', 'A')]['0228T_Status'], 'HET')#.2-.7 self.assertEqual(specimens[('chr2:215661788','C','T')]['0228T_Status'], 'H**O')#>.7 self.assertEqual(specimens[('chr13:32936674', 'C', 'T')]['0228T_Status'], 'NEG')#<.1
def testHSParser(self): variant_keys = [] files = ifilter(filters.hs_file_finder, walker(qualityfiles)) fname=open(path.join(qualityfiles, '6037_E05_OPXv4_NA12878_HA0201.hs_metrics'),'rU') lines=fname.readlines() output_dict, variant_keys = parsers.parse_hsmetrics(lines, variant_keys) self.assertListEqual(sorted(variant_keys),sorted(['PF_READS', 'PF_UNIQUE_READS', 'PCT_PF_UQ_READS', 'PF_UQ_READS_ALIGNED', 'PCT_SELECTED_BASES', 'PCT_OFF_BAIT', 'MEAN_TARGET_COVERAGE', 'PCT_USABLE_BASES_ON_TARGET', 'ZERO_CVG_TARGETS_PCT', 'AT_DROPOUT', 'GC_DROPOUT'])) self.assertDictContainsSubset({'MEAN_TARGET_COVERAGE':'614.820203'}, output_dict)
def testHSParser(self): variant_keys = [] files = ifilter(filters.hs_file_finder, walker(qualityfiles)) fname = open( path.join(qualityfiles, '6037_E05_OPXv4_NA12878_HA0201.hs_metrics'), 'rU') lines = fname.readlines() output_dict, variant_keys = parsers.parse_hsmetrics( lines, variant_keys) self.assertListEqual( sorted(variant_keys), sorted([ 'PF_READS', 'PF_UNIQUE_READS', 'PCT_PF_UQ_READS', 'PF_UQ_READS_ALIGNED', 'PCT_SELECTED_BASES', 'PCT_OFF_BAIT', 'MEAN_TARGET_COVERAGE', 'PCT_USABLE_BASES_ON_TARGET', 'ZERO_CVG_TARGETS_PCT', 'AT_DROPOUT', 'GC_DROPOUT' ])) self.assertDictContainsSubset({'MEAN_TARGET_COVERAGE': '614.820203'}, output_dict)
def testQualityMetricsParser(self): variant_keys = [] files = ifilter(filters.quality_file_finder, walker(qualityfiles)) fname = open( path.join(qualityfiles, '6037_E05_OPXv4_NA12878_HA0201.quality_metrics'), 'rU') lines = fname.readlines() output_dict, variant_keys = parsers.parse_qualitymetrics( lines, variant_keys) self.assertListEqual( sorted(variant_keys), sorted([ 'UNPAIRED_READS_EXAMINED', 'READ_PAIRS_EXAMINED', 'UNMAPPED_READS', 'UNPAIRED_READ_DUPLICATES', 'READ_PAIR_DUPLICATES', 'READ_PAIR_OPTICAL_DUPLICATES', 'PERCENT_DUPLICATION', 'ESTIMATED_LIBRARY_SIZE' ])) self.assertDictContainsSubset({'PERCENT_DUPLICATION': '0.130625'}, output_dict)
def action(args): #get run-amplicon file files = ifilter(filters.amplicon_coverage, walker(args.run_metrics_dir)) files = sorted(files) assert len(files) is 1 for pth in files: with open(path.join(pth.dir, pth.fname)) as fname: run_metrics = pd.read_csv(fname, delimiter='\t') #the target column header is empty, so add a name run_metrics.rename(columns={'Unnamed: 0': 'Target'}, inplace=True) #clean up the metrics file to remove empty columns clean_metrics = run_metrics.dropna(axis='columns', how='all') #Grab samples from file, removing 'Target' columns sample_names = list(clean_metrics.columns.values) sample_names.remove('Target') #grab amplicon bed amplicons = pd.read_csv(args.amplicons) #merge metrics on the amplicon targets merged = pd.merge(amplicons, clean_metrics, how='inner', left_on='Target', right_on='Target') #Print top-level-output merged.to_csv(args.top_output, index=False, sep='\t') #Print sample level output for sample in sample_names: pfx = sample.replace('_', '-') + '-' + args.project header = ['Target', 'Gene', 'Position'] header.append(sample) sample_info = merged[header] #Expected : project/pfx/pfx.Amplicon_Analysis.txt outdir = path.join('output', pfx) if not path.exists(outdir): makedirs(outdir) sample_out = path.join(outdir, pfx + '.Amplicon_Analysis.txt') sample_info.to_csv(sample_out, index=False, sep='\t')
def testCoverageKitParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = ['NA12878-HP998-HHv3', 'OCIAML3-HP998-HHv3'] analysis_type = 'parsers.parse_coveragekit' files = ifilter(filters.exon_coverage_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual(prefixes, [ 'NA12878-HP998-HHv3_AveCoverage', 'OCIAML3-HP998-HHv3_AveCoverage' ]) self.assertListEqual(fieldnames, [ 'RegionID', 'Position', 'NA12878-HP998-HHv3_AveCoverage', 'OCIAML3-HP998-HHv3_AveCoverage' ]) self.assertListEqual(variant_keys, ['RegionID', 'Position'])
def testQualityParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201' ] analysis_type = 'parsers.parse_quality' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual( fieldnames, ['MEAN_TARGET_COVERAGE', '0228T', '5437_NA12878', '6037_NA12878']) self.assertListEqual(variant_keys, ['MEAN_TARGET_COVERAGE']) self.assertListEqual(prefixes, ['0228T', '5437_NA12878', '6037_NA12878'])
def testBreakDancerParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201' ] analysis_type = 'parsers.parse_breakdancer' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual(prefixes, ['0228T', '6037_NA12878', 'Count']) self.assertListEqual(fieldnames, [ 'Event_1', 'Event_2', 'Type', 'Size', 'Gene_1', 'Gene_2', '0228T', '6037_NA12878', 'Count' ]) self.assertListEqual(variant_keys, ['Event_1', 'Event_2'])
def testSNPParser(self): """ Test for correct fieldname parsing """ specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order=['5437_E05_OPXv4_NA12878_MA0013','6037_E05_OPXv4_NA12878_HA0201','0228T_CON_OPXv4_INT'] analysis_type='parsers.parse_snp' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) self.assertListEqual(prefixes,['5437_NA12878_Ref|Var','6037_NA12878_Ref|Var','0228T_Ref|Var', 'Count']) self.assertListEqual(fieldnames, ['Position', 'Ref_Base', 'Var_Base', 'Gene', 'Variant_Type', 'Transcripts', 'Clinically_Flagged', 'Cosmic', 'Segdup', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', 'UW_Freq', 'UW_Count', 'UW_DEC_p', '1000g_ALL', 'EVS_esp6500_ALL', '1000g_AMR', 'EVS_esp6500_AA', '1000g_EUR', 'EVS_esp6500_EU', '1000g_SAS','1000g_EAS', '1000g_AFR', 'ADA_Alter_Splice', 'RF_Alter_Splice', '5437_NA12878_Ref|Var','6037_NA12878_Ref|Var','0228T_Ref|Var', 'Count']) self.assertListEqual(variant_keys, ['Position', 'Ref_Base', 'Var_Base'])
def testCNVExonParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201' ] analysis_type = 'parsers.parse_cnv_exon' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual( prefixes, ['0228T_Log', '5437_NA12878_Log', '6037_NA12878_Log']) self.assertListEqual(fieldnames, [ 'Position', 'Gene', 'Transcripts', '0228T_Log', '5437_NA12878_Log', '6037_NA12878_Log' ]) self.assertListEqual(variant_keys, ['Position', 'Gene'])
def action(args): #get run-amplicon file files = ifilter(filters.amplicon_coverage, walker(args.run_metrics_dir)) files = sorted(files) assert len(files) is 1 for pth in files: with open(path.join(pth.dir, pth.fname)) as fname: run_metrics=pd.read_csv(fname, delimiter='\t') #the target column header is empty, so add a name run_metrics.rename(columns={'Unnamed: 0':'Target'}, inplace=True) #clean up the metrics file to remove empty columns clean_metrics = run_metrics.dropna(axis='columns',how='all') #Grab samples from file, removing 'Target' columns sample_names=list(clean_metrics.columns.values) sample_names.remove('Target') #grab amplicon bed amplicons=pd.read_csv(args.amplicons) #merge metrics on the amplicon targets merged = pd.merge(amplicons,clean_metrics, how='inner', left_on='Target', right_on='Target') #Print top-level-output merged.to_csv(args.top_output, index=False,sep='\t') #Print sample level output for sample in sample_names: pfx=sample.replace('_','-')+'-'+args.project header=['Target','Gene','Position'] header.append(sample) sample_info=merged[header] #Expected : project/pfx/pfx.Amplicon_Analysis.txt outdir = path.join('output',pfx) if not path.exists(outdir): makedirs(outdir) sample_out = path.join(outdir,pfx+'.Amplicon_Analysis.txt') sample_info.to_csv(sample_out,index=False, sep='\t')
def action(args): #Grab all analysis files from the path files = ifilter(filters.any_analysis, walker(args.path)) files = filter(filters.polyhunter_analysis, files) # interesting_files = glob.glob("*.csv") df_list = [] df = pd.DataFrame() sort_order = [ x['barcode_id'] for x in csv.DictReader(args.pipeline_manifest) ] for sample in sort_order: #Grab the file for each sample, in specified sort order pfx_file = [s for s in files if sample in s.fname] if pfx_file: pfx_file = pfx_file[0] pfx = munge_pfx(pfx_file.fname) #Create a smaller version of this really long string data = pd.read_csv(os.path.join(pfx_file.dir, pfx_file.fname), sep='\t') data.index = [pfx['mini-pfx']] * len(data) df = df.append(data) cols = natsorted(df.columns) df.to_csv(args.outfile, sep='\t', na_rep='0', columns=cols)
def testAmpliconParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201' ] analysis_type = 'parsers.parse_amplicon' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual(prefixes, ['0228T', '5437_NA12878', '6037_NA12878']) self.assertListEqual( fieldnames, ['Position', 'Probe', '0228T', '5437_NA12878', '6037_NA12878']) self.assertListEqual(variant_keys, ['Position', 'Probe']) #Should heave 753 entries for hhv3, the only assay running this parser self.assertEqual(len(specimens), 753)
def action(args): files = ifilter(filters.any_analysis, walker(args.path)) files = ifilter(filters.pindel_analysis, files) #sort the files so that the output in the workbook is sorted files=sorted(files) variant_keys = ['Position', 'Gene'] specimens, annotation, prefixes=parse_pindel(variant_keys, files, args.path) annotation_headers = [ 'Gene_Region', 'Event_Type', 'Size', 'Transcripts', ] writer = csv.DictWriter(args.outfile, fieldnames = variant_keys + annotation_headers + prefixes, extrasaction = 'ignore', delimiter = '\t') writer.writeheader() for variant in sorted(specimens.keys()): d = {k:v for k,v in zip(variant_keys,variant)} d.update({pfx:specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def action(args): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] #Get sort order from pipeline manifest. For TGC, this is alpha numeric. For others it is not. sort_order = [ x['barcode_id'] for x in csv.DictReader(args.pipeline_manifest) ] files = ifilter(filters.any_analysis, walker(args.path)) if args.type == 'indel': parser_type = 'snp' elif args.type == 'exon_cov': parser_type = 'coveragekit' files = list(filter(filters.exon_coverage_analysis, files)) elif args.type == 'gene_cov': parser_type = 'coveragekit' files = list(filter(filters.gene_coverage_analysis, files)) else: parser_type = args.type analysis_type = '_'.join(['parsers.parse', parser_type]) print "analysis type:", analysis_type chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) writer = csv.DictWriter(args.outfile, fieldnames=fieldnames, extrasaction='ignore', delimiter='\t') writer.writeheader() for variant in sorted(specimens.keys()): d = {k: v for k, v in zip(variant_keys, variant)} d.update({pfx: specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def action(args): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] #Get sort order from pipeline manifest. For TGC, this is alpha numeric. For others it is not. sort_order = [x['barcode_id'] for x in csv.DictReader(args.pipeline_manifest)] files = ifilter(filters.any_analysis, walker(args.path)) if args.type == 'indel': parser_type = 'snp' else: parser_type = args.type analysis_type='_'.join(['parsers.parse',parser_type]) print "analysis type:",analysis_type chosen_parser='{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format(analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys=eval(chosen_parser) writer = csv.DictWriter(args.outfile, fieldnames = fieldnames, extrasaction = 'ignore', delimiter = '\t') writer.writeheader() for variant in sorted(specimens.keys()): d = {k:v for k,v in zip(variant_keys,variant)} d.update({pfx:specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def action(args): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] # apply a series of filters to files files = ifilter(filters.any_analysis, walker(args.path)) if args.type == "Exon": files = ifilter(filters.cnv_exon_analysis, files) elif args.type == "Gene": files = ifilter(filters.cnv_gene_analysis, files) variant_keys = ["Position", "Gene"] # sort the files so that the output in the workbook is sorted for pth in files: pfx = munge_pfx(pth.fname) log_pfx = pfx["mini-pfx"] + "_Log" prefixes.append(log_pfx) with open(os.path.join(pth.dir, pth.fname)) as fname: print pth.fname reader = csv.DictReader(fname, delimiter="\t") for row in reader: variant = tuple(row[k] for k in variant_keys) specimens[variant][log_pfx] = row["Ave_Adjusted_Log_Ratio"] annotation[variant] = row annotation_headers = ["Transcripts"] writer = csv.DictWriter( args.outfile, fieldnames=variant_keys + annotation_headers + prefixes, extrasaction="ignore", delimiter="\t" ) writer.writeheader() for variant in sorted(specimens.keys()): d = {k: v for k, v in zip(variant_keys, variant)} d.update({pfx: specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def testClinFlaggedParser(self): specimens = defaultdict(dict) annotation = {} prefixes = [] variant_keys = [] sort_order = [ '0228T_CON_OPXv4_INT', '5437_E05_OPXv4_NA12878_MA0013', '6037_E05_OPXv4_NA12878_HA0201' ] analysis_type = 'parsers.parse_clin_flagged' files = ifilter(filters.any_analysis, walker(testfiles)) chosen_parser = '{}(files, specimens, annotation, prefixes, variant_keys, sort_order)'.format( analysis_type) specimens, annotation, prefixes, fieldnames, variant_keys = eval( chosen_parser) self.assertListEqual(prefixes, [ '0228T_Variants', '5437_NA12878_Variants', '6037_NA12878_Variants' ]) self.assertListEqual(fieldnames, [ 'Position', 'Ref_Base', 'Var_Base', 'Clinically_Flagged', '0228T_Variants', '5437_NA12878_Variants', '6037_NA12878_Variants' ]) self.assertListEqual(variant_keys, ['Position', 'Ref_Base', 'Var_Base'])
def action(args): #read in the data, adding the name into the df and skipping the 'version' filelist = ifilter(filters.hs_file_finder, walker(args.path)) df_list = [] pd.set_option('display.width', 100) for pfx_file in filelist: pfx = munge_pfx(pfx_file.fname) log_pfx = pfx['mini-pfx'] data = pd.read_csv(os.path.join(pfx_file.dir, pfx_file.fname), sep='\t', comment='#', error_bad_lines=False).assign(SAMPLE=log_pfx) df_list.append(data[0:1]) #concatenate them together big_df = pd.concat(df_list, ignore_index=True) # #now, lets grab just the data we want qc_df = big_df[[ 'SAMPLE', 'MEAN_TARGET_COVERAGE', # 'MEDIAN_TARGET_COVERAGE','MAX_TARGET_COVERAGE', 'PCT_USABLE_BASES_ON_TARGET', 'PF_UNIQUE_READS', ]] #Setup the values we wish to plot qc_df['On Target Reads'] = qc_df['PF_UNIQUE_READS'] * qc_df[ 'PCT_USABLE_BASES_ON_TARGET'] qc_df['Off Target Reads'] = qc_df['PF_UNIQUE_READS'] - qc_df[ 'On Target Reads'] qc_df['On Target Reads'] = qc_df['On Target Reads'].astype(int) qc_df['Off Target Reads'] = qc_df['Off Target Reads'].astype(int) qc_df = qc_df.sort_values(by=['SAMPLE']) #Setup the plot data1 = go.Bar( x=qc_df['SAMPLE'], # assign x as the dataframe column 'x' y=qc_df['Off Target Reads'], name='Off Target Reads', xaxis='x1', yaxis='y1') data2 = go.Bar(x=qc_df['SAMPLE'], y=qc_df['On Target Reads'], name='On Target Reads', xaxis='x1', yaxis='y1') layout = { 'title': 'QC Metrics', 'xaxis': { 'type': 'category', #required so mini sample names are strings instead of numbers 'domain': [0, 1] }, 'yaxis': { 'hoverformat': ',f', #print real numbers, not 149.786k 'domain': [.7, 1] }, #only take bottom portion of screen}, 'barmode': 'stack' } #setup table table = go.Table( header=dict(values=[ 'Sample ID', 'Mean Target Coverage', 'Total Read Pairs', 'On Target Reads', 'Off Target Reads' ], line=dict(color='#7D7F80'), fill=dict(color='#a1c3d1'), align=['left'] * 5), cells=dict( values=[ qc_df['SAMPLE'], qc_df['MEAN_TARGET_COVERAGE'], qc_df['PF_UNIQUE_READS'], qc_df['On Target Reads'], qc_df['Off Target Reads'] ], line=dict(color='#7D7F80'), fill=dict(color=[ 'rgb(245,245,245)', #color for the first column, red if Target Coverage below 500 [ 'rgba(0,250,0, 0.8)' if val >= 500 else 'rgba(250,0,0, 0.8)' for val in qc_df['MEAN_TARGET_COVERAGE'] ] ]), align=['left'] * 5), domain=dict( x=[0, 1], #above/belowe y=[0, .5])) #Make the plot fig = go.Figure(data=[data1, data2, table], layout=layout) plotly.offline.plot(fig, filename=args.outfile, auto_open=false)