def testSplitStringinTwo(self): """ Tests spliting a string into two on | or , Return -1, -1 if input is None Used to split columns of data in summary """ result01=split_string_in_two("1|99") result02=split_string_in_two(None) self.assertEquals(result01[0], '1') self.assertEquals(result01[1], '99') self.assertEquals(result02[0], '-1') self.assertEquals(result02[1], '-1')
def testSplitStringinTwo(self): """ Tests spliting a string into two on | or , Return -1, -1 if input is None Used to split columns of data in summary """ result01 = ann.split_string_in_two("1|99") result02 = ann.split_string_in_two(None) self.assertEquals(result01[0], '1') self.assertEquals(result01[1], '99') self.assertEquals(result02[0], '-1') self.assertEquals(result02[1], '-1')
def action(args): (infiles, ) = args.infiles RefSeqs = {} if args.RefSeqs: refs = csv.DictReader(args.RefSeqs, delimiter='\t') for row in refs: if row['RefSeq'] : for transcript in row['RefSeq'].split('/'): RefSeqs[transcript.split('.')[0]] = transcript headers = ['Position'] + variant_headers[3:5] + [ 'Clinically_Flagged', 'Variant_Type', 'UW_Freq', 'UW_DEC_p', 'Filter', '1000g_ALL', 'EVS_esp6500_ALL', 'EXAC', 'Gene', 'p.', 'c.', 'Faves_Y/N', 'Ref_Reads', 'Var_Reads', 'Allele_Frac', 'Variant_Phred', 'Cosmic', 'CADD', 'ClinVar', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', '1000g_AMR', '1000g_EUR', '1000g_SAS', '1000g_EAS', '1000g_AFR', 'EVS_esp6500_AA', 'EVS_esp6500_EU', 'Transcripts', 'Zygosity', 'Segdup', 'NCI60', 'dbSNP_ID', 'UW_Count', 'GATK_Score', 'ADA_Alter_Splice', 'RF_Alter_Splice', ] if args.type == 'PINDEL': headers = ['Position'] + variant_headers[3:5] + [ 'Clinically_Flagged', 'Variant_Type', 'UW_Freq', 'Gene', 'p.', 'c.', 'Faves_Y/N', 'Ref_Reads', 'Var_Reads', 'Allele_Frac', 'Transcripts', 'UW_Count', ] writer = csv.DictWriter(args.outfile, fieldnames=headers, quoting=csv.QUOTE_MINIMAL, extrasaction='ignore', delimiter='\t') writer.writeheader() # accumulate data from all input files for each variant output = defaultdict(dict) for fname in infiles: if args.type == 'SNP': file_types = snp_file_types elif args.type == 'INDEL': file_types = indel_file_types elif args.type == 'PINDEL': file_types = pindel_file_types try: _, file_type = path.basename(fname).split('.', 1) except ValueError: continue try: #if the file type matches one we want, #header ids are output columns #var_key_ids are chrm:str:stp:ref:var header_ids, var_key_ids = file_types[file_type] except KeyError: if re.search('dropped', fname): continue log.warning('no match: %s' % fname) if args.strict: sys.exit(1) continue for var_key, data in map_headers(fname, header_ids, var_key_ids): if var_key in output: output[var_key]=merge_data(output[var_key],data) else: output[var_key].update(data) if output[var_key].has_key('Reads') and not output[var_key].has_key('Var_Reads'): output[var_key]['Ref_Reads'], output[var_key]['Var_Reads'], output[var_key]['Variant_Phred'] = get_reads(data.get('Read_Headers'),data.get('Reads')) sort_key = lambda row: [(row[k]) for k in ['chr', 'start', 'stop', 'Ref_Base', 'Var_Base']] # # write each row (with all data aggregated), modifying fields as necessary for data in sorted(output.values(), key=sort_key): variants=[data.get('var_type_2'),data.get('var_type_1')] data['Variant_Type'] = ','.join(filter(None, variants)) data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(data, RefSeqs) data['c.'], data['p.'] = munge_transcript(data, RefSeqs) data['Polyphen'], data['Sift'],data['Mutation_Taster'],data['Gerp'] = munge_ljb_scores(data) data['dbSNP_ID'] = data.get('rsid_1') or data.get('rsid_2') data['1000g_ALL'] = data.get('1000g_ALL') or -1 data['1000g_AMR'] = data.get('1000g_AMR') or -1 data['1000g_SAS'] = data.get('1000g_SAS') or -1 data['1000g_EAS'] = data.get('1000g_EAS') or -1 data['1000g_AFR'] = data.get('1000g_AFR') or -1 data['1000g_EUR'] = data.get('1000g_EUR') or -1 data['UW_DEC_p'] = data.get('UW_DEC_p') or -1 data['EXAC'] = data.get('EXAC').split(',')[0] if data.get('EXAC') else -1 data['EVS_esp6500_ALL'] = data.get('EVS_esp6500_ALL').split(',')[0] if data.get('EVS_esp6500_ALL') else -1 data['EVS_esp6500_AA'] = data.get('EVS_esp6500_AA').split(',')[0] if data.get('EVS_esp6500_AA') else -1 data['EVS_esp6500_EU'] = data.get('EVS_esp6500_EU').split(',')[0] if data.get('EVS_esp6500_EU') else -1 #CADD is raw score, phred score. We only care about phred _, data['CADD'] = split_string_in_two(data.get('CADD')) data['ADA_Alter_Splice'],data['RF_Alter_Splice'] = split_string_in_two(data.get('splicing')) data['UW_Freq'], data['UW_Count'] = split_string_in_two(data.get('UW_Freq_list')) data['Allele_Frac'] = get_allele_freq(data) writer.writerow(data)
def action(args): (infiles, ) = args.infiles RefSeqs = {} if args.RefSeqs: refs = csv.DictReader(args.RefSeqs, delimiter='\t') for row in refs: if row['RefSeq']: for transcript in row['RefSeq'].split('/'): RefSeqs[transcript.split('.')[0]] = transcript headers = ['Position'] + variant_headers[3:5] + [ 'Clinically_Flagged', 'Variant_Type', 'UW_Freq', 'UW_DEC_p', 'Filter', '1000g_ALL', 'EVS_esp6500_ALL', 'EXAC', 'Gene', 'p.', 'c.', 'Faves_Y/N', 'Ref_Reads', 'Var_Reads', 'Allele_Frac', 'Variant_Phred', 'Cosmic', 'CADD', 'ClinVar', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', '1000g_AMR', '1000g_EUR', '1000g_SAS', '1000g_EAS', '1000g_AFR', 'EVS_esp6500_AA', 'EVS_esp6500_EU', 'Transcripts', 'Zygosity', 'Segdup', 'NCI60', 'dbSNP_ID', 'UW_Count', 'GATK_Score', 'ADA_Alter_Splice', 'RF_Alter_Splice', ] if args.type == 'PINDEL': headers = ['Position'] + variant_headers[3:5] + [ 'Clinically_Flagged', 'Variant_Type', 'UW_Freq', 'Gene', 'p.', 'c.', 'Faves_Y/N', 'Ref_Reads', 'Var_Reads', 'Allele_Frac', 'Transcripts', 'UW_Count', ] writer = csv.DictWriter(args.outfile, fieldnames=headers, quoting=csv.QUOTE_MINIMAL, extrasaction='ignore', delimiter='\t') writer.writeheader() # accumulate data from all input files for each variant output = defaultdict(dict) for fname in infiles: if args.type == 'SNP': file_types = snp_file_types elif args.type == 'INDEL': file_types = indel_file_types elif args.type == 'PINDEL': file_types = pindel_file_types try: _, file_type = path.basename(fname).split('.', 1) except ValueError: continue try: #if the file type matches one we want, #header ids are output columns #var_key_ids are chrm:str:stp:ref:var header_ids, var_key_ids = file_types[file_type] except KeyError: if re.search('dropped', fname): continue log.warning('no match: %s' % fname) if args.strict: sys.exit(1) continue for var_key, data in map_headers(fname, header_ids, var_key_ids): if var_key in output: output[var_key] = merge_data(output[var_key], data) else: output[var_key].update(data) if output[var_key].has_key( 'Reads') and not output[var_key].has_key('Var_Reads'): output[var_key]['Ref_Reads'], output[var_key][ 'Var_Reads'], output[var_key][ 'Variant_Phred'] = get_reads( data.get('Read_Headers'), data.get('Reads')) sort_key = lambda row: [ (row[k]) for k in ['chr', 'start', 'stop', 'Ref_Base', 'Var_Base'] ] # # write each row (with all data aggregated), modifying fields as necessary for data in sorted(output.values(), key=sort_key): variants = [data.get('var_type_2'), data.get('var_type_1')] data['Variant_Type'] = ','.join(filter(None, variants)) data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts( data, RefSeqs) data['c.'], data['p.'] = munge_transcript(data, RefSeqs) data['Polyphen'], data['Sift'], data['Mutation_Taster'], data[ 'Gerp'] = munge_ljb_scores(data) data['dbSNP_ID'] = data.get('rsid_1') or data.get('rsid_2') data['1000g_ALL'] = data.get('1000g_ALL') or -1 data['1000g_AMR'] = data.get('1000g_AMR') or -1 data['1000g_SAS'] = data.get('1000g_SAS') or -1 data['1000g_EAS'] = data.get('1000g_EAS') or -1 data['1000g_AFR'] = data.get('1000g_AFR') or -1 data['1000g_EUR'] = data.get('1000g_EUR') or -1 data['UW_DEC_p'] = data.get('UW_DEC_p') or -1 data['EXAC'] = data.get('EXAC').split(',')[0] if data.get( 'EXAC') else -1 data['EVS_esp6500_ALL'] = data.get('EVS_esp6500_ALL').split( ',')[0] if data.get('EVS_esp6500_ALL') else -1 data['EVS_esp6500_AA'] = data.get('EVS_esp6500_AA').split( ',')[0] if data.get('EVS_esp6500_AA') else -1 data['EVS_esp6500_EU'] = data.get('EVS_esp6500_EU').split( ',')[0] if data.get('EVS_esp6500_EU') else -1 #CADD is raw score, phred score. We only care about phred _, data['CADD'] = split_string_in_two(data.get('CADD')) data['ADA_Alter_Splice'], data[ 'RF_Alter_Splice'] = split_string_in_two(data.get('splicing')) data['UW_Freq'], data['UW_Count'] = split_string_in_two( data.get('UW_Freq_list')) data['Allele_Frac'] = get_allele_freq(data) writer.writerow(data)
def action(args): (infiles, ) = args.infiles RefSeqs = {} if args.RefSeqs: refs = csv.DictReader(args.RefSeqs, delimiter='\t') for row in refs: if row['RefSeq'] : for transcript in row['RefSeq'].split('/'): RefSeqs[transcript.split('.')[0]] = transcript headers = ['Position'] + variant_headers[3:5] + [ 'Clinically_Flagged', 'Variant_Type', 'HiSeq_Freq', '1000g_ALL', 'EVS_esp6500_ALL', 'Gene', 'p.', 'c.', 'Faves_Y/N', 'Ref_Reads', 'Var_Reads', 'Allele_Freq', 'Variant_Phred', 'Cosmic', 'CADD', 'ClinVar', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', '1000g_AMR', '1000g_EUR', '1000g_ASN', '1000g_AFR', 'EVS_esp6500_AA', 'EVS_esp6500_EU', 'Transcripts', 'Zygosity', 'Segdup', 'NCI60', 'dbSNP_ID', 'HiSeq_Count', 'MiSeq_Freq', 'MiSeq_Count', 'GATK_Score' ] # accumulate data from all input files for each variant output = defaultdict(dict) for fname in infiles: try: _, file_type = path.basename(fname).split('.', 1) except ValueError: continue try: #if the file type matches one we want, #header ids are output columns #var_key_ids are chrm:str:stp:ref:var header_ids, var_key_ids = file_types[file_type] except KeyError: if re.search('dropped', fname): log.warning('no match: %s' % fname) if args.strict: sys.exit(1) continue for var_key, data in map_headers(fname, header_ids, var_key_ids): output[var_key].update(data) writer = csv.DictWriter(args.outfile, fieldnames=headers, quoting=csv.QUOTE_MINIMAL, extrasaction='ignore', delimiter='\t') writer.writeheader() sort_key = lambda row: [(row[k]) for k in ['chr', 'start', 'stop']] # write each row (with all data aggregated), modifying fields as necessary for data in sorted(output.values(), key=sort_key): # # modify any specific fields here data['Variant_Type'] = data.get('var_type_2') if data.get('var_type_2', '').strip() else data.get('var_type_1') data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(data, RefSeqs) data['c.'], data['p.'] = munge_transcript(data, RefSeqs) data['dbSNP_ID'] = data.get('rsid_1') or data.get('rsid_2') data['1000g_ALL'] = data.get('1000g_ALL') or -1 data['1000g_AMR'] = data.get('1000g_AMR') or -1 data['1000g_ASN'] = data.get('1000g_ASN') or -1 data['1000g_AFR'] = data.get('1000g_AFR') or -1 data['1000g_EUR'] = data.get('1000g_EUR') or -1 data['EVS_esp6500_ALL'] = data.get('EVS_esp6500_ALL') or -1 data['EVS_esp6500_AA'] = data.get('EVS_esp6500_AA') or -1 data['EVS_esp6500_EU'] = data.get('EVS_esp6500_EU') or -1 #CADD is raw score, phred score. We only care about phred _, data['CADD'] = split_string_in_two(data.get('CADD')) data['Ref_Reads'], data['Var_Reads'], data['Variant_Phred'] = get_reads(data.get('Reads')) data['MiSeq_Freq'], data['MiSeq_Count'] = split_string_in_two(data.get('Mi_Freq_list')) data['HiSeq_Freq'], data['HiSeq_Count'] = split_string_in_two(data.get('Hi_Freq_list')) data['Allele_Freq'] = get_allele_freq(data) writer.writerow(data)