Example #1
0
 def testSplitStringinTwo(self):
     """
     Tests spliting a string into two on | or ,
     Return -1, -1 if input is None
     Used to split columns of data in summary
     """
     result01=split_string_in_two("1|99")
     result02=split_string_in_two(None)
     self.assertEquals(result01[0], '1')
     self.assertEquals(result01[1], '99')
     self.assertEquals(result02[0], '-1')
     self.assertEquals(result02[1], '-1')
Example #2
0
 def testSplitStringinTwo(self):
     """
     Tests spliting a string into two on | or ,
     Return -1, -1 if input is None
     Used to split columns of data in summary
     """
     result01 = ann.split_string_in_two("1|99")
     result02 = ann.split_string_in_two(None)
     self.assertEquals(result01[0], '1')
     self.assertEquals(result01[1], '99')
     self.assertEquals(result02[0], '-1')
     self.assertEquals(result02[1], '-1')
Example #3
0
def action(args):

    (infiles, ) = args.infiles

    RefSeqs = {}
    if args.RefSeqs:
        refs = csv.DictReader(args.RefSeqs, delimiter='\t')
        for row in refs:
            if row['RefSeq'] :
                for transcript in row['RefSeq'].split('/'):
                    RefSeqs[transcript.split('.')[0]] = transcript

    headers = ['Position'] + variant_headers[3:5] + [
        'Clinically_Flagged',
        'Variant_Type',
        'UW_Freq',
        'UW_DEC_p',
        'Filter',
        '1000g_ALL',
        'EVS_esp6500_ALL',        
        'EXAC',
        'Gene',
        'p.',
        'c.',
        'Faves_Y/N',
        'Ref_Reads',
        'Var_Reads',
        'Allele_Frac',
        'Variant_Phred',
        'Cosmic',
        'CADD',
        'ClinVar',
        'Polyphen',
        'Sift',
        'Mutation_Taster',
        'Gerp',
        '1000g_AMR',
        '1000g_EUR',
        '1000g_SAS',
        '1000g_EAS',
        '1000g_AFR',
        'EVS_esp6500_AA',
        'EVS_esp6500_EU',
        'Transcripts',
        'Zygosity',
        'Segdup',
        'NCI60',
        'dbSNP_ID',
        'UW_Count',
        'GATK_Score',
        'ADA_Alter_Splice',
        'RF_Alter_Splice',
        ]

    if args.type == 'PINDEL':
        headers = ['Position'] + variant_headers[3:5] + [
            'Clinically_Flagged',
            'Variant_Type',
            'UW_Freq',
            'Gene',
            'p.',
            'c.',
            'Faves_Y/N',
            'Ref_Reads',
            'Var_Reads',
            'Allele_Frac',
            'Transcripts',
            'UW_Count',
        ]

    writer = csv.DictWriter(args.outfile,
                            fieldnames=headers,
                            quoting=csv.QUOTE_MINIMAL,
                            extrasaction='ignore',
                            delimiter='\t')

    writer.writeheader()

    # accumulate data from all input files for each variant
    output = defaultdict(dict)
    for fname in infiles:
        if args.type == 'SNP':
            file_types = snp_file_types
        elif args.type == 'INDEL':
            file_types = indel_file_types
        elif args.type == 'PINDEL':
            file_types = pindel_file_types

        try:
            _, file_type = path.basename(fname).split('.', 1)
        except ValueError:
            continue

        try:
            #if the file type matches one we want,
            #header ids are output columns
            #var_key_ids are chrm:str:stp:ref:var
            header_ids, var_key_ids = file_types[file_type]
        except KeyError:
            if re.search('dropped', fname):
                continue
                log.warning('no match: %s' % fname)
            if args.strict:
                sys.exit(1)
            continue
        for var_key, data in map_headers(fname, header_ids, var_key_ids):
            if var_key in output:
                output[var_key]=merge_data(output[var_key],data)
            else:
                output[var_key].update(data)
                if output[var_key].has_key('Reads') and not output[var_key].has_key('Var_Reads'):
                    output[var_key]['Ref_Reads'], output[var_key]['Var_Reads'], output[var_key]['Variant_Phred'] = get_reads(data.get('Read_Headers'),data.get('Reads'))

    sort_key = lambda row: [(row[k]) for k in ['chr', 'start', 'stop', 'Ref_Base', 'Var_Base']]
    # # write each row (with all data aggregated), modifying fields as necessary
    for data in sorted(output.values(), key=sort_key):
        variants=[data.get('var_type_2'),data.get('var_type_1')]
        data['Variant_Type'] = ','.join(filter(None, variants))
        data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(data, RefSeqs)
        data['c.'], data['p.'] = munge_transcript(data, RefSeqs)
        data['Polyphen'], data['Sift'],data['Mutation_Taster'],data['Gerp'] = munge_ljb_scores(data)
        data['dbSNP_ID'] = data.get('rsid_1') or data.get('rsid_2')
        data['1000g_ALL'] = data.get('1000g_ALL') or -1
        data['1000g_AMR'] = data.get('1000g_AMR') or -1
        data['1000g_SAS'] = data.get('1000g_SAS') or -1
        data['1000g_EAS'] = data.get('1000g_EAS') or -1
        data['1000g_AFR'] = data.get('1000g_AFR') or -1
        data['1000g_EUR'] = data.get('1000g_EUR') or -1
        data['UW_DEC_p'] = data.get('UW_DEC_p') or -1
        data['EXAC'] = data.get('EXAC').split(',')[0] if data.get('EXAC') else -1      
        data['EVS_esp6500_ALL'] = data.get('EVS_esp6500_ALL').split(',')[0] if data.get('EVS_esp6500_ALL') else -1
        data['EVS_esp6500_AA'] = data.get('EVS_esp6500_AA').split(',')[0] if data.get('EVS_esp6500_AA') else -1
        data['EVS_esp6500_EU'] = data.get('EVS_esp6500_EU').split(',')[0] if data.get('EVS_esp6500_EU') else -1
        #CADD is raw score, phred score. We only care about phred
        _, data['CADD'] = split_string_in_two(data.get('CADD'))
        data['ADA_Alter_Splice'],data['RF_Alter_Splice'] = split_string_in_two(data.get('splicing'))
        data['UW_Freq'], data['UW_Count'] = split_string_in_two(data.get('UW_Freq_list'))
        data['Allele_Frac'] = get_allele_freq(data)
        writer.writerow(data)
Example #4
0
def action(args):

    (infiles, ) = args.infiles

    RefSeqs = {}
    if args.RefSeqs:
        refs = csv.DictReader(args.RefSeqs, delimiter='\t')
        for row in refs:
            if row['RefSeq']:
                for transcript in row['RefSeq'].split('/'):
                    RefSeqs[transcript.split('.')[0]] = transcript

    headers = ['Position'] + variant_headers[3:5] + [
        'Clinically_Flagged',
        'Variant_Type',
        'UW_Freq',
        'UW_DEC_p',
        'Filter',
        '1000g_ALL',
        'EVS_esp6500_ALL',
        'EXAC',
        'Gene',
        'p.',
        'c.',
        'Faves_Y/N',
        'Ref_Reads',
        'Var_Reads',
        'Allele_Frac',
        'Variant_Phred',
        'Cosmic',
        'CADD',
        'ClinVar',
        'Polyphen',
        'Sift',
        'Mutation_Taster',
        'Gerp',
        '1000g_AMR',
        '1000g_EUR',
        '1000g_SAS',
        '1000g_EAS',
        '1000g_AFR',
        'EVS_esp6500_AA',
        'EVS_esp6500_EU',
        'Transcripts',
        'Zygosity',
        'Segdup',
        'NCI60',
        'dbSNP_ID',
        'UW_Count',
        'GATK_Score',
        'ADA_Alter_Splice',
        'RF_Alter_Splice',
    ]

    if args.type == 'PINDEL':
        headers = ['Position'] + variant_headers[3:5] + [
            'Clinically_Flagged',
            'Variant_Type',
            'UW_Freq',
            'Gene',
            'p.',
            'c.',
            'Faves_Y/N',
            'Ref_Reads',
            'Var_Reads',
            'Allele_Frac',
            'Transcripts',
            'UW_Count',
        ]

    writer = csv.DictWriter(args.outfile,
                            fieldnames=headers,
                            quoting=csv.QUOTE_MINIMAL,
                            extrasaction='ignore',
                            delimiter='\t')

    writer.writeheader()

    # accumulate data from all input files for each variant
    output = defaultdict(dict)
    for fname in infiles:
        if args.type == 'SNP':
            file_types = snp_file_types
        elif args.type == 'INDEL':
            file_types = indel_file_types
        elif args.type == 'PINDEL':
            file_types = pindel_file_types

        try:
            _, file_type = path.basename(fname).split('.', 1)
        except ValueError:
            continue

        try:
            #if the file type matches one we want,
            #header ids are output columns
            #var_key_ids are chrm:str:stp:ref:var
            header_ids, var_key_ids = file_types[file_type]
        except KeyError:
            if re.search('dropped', fname):
                continue
                log.warning('no match: %s' % fname)
            if args.strict:
                sys.exit(1)
            continue
        for var_key, data in map_headers(fname, header_ids, var_key_ids):
            if var_key in output:
                output[var_key] = merge_data(output[var_key], data)
            else:
                output[var_key].update(data)
                if output[var_key].has_key(
                        'Reads') and not output[var_key].has_key('Var_Reads'):
                    output[var_key]['Ref_Reads'], output[var_key][
                        'Var_Reads'], output[var_key][
                            'Variant_Phred'] = get_reads(
                                data.get('Read_Headers'), data.get('Reads'))

    sort_key = lambda row: [
        (row[k]) for k in ['chr', 'start', 'stop', 'Ref_Base', 'Var_Base']
    ]
    # # write each row (with all data aggregated), modifying fields as necessary
    for data in sorted(output.values(), key=sort_key):
        variants = [data.get('var_type_2'), data.get('var_type_1')]
        data['Variant_Type'] = ','.join(filter(None, variants))
        data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(
            data, RefSeqs)
        data['c.'], data['p.'] = munge_transcript(data, RefSeqs)
        data['Polyphen'], data['Sift'], data['Mutation_Taster'], data[
            'Gerp'] = munge_ljb_scores(data)
        data['dbSNP_ID'] = data.get('rsid_1') or data.get('rsid_2')
        data['1000g_ALL'] = data.get('1000g_ALL') or -1
        data['1000g_AMR'] = data.get('1000g_AMR') or -1
        data['1000g_SAS'] = data.get('1000g_SAS') or -1
        data['1000g_EAS'] = data.get('1000g_EAS') or -1
        data['1000g_AFR'] = data.get('1000g_AFR') or -1
        data['1000g_EUR'] = data.get('1000g_EUR') or -1
        data['UW_DEC_p'] = data.get('UW_DEC_p') or -1
        data['EXAC'] = data.get('EXAC').split(',')[0] if data.get(
            'EXAC') else -1
        data['EVS_esp6500_ALL'] = data.get('EVS_esp6500_ALL').split(
            ',')[0] if data.get('EVS_esp6500_ALL') else -1
        data['EVS_esp6500_AA'] = data.get('EVS_esp6500_AA').split(
            ',')[0] if data.get('EVS_esp6500_AA') else -1
        data['EVS_esp6500_EU'] = data.get('EVS_esp6500_EU').split(
            ',')[0] if data.get('EVS_esp6500_EU') else -1
        #CADD is raw score, phred score. We only care about phred
        _, data['CADD'] = split_string_in_two(data.get('CADD'))
        data['ADA_Alter_Splice'], data[
            'RF_Alter_Splice'] = split_string_in_two(data.get('splicing'))
        data['UW_Freq'], data['UW_Count'] = split_string_in_two(
            data.get('UW_Freq_list'))
        data['Allele_Frac'] = get_allele_freq(data)
        writer.writerow(data)
Example #5
0
def action(args):

    (infiles, ) = args.infiles

    RefSeqs = {}
    if args.RefSeqs:
        refs = csv.DictReader(args.RefSeqs, delimiter='\t')
        for row in refs:
            if row['RefSeq'] :
                for transcript in row['RefSeq'].split('/'):
                    RefSeqs[transcript.split('.')[0]] = transcript

    headers = ['Position'] + variant_headers[3:5] + [
        'Clinically_Flagged',
        'Variant_Type',
        'HiSeq_Freq',
        '1000g_ALL',
        'EVS_esp6500_ALL',
        'Gene',
        'p.',
        'c.',
        'Faves_Y/N',
        'Ref_Reads',
        'Var_Reads',
        'Allele_Freq',
        'Variant_Phred',
        'Cosmic',
        'CADD',
        'ClinVar',
        'Polyphen',
        'Sift',
        'Mutation_Taster',
        'Gerp',
        '1000g_AMR',
        '1000g_EUR',
        '1000g_ASN',
        '1000g_AFR',
        'EVS_esp6500_AA',
        'EVS_esp6500_EU',
        'Transcripts',
        'Zygosity',
        'Segdup',
        'NCI60',
        'dbSNP_ID',
        'HiSeq_Count',
        'MiSeq_Freq',
        'MiSeq_Count',
        'GATK_Score'
        ]

    # accumulate data from all input files for each variant
    output = defaultdict(dict)
    for fname in infiles:
        try:
            _, file_type = path.basename(fname).split('.', 1)
        except ValueError:
            continue
        try:
            #if the file type matches one we want,
            #header ids are output columns
            #var_key_ids are chrm:str:stp:ref:var
            header_ids, var_key_ids = file_types[file_type]
        except KeyError:
            if re.search('dropped', fname):
                log.warning('no match: %s' % fname)
            if args.strict:
                sys.exit(1)
            continue

        for var_key, data in map_headers(fname, header_ids, var_key_ids):
            output[var_key].update(data)

    writer = csv.DictWriter(args.outfile,
                            fieldnames=headers,
                            quoting=csv.QUOTE_MINIMAL,
                            extrasaction='ignore',
                            delimiter='\t')

    writer.writeheader()
    sort_key = lambda row: [(row[k]) for k in ['chr', 'start', 'stop']]

    # write each row (with all data aggregated), modifying fields as necessary
    for data in sorted(output.values(), key=sort_key):
        # # modify any specific fields here
        data['Variant_Type'] = data.get('var_type_2') if data.get('var_type_2', '').strip() else data.get('var_type_1')
        data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(data, RefSeqs)
        data['c.'], data['p.'] = munge_transcript(data, RefSeqs)
        data['dbSNP_ID'] = data.get('rsid_1') or data.get('rsid_2')
        data['1000g_ALL'] = data.get('1000g_ALL') or -1
        data['1000g_AMR'] = data.get('1000g_AMR') or -1
        data['1000g_ASN'] = data.get('1000g_ASN') or -1
        data['1000g_AFR'] = data.get('1000g_AFR') or -1
        data['1000g_EUR'] = data.get('1000g_EUR') or -1
        data['EVS_esp6500_ALL'] = data.get('EVS_esp6500_ALL') or -1
        data['EVS_esp6500_AA'] = data.get('EVS_esp6500_AA') or -1
        data['EVS_esp6500_EU'] = data.get('EVS_esp6500_EU') or -1
        #CADD is raw score, phred score. We only care about phred
        _, data['CADD'] = split_string_in_two(data.get('CADD'))
        data['Ref_Reads'], data['Var_Reads'], data['Variant_Phred'] = get_reads(data.get('Reads'))
        data['MiSeq_Freq'], data['MiSeq_Count'] = split_string_in_two(data.get('Mi_Freq_list'))
        data['HiSeq_Freq'], data['HiSeq_Count'] = split_string_in_two(data.get('Hi_Freq_list'))
        data['Allele_Freq'] = get_allele_freq(data)
        writer.writerow(data)