Esempio n. 1
0
 def testMapHeaders(self):
     """
     Gets header(s) and info from each file
     """
     fname = path.join(summary_testfiles, '{}_gatk.variant_function').format(control)
     header_ids = {0: 'var_type_1',
                   1: 'gene',
                   7: 'zygosity',
                   12: 'rsid_1',
                   8: 'GATK_Score'}
     variant_idx = [2, 3, 4, 5, 6]
     out = summary.map_headers(fname, header_ids, variant_idx)
     out = list(out)
     self.assertEqual(len(out), 1713)
     header_keys = set(header_ids.values())
     # confirm that all keys in header_ids are contained in each row of the output
     for pos, data in out:
         self.assertFalse(header_keys - set(data.keys()))
Esempio n. 2
0
def action(args):

    (infiles, ) = args.infiles

    headers = ['Position'] + variant_headers[3:5] + [
        'Gene',
        'dbSNP_ID',
        'Variant_Type',
        'Transcripts',
        'Clinically_Flagged',
        'NCI60',
        'Cosmic',
        'Segdup',
        'Polyphen',
        'Sift',
        'Mutation_Taster',
        'Gerp',
        '1000g_ALL',
        'EVS_esp6500_ALL',
        '1000g_AMR',
        'EVS_esp6500_AA',
        '1000g_EUR',
        'EVS_esp6500_EU',
        '1000g_ASN',
        '1000g_AFR']


    # accumulate data from all input files for each variant
    output = defaultdict(dict)
    for fname in infiles:
        _, file_type = path.basename(fname).split('.', 1)
        try:
            header_ids, var_key_ids = file_types[file_type]
        except KeyError:
            log.warning('no match: %s' % fname)
            if args.strict:
                sys.exit(1)
            continue
        
        for var_key, data in map_headers(fname, header_ids, var_key_ids):
            output[var_key].update(data)
            
    
    writer = csv.DictWriter(args.outfile,
                            fieldnames = headers,
                            quoting = csv.QUOTE_MINIMAL,
                            extrasaction = 'ignore',
                            delimiter='\t')

    writer.writeheader()
    sort_key = lambda row: [(row[k]) for k in ['chr','start','stop']]

    # write each row (with all data aggregated), modifying fields as necessary
    for data in sorted(output.values(), key = sort_key):
        # modify any specific fields here
        data['Variant_Type'] = data.get('var_type_2') if data.get('var_type_2','').strip() else data.get('var_type_1')
        data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(data)
        data['dbSNP_ID']=data.get('rsid_1') or data.get('rsid_2')
        data['1000g_ALL']=data.get('1000g_ALL') or -1
        data['1000g_AMR']=data.get('1000g_AMR') or -1
        data['1000g_ASN']=data.get('1000g_ASN') or -1
        data['1000g_AFR']=data.get('1000g_AFR') or -1
        data['1000g_EUR']=data.get('1000g_EUR') or -1
        data['EVS_esp6500_ALL']=data.get('EVS_esp6500_ALL') or -1
        data['EVS_esp6500_AA']=data.get('EVS_esp6500_AA') or -1
        data['EVS_esp6500_EU']=data.get('EVS_esp6500_EU') or -1
        writer.writerow(data)