def main(args):
    reader = hgsc_vcf.Reader(args.INPUT)

    ##
    # The field names and processing functions must be established based on the command options
    fields = []
    fields += STANDARD_FIELDS
    process_functions_sample = VTE_CORE_SAMPLE
    process_functions_header = VTE_CORE_HEADER
    process_functions_record = VTE_CORE_RECORD
    for _switch, _fields, _sfuns, _hfuns, _rfuns in [
        (args.vep, VEP_FIELDS, VEP_FUNS_SAMPLE, VEP_FUNS_HEADER,
         VEP_FUNS_RECORD),
        (args.cosmic, COSMIC_FIELDS, COSMIC_FUNS_SAMPLE, COSMIC_FUNS_HEADER,
         COSMIC_FUNS_RECORD),
        (args.canannot, CAN_FIELDS, CAN_FUNS_SAMPLE, CAN_FUNS_HEADER,
         CAN_FUNS_RECORD),
        (args.wheeljack, WJ_FIELDS, WG_FUNS_SAMPLE, WJ_FUNS_HEADER,
         WJ_FUNS_RECORD),
        (args.gatk, GATK_FIELDS, GATK_FUNS_SAMPLE, GATK_FUNS_HEADER,
         GATK_FUNS_RECORD)
    ]:
        if _switch:
            fields += _fields
            process_functions_sample += _sfuns
            process_functions_header += _hfuns
            process_functions_record += _rfuns
    with bz2.BZ2File(args.OUTPUT, 'w') as fo:
        writer = csv.DictWriter(fo,
                                delimiter='\t',
                                fieldnames=fields,
                                extrasaction='ignore')
        writer.writeheader()

        _output_header = {}
        for _f in process_functions_header:
            _f(reader.header, _output_header)
        if args.subject:
            _output_header['SUBJECT'] = args.subject

        for record in reader:
            if len(record['REF']) > 50:
                continue
            if 'NONE' in record['INFO']['VTE'][0]:
                continue
            output = {}
            output.update(_output_header)
            for _f in process_functions_record:
                _f(record, output)
            simplify_allele(output)
            for sample in record['SAMPLES']:
                sample_output = {}
                sample_output.update(output)
                for _f in process_functions_sample:
                    _f(record, sample, sample_output)
                for k, v in sample_output.items():
                    if not v:
                        sample_output[k] = 'NA'
                writer.writerow(sample_output)
def main(args):
    reader = hgsc_vcf.Reader(args.INFILE)
    header = reader.header
    writer = hgsc_vcf.Writer(args.OUTFILE, header)
    writer.header.add_header('##COMMAND=<ID=filter-alts,ARGS="%s">' % re.escape(' '.join(sys.argv)))
    writer.write_header()
    config = json.load(args.CONFIG)
    process_vcf(reader, writer, config)
    logger.info("Done")
Exemple #3
0
 def _merge_contig(self, mergefiles):
     holder = [hgsc_vcf.Reader(open(f, 'r')) for f in mergefiles]
     for r in holder:
         if r.peek() is None:
             r.take()  # will iterate if it can
     while len(holder) > 1:
         logger.info("Sorting")
         holder = sorted([r for r in holder if r.peek() is not None],
                         cmp=self.position_compare)
         if len(holder) < 2:
             break
         h0 = holder[0]
         h1 = holder[1]
         while h0.peek() is not None and self.position_compare(h0, h1) < 1:
             self.writer.write_record(h0.take())
     last_r = holder[0]
     while last_r.peek() is not None:
         self.writer.write_record(last_r.take())
Exemple #4
0
 def __init__(self, fobj):
     self.reader = hgsc_vcf.Reader(fobj)
     self.caller = fobj.name
     # get the normal and primary sample ids
     sampleMapping = {
         l.fields.get('ID'): l.fields.get('SampleTCGABarcode')
         for l in self.reader.header.get_headers('SAMPLE')
     }
     if 'PRIMARY' not in sampleMapping and 'METASTATIC' in sampleMapping:
         sampleMapping['PRIMARY'] = sampleMapping['METASTATIC']
     elif 'PRIMARY' not in sampleMapping and 'RECURRANCE' in sampleMapping:
         sampleMapping['PRIMARY'] = sampleMapping['RECURRANCE']
     logger.info("Sample mapping for %s: %s", fobj.name, sampleMapping)
     self.normal = sampleMapping['NORMAL']
     self.primary = sampleMapping['PRIMARY']
     self._next = None
     self.take(
     )  # call to take this time will return None but will also fast forward the reader to the next position
Exemple #5
0
def main(args):
    # get the seqdict
    seqdict = SeqDict(args.seqdict)

    # split the file
    splitter = FileSplitter(hgsc_vcf.Reader(open(args.input, 'r')), seqdict)
    splitfiles = splitter.split()
    splitter.reader.fobj.close()

    # write the new files
    merger = FileMerger(
        hgsc_vcf.Writer(open(args.output, 'w'), splitter.reader.header),
        splitfiles, seqdict)
    merger.writer.write_header()
    merger.merge()
    merger.writer.fobj.close()

    shutil.rmtree(splitter.tmpdir)

    logger.info("Done")
def main(args):
    if args.level == 'all':
        convert = SORTED_TIERS
        filter = []
    elif args.level in ['1', '2', '3', '4', '5']:
        cut = int(args.level) - 1
        convert = SORTED_TIERS[:cut]
        filter = SORTED_TIERS[cut:]
    else:
        raise ValueError("%s is not a valid level" % args.level)

    reader = hgsc_vcf.Reader(args.INPUT)
    header = reader.header
    header.add_header(
        '##INFO=<ID=OF,Number=1,Type=String,Description="original tiering call for this variant in this sample">'
    )
    header.add_header('##COMMAND=<ID=filter_muse.py,Params="%s">' %
                      ' '.join(sys.argv))
    writer = hgsc_vcf.Writer(args.OUTPUT, header)
    writer.write_header()
    for record in reader:
        convert_record(record, convert, filter)
        writer.write_record(record)
Exemple #7
0
parser.add_argument('OUTPUT', type=argparse.FileType('w'), help='output file')

args = parser.parse_args()

if args.level == 'all':
    convert = SORTED_TIERS
    filter = []

elif args.level in ['1', '2', '3', '4', '5']:
    cut = int(args.level) - 1
    convert = SORTED_TIERS[:cut]
    filter = SORTED_TIERS[cut:]

else:
    raise ValueError("%s is not a valid level" % args.level)

reader = hgsc_vcf.Reader(args.INPUT)
header = reader.header

header.add_header(
    '##INFO=<ID=OF,Number=1,Type=String,Description="original tiering call for this variant in this sample">'
)
header.add_header('##COMMAND=<ID=filter_muse.py,Params="%s">' %
                  ' '.join(sys.argv))

writer = hgsc_vcf.Writer(args.OUTPUT, header)
writer.write_header()
for record in reader:
    convert_record(record, convert, filter)
    writer.write_record(record)
Exemple #8
0
def main(args):
    reader = hgsc_vcf.Reader(args.INFILE)
    writer = csv.DictWriter(
        args.OUTFILE,
        delimiter='\t',
        fieldnames=[
            'CHROM', 'INPOS', 'INREF', 'INALT', 'VALPOS', 'VALREF', 'VALALT',
            'SUM_TUMOR_REF', 'SUM_TUMOR_ALT', 'SUM_TUMOR_DP', 'SUM_NORMAL_REF',
            'SUM_NORMAL_ALT', 'SUM_NORMAL_DP', 'SUM_TUMOR_VAL_REF',
            'SUM_TUMOR_VAL_ALT', 'SUM_TUMOR_VAL_DP', 'SUM_NORMAL_VAL_REF',
            'SUM_NORMAL_VAL_ALT', 'SUM_NORMAL_VAL_DP', 'VALKEY'
        ])
    writer.writeheader()

    def batch(reader):
        first = reader.next()
        valbatch = new_valbatch(first)
        for record in reader:
            if not valbatch.oc_match(record['INFO']['OC'][0]):
                yield valbatch
                valbatch = new_valbatch(record)
            else:
                valbatch.add_record(record)
        yield valbatch

    for b in batch(reader):
        b.clean()
        for r in b.get_records():
            tumor_val_samples = get_samples(r, 'TUMOR_VALIDATION')
            tumor_samples = get_samples(r, r'^TUMOR$')
            normal_val_samples = get_samples(r, 'NORMAL_VALIDATION')
            normal_samples = get_samples(r, r'^NORMAL$')
            o = {
                'CHROM':
                r['CHROM'],
                'INPOS':
                b.pos,
                'INREF':
                b.ref,
                'INALT':
                b.alt,
                'VALPOS':
                r['POS'],
                'VALREF':
                r['REF'],
                'VALALT':
                ','.join(r['ALT']),
                'SUM_TUMOR_REF':
                str(get_allele_count(r, 0, tumor_samples)),
                'SUM_TUMOR_ALT':
                str(get_allele_count(r, 1, tumor_samples)),
                'SUM_TUMOR_DP':
                str(sum([int(s['DP'][0]) for s in tumor_samples.values()])),
                'SUM_NORMAL_REF':
                str(get_allele_count(r, 0, normal_samples)),
                'SUM_NORMAL_ALT':
                str(get_allele_count(r, 1, normal_samples)),
                'SUM_NORMAL_DP':
                str(sum([int(s['DP'][0]) for s in normal_samples.values()])),
                'SUM_TUMOR_VAL_REF':
                str(get_allele_count(r, 0, tumor_val_samples)),
                'SUM_TUMOR_VAL_ALT':
                str(get_allele_count(r, 1, tumor_val_samples)),
                'SUM_TUMOR_VAL_DP':
                str(sum([int(s['DP'][0])
                         for s in tumor_val_samples.values()])),
                'SUM_NORMAL_VAL_REF':
                str(get_allele_count(r, 0, normal_val_samples)),
                'SUM_NORMAL_VAL_ALT':
                str(get_allele_count(r, 1, normal_val_samples)),
                'SUM_NORMAL_VAL_DP':
                str(sum([int(s['DP'][0])
                         for s in normal_val_samples.values()])),
                'VALKEY':
                extract_key(r)
            }
            writer.writerow(o)
def main(args):
    vcf_reader = hgsc_vcf.Reader(args.INPUTVCF)
    vcf_container_cosmic = VCFContainer(hgsc_vcf.Reader(args.COSMICVCF),
                                        args.buffer)
    # read in the dbsnp data
    # connect to the reference file
    ifasta = IndexedFastaSequenceFile(File(args.reference))
    add_command_to_reader(
        vcf_reader, '##COMMAND=<ID=annotate_vcf_cosmic.py,Params="%s">' %
        " ".join(sys.argv))
    # add the COSMIC header info
    add_info_to_reader(
        vcf_reader, '##INFO=<ID=COSMIC,Number=.,Type=String,Description="' +
        'COSMIC info, can be one of NONE, BUFFER, CODON, SITE.  ' +
        'All but NONE are accompanied by AA|CDS|CNT BUFFER indicates the COSMIC site is within %(buffer)sbp of the position.  example: '
        +
        'SITE|p.P228fs*227|c.682_683insT|3 or NONE.  VCF file used was %(cosmicvcf)s.">\n'
        % {
            'buffer': str(args.buffer),
            'cosmicvcf': args.COSMICVCF
        })

    # add the context
    add_info_to_reader(
        vcf_reader,
        "##INFO=<ID=CONTEXT,Number=1,Type=String,Description=\"Base context around variant. [POS - 5, POS + len(REF) + 4]\">\n"
    )
    # add the validatio status info
    add_info_to_reader(
        vcf_reader,
        "##INFO=<ID=DBVS,Number=1,Type=String,Description=\"dbSNP validation status, | separated\">\n"
    )
    # get the format for the vep annotations
    _vep_format = get_csq_format([
        h for h in vcf_reader.header.get_headers('INFO', 'CSQ')
    ][0].fields['Description'])

    vcf_writer = hgsc_vcf.Writer(args.OUTPUTVCF, vcf_reader.header)
    vcf_writer.write_header()
    for record in vcf_reader:
        try:
            ## check that the position is annotated with CSQ, if not then this is a write through
            if 'CSQ' in record['INFO']:
                # matches are intersecting hits in the VCF

                _matches = vcf_container_cosmic.intersect(record)
                _csq_l = [
                    dict(zip(_vep_format, _csq.split('|')))
                    for _csq in record['INFO'].get('CSQ')
                ]
                _info = generate_cosmic_info(_matches, _csq_l, record)
                record['INFO']['COSMIC'] = _info
                # extract the dbsnp validation rsids
                _existing_ids = [
                    _id for _csq in _csq_l
                    for _id in _csq['Existing_variation'].split('&')
                ]
                record['INFO']['DBVS'] = [
                    generate_valstatus_info(_existing_ids, args.DBSNPVAL)
                ]
            record['INFO']['CONTEXT'] = [
                str(
                    String(
                        ifasta.getSubsequenceAt(
                            record['CHROM'], record['POS'] - 5, record['POS'] +
                            len(record['REF']) + 4).getBases()))
            ]

        except:
            logger.exception("Error in record modification")
        vcf_writer.write_record(record)
Exemple #10
0
def main(args):
    if os.path.isfile(args.MODEL):
        with open(args.MODEL, 'r') as fi:
            _model = json.load(fi)
    else:
        _model = json.loads(args.MODEL)

    if os.path.isfile(args.TYPEMAP):
        with open(args.TYPEMAP, 'r') as fi:
            _map = json.load(fi)
    else:
        _map = json.loads(args.TYPEMAP)
    logger.info("Input map: %s", _map)
    _config = json.load(args.CONFIG)

    vcf = hgsc_vcf.Reader(args.INPUT)
    
    # reduce the map to just the samples used in this study
    _map = {k:v for k, v in _map.items() if k in vcf.header.samples}
    logger.info("Revised map: %s", _map)
    logger.info("Direct Mapping")
    sample_model_direct_map = make_sample_direct_map(_map, _model)
    logger.info(sample_model_direct_map)
    sample_type_direct_mapping = set([v for v in get_samples_from_model_map(sample_model_direct_map)])
    logger.info(sample_type_direct_mapping)
    sample_model_regex_map = make_sample_map({k:v for k, v in _map.items() if k not in [s for s, m in sample_type_direct_mapping]}, _model)
    logger.info(sample_model_regex_map)
    # check that there are no duplicates
    sample_type_regex_mapping = set([v for v in get_samples_from_model_map(sample_model_regex_map)])
    logger.info("Regex Mapping")
    logger.info(sample_type_regex_mapping)
    # take the mappings that are direct first, then the ones that are satisfied via regex
    direct_map_samples = [s for s, k in sample_type_direct_mapping]
    regex_only_mappings = [(s, k) for s, k in sample_type_regex_mapping if s not in direct_map_samples]
    sample_type_mapping = list(sample_type_direct_mapping) + list(set(regex_only_mappings))
    sample_model_map = merge_map(sample_model_direct_map, sample_model_regex_map)
    logger.info("Final Mapping")
    logger.info(sample_model_map)
    logger.info(sample_type_mapping)
    _used_samples = [s for s in sample_type_mapping]
    if len(_used_samples) != len(set(_used_samples)):
        raise ValueError("You have samples that are listed in two branches, indicates a role collision: %s", _used_samples)
    if len(_used_samples) != len(vcf.header.samples):
        logger.error("Used samples: %s", _used_samples)
        logger.error("Header samples: %s", vcf.header.samples)
        _us = [v[0] for v in _used_samples]
        for s in vcf.header.samples:
            if s not in _us:
                logger.error("No match for %s",s)
        raise ValueError("You have not mapped all samples in the vcf, check that the roles for each sample in the TYPEMAP are used in the MODELMAP")
        
    
    # build the _sample_affinity_f
    _sample_affinity_f = build_sample_affinity_function(_config)
    _record_f = build_record_mod_function(_config)
    
    writer = hgsc_vcf.Writer(args.OUTPUT, vcf.header)
    writer.header.add_header('##INFO=<ID=VTE,Number=1,Type=String,Description="Variant type exclusivity based on the input models and mappings.  If multiple types are detected they are separated by |.">')
    writer.header.add_header('##FORMAT=<ID=VTES,Number=1,Type=Integre,Description="Variant type exclusivity score">')
    writer.header.add_header('##COMMAND=<ID=varytpe-exclusivity,ARGS="%s">' % re.escape(' '.join(sys.argv)))
    header_samples = {s['ID']: s for s in writer.header.get_headers('SAMPLE')}
    for s, t in sample_type_mapping:
        if s in header_samples:
            header_samples[s]['TYPE'] = t
            header_samples[s]['ROLE'] = _map[s]
        else:
            writer.header.add_header('##SAMPLE=<ID=%s,TYPE=%s,ROLE=%s>' % (s, t, _map[s]))
            header_samples = {s['ID']: s for s in writer.header.get_headers('SAMPLE')}
    if args.subject:
        subject = args.subject
    else:
        subject = base64.urlsafe_b64encode(hashlib.md5("".join(sorted(header_samples.keys()))).digest())
    if len([h for h in writer.header.get_headers('SUBJECT')]) < 1:
        writer.header.add_header('##SUBJECT=<ID="%s">' % subject.replace('=', ''))
    writer.write_header()
    for record in vcf:
        for b_record in hgsc_vcf.select_allele(record, lambda x: [hgsc_vcf.best_alt_index(x)]):
            b_record['INFO']['VTE'] = [vartype_exclusivity(sample_model_map, b_record, _sample_affinity_f) + _record_f(b_record)]
            writer.write_record(b_record)