def main(args): reader = hgsc_vcf.Reader(args.INPUT) ## # The field names and processing functions must be established based on the command options fields = [] fields += STANDARD_FIELDS process_functions_sample = VTE_CORE_SAMPLE process_functions_header = VTE_CORE_HEADER process_functions_record = VTE_CORE_RECORD for _switch, _fields, _sfuns, _hfuns, _rfuns in [ (args.vep, VEP_FIELDS, VEP_FUNS_SAMPLE, VEP_FUNS_HEADER, VEP_FUNS_RECORD), (args.cosmic, COSMIC_FIELDS, COSMIC_FUNS_SAMPLE, COSMIC_FUNS_HEADER, COSMIC_FUNS_RECORD), (args.canannot, CAN_FIELDS, CAN_FUNS_SAMPLE, CAN_FUNS_HEADER, CAN_FUNS_RECORD), (args.wheeljack, WJ_FIELDS, WG_FUNS_SAMPLE, WJ_FUNS_HEADER, WJ_FUNS_RECORD), (args.gatk, GATK_FIELDS, GATK_FUNS_SAMPLE, GATK_FUNS_HEADER, GATK_FUNS_RECORD) ]: if _switch: fields += _fields process_functions_sample += _sfuns process_functions_header += _hfuns process_functions_record += _rfuns with bz2.BZ2File(args.OUTPUT, 'w') as fo: writer = csv.DictWriter(fo, delimiter='\t', fieldnames=fields, extrasaction='ignore') writer.writeheader() _output_header = {} for _f in process_functions_header: _f(reader.header, _output_header) if args.subject: _output_header['SUBJECT'] = args.subject for record in reader: if len(record['REF']) > 50: continue if 'NONE' in record['INFO']['VTE'][0]: continue output = {} output.update(_output_header) for _f in process_functions_record: _f(record, output) simplify_allele(output) for sample in record['SAMPLES']: sample_output = {} sample_output.update(output) for _f in process_functions_sample: _f(record, sample, sample_output) for k, v in sample_output.items(): if not v: sample_output[k] = 'NA' writer.writerow(sample_output)
def main(args): reader = hgsc_vcf.Reader(args.INFILE) header = reader.header writer = hgsc_vcf.Writer(args.OUTFILE, header) writer.header.add_header('##COMMAND=<ID=filter-alts,ARGS="%s">' % re.escape(' '.join(sys.argv))) writer.write_header() config = json.load(args.CONFIG) process_vcf(reader, writer, config) logger.info("Done")
def _merge_contig(self, mergefiles): holder = [hgsc_vcf.Reader(open(f, 'r')) for f in mergefiles] for r in holder: if r.peek() is None: r.take() # will iterate if it can while len(holder) > 1: logger.info("Sorting") holder = sorted([r for r in holder if r.peek() is not None], cmp=self.position_compare) if len(holder) < 2: break h0 = holder[0] h1 = holder[1] while h0.peek() is not None and self.position_compare(h0, h1) < 1: self.writer.write_record(h0.take()) last_r = holder[0] while last_r.peek() is not None: self.writer.write_record(last_r.take())
def __init__(self, fobj): self.reader = hgsc_vcf.Reader(fobj) self.caller = fobj.name # get the normal and primary sample ids sampleMapping = { l.fields.get('ID'): l.fields.get('SampleTCGABarcode') for l in self.reader.header.get_headers('SAMPLE') } if 'PRIMARY' not in sampleMapping and 'METASTATIC' in sampleMapping: sampleMapping['PRIMARY'] = sampleMapping['METASTATIC'] elif 'PRIMARY' not in sampleMapping and 'RECURRANCE' in sampleMapping: sampleMapping['PRIMARY'] = sampleMapping['RECURRANCE'] logger.info("Sample mapping for %s: %s", fobj.name, sampleMapping) self.normal = sampleMapping['NORMAL'] self.primary = sampleMapping['PRIMARY'] self._next = None self.take( ) # call to take this time will return None but will also fast forward the reader to the next position
def main(args): # get the seqdict seqdict = SeqDict(args.seqdict) # split the file splitter = FileSplitter(hgsc_vcf.Reader(open(args.input, 'r')), seqdict) splitfiles = splitter.split() splitter.reader.fobj.close() # write the new files merger = FileMerger( hgsc_vcf.Writer(open(args.output, 'w'), splitter.reader.header), splitfiles, seqdict) merger.writer.write_header() merger.merge() merger.writer.fobj.close() shutil.rmtree(splitter.tmpdir) logger.info("Done")
def main(args): if args.level == 'all': convert = SORTED_TIERS filter = [] elif args.level in ['1', '2', '3', '4', '5']: cut = int(args.level) - 1 convert = SORTED_TIERS[:cut] filter = SORTED_TIERS[cut:] else: raise ValueError("%s is not a valid level" % args.level) reader = hgsc_vcf.Reader(args.INPUT) header = reader.header header.add_header( '##INFO=<ID=OF,Number=1,Type=String,Description="original tiering call for this variant in this sample">' ) header.add_header('##COMMAND=<ID=filter_muse.py,Params="%s">' % ' '.join(sys.argv)) writer = hgsc_vcf.Writer(args.OUTPUT, header) writer.write_header() for record in reader: convert_record(record, convert, filter) writer.write_record(record)
parser.add_argument('OUTPUT', type=argparse.FileType('w'), help='output file') args = parser.parse_args() if args.level == 'all': convert = SORTED_TIERS filter = [] elif args.level in ['1', '2', '3', '4', '5']: cut = int(args.level) - 1 convert = SORTED_TIERS[:cut] filter = SORTED_TIERS[cut:] else: raise ValueError("%s is not a valid level" % args.level) reader = hgsc_vcf.Reader(args.INPUT) header = reader.header header.add_header( '##INFO=<ID=OF,Number=1,Type=String,Description="original tiering call for this variant in this sample">' ) header.add_header('##COMMAND=<ID=filter_muse.py,Params="%s">' % ' '.join(sys.argv)) writer = hgsc_vcf.Writer(args.OUTPUT, header) writer.write_header() for record in reader: convert_record(record, convert, filter) writer.write_record(record)
def main(args): reader = hgsc_vcf.Reader(args.INFILE) writer = csv.DictWriter( args.OUTFILE, delimiter='\t', fieldnames=[ 'CHROM', 'INPOS', 'INREF', 'INALT', 'VALPOS', 'VALREF', 'VALALT', 'SUM_TUMOR_REF', 'SUM_TUMOR_ALT', 'SUM_TUMOR_DP', 'SUM_NORMAL_REF', 'SUM_NORMAL_ALT', 'SUM_NORMAL_DP', 'SUM_TUMOR_VAL_REF', 'SUM_TUMOR_VAL_ALT', 'SUM_TUMOR_VAL_DP', 'SUM_NORMAL_VAL_REF', 'SUM_NORMAL_VAL_ALT', 'SUM_NORMAL_VAL_DP', 'VALKEY' ]) writer.writeheader() def batch(reader): first = reader.next() valbatch = new_valbatch(first) for record in reader: if not valbatch.oc_match(record['INFO']['OC'][0]): yield valbatch valbatch = new_valbatch(record) else: valbatch.add_record(record) yield valbatch for b in batch(reader): b.clean() for r in b.get_records(): tumor_val_samples = get_samples(r, 'TUMOR_VALIDATION') tumor_samples = get_samples(r, r'^TUMOR$') normal_val_samples = get_samples(r, 'NORMAL_VALIDATION') normal_samples = get_samples(r, r'^NORMAL$') o = { 'CHROM': r['CHROM'], 'INPOS': b.pos, 'INREF': b.ref, 'INALT': b.alt, 'VALPOS': r['POS'], 'VALREF': r['REF'], 'VALALT': ','.join(r['ALT']), 'SUM_TUMOR_REF': str(get_allele_count(r, 0, tumor_samples)), 'SUM_TUMOR_ALT': str(get_allele_count(r, 1, tumor_samples)), 'SUM_TUMOR_DP': str(sum([int(s['DP'][0]) for s in tumor_samples.values()])), 'SUM_NORMAL_REF': str(get_allele_count(r, 0, normal_samples)), 'SUM_NORMAL_ALT': str(get_allele_count(r, 1, normal_samples)), 'SUM_NORMAL_DP': str(sum([int(s['DP'][0]) for s in normal_samples.values()])), 'SUM_TUMOR_VAL_REF': str(get_allele_count(r, 0, tumor_val_samples)), 'SUM_TUMOR_VAL_ALT': str(get_allele_count(r, 1, tumor_val_samples)), 'SUM_TUMOR_VAL_DP': str(sum([int(s['DP'][0]) for s in tumor_val_samples.values()])), 'SUM_NORMAL_VAL_REF': str(get_allele_count(r, 0, normal_val_samples)), 'SUM_NORMAL_VAL_ALT': str(get_allele_count(r, 1, normal_val_samples)), 'SUM_NORMAL_VAL_DP': str(sum([int(s['DP'][0]) for s in normal_val_samples.values()])), 'VALKEY': extract_key(r) } writer.writerow(o)
def main(args): vcf_reader = hgsc_vcf.Reader(args.INPUTVCF) vcf_container_cosmic = VCFContainer(hgsc_vcf.Reader(args.COSMICVCF), args.buffer) # read in the dbsnp data # connect to the reference file ifasta = IndexedFastaSequenceFile(File(args.reference)) add_command_to_reader( vcf_reader, '##COMMAND=<ID=annotate_vcf_cosmic.py,Params="%s">' % " ".join(sys.argv)) # add the COSMIC header info add_info_to_reader( vcf_reader, '##INFO=<ID=COSMIC,Number=.,Type=String,Description="' + 'COSMIC info, can be one of NONE, BUFFER, CODON, SITE. ' + 'All but NONE are accompanied by AA|CDS|CNT BUFFER indicates the COSMIC site is within %(buffer)sbp of the position. example: ' + 'SITE|p.P228fs*227|c.682_683insT|3 or NONE. VCF file used was %(cosmicvcf)s.">\n' % { 'buffer': str(args.buffer), 'cosmicvcf': args.COSMICVCF }) # add the context add_info_to_reader( vcf_reader, "##INFO=<ID=CONTEXT,Number=1,Type=String,Description=\"Base context around variant. [POS - 5, POS + len(REF) + 4]\">\n" ) # add the validatio status info add_info_to_reader( vcf_reader, "##INFO=<ID=DBVS,Number=1,Type=String,Description=\"dbSNP validation status, | separated\">\n" ) # get the format for the vep annotations _vep_format = get_csq_format([ h for h in vcf_reader.header.get_headers('INFO', 'CSQ') ][0].fields['Description']) vcf_writer = hgsc_vcf.Writer(args.OUTPUTVCF, vcf_reader.header) vcf_writer.write_header() for record in vcf_reader: try: ## check that the position is annotated with CSQ, if not then this is a write through if 'CSQ' in record['INFO']: # matches are intersecting hits in the VCF _matches = vcf_container_cosmic.intersect(record) _csq_l = [ dict(zip(_vep_format, _csq.split('|'))) for _csq in record['INFO'].get('CSQ') ] _info = generate_cosmic_info(_matches, _csq_l, record) record['INFO']['COSMIC'] = _info # extract the dbsnp validation rsids _existing_ids = [ _id for _csq in _csq_l for _id in _csq['Existing_variation'].split('&') ] record['INFO']['DBVS'] = [ generate_valstatus_info(_existing_ids, args.DBSNPVAL) ] record['INFO']['CONTEXT'] = [ str( String( ifasta.getSubsequenceAt( record['CHROM'], record['POS'] - 5, record['POS'] + len(record['REF']) + 4).getBases())) ] except: logger.exception("Error in record modification") vcf_writer.write_record(record)
def main(args): if os.path.isfile(args.MODEL): with open(args.MODEL, 'r') as fi: _model = json.load(fi) else: _model = json.loads(args.MODEL) if os.path.isfile(args.TYPEMAP): with open(args.TYPEMAP, 'r') as fi: _map = json.load(fi) else: _map = json.loads(args.TYPEMAP) logger.info("Input map: %s", _map) _config = json.load(args.CONFIG) vcf = hgsc_vcf.Reader(args.INPUT) # reduce the map to just the samples used in this study _map = {k:v for k, v in _map.items() if k in vcf.header.samples} logger.info("Revised map: %s", _map) logger.info("Direct Mapping") sample_model_direct_map = make_sample_direct_map(_map, _model) logger.info(sample_model_direct_map) sample_type_direct_mapping = set([v for v in get_samples_from_model_map(sample_model_direct_map)]) logger.info(sample_type_direct_mapping) sample_model_regex_map = make_sample_map({k:v for k, v in _map.items() if k not in [s for s, m in sample_type_direct_mapping]}, _model) logger.info(sample_model_regex_map) # check that there are no duplicates sample_type_regex_mapping = set([v for v in get_samples_from_model_map(sample_model_regex_map)]) logger.info("Regex Mapping") logger.info(sample_type_regex_mapping) # take the mappings that are direct first, then the ones that are satisfied via regex direct_map_samples = [s for s, k in sample_type_direct_mapping] regex_only_mappings = [(s, k) for s, k in sample_type_regex_mapping if s not in direct_map_samples] sample_type_mapping = list(sample_type_direct_mapping) + list(set(regex_only_mappings)) sample_model_map = merge_map(sample_model_direct_map, sample_model_regex_map) logger.info("Final Mapping") logger.info(sample_model_map) logger.info(sample_type_mapping) _used_samples = [s for s in sample_type_mapping] if len(_used_samples) != len(set(_used_samples)): raise ValueError("You have samples that are listed in two branches, indicates a role collision: %s", _used_samples) if len(_used_samples) != len(vcf.header.samples): logger.error("Used samples: %s", _used_samples) logger.error("Header samples: %s", vcf.header.samples) _us = [v[0] for v in _used_samples] for s in vcf.header.samples: if s not in _us: logger.error("No match for %s",s) raise ValueError("You have not mapped all samples in the vcf, check that the roles for each sample in the TYPEMAP are used in the MODELMAP") # build the _sample_affinity_f _sample_affinity_f = build_sample_affinity_function(_config) _record_f = build_record_mod_function(_config) writer = hgsc_vcf.Writer(args.OUTPUT, vcf.header) writer.header.add_header('##INFO=<ID=VTE,Number=1,Type=String,Description="Variant type exclusivity based on the input models and mappings. If multiple types are detected they are separated by |.">') writer.header.add_header('##FORMAT=<ID=VTES,Number=1,Type=Integre,Description="Variant type exclusivity score">') writer.header.add_header('##COMMAND=<ID=varytpe-exclusivity,ARGS="%s">' % re.escape(' '.join(sys.argv))) header_samples = {s['ID']: s for s in writer.header.get_headers('SAMPLE')} for s, t in sample_type_mapping: if s in header_samples: header_samples[s]['TYPE'] = t header_samples[s]['ROLE'] = _map[s] else: writer.header.add_header('##SAMPLE=<ID=%s,TYPE=%s,ROLE=%s>' % (s, t, _map[s])) header_samples = {s['ID']: s for s in writer.header.get_headers('SAMPLE')} if args.subject: subject = args.subject else: subject = base64.urlsafe_b64encode(hashlib.md5("".join(sorted(header_samples.keys()))).digest()) if len([h for h in writer.header.get_headers('SUBJECT')]) < 1: writer.header.add_header('##SUBJECT=<ID="%s">' % subject.replace('=', '')) writer.write_header() for record in vcf: for b_record in hgsc_vcf.select_allele(record, lambda x: [hgsc_vcf.best_alt_index(x)]): b_record['INFO']['VTE'] = [vartype_exclusivity(sample_model_map, b_record, _sample_affinity_f) + _record_f(b_record)] writer.write_record(b_record)