def test_illumina_writer(self): # 01234 # 1234567890 # CCTGATTT-A # TAACGA # - C -A vcf = '''##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO ref 1 . C T 10 PASS . ref 2 . CT CA,C 10 PASS . ref 3 . T A 10 PASS . ref 4 . G C 10 PASS . ref 5 . A G 10 PASS . ref 6 . T A,C 10 PASS . ref 7 . TT T 10 PASS . ref 8 . T TA 10 PASS . ref 10 . A C 10 PASS . ''' vcf = vcf.replace(' ', '\t') vcf_fhand = NamedTemporaryFile(suffix='.vcf') vcf_fhand.write(vcf) vcf_fhand.flush() vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz') compress_with_bgzip(vcf_fhand, vcf_compressed) index_vcf_with_tabix(vcf_compressed.name) ref_fhand = NamedTemporaryFile(suffix='.fasta') ref_fhand.write('>ref\nACTGATTTA\n') ref_fhand.flush() out_fhand1 = StringIO() writer = IlluminaWriter(ref_fhand.name, out_fhand1, min_length=0, vcf_fpath=vcf_compressed.name) for snp in Reader(filename=vcf_compressed.name): writer.write(snp) # With no SNPs converted to IUPAC around out_fhand2 = StringIO() writer = IlluminaWriter(ref_fhand.name, out_fhand2, min_length=0) for snp in Reader(filename=vcf_compressed.name): writer.write(snp) remove(vcf_compressed.name + '.tbi') expected = u'CHROM\tPOS\tID\tseq\n' expected += u'ref\t1\t.\t[C/T]*WSRHT-^A\n' expected += u'ref\t2\t.\tY[CT/CA/C]SRHT-^A\n' expected += u'ref\t3\t.\tYC[T/A]SRHT-^A\n' expected += u'ref\t4\t.\tY*W[G/C]RHT-^A\n' expected += u'ref\t5\t.\tY*WS[A/G]HT-^A\n' expected += u'ref\t6\t.\tY*WSR[T/A/C]T-^A\n' expected += u'ref\t7\t.\tY*WSRH[TT/T]A\n' expected += u'ref\t8\t.\tY*WSRHT[T/TA]A\n' expected += u'ref\t10\t.\tY*WSRHT-^A[A/C]\n' assert expected == out_fhand1.getvalue() expected = u'CHROM\tPOS\tID\tseq\nref\t1\t.\t[C/T]' assert expected in out_fhand1.getvalue()
def assertVcfHasVariantWithCall(self, vcf, chrom, pos, sample, call): """ Assert that a call is made for a given sample in a given position. `call` is a dict corresponding to elements in the vcf sample field. Example: self.assertVcfHasVariantWithCall(my_vcf, 1, 3184885, 'B', call={'GT': '1/2', 'DP': 10}) """ self.assertVcfHasSample(vcf, sample) v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and variant.POS == pos: for cc in variant.samples: if cc.sample == sample: # thank you http://stackoverflow.com/a/4527978/179444 shared_items = set(cc.data.__dict__.items()) & set( call.items()) if shared_items == set(call.items()): variant_found = True if not variant_found: raise AssertionError( "Call {} not present for sample {} at {}:{} in {}".format( call, sample, chrom, pos, vcf))
def test_vcf2sfs(self): vcf_file = Reader(filename=self.filename, compressed=True, encoding='utf-8') panel = read_csv(self.panelname, sep=None, engine='python', skipinitialspace=True, index_col=0) panel = panel[panel['pop'] == 'YRI'] result = vcf2sfs(vcf_file, panel, self.chrom, self.start, self.end, select_chr=True) assert all(self.result[0] == result[0]), "Failed test of vcf2sfs (sfs)" assert all(self.result[1] == result[1]), "Failed test of vcf2sfs (sample size)" assert all(self.result[2] == result[2]), "Failed test of vcf2sfs (common variant)"
def test_calculate_statistics(self): # with freebayes reader = Reader(filename=FREEBAYES_VCF_PATH) vcf_to_compare = VCFcomparisons(FREEBAYES_VCF_PATH) stats = vcf_to_compare.calculate_statistics(reader) assert stats['common'] == 944 assert stats['uncalled'] == 0 assert stats['different'] == 0 assert stats['common_snps_prc'] == 100 # with varscan reader = Reader(filename=VARSCAN_VCF_PATH) vcf_to_compare = VCFcomparisons(VARSCAN_VCF_PATH, samples=['mu16']) stats = vcf_to_compare.calculate_statistics(reader, samples=['mu16']) assert stats['common'] == 107 assert stats['uncalled'] == 69 assert stats['different'] == 0 assert stats['common_snps_prc'] == 100
def assertVcfHasVariantAt(self, vcf, chrom, pos): v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and variant.POS == pos: variant_found = True if not variant_found: raise AssertionError("Variant at {}:{} not present in {}".format( chrom, pos, vcf))
def vcf_to_hgvs(build_name, input_handle, output_handle): """ Convert all variants in a VCF file to HGVS. :arg str build_name: Build name. :arg stream input_handle: Open readable handle to a VCF file. :arg stream output_handle: Open writeable handle to a text file. """ mutalyzer = Mutalyzer(build_name) for record in Reader(input_handle): for alt in record.ALT: output_handle.write('{}\n'.format(mutalyzer.vcf_to_hgvs( record.CHROM, record.POS, record.REF, alt)))
def parse_vcf(assembly, vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(assembly, rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[assembly] = doc[assembly][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (_doc['rsid'], _doc['_id'])) else: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 cnt_1 += 1 logging.info("Done. [{}]".format(timesofar(t0))) logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format( cnt_1, cnt_2, cnt_3))
def __init__(self): self.filename = projdir + '/chr1vcftest.gz' self.panelname = projdir + '/testpanel.panel' self.vcf_file = Reader(filename=projdir + '/chr1vcftest.gz', compressed=True, encoding='utf-8') self.panel = read_csv(projdir + '/testpanel.panel', sep=None, engine='python', skipinitialspace=True, index_col=0) self.chrom = '1' self.start = 159173097 self.end = 159176290 self.result = ((array([3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 108, ['rs2814778'])) self.rho = 0.9864313429651763
def vcf_to_db(build_name, input_handle, output_handle): """ Convert all variants in a VCF file to database format. :arg str build_name: Build name. :arg stream input_handle: Open readable handle to a VCF file. :arg stream output_handle: Open writeable handle to a text file. """ mutalyzer = Mutalyzer(build_name) for record in Reader(input_handle): for alt in record.ALT: _write_db(output_handle, *mutalyzer.vcf_to_db( record.CHROM, record.POS, record.REF, alt))
def assertVcfHasVariantWithChromPosRefAlt(self, vcf, chrom, pos, ref, alt): v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and \ variant.POS == pos and \ variant.REF == ref and \ alt in variant.ALT: variant_found = True if not variant_found: raise AssertionError( "Variant at {}:{} {}/{} not present in {}".format( chrom, pos, ref, alt, vcf))
def _extract(cls, file_path: Union[str, Path]) -> EvaluationData: records = [] vcf_reader = Reader(open(file_path, "r")) for vcf_record in vcf_reader: chrom = str(vcf_record.CHROM) pos = vcf_record.POS ref = vcf_record.REF alt = (vcf_record.ALT[0] if len(vcf_record.ALT) == 1 else vcf_record.ALT).sequence clnsig = PathogencityClass(vcf_record.INFO["CLNSIG"][0].lower()) variation_type = VariationType(vcf_record.var_type) rg = ReferenceGenome.resolve(vcf_reader.metadata["reference"]) records.append( EvaluationDataEntry(chrom, pos, ref, alt, clnsig, variation_type, rg)) return EvaluationData.from_records(records)
def main(): close_infile = False infile = sys.argv[1] if infile == "-": i = sys.stdin else: i = open(infile, "rU") close_infile = True close_outfile = False outfile = sys.argv[2] if outfile == "-": o = sys.stdout else: o = open(outfile, "w") close_outfile = True group_sizes = tuple(int(i) for i in sys.argv[3].split(",")) group_cnt = range(len(group_sizes)) group_ixs = (0,) + tuple(cumsum(group_sizes)) reader = Reader(i) writer = Writer(o, reader) for rec in reader: fix = [] for i in group_cnt: calls = rec.samples[group_ixs[i]:group_ixs[i+1]] called = False for c in calls: if c.called: called = True break if not called: fix.extend(calls) if len(fix) > 0: for c in fix: # This is a hack because PyVCF _Call objects are not mutable c.data = vcf.model.make_calldata_tuple(c.data._fields)(GT="0/0", DP=c.data.DP, GQ=c.data.GQ, PL=c.data.PL) writer.write_record(rec) if close_infile: i.close() if cloes_outfile: o.close()
def BisSNP_vcf2bed(INfile, OUTfile, mextDir, my_session, logobject): cmd = ['grep -v "#" ' + INfile + ' | wc -l '] rowNum = int(subprocess.check_output(cmd, shell=True)) #create empty numpy array of desired size dtstr = ['string', 'int', 'string', 'string', 'float', 'int', 'int'] for i in range(7, 17): dtstr.insert(i, 'int') colN = [ 'CHROM', 'POS', 'STRAND', 'REF', 'QUAL', 'MQ0', 'DP', 'DP_FREF', 'DP_RREF', 'DP_FALT', 'DP_RALT', 'C_Cstrand', 'T_Cstrand', 'Illegal_Cstrand', 'G_Gstrand', 'A_Gstrand', 'Illegal_Gstrand' ] dummyA = np.empty(shape=(rowNum, 17), dtype='object') CpG_df = pd.DataFrame(dummyA, index=range(0, rowNum), columns=colN) #read in vcf vcf_reader = Reader(open(INfile, 'r')) for record, i in zip(vcf_reader, range(0, rowNum)): CHROM = pd.Series(record.CHROM, name='CHROM') POS = pd.Series(record.POS, name='POS') #skip ALT for CpG calls; skip FILTER STRAND = pd.Series(record.INFO['CS'], name='STRAND') REF = pd.Series(record.REF, name='REF') QUAL = pd.Series(record.QUAL, name='QUAL') MQ0 = pd.Series(record.INFO['MQ0'], name='MQ0') #Context=pd.Series(record.INFO['REF'],name='Context') #Sample=pd.Series(record.samples[0].sample,name='Sample') #don't need for single sample vcf GT, BQ, BRC6, CM, CP, CU, DP, DP4, GP, GQ, SS = record.samples[0].data #skip GT as non-homozygous CGs have been filtered out DP = pd.Series(DP, name='DP') DP4_dict = cll.OrderedDict( zip(['DP_FREF', 'DP_RREF', 'DP_FALT', 'DP_RALT'], DP4)) DP4_df = pd.DataFrame.from_dict(DP4_dict, orient='index').transpose() BRC6_dict = cll.OrderedDict( zip([ 'C_Cstrand', 'T_Cstrand', 'Illegal_Cstrand', 'G_Gstrand', 'A_Gstrand', 'Illegal_Gstrand' ], BRC6)) BRC6_df = pd.DataFrame.from_dict(BRC6_dict, orient='index').transpose() comb_df = pd.concat( [CHROM, POS, STRAND, REF, QUAL, MQ0, DP, DP4_df, BRC6_df], axis=1, join='inner') CpG_df.iloc[i, ] = comb_df.iloc[0, ] CpG_df.to_csv(OUTfile, sep='\t', na_rep='NA', index=False) logobject.info('CpG vcf to txt conversion complete') return
def test_neutrality_from_vcf(vcf_name, panel_name, coord, start, end, sel, reps, select_chr): """Calculate the log odds ratio of the data specified by PyVCF file VCF_NAME, sample details PANEL_NAME and region defined by CHROM, START and END.""" vcf_file = Reader(filename=vcf_name, compressed=True, encoding='utf-8') panel = pd.read_csv(panel_name, sep=None, engine='python', skipinitialspace=True, index_col=0) if sel != (None, None): panel = panel[panel[sel[0]] == sel[1]] sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, coord, start, end, select_chr) rho = selectiontest.test_neutrality(sfs, variates0=None, variates1=None, reps=reps) click.echo(rho)
def test_errors(self): # 01234 # 1234567890 # CCTGATTT-A # TAACGA # - C -A vcf = '''##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO ref 1 . C T 10 PASS . ref 2 . CT CA,C 10 PASS . ref 3 . T A 10 PASS . ref 4 . G C 10 PASS . ref 5 . A G 10 PASS . ref 6 . T A,C 10 PASS . ref 7 . TT T 10 PASS . ref 8 . T TA 10 PASS . ref 10 . A C 10 PASS . ''' vcf = vcf.replace(' ', '\t') vcf_fhand = NamedTemporaryFile(suffix='.vcf') vcf_fhand.write(vcf) vcf_fhand.flush() vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz') compress_with_bgzip(vcf_fhand, vcf_compressed) index_vcf_with_tabix(vcf_compressed.name) ref_fhand = NamedTemporaryFile(suffix='.fasta') ref_fhand.write('>ref\nACTGATTTA\n') ref_fhand.flush() out_fhand = StringIO() writer = IlluminaWriter(ref_fhand.name, out_fhand, vcf_fpath=vcf_compressed.name) snps = Reader(filename=vcf_compressed.name) snp = snps.next() try: writer.write(snp) self.fail('NotEnoughAdjacentSequenceError expected') except IlluminaWriter.NotEnoughAdjacentSequenceError: pass
def _grid_export_vcf(filename, genome_build, colmodels, items, sample_ids, sample_names_by_id): samples = [sample_names_by_id[s_id] for s_id in sample_ids] info_dict = _get_colmodel_info_dict(colmodels) vcf_template_file = _colmodels_to_vcf_header(genome_build, info_dict, samples) vcf_reader = Reader(vcf_template_file, strict_whitespace=True) pseudo_buffer = StashFile() vcf_writer = Writer(pseudo_buffer, vcf_reader) def iter_row_writer(): for obj in items: record = _grid_item_to_vcf_record(info_dict, obj, sample_ids, samples) vcf_writer.write_record(record) yield pseudo_buffer.value response = StreamingHttpResponse(iter_row_writer(), content_type="text/csv") response['Content-Disposition'] = f'attachment; filename="{filename}.vcf"' return response
def parse(json_filename, vcf_file): amount = 0 with open("errors.txt", "w"): pass vcf_reader = Reader(open(vcf_file)) json_chr = {} for index, record in enumerate(vcf_reader): try: if record.INFO["AF"][ 0] > 0.01: # and record.INFO["non_cancer_AF_popmax"] == 0: # or not "e" in str(x.INFO[keys[0]][0]) amount += 1 if json_chr.get(str(record.CHROM)): json_chr[str(record.CHROM)].append(to_json(record, index)) else: json_chr[str(record.CHROM)] = [to_json(record, index)] # Just skip errors. except KeyError: pass # Testing only # if amount > 6: # break total_data = index + 1 # Write the file used for MongoDB with open(json_filename, "w") as outjson: json.dump(json_chr, outjson) # Extra information about how much data is stored. with open("Output/data_split_{}.txt".format(json_filename.split("/")[1]), "w") as dp: dp.write("Amount of data: {}\nTotal data in file: {}".format( amount, total_data))
def assertVcfHasSample(self, vcf, sample): v = Reader(filename=vcf) if sample not in v.samples: raise AssertionError("Sample {} not present in {}".format( sample, vcf))
import sys from vcf import Reader import gzip vcf = Reader(open(sys.argv[1], 'r')) n = 0 for v in vcf: if len(v.ALT) > 1: continue if v.QUAL < 20: continue if v.aaf[0] > 0.05: continue n += 1 print(n)
def records_from_vcf(vcf_file: str) -> list: """Creates a list of VCF Record objects from a VCF file""" # Load the VCF vcf_reader: Reader = Reader(open(vcf_file, 'r')) # Create the list of records records: list = [] total: int = 0 n_failed: int = 0 n_with_too_many_unknown_genotypes: int = 0 n_below_threshold: int = 0 data_set_ploidity: None = None while True: try: total += 1 r = next(vcf_reader) chromosome: str = r.CHROM position: int = r.POS ref_allele: str = r.REF alt_alleles: list = r.ALT if data_set_ploidity is None: data_set_ploidity: int = r.samples[0].ploidity assert data_set_ploidity == 1 or data_set_ploidity == 2 genotypes, n_alternates = get_genotypes( data_set_ploidity=data_set_ploidity, r=r, ref_allele=ref_allele, alt_alleles=alt_alleles ) assert len(genotypes) == len(r.samples) # Only add records if the ratio of alternate alleles to total alleles exceeds the (MAF) filter threshold if n_alternates / len(r.samples) >= MAF_THRESHOLD: records.append(VCFRecordObj(chromosome=chromosome, position=position, genotypes=genotypes)) else: n_below_threshold += 1 except RuntimeError: # There was a missing genotype in one of the samples of the current record n_with_too_many_unknown_genotypes += 1 continue except ValueError: # The current record failed because the call to next above raised and exception n_failed += 1 continue except StopIteration: # Decrement the total number of records since the latest increment occurred after the last iteration total -= 1 break assert len(records) + n_with_too_many_unknown_genotypes + n_below_threshold + n_failed == total print('For {}:'.format(vcf_file, MAF_THRESHOLD)) print('Minor Allele Frequency Threshold: {0:.2f}'.format(MAF_THRESHOLD)) # Print the percentages of the records that will actually be used, the records with missing genotypes, the records # that didn't have enough alternate alleles, and the records that flat out failed print_percentage(msg='records successfully added', amount=len(records), total=total) print_percentage( msg='records with too many missing genotypes', amount=n_with_too_many_unknown_genotypes, total=total ) print_percentage(msg='records below the minor allele frequency threshold', amount=n_below_threshold, total=total) print_percentage(msg='records that failed to parse', amount=n_failed, total=total) print('Total number of records:', total) return records
#! /usr/bin/python from vcf import Reader from sys import argv path=argv[1] reader=Reader(filename=path) print('CHROM\tPOS\tREF\tALT\tSAMPLE\tINDEL\tHW\tMASKED\tINFORM\tREP\tA_INDEL\tA_SNP\tANNO') calls=len(reader.samples) output.write('calls:'+str(calls)+'\n') for record in reader: info={'CHROM':record.CHROM, 'POS':record.POS, 'REF':record.REF, 'ALT':record.ALT, 'SAMPLE':[], 'INDEL':'', 'HW':'', 'MASKED':'', 'INFORM':'', 'REP':'', 'A_INDEL':'', 'A_SNP':'', 'ANNO':''} for sample in record.samples: if len(sample.data)>1: info['SAMPLE'].append((sample.sample,sample.data[0],sample.data[1])) else: info['SAMPLE'].append((sample.sample,sample.data[0],None)) print(str(info['CHROM'])+'\t'+str(info['POS'])+'\t'+str(info['REF'])+'\t'+str(info['ALT'])+'\t'+str(info['SAMPLE'])+'\t'+str(info['INDEL'])+'\t'+str(info['HW'])+'\t'+str(info['MASKED'])+'\t'+str(info['INFORM'])+'\t'+str(info['REP'])+'\t'+str(info['A_INDEL'])+'\t'+str(info['A_SNP'])+'\t'+info['ANNO'])