def main(opts): """Main function""" counter = 0 vcf_reader = vcf.Reader(open(opts.vcf_file, "rb")) writer_willem = vcf.Writer(open(opts.output, 'w'), vcf_reader, lineterminator='\n') if "samtools" in vcf_reader._header_lines[2]: vcf_type = "samtools" elif "freeBayes" in vcf_reader._header_lines[2]: vcf_type = "freeBayes" else: print("Unknown vcf type, tool only handels vcf's produced by samtools or freeBayes") vcf_type = "unknown" # sys.exit() # only if a pair file is specified if opts.pair_file is not None: print("its not none") pairs = [] with open(opts.pair_file, "r") as file: for line in file: clean_line = line.rstrip() splitted_line = clean_line.split(",") pairs.append(splitted_line) # for every record apply the filters for record in vcf_reader: # counter += 1 genotype_info_first = genotype_depth(record, opts.call_rate) if genotype_info_first: qual_pass = filter_qual(record, opts.record_qual) depth_pass = filter_depth(record, opts.record_depth) if qual_pass and depth_pass: # print("pass qual and dept") new_record = filter_sample_depth(record, opts.sample_depth, vcf_type) some_site = filter_site(new_record, opts.fraction_het) if some_site: # print("passed heterozygot filter") new_record = sample_het(new_record, vcf_type) if opts.pair_file is not None: # print("its not none") pair_record = check_pairs(new_record, pairs, vcf_type) # if pair_filter: genotype_info_second = genotype_depth(pair_record, opts.call_rate) if genotype_info_second: # print("passed all writing") counter += 1 writer_willem.write_record(pair_record) # print(counter) # if counter > 200: # break else: genotype_info_second = genotype_depth(new_record, opts.call_rate) if genotype_info_second: counter += 1 # print("passed all writing") writer_willem.write_record(new_record) # print(counter) # if counter > 200: # break writer_willem.close()
sampleToBam[fields[0]] = fields[1] # Parse sample list unmappedToBam = dict() if args.unmappedToBam: with open(args.unmappedToBam) as f: for line in f: fields = line.strip().split('\t') unmappedToBam[fields[0]] = fields[1] # Parse VCF sv = collections.defaultdict(list) observedSamples = set() if args.vcfFile: vcf_reader = vcf.Reader( open(args.vcfFile), 'r', compressed=True) if args.vcfFile.endswith('.gz') else vcf.Reader( open(args.vcfFile), 'r', compressed=False) for record in vcf_reader: svlen = record.INFO['END'] - record.POS if (svlen <= maxSize) and ((not args.siteFilter) or (len(record.FILTER) == 0)): if record.INFO['SVTYPE'] == "TRA": continue carrier = set() dupBounds = set() delBounds = set() for call in record.samples: if (call.sample in sampleToBam.keys()) and (call.called) and ( call.gt_type != 0): carrier.add(call.sample)
# inputs vcf_file = snakemake.input["vcf_file"] merged_vcf = snakemake.input["merged_vcf"] gbk_file = snakemake.input["gbk_file"] reference = snakemake.params["reference"] # rename reference if assembled genome if "_assembled_genome" in reference: reference = re.sub("_assembled_genome", "", reference) # output report_file = snakemake.output["html_file"] # parse vcf merged_vcf_records = [ i for i in vcf.Reader(codecs.open(merged_vcf, 'r', 'latin-1')) ] def parse_gbk(gbk_file): with open(gbk_file, "r") as f: record_dict = SeqIO.to_dict(SeqIO.parse(f, 'genbank')) return record_dict def get_neiboring_orf(position, feature_list): ''' Identify neiboring feature of variant position. Input: SeqRecord and position (integer) Output: List of two strings with the closest features located before and after the input position (either locus_tags/gene names).
def test_issue_16(self): reader = vcf.Reader(fh('issue-16.vcf')) n = reader.next() assert n.QUAL == None
REG_START, REG_END = None, None print('CHROM = {}'.format(REG_CHROM)) print('START = {}'.format(REG_START)) print('END = {}'.format(REG_END)) # CHECK HERE if args.nonorm: NORM = False DEBUG = args.debug PRINT_ALLELES = args.alleles COUNTFILTERS = args.countfilters if args.filter: FILTER = True VCFFILE = args.vcf OUTFILE = args.out # If region is specified, just parse through that. vcf_reader = vcf.Reader(open(VCFFILE, "rb")) if args.region is not None: try: vcf_reads = vcf_reader.fetch(str(REG_CHROM), int(REG_START), int(REG_END)) except: vcf_reads = vcf_reader.fetch(str(REG_CHROM)) else: vcf_reads = vcf_reader if SAMPLES == []: SAMPLES = vcf_reads.samples SAMPLES = [item for item in SAMPLES if item in vcf_reads.samples] counters = { "numloci": 0, "minmaf": 0, "minsamples": 0,
def testOpenFilehandle(self): r = vcf.Reader(fh('example-4.0.vcf')) self.assertEqual(self.samples, r.samples) self.assertEqual('example-4.0.vcf', os.path.split(r.filename)[1])
def testOpenFilehandleGzipped(self): r = vcf.Reader(fh('tb.vcf.gz', 'rb')) self.assertEqual(self.samples, r.samples)
def setUp(self): self.reader = vcf.Reader(fh(self.filename))
def testParse(self): reader = vcf.Reader(fh('samtools.vcf')) self.assertEqual(len(reader.samples), 1) self.assertEqual(sum(1 for _ in reader), 11)
dest='vcfFile1', type=str, help='vcfFile1', nargs='?', default=None) parser.add_argument('-R', '--keepRefCalls', action="store_true", dest='keepRefCalls', help='do not remove calls in which only ref is called', default=False) args = parser.parse_args() vcfFile1 = open(args.vcfFile1, 'r') reader1 = vcf.Reader(vcfFile1) vcfoutF1 = replace(args.vcfFile1, '.vcf', '.MKSNGL.vcf') vcfoutF1 = replace(vcfoutF1, '.vcf.gz', '.vcf') print >> sys.stdout, "VCFOUT1", vcfoutF1 vcfoutF1 = open(vcfoutF1, 'w') _Filter = collections.namedtuple('Filter', ['id', 'desc']) reader1.filters['Singleton'] = _Filter(id='Singleton', desc='only one minor variant at locus') reader2 = copy.copy(reader1) vcfout1 = vcf.Writer(vcfoutF1, reader2) #makeCallData = vcf.model.make_calldata_tuple(("GT","ALTP","REFP","GP"))
def test_meta(self): # expect no exceptions raised reader = vcf.Reader(fh('gatk_26_meta.vcf')) assert 'GATKCommandLine' in reader.metadata self.assertEqual(reader.metadata['GATKCommandLine'][0]['CommandLineOptions'], '"analysis_type=LeftAlignAndTrimVariants"') self.assertEqual(reader.metadata['GATKCommandLine'][1]['CommandLineOptions'], '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"')
def simulate_error_sample(self, alpha, beta, error_vcf_path, tumor_bam_path, normal_bam_path, output_bam_path, margin=700): print("============= simulate_error_sample =============") # debug vcf_reader = vcf.Reader(open(error_vcf_path, 'r')) tumor_reader = BamReader() normal_reader = BamReader() tumor_reads = ReadCollector() normal_reads = ReadCollector() num_output_pair = 0 with tumor_reader.prepare(tumor_bam_path), normal_reader.prepare( normal_bam_path): output_bam = pysam.AlignmentFile(output_bam_path, 'w', header=tumor_reader.bam.header) for record in vcf_reader: Chr = record.CHROM pos = record.POS p = list(np.random.dirichlet([alpha, beta], 1).flat) tumor_proportoin_here = p[0] normal_proportion_here = p[1] rct, rcn, act, acn = 0, 0, 0, 0 # debug for sample in record.samples: # debug rct += sample["RCT"] # debug rcn += sample["RCN"] # debug act += sample["ACT"] # debug acn += sample["ACN"] # debug break # debug ref_prediction = int(rct * tumor_proportoin_here + rcn * normal_proportion_here) # debug alt_prediction = int(act * tumor_proportoin_here + acn * normal_proportion_here) # debug print( str(record.CHROM) + ":" + str(record.POS) + ", (tumor proportion, normal proportion): " + \ str((tumor_proportoin_here, normal_proportion_here)) + ", (ref, alt) predicted: " + \ str((ref_prediction, alt_prediction)) ) # debug tumor_reads.clear() normal_reads.clear() for read in tumor_reader.search(Chr, pos - margin, pos + margin, f_flag=0, F_flag=2816): tumor_reads.push(read) for read in normal_reader.search(Chr, pos - margin, pos + margin, f_flag=0, F_flag=2816): normal_reads.push(read) for ID, read_group in tumor_reads: reads = read_group[1] if len( reads ) >= 2 and tumor_proportoin_here >= np.random.rand(): for read in reads: read.query_name = 'error_' + str( num_output_pair) + '_' + read.query_name output_bam.write(read) num_output_pair += 1 for ID, read_group in normal_reads: reads = read_group[1] if len( reads ) >= 2 and normal_proportion_here >= np.random.rand(): for read in reads: read.query_name = 'error_' + str( num_output_pair) + '_' + read.query_name output_bam.write(read) num_output_pair += 1 output_bam.close()
def simulate_sample(self, proportion, vcf_path, tumor_bam_path, normal_bam_path, output_bam_path, min_vaf=0.0, max_vaf=1.0, margin=700, node_info_tag='NODE'): sys.stderr.writelines("min_vaf: " + str(min_vaf) + ", max_vaf: " + str(max_vaf) + "\n") vcf_reader = vcf.Reader(open(vcf_path, 'r')) tumor_reader = BamReader() normal_reader = BamReader() tumor_reads = ReadCollector() normal_reads = ReadCollector() num_output_pair = 0 with tumor_reader.prepare(tumor_bam_path), normal_reader.prepare( normal_bam_path): output_bam = pysam.AlignmentFile(output_bam_path, 'w', header=tumor_reader.bam.header) for record in vcf_reader: Chr = record.CHROM pos = record.POS nodes = re.split('/', record.INFO[node_info_tag]) nodes = map(int, nodes) tumor_proportoin_here = 0.0 normal_proportion_here = 1.0 for node in nodes: tumor_proportoin_here += proportion[node] normal_proportion_here -= proportion[node] # rct, rcn, act, acn = 0, 0, 0, 0 # debug # for sample in record.samples: # debug # rct += sample["RCT"] # debug # rcn += sample["RCN"] # debug # act += sample["ACT"] # debug # acn += sample["ACN"] # debug # break # debug # ref_prediction = int(rct * tumor_proportoin_here + rcn * normal_proportion_here ) # debug # alt_prediction = int(act * tumor_proportoin_here + acn * normal_proportion_here ) # debug # print( str(record.CHROM) + ":" + str(record.POS) + ", (tumor proportion, normal proportion): " + \ # str((tumor_proportoin_here, normal_proportion_here)) + ", (ref, alt) predicted: " + \ # str((ref_prediction, alt_prediction)) ) # debug vaf = 0.0 if len(record.samples) != 1: raise Exception( "Unexpected number of samples in answer vcf.") sample = record.samples[0] rct = int(sample["RCT"]) act = int(sample["ACT"]) depth = rct + act vaf = (1.0 * act) / (depth * 1.0) if not (min_vaf <= vaf <= max_vaf): rcn = int(sample["RCN"]) # debug acn = int(sample["ACN"]) # debug ref_prediction = int(rct * tumor_proportoin_here + rcn * normal_proportion_here) # debug alt_prediction = int(act * tumor_proportoin_here + acn * normal_proportion_here) # debug sys.stderr.writelines( "filtered: " + str(record.CHROM) + ":" + str(record.POS) + # debug ", (tumor proportion, normal proportion): " + # debug str((tumor_proportoin_here, normal_proportion_here)) + # debug ", (ref, alt) predicted: " + str( (ref_prediction, alt_prediction)) + "\n") # debug continue else: rcn = int(sample["RCN"]) # debug acn = int(sample["ACN"]) # debug ref_prediction = int(rct * tumor_proportoin_here + rcn * normal_proportion_here) # debug alt_prediction = int(act * tumor_proportoin_here + acn * normal_proportion_here) # debug sys.stderr.writelines( "passed: " + str(record.CHROM) + ":" + str(record.POS) + # debug ", (tumor proportion, normal proportion): " + # debug str((tumor_proportoin_here, normal_proportion_here)) + # debug ", (ref, alt) predicted: " + str( (ref_prediction, alt_prediction)) + "\n") # debug tumor_reads.clear() normal_reads.clear() for read in tumor_reader.search(Chr, pos - margin, pos + margin, f_flag=0, F_flag=2816): tumor_reads.push(read) for read in normal_reader.search(Chr, pos - margin, pos + margin, f_flag=0, F_flag=2816): normal_reads.push(read) for ID, read_group in tumor_reads: reads = read_group[1] if len( reads ) >= 2 and tumor_proportoin_here >= np.random.rand(): for read in reads: read.query_name = 'tumor_' + str( num_output_pair) + '_' + read.query_name output_bam.write(read) num_output_pair += 1 for ID, read_group in normal_reads: reads = read_group[1] if len( reads ) >= 2 and normal_proportion_here >= np.random.rand(): for read in reads: read.query_name = 'tumor_' + str( num_output_pair) + '_' + read.query_name output_bam.write(read) num_output_pair += 1 output_bam.close()
'ALAA20-3_DNA366', 'BELA18-1_DNA57', 'BELA18-3_DNA58', 'BELA18-4_DNA59', 'BELC18-1_DNA127', 'BELC18-2_DNA128', 'BELC18-4_DNA129' ] filelist = glob.glob("test_data/*.filter.vcf") print(filelist) data = {} destf = open('db_alt.json', 'w') tracking = open('tracking.txt', 'w') for individual in individuals: curr_individual = {} print("reading " + individual) vcf_reader = vcf.Reader( open('test_data/' + individual + '.filter.vcf', 'r')) total_count = 0 num_duplicates = 0 for record in vcf_reader: annotations = record.INFO['ANN'] for ann in annotations: fields = ann.split('|') duplicate = False # According to SnpEff docs, fields are (1-indexed): # 1. allele # 2. effect # 4. gene name # 5. gene ID
def test_dunder_eq(self): reader = vcf.Reader(fh('example-4.0.vcf')) var = reader.next() example_call = var.samples[0] self.assertFalse(example_call == None) self.assertFalse(None == example_call)
def testParse(self): reader = vcf.Reader(fh('bcftools.vcf')) self.assertEqual(len(reader.samples), 1) for r in reader: for s in r.samples: s.phased
def setUp(self): self.reader = vcf.Reader(fh('tb.vcf.gz', 'rb')) self.run = vcf.parser.pysam is not None
def testParse(self): reader = vcf.Reader(fh('gonl.chr20.release4.gtc.vcf')) for _ in reader: pass
def testOpenFilename(self): r = vcf.Reader(filename=self.fp('example-4.0.vcf')) self.assertEqual(self.samples, r.samples)
def test_contig_line(self): reader = vcf.Reader(fh('gonl.chr20.release4.gtc.vcf')) self.assertEqual(reader.contigs['1'].length, 249250621)
def testOpenFilenameGzipped(self): r = vcf.Reader(filename=self.fp('tb.vcf.gz')) self.assertEqual(self.samples, r.samples)
def test_samples(self): self.reader = vcf.Reader(fh(self.filename), strict_whitespace=True) self.assertEqual(self.reader.samples, self.samples)
ann.append('1/1') # Rank / total ann.append('') # HGVS.c ann.append('') # HGVS.p ann.append('') # cDNA_position ann.append('') # CDS_position ann.append('') # Protein_position ann.append(dist) # Distance to feature ann.append('') # Errors, Warnings or Information messages anns.append ('|'.join(ann)) INFO['ANN'] = anns record = vcf.model._Record(CHROM, POS, ID, REF, alts, QUAL, FILTER, INFO, FORMAT, snames) record.samples = reader._parse_samples (samples, FORMAT, record) return record reader = vcf.Reader(filename="{{i.infile}}") reader.infos["ANN"] = vcf.parser._Info("ANN", 1, "String", "Annotation by ANNOVAR", "", "") snames = {v:k for k,v in enumerate(reader.samples)} writer = vcf.Writer(open(outfile, 'w'), reader) f2conv = "{{o.outfile | prefix}}.variant_function" lastvid= '' lastr = [] with open (f2conv) as f: for line in f: line = line.strip("\r\n") if not line: continue parts = line.split("\t") varid = parts[2] + '|' + parts[3] + '|' + parts[12] + '|' + parts[5] if lastvid != varid and lastvid:
def test_num_calls(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: num_calls = (var.num_hom_ref + var.num_hom_alt + \ var.num_het + var.num_unknown) self.assertEqual(len(var.samples), num_calls)
writer = csv.writer(csv_file, delimiter="\t") for key, value in used_dict.items(): writer.writerow([key, value]) for key, value in file_dict.items(): writer.writerow([key, value]) ########Single_SNP_probability and Multi_SNP_probability (1 hour)############ pos = [] snp1 = [] snp2 = [] allcombination = [] allpos = [] for file in file_list: pos = [] with open(file) as vcffile: vcfReader = vcf.Reader(vcffile) for record in vcfReader: pos.append(record.POS) allpos.append(record.POS) i = 0 Single_SNP_prob = {} Single_SNP_proba = {} Single_SNP_probs = [] Single_SNP_probp = [] counts = Counter(allpos) Single_SNP_proba = dict(counts) for element in Single_SNP_proba: if (float(Single_SNP_proba[element]) / len(file_list)) >= 0.01: Single_SNP_prob[str(element)] = (float(Single_SNP_proba[element]) / len(file_list))
def test_dunder_eq(self): rec = vcf.Reader(fh('example-4.0.vcf')).next() self.assertFalse(rec == None) self.assertFalse(None == rec)
def main(): """ :return: """ parser = argparse.ArgumentParser() parser.add_argument( '-i', '--inVCF', required=True, help='The input VCF file name including full/relative path') parser.add_argument( '-f', '--founders', required=True, help= 'The parental genotypes in TSV format [CHROM, POS, REF, ALT, GT1, GT2]' ) parser.add_argument('-c', '--chrom', required=True, help='specify which chromosome the VCF file is from') parser.add_argument('-p', '--prefix', default='out', help='the prefix for the output files') parser.add_argument('-o', '--outputDir', default='', help='the name of the output directory') parser.add_argument('-v', '--verbose', action='store_true', help='print more information') args = parser.parse_args() logger = logging.getLogger('root') FORMAT = "[%(filename)s:%(lineno)4s - %(funcName)20s() ] %(levelname)10s - %(message)s" if args.verbose: logging.basicConfig(level=logging.INFO, format=FORMAT) else: logging.basicConfig(level=logging.WARNING, format=FORMAT) parentalGenosDict = OrderedDict() with openIOFile(args.founders) as parentalGenosInput: parentalGenosHeader = parentalGenosInput.readline().strip().split() logging.info(f'header: {parentalGenosHeader}') try: assert verifyParentalGenosFileStructure(parentalGenosHeader) except AssertionError: message = f'the header of the parental genotypes file [{args.founders}] is missing. ' \ f'Add a header: CHROM, POS, REF, ALT, Parent1_Geno, Parent2_Geno' logging.critical(message) sys.exit(message) parents = parentalGenosHeader[4:6] logging.info(f'parents: {parents}') parentalGenosDict = extractParentGenosForGivenChrom( parentalGenosInput, args.chrom, parentalGenosHeader) logging.info(f"parental genotypes have been stored in memory") genotypeTranslationDictProper = { '0/0': '0', '0/1': '1', '1/1': '2', './.': 'NA' } genotypeTranslationDictInv = { '0/0': '2', '0/1': '1', '1/1': '0', './.': 'NA' } logging.info(f"opening the input VCF file...") vcfFileInput = openIOFile(args.inVCF) vcf_reader = vcf.Reader(vcfFileInput) nSamples = len(vcf_reader.samples) logging.info(f"calculated number of samples: [{nSamples}]") # prepare the data structures for the output file outputHeader = ['sample'] outputGenosDict = OrderedDict() outputGenosDict['positions'] = [] for sample in vcf_reader.samples: outputGenosDict[sample] = [] logging.info(f"iterating through the VCF records") i = 0 for record in vcf_reader: i += 1 if i % 10000 == 0: logging.info(f"processing record # {i}") # print(record.CHROM, record.POS, record.num_called, record.num_unknown) # ensure that the current VCF position is in the parental genotypes table try: assert str(record.POS) in parentalGenosDict.keys() logging.debug(f'found {record.POS}') except AssertionError: message = f'position {record.POS} was not found in the founding SNPs' logging.warning(message) continue # check whether the REF allele matches between the VCF and the parental genos table try: assert parentalGenosDict[str(record.POS)]['REF'] == record.REF except AssertionError: message = f'REF alleles for position {record.POS} don\'t match in founding SNPs and STITCH VCF' logging.critical(message) sys.exit(message) # check whether the ALT allele matches between the VCF and the parental genos table try: assert parentalGenosDict[str(record.POS)]['ALT'] == record.ALT[0] except AssertionError: message = f'ALT alleles for position {record.POS} don\'t match in founding SNPs and STITCH VCF' logging.critical(message) sys.exit(message) # determine which translation dictionary will be used if parentalGenosDict[str( record.POS)][parents[0]] == '0/0' and parentalGenosDict[str( record.POS)][parents[1]] == '1/1': translateGeno = genotypeTranslationDictProper.copy() elif parentalGenosDict[str( record.POS)][parents[0]] == '1/1' and parentalGenosDict[str( record.POS)][parents[1]] == '0/0': translateGeno = genotypeTranslationDictInv.copy() else: message = f'An unexpected genotype combination was encountered in the parents at position [{record.POS}]' logging.warn(message) warnings.warn(message, Warning) continue outputGenosDict['positions'].append(str(record.POS)) # iterate through the samples in the VCF # re-code genos as 0, 1, or 2 for sample in vcf_reader.samples: trGeno = translateGeno[record.genotype(sample)['GT']] outputGenosDict[sample].append(trGeno) logging.debug(pprint.pformat(outputGenosDict)) try: assert verifyOutputGenoIntegrity(outputGenosDict, vcf_reader.samples) except AssertionError: message = f'the output genotype dictionary is not correct' logging.critical(message) sys.exit(message) outputFN = f'{args.prefix}.{args.chrom}.genos.csv' outputFile = openIOFile(outputFN, args.outputDir, 'w') outputHeader += outputGenosDict['positions'] outputFile.write(','.join(map(str, outputHeader)) + '\n') for sample in vcf_reader.samples: outputLine = [sample] + outputGenosDict[sample] outputFile.write(','.join(map(str, outputLine)) + '\n')
def test_pickle(self): reader = vcf.Reader(fh('example-4.0.vcf')) for var in reader: self.assertEqual(cPickle.loads(cPickle.dumps(var)), var)
def parse_vcf(vcf_file, gbk_file): ''' Given a vcf input file and the gbk of the reference genome, return an html table of identified variants. ''' vcf_reader = vcf.Reader(codecs.open(vcf_file, 'r', 'latin-1')) gbk_dico = parse_gbk(gbk_file) filter_head = [ '%s' % (vcf_reader.filters[i].id) for i in vcf_reader.filters ] header = [ "contig", "length", "position", "REF", "ALT", "location", "type", "ORF", "gene", "orf_before", "orf_after" ] header += filter_head if 'assembled' in vcf_file: header.append("InRef") header.append("Fail Others") table_rows = [] snp_count = 0 for n, vcf_record in enumerate(vcf_reader): try: contig = gbk_dico[vcf_record.CHROM] except KeyError: print("Missing contig", vcf_record.CHROM) continue variant_feature = search_mutated_feature(vcf_record, gbk_dico) if variant_feature["mut_location"] == 'Intergenic': orf_before, orf_after = get_neiboring_orf(int(vcf_record.POS), contig.features) else: orf_before, orf_after = ['-', '-'] contig_name = vcf_record.CHROM # skip ppositions with genomtype identical to REF if vcf_record.samples[0]['GT'] in ['.', '0']: continue snp_count += 1 position = vcf_record.POS # REF and ALT with respective depth in parenthesis ref = "%s (%s/%s)" % (vcf_record.REF, vcf_record.samples[0]['AD'][0], vcf_record.samples[0]['DP']) if len(vcf_record.ALT[0]) == 1: alt = "%s (%s/%s)" % (vcf_record.ALT[0], vcf_record.samples[0]['AD'][1], vcf_record.samples[0]['DP']) else: alt = "%sbp (%s/%s)" % (len( vcf_record.ALT[0]), vcf_record.samples[0]['AD'][1], vcf_record.samples[0]['DP']) filter_status = [] # if any of the test failed, set PASS as failed if len(vcf_record.FILTER) != 0: vcf_record.FILTER.append('PASS') for filter_name in vcf_reader.filters: if filter_name in vcf_record.FILTER: if filter_name == 'PASS': filter_status.append('NO') else: filter_status.append('-') else: if filter_name == 'PASS': filter_status.append('YES') else: filter_status.append('+') row = [ contig_name, len(contig), position, ref, alt, variant_feature["mut_location"], variant_feature["mut_type"], variant_feature["orf_name"], variant_feature["gene"], orf_before, orf_after ] row += list(filter_status) # if comparison to assembled genome, add data about self mapping # (IF A VARIANT IS ALSO IDENTIFIED IN THAT MAPPING, PROBABLY A FALSE POSITIVE) if 'assembled' in vcf_file: GT, PASS = check_reference_mapping_GT(merged_vcf_records, contig_name, position, reference) row.append(GT) row.append(PASS) table_rows.append(row) df = pandas.DataFrame(table_rows, columns=header) # cell content is truncated if colwidth not set to -1 pandas.set_option('display.max_colwidth', -1) df_str = df.to_html(index=False, bold_rows=False, classes=["dataTable"], table_id="snps_table", escape=False, border=0) return df_str.replace("\n", "\n" + 10 * " ")
def test_parser(self): """Basic tests for the parser. """ VCF_DATATYPE = Dataset.TYPE.VCF_FREEBAYES alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.reference_genome) copy_and_add_dataset_source(alignment_group, VCF_DATATYPE, VCF_DATATYPE, TEST_GENOME_SNPS) Chromosome.objects.create( reference_genome=self.reference_genome, label='Chromosome', num_bases=9001) # Create experiment sample objects having UIDs that correspond to those # in the vcf file. This is a bit "fake" in that the actual pipeline we # will be generating the vcf file from the samples (see add_groups() # stage of pipeline. with open(TEST_GENOME_SNPS) as fh: reader = vcf.Reader(fh) experiment_sample_uids = reader.samples num_experiment_samples = len(experiment_sample_uids) for sample_uid in experiment_sample_uids: ExperimentSample.objects.create( uid=sample_uid, project=self.project, label='fakename:' + sample_uid ) # Count the number of records in the vcf file for testing. record_count = 0 with open(TEST_GENOME_SNPS) as fh: for record in vcf.Reader(fh): record_count += 1 # Parse the vcf parse_alignment_group_vcf(alignment_group, VCF_DATATYPE) variant_list = Variant.objects.filter( reference_genome=self.reference_genome) # There should be one Variant object for each record. self.assertEqual(record_count, len(variant_list)) # Spot-check a few variants. self.assertEqual(1, len(Variant.objects.filter( reference_genome=self.reference_genome, position=376))) v_453 = Variant.objects.get(reference_genome=self.reference_genome, position=453) self.assertEqual(['G'], v_453.get_alternates()) # Check false negatives. self.assertEqual(0, len(Variant.objects.filter( reference_genome=self.reference_genome, position=454))) # There should be one VariantCallerCommonData object for each record. self.assertEqual(record_count, len(VariantCallerCommonData.objects.filter( variant__reference_genome=self.reference_genome))) # There should also be one VariantEvidence object per Variant x Sample. for variant in variant_list: vccd = variant.variantcallercommondata_set.all()[0] self.assertEqual(num_experiment_samples, len(vccd.variantevidence_set.all())) # Check that alternate data is populated. #Chromosome 1330 . CG C,GC,AG 126.036 . AB=0.5,0.5,1;ABP=3.0103,3.0103,7.35324;AC=1,1,1;AF=0.0833333,0.0833333,0.0833333;AN=12;AO=1,1,2;CIGAR=1M1D,2X,1X1M;DP=10;DPRA=1.33333,1.33333,1.33333;EPP=5.18177,5.18177,3.0103;EPPR=4.45795;HWE=-16.5861;LEN=1,2,1;MEANALT=2,2,1;MQM=60,37,48.5;MQMR=40.8333;NS=6;NUMALT=3;ODDS=1.50408;PAIRED=1,0,0.5;PAIREDR=0.166667;RO=6;RPP=5.18177,5.18177,7.35324;RPPR=16.0391;RUN=1,1,1;SAP=5.18177,5.18177,3.0103;SRP=4.45795;TYPE=del,mnp,snp;XAI=0,0.0102041,0.00515464;XAM=0,0.0102041,0.0253649;XAS=0,0,0.0202103;XRI=0.0016835;XRM=0.00835084;XRS=0.00666733;technology.illumina=1,1,1;BVAR GT:DP:RO:QR:AO:QA:GL . 0/0:1:1:36:0,0,0:0,0,0:0,-0.30103,-3.6,-0.30103,-3.6,-3.6,-0.30103,-3.6,-3.6,-3.6 0/0:2:2:76:0,0,0:0,0,0:0,-0.60206,-7.03,-0.60206,-7.03,-7.03,-0.60206,-7.03,-7.03,-7.03 1/2:2:0:0:1,1,0:108,31,0:-8.645,-3.40103,-3.1,-6.30103,-0.30103,-6,-8.645,-3.40103,-6.30103,-8.645 . 0/3:2:0:0:0,0,2:0,0,73:-6.935,-6.935,-6.935,-6.935,-6.935,-6.935,-0.60206,-0.60206,-0.60206,0 0/0:2:2:72:0,0,0:0,0,0:0,-0.60206,-6.84,-0.60206,-6.84,-6.84,-0.60206,-6.84,-6.84,-6.84 . 0/0:1:1:34:0,0,0:0,0,0:0,-0.30103,-3.4,-0.30103,-3.4,-3.4,-0.30103,-3.4,-3.4,-3.4 . v_1330 = Variant.objects.get(reference_genome=self.reference_genome, position=1330) self.assertEqual(set(v_1330.get_alternates()), set(['C', 'GC', 'AG'])) v_1330_c = VariantAlternate.objects.get(variant=v_1330, alt_value='C') self.assertTrue(len(v_1330_c.variantevidence_set.all())) v_1330_gc = VariantAlternate.objects.get(variant=v_1330, alt_value='GC') self.assertTrue(len(v_1330_gc.variantevidence_set.all())) self.assertEqual(v_1330_c.data['INFO_ABP'], v_1330_gc.data['INFO_ABP'])