def parse_jsons(jsonfile, stats, count_sv = False, count_all = False): ''' parse json, extract T, TP, FP stats for various variant types adapted from Roger Liu's code. :param jsonfile: :param stats: :param count_sv: :param count_all: :return: ''' var_types = stats.keys() metrics = stats[var_types[0]].keys() with utils.versatile_open(jsonfile, 'r') as fh: data = json.load(fh) for vt in var_types: if vt in data['num_true_correct']['data']: for mt in metrics: try: if count_all: stats[vt][mt] += data['num_true_correct']['data'][vt]['sum_count'][mt] elif count_sv: stats[vt][mt] += data['num_true_correct']['data'][vt]['svSumCount'][mt] else: stats[vt][mt] += data['num_true_correct']['data'][vt]['sum_count'][mt] stats[vt][mt] -= data['num_true_correct']['data'][vt]['svSumCount'][mt] except KeyError as err: print ("error in {}. No {} field".format(jsonfile, err)) stats[vt][mt] += 0
def parse_jsons(jsonfile, stats, count_sv=False, count_all=False): ''' parse json, extract T, TP, FP stats for various variant types adapted from Roger Liu's code. :param jsonfile: :param stats: :param count_sv: :param count_all: :return: ''' var_types = stats.keys() metrics = stats[var_types[0]].keys() with utils.versatile_open(jsonfile, 'r') as fh: data = json.load(fh) for vt in var_types: if vt in data['num_true_correct']['data']: for mt in metrics: try: if count_all: stats[vt][mt] += data['num_true_correct']['data'][ vt]['sum_count'][mt] elif count_sv: stats[vt][mt] += data['num_true_correct']['data'][ vt]['svSumCount'][mt] else: stats[vt][mt] += data['num_true_correct']['data'][ vt]['sum_count'][mt] stats[vt][mt] -= data['num_true_correct']['data'][ vt]['svSumCount'][mt] except KeyError as err: print("error in {}. No {} field".format(jsonfile, err)) stats[vt][mt] += 0
def convertCN(filenames, operation): """ convert '2/1'-like copy number to a single number(e.g. 2) 0 will be considered same as 1 by default the max number will be kept the change is in place """ logger = logging.getLogger(convertCN.__name__) logger.info("convertCN started") if operation != "two2one" and operation != "one2two": raise ValueError("Only two2one or one2two allowed") two2one = operation == "two2one" delimiter = re.compile('[/|]') for name in filenames: logger.info("processing {}".format(name)) with versatile_open(name, 'r') as file_fd: output = tempfile.NamedTemporaryFile(mode='r+w', delete=False) for l in file_fd: l = l.rstrip() fields = l.split("\t") if l.startswith("#") or 'CN' not in fields[8]: if l.startswith('##FORMAT=<ID=CN'): if two2one: l = l.replace("Type=String", "Type=Integer") else: l = l.replace("Type=Integer", "Type=String") output.write(l + "\n") else: info = fields[8].split(':') cnIndex = info.index('CN') gtIndex = info.index('GT') #change CN field in all samples for sampleIndex in range(9, len(fields)): sampleInfo = fields[sampleIndex].split(':') if two2one: cn = delimiter.split(sampleInfo[cnIndex]) #here cn is list of strings sampleInfo[cnIndex] = str(max(map(int, cn))) elif len(delimiter.split(sampleInfo[cnIndex])) == 1: #only split when there is only one number gt = delimiter.split(sampleInfo[gtIndex]) cn = sampleInfo[cnIndex] for i in range(len(gt)): gt[i] = '1' if gt[i] == '0' else cn if sampleInfo[gtIndex].find('/') >= 0: sampleInfo[cnIndex] = '/'.join(gt) else: sampleInfo[cnIndex] = '|'.join(gt) fields[sampleIndex] = ":".join(sampleInfo) output.write("\t".join(fields) + "\n") output.close() shutil.copyfile(output.name, name) os.remove(output.name) logger.info("convertCN done") return
def convertCN(filenames, operation): """ convert '2/1'-like copy number to a single number(e.g. 2) 0 will be considered same as 1 by default the max number will be kept the change is in place """ logger = logging.getLogger(convertCN.__name__) logger.info("convertCN started") if operation != "two2one" and operation != "one2two": raise ValueError("Only two2one or one2two allowed") two2one = operation == "two2one" delimiter = re.compile('[/|]') for name in filenames: logger.info("processing {}".format(name)) with versatile_open(name, 'r') as file_fd: output = tempfile.NamedTemporaryFile(mode = 'r+w', delete = False) for l in file_fd: l = l.rstrip() fields = l.split("\t") if l.startswith("#") or 'CN' not in fields[8]: if l.startswith('##FORMAT=<ID=CN'): if two2one: l = l.replace("Type=String","Type=Integer") else: l = l.replace("Type=Integer", "Type=String") output.write(l + "\n") else: info = fields[8].split(':') cnIndex = info.index('CN') gtIndex = info.index('GT') #change CN field in all samples for sampleIndex in range(9,len(fields)): sampleInfo = fields[sampleIndex].split(':') if two2one: cn = delimiter.split(sampleInfo[cnIndex]) #here cn is list of strings sampleInfo[cnIndex] = str(max(map(int, cn))) elif len(delimiter.split(sampleInfo[cnIndex])) == 1: #only split when there is only one number gt = delimiter.split(sampleInfo[gtIndex]) cn = sampleInfo[cnIndex] for i in range(len(gt)): gt[i] = '1' if gt[i] == '0' else cn if sampleInfo[gtIndex].find('/') >= 0: sampleInfo[cnIndex] = '/'.join(gt) else: sampleInfo[cnIndex] = '|'.join(gt) fields[sampleIndex] = ":".join(sampleInfo) output.write("\t".join(fields) + "\n") output.close() shutil.copyfile(output.name, name) os.remove(output.name) logger.info("convertCN done") return
def run(self): ''' :return: ''' #command example #rtg-tools-3.8.4-bdba5ea_install/rtg vcfeval --baseline truth.vcf.gz \ #--calls compare1.vcf.gz -o vcfeval_split_snp -t ref.sdf --output-mode=annotate --sample xx --squash-ploidy --regions ?? \ cmd = ['java', utils.JAVA_XMX, '-jar', utils.RTGJAR, 'vcfeval', '-o', self.prefix, '--baseline', self.true_vcf, '-t', self.reference, ] if not self.exclude_filtered: cmd.append('--all-records') if not self.match_geno: cmd.append('--squash-ploidy') if self.sample: cmd.append('--sample') cmd.append(self.sample) if self.regions: cmd.append('--bed-regions') cmd.append(self.regions) if self.opts: cmd.append(self.opts) if len(self.vcfs) != 1: raise ValueError('vcfeval only takes 1 prediction VCF and 1 truth VCF: {0}'.format(self.vcfs)) cmd.append('--calls') cmd.append(self.vcfs[0]) tp = os.path.join(self.prefix, 'tp-baseline.vcf.gz') tp_predict = os.path.join(self.prefix, 'tp.vcf.gz') fn = os.path.join(self.prefix, 'fn.vcf.gz') fp = os.path.join(self.prefix, 'fp.vcf.gz') if utils.count_variants(self.true_vcf) == 0 and utils.count_variants(self.vcfs[0]) == 0: #both truth and prediction are empty, do nothing utils.makedirs([self.prefix]) shutil.copyfile(self.true_vcf, tp) shutil.copyfile(self.true_vcf, fn) shutil.copyfile(self.vcfs[0], tp_predict) shutil.copyfile(self.vcfs[0], fp) else: if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stderr, logout) else: utils.run_shell_command(cmd, sys.stderr, sys.stderr) for i in (tp, tp_predict, fn, fp): if not os.path.exists(i): raise Exception('{0} was not generated by vcfeval. Please check and rerun.'.format(i)) self.tp, self.tp_predict, self.fn, self.fp = tp, tp_predict, fn, fp
def run(self): ''' :return: ''' cmd = [ self.java, utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompare', '-prefix', self.prefix, '-true_vcf', self.true_vcf, '-reference', self.reference, ] if self.exclude_filtered: cmd.append('-exclude_filtered') if self.match_geno: cmd.append('-match_geno') if self.sample: cmd.append('-sample') cmd.append(self.sample) if self.regions: cmd.append('-bed') cmd.append(self.regions) if self.disallow_partial_fp: cmd.append('-disallow_partial_fp') if str(self.sv_length): cmd.append('-sv_length {}'.format(self.sv_length)) if self.opts: cmd.append(self.opts) cmd.extend(self.vcfs) if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stdout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) tp = self.prefix + '_TP.vcf' fn = self.prefix + '_FN.vcf' fp = self.prefix + '_FP.vcf' for i in (tp, fn, fp): if not os.path.exists(i): raise Exception( '{0} was not generated by VarSim vcfcompare. Please check and rerun.' .format(i)) self.tp, self.fn, self.fp = tp, fn, fp
def generate_sdf(reference, log): ''' take reference and generate SDF :param reference: :return: ''' sdf = reference + '.sdf' if os.path.exists(sdf): LOGGER.info('{0} exists, doing nothing'.format(sdf)) LOGGER.info('to rerun SDF generation, please remove or rename {0}'.format(sdf)) return sdf cmd = ['java', utils.JAVA_XMX, '-jar',utils.RTGJAR,'format', '-o', sdf, reference] if log: with utils.versatile_open(log, 'a') as logout: utils.run_shell_command(cmd, logout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) return sdf
def generate_sdf(reference, log): ''' take reference and generate SDF :param reference: :return: ''' sdf = reference + '.sdf' if os.path.exists(sdf): LOGGER.info('{0} exists, doing nothing'.format(sdf)) LOGGER.info( 'to rerun SDF generation, please remove or rename {0}'.format(sdf)) return sdf cmd = [ 'java', utils.JAVA_XMX, '-jar', utils.RTGJAR, 'format', '-o', sdf, reference ] if log: with utils.versatile_open(log, 'a') as logout: utils.run_shell_command(cmd, logout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) return sdf
def run(self): ''' :return: ''' cmd = ['java', utils.JAVA_XMX, '-jar', utils.VARSIMJAR, 'vcfcompare', '-prefix', self.prefix, '-true_vcf', self.true_vcf, '-reference', self.reference, ] if self.exclude_filtered: cmd.append('-exclude_filtered') if self.match_geno: cmd.append('-match_geno') if self.sample: cmd.append('-sample') cmd.append(self.sample) if self.regions: cmd.append('-bed') cmd.append(self.regions) if self.disallow_partial_fp: cmd.append('-disallow_partial_fp') if self.opts: cmd.append(self.opts) cmd.extend(self.vcfs) if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stdout, logout) else: utils.run_shell_command(cmd, sys.stdout, sys.stderr) tp = self.prefix + '_TP.vcf' fn = self.prefix + '_FN.vcf' fp = self.prefix + '_FP.vcf' for i in (tp, fn, fp): if not os.path.exists(i): raise Exception('{0} was not generated by VarSim vcfcompare. Please check and rerun.'.format(i)) self.tp, self.fn, self.fp = tp, fn, fp
def run(self): ''' :return: ''' #command example #rtg-tools-3.8.4-bdba5ea_install/rtg vcfeval --baseline truth.vcf.gz \ #--calls compare1.vcf.gz -o vcfeval_split_snp -t ref.sdf --output-mode=annotate --sample xx --squash-ploidy --regions ?? \ cmd = [ 'java', utils.JAVA_XMX, '-jar', utils.RTGJAR, 'vcfeval', '-o', self.prefix, '--baseline', self.true_vcf, '-t', self.reference, ] if not self.exclude_filtered: cmd.append('--all-records') if not self.match_geno: cmd.append('--squash-ploidy') if self.sample: cmd.append('--sample') cmd.append(self.sample) if self.regions: cmd.append('--bed-regions') cmd.append(self.regions) if self.opts: cmd.append(self.opts) if len(self.vcfs) != 1: raise ValueError( 'vcfeval only takes 1 prediction VCF and 1 truth VCF: {0}'. format(self.vcfs)) cmd.append('--calls') cmd.append(self.vcfs[0]) tp = os.path.join(self.prefix, 'tp-baseline.vcf.gz') tp_predict = os.path.join(self.prefix, 'tp.vcf.gz') fn = os.path.join(self.prefix, 'fn.vcf.gz') fp = os.path.join(self.prefix, 'fp.vcf.gz') if utils.count_variants(self.true_vcf) == 0 and utils.count_variants( self.vcfs[0]) == 0: #both truth and prediction are empty, do nothing utils.makedirs([self.prefix]) shutil.copyfile(self.true_vcf, tp) shutil.copyfile(self.true_vcf, fn) shutil.copyfile(self.vcfs[0], tp_predict) shutil.copyfile(self.vcfs[0], fp) else: if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stderr, logout) else: utils.run_shell_command(cmd, sys.stderr, sys.stderr) for i in (tp, tp_predict, fn, fp): if not os.path.exists(i): raise Exception( '{0} was not generated by vcfeval. Please check and rerun.' .format(i)) self.tp, self.tp_predict, self.fn, self.fp = tp, tp_predict, fn, fp
def match_false(augmented_file, files_to_pair_with, out_dir, sample, log_to_file, vcfeval_options, sdf, java="java"): """Try to pair up each false call in a file (augmented_file) with a variant in the other files provided in a list (files_to_pair_with) to create an annotated version of the first file. By default the the first variant in the list is provided to get an AF, the 2nd to determine the simulated variant (for false positives) and the 3rd to determine if a false positive is a pure false positive (not simulated) or not (wrong genotype)""" files_to_pair_with_clean = [] for item in files_to_pair_with: files_to_pair_with_clean.append(utils.make_clean_vcf(item, out_dir)) content = [] annotated_content = [] with utils.versatile_open(augmented_file, "rt") as augmented_file_handle: for line in augmented_file_handle.readlines(): line_strip = line.strip() line_split = line_strip.split() if line_strip[0] == "#": annotated_content.append(line_strip) content.append(line_strip) else: if content[-1][0] != "#": del content[-1] content.append(line_strip) single_var_file = utils.write_vcf( content, os.path.join(out_dir, "single.vcf")) single_var_file = utils.sort_and_compress(single_var_file) for i, item in enumerate(files_to_pair_with_clean): equivalent_variant = None if item: vcfeval_prefix = os.path.join( out_dir, 'vcfeval_compare_results_annotate') vcfeval_comparator = RTGVCFComparator( prefix=vcfeval_prefix, true_vcf=item, reference=sdf, regions=None, sample=sample, vcfs=[single_var_file], exclude_filtered=False, match_geno=False, log_to_file=log_to_file, opts=vcfeval_options, java=java) equivalent_variant = utils.get_equivalent_variant( line_split, vcfeval_comparator.get_tp()) #clean up if os.path.exists(vcfeval_prefix): LOGGER.warn('{0} exists, removing ...'.format( vcfeval_prefix)) shutil.rmtree(vcfeval_prefix) if i == 0: if equivalent_variant: try: AO = int(equivalent_variant[-1].split(':') [4].split(',')[0]) RO = int(equivalent_variant[-1].split(':') [2].split(',')[0]) except: info = "N/A;" else: info = str(float(AO) / (AO + RO)) + ';' else: info = "N/A;" elif i == 1: if equivalent_variant: info += equivalent_variant[ 0] + '_' + equivalent_variant[ 1] + '_' + equivalent_variant[ 3] + '_' + equivalent_variant[ 4] + '_' + equivalent_variant[ -1] + ";" else: info += "N/A;" elif i == 2: info += "pure;" if not equivalent_variant else "not;" line_split[6] = info annotated_content.append('\t'.join(line_split)) #clean up if os.path.isfile(single_var_file): os.remove(single_var_file) os.remove(single_var_file + ".tbi") annotated_file = utils.write_vcf( annotated_content, os.path.join( args.out_dir, "{}_annotated.vcf".format( os.path.splitext( os.path.splitext( os.path.basename(augmented_file))[0])[0]))) annotated_file = utils.sort_and_compress(annotated_file) #clean up for item in files_to_pair_with_clean: if item and os.path.isfile(item): os.remove(item) os.remove(item + ".tbi")
def run(self): ''' :return: ''' #command example #rtg-tools-3.8.4-bdba5ea_install/rtg vcfeval --baseline truth.vcf.gz \ #--calls compare1.vcf.gz -o vcfeval_split_snp -t ref.sdf --output-mode=annotate --sample xx --squash-ploidy --regions ?? \ cmd = [ self.java, utils.JAVA_XMX, '-jar', utils.RTGJAR, 'vcfeval', '-o', self.prefix, '--baseline', self.true_vcf, '-t', self.reference, ] if not self.exclude_filtered: cmd.append('--all-records') if not self.match_geno: cmd.append('--squash-ploidy') if self.sample: cmd.append('--sample') cmd.append(self.sample) if self.regions: cmd.append('--bed-regions') cmd.append(self.regions) if self.opts: cmd.append(self.opts) if len(self.vcfs) != 1: raise ValueError( 'vcfeval only takes 1 prediction VCF and 1 truth VCF: {0}'. format(self.vcfs)) cmd.append('--calls') cmd.append(self.vcfs[0]) tp = os.path.join(self.prefix, 'tp-baseline.vcf.gz') tp_predict = os.path.join(self.prefix, 'tp.vcf.gz') fn = os.path.join(self.prefix, 'fn.vcf.gz') fp = os.path.join(self.prefix, 'fp.vcf.gz') #vcfeval refuses to run if true_vcf contains 0 variants if utils.count_variants(self.true_vcf) == 0: utils.makedirs([self.prefix]) #because there is 0 ground truth variants, TP and FN will be empty shutil.copyfile(self.true_vcf, tp) shutil.copyfile(self.true_vcf, fn) if utils.count_variants(self.vcfs[0]) == 0: #if calls are empty, then TP_PREDICT and FP will for sure be empty shutil.copyfile(self.vcfs[0], tp_predict) shutil.copyfile(self.vcfs[0], fp) else: #if calls are not empty, then all calls will be FP due to 0 ground truth, TP_PREDICT will be empty shutil.copyfile(self.vcfs[0], fp) with utils.versatile_open(tp_predict, "w") as output, utils.versatile_open( self.vcfs[0], "r") as input: for i in input: if i.startswith('#'): output.write(i) else: break else: if self.log_to_file: with utils.versatile_open(self.log_to_file, 'a') as logout: utils.run_shell_command(cmd, sys.stderr, logout) else: utils.run_shell_command(cmd, sys.stderr, sys.stderr) for i in (tp, tp_predict, fn, fp): if not os.path.exists(i): raise Exception( '{0} was not generated by vcfeval. Please check and rerun.' .format(i)) self.tp, self.tp_predict, self.fn, self.fp = tp, tp_predict, fn, fp
def match_false(augmented_file, files_to_pair_with, out_dir, sample, log_to_file, vcfeval_options, sdf, java="java"): """Try to pair up each false call in a file (augmented_file) with a variant in the other files provided in a list (files_to_pair_with) to create an annotated version of the first file. By default the the first variant in the list is provided to get an AF, the 2nd to determine the simulated variant (for false positives) and the 3rd to determine if a false positive is a pure false positive (not simulated) or not (wrong genotype)""" files_to_pair_with_clean = [] for item in files_to_pair_with: files_to_pair_with_clean.append(utils.make_clean_vcf(item, out_dir)) content = [] annotated_content = [] with utils.versatile_open(augmented_file, "rt") as augmented_file_handle: for line in augmented_file_handle.readlines(): line_strip = line.strip() line_split = line_strip.split() if line_strip[0] == "#": annotated_content.append(line_strip) content.append(line_strip) else: if content[-1][0] != "#": del content[-1] content.append(line_strip) single_var_file = utils.write_vcf( content, os.path.join(out_dir, "single.vcf")) single_var_file = utils.sort_and_compress(single_var_file) single_var_chr = line_split[0] info = '' for i, item in enumerate(files_to_pair_with_clean): nonmatching_gt_variant = None if item: vcfeval_prefix = os.path.join( out_dir, 'vcfeval_compare_results_annotate') #Restrict the comparison to just the chromosome of the single variant by creating a filtered comparison file filtered_true_vcf = utils.write_filtered_vcf( item, single_var_chr, os.path.join(out_dir, "filtered.vcf")) filtered_true_vcf = utils.sort_and_compress( filtered_true_vcf) vcfeval_comparator = RTGVCFComparator( prefix=vcfeval_prefix, true_vcf=filtered_true_vcf, reference=sdf, regions=None, sample=sample, vcfs=[single_var_file], exclude_filtered=False, match_geno=False, log_to_file=log_to_file, opts=vcfeval_options, java=java) nonmatching_gt_variant = utils.get_closest_variant( line_split, vcfeval_comparator.get_tp()) #if not nonmatching_gt_variant, check for matching alt and ref at the same position. Example of when this could be applicable is a 0/0 call when vcfeval will not pair up variants at the same locus with the same alt and ref even with match_geno=False if not nonmatching_gt_variant: nonmatching_gt_variant = utils.get_matching_alt_ref( line_split, filtered_true_vcf) #clean up if os.path.exists(vcfeval_prefix): LOGGER.warn('{0} exists, removing ...'.format( vcfeval_prefix)) shutil.rmtree(vcfeval_prefix) if i == 0: AO_RO_DP_AD = { "AO": None, "RO": None, "DP": None, "AD": None } if nonmatching_gt_variant: for entry in AO_RO_DP_AD: AO_RO_DP_AD[entry] = utils.get_info( nonmatching_gt_variant, entry) # gatk4 format if AO_RO_DP_AD["AD"]: AD_split = AO_RO_DP_AD["AD"].split(',') AO = list(map(int, AD_split[1:])) RO = int(AD_split[0]) for i, item in enumerate(AO): comma = ',' if i < len(AO) - 1 else '' if item + RO == 0: info += "0.0" + comma else: info += str(float(item) / (item + RO)) + comma #freebayes elif AO_RO_DP_AD["AO"] and AO_RO_DP_AD["RO"]: for i, item in enumerate( AO_RO_DP_AD["AO"].split(',')): comma = ',' if i < len( AO_RO_DP_AD["AO"].split(',')) - 1 else '' denominator = int(item) + int( AO_RO_DP_AD["RO"]) if denominator == 0: info += "0.0" + comma else: info += str( float(item) / denominator) + comma else: info += "N/A" info += ';' info += "N/A" if not AO_RO_DP_AD["DP"] else str( AO_RO_DP_AD["DP"]) info += ';' elif i == 1: if nonmatching_gt_variant: info += nonmatching_gt_variant[ 0] + '_' + nonmatching_gt_variant[ 1] + '_' + nonmatching_gt_variant[ 3] + '_' + nonmatching_gt_variant[ 4] + '_' + nonmatching_gt_variant[ -1] + ";" else: info += "N/A;" elif i == 2: info += "pure;" if not nonmatching_gt_variant else "not;" line_split[6] = info annotated_content.append('\t'.join(line_split)) #clean up for fil in [single_var_file, filtered_true_vcf]: if os.path.isfile(fil): os.remove(fil) os.remove(fil + ".tbi") annotated_file = utils.write_vcf( annotated_content, os.path.join( args.out_dir, "{}_annotated.vcf".format( os.path.splitext( os.path.splitext( os.path.basename(augmented_file))[0])[0]))) annotated_file = utils.sort_and_compress(annotated_file) #clean up for item in files_to_pair_with_clean: if item and os.path.isfile(item): os.remove(item) os.remove(item + ".tbi")