def add_snpeff_to_svm(svm_file, snpeff_file, outfile): import shutil from genomicode import filelib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix if not filelib.exists_nz(snpeff_file): shutil.copy2(svm_file, outfile) return # Read the annotations. header = None # includes Chrom, Pos, Ref, Alt coord2d = {} for d in filelib.read_row(snpeff_file, header=1): if header is None: header = d._header coord = d.Chrom, d.Pos, d.Ref, d.Alt coord2d[coord] = d svm = SimpleVariantMatrix.read_as_am(svm_file) CHROM = svm.header2annots["______Chrom"] POS = svm.header2annots["______Pos"] REF = svm.header2annots["______Ref"] ALT = svm.header2annots["______Alt"] snpeff_header = header[4:] snpeff_matrix = [] # Row major. for i in range(len(CHROM)): coord = CHROM[i], POS[i], REF[i], ALT[i] row = [""] * len(snpeff_header) d = coord2d.get(coord) if d: row = d._cols[4:] assert len(row) == len(snpeff_header) snpeff_matrix.append(row) assert len(snpeff_matrix) == len(CHROM) # AnnotationMatrix is column major. snpeff_annots = [] for j in range(len(snpeff_header)): x = [snpeff_matrix[i][j] for i in range(len(snpeff_matrix))] snpeff_annots.append(x) # Convert the headers to SVM format. snpeff_header = ["SnpEff______%s" % x for x in snpeff_header] # Make the new SimpleVariantMatrix. headers = svm.headers[:4] + snpeff_header + svm.headers[4:] x = [svm.header2annots[x] for x in svm.headers_h] all_annots = x[:4] + snpeff_annots + x[4:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=svm.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import itertools from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_file = in_data.identifier metadata = {} #x = mlib.get_user_option( # user_options, "nonsynonymous_and_stopgain_only", # allowed_values=["no", "yes"]) #nonsynonymous_and_stopgain_only = (x == "yes") min_alt_reads = mlib.get_user_option(user_options, "filter_by_min_alt_reads", not_empty=True, type=int) assert min_alt_reads >= 0 and min_alt_reads < 10000 min_total_reads = mlib.get_user_option(user_options, "filter_by_min_total_reads", not_empty=True, type=int) assert min_total_reads >= 0 and min_total_reads < 10000 min_vaf = mlib.get_user_option(user_options, "filter_by_min_vaf", not_empty=True, type=float) assert min_vaf >= 0.0 and min_vaf < 1.0 #min_gq = mlib.get_user_option( # user_options, "filter_by_min_GQ", not_empty=True, type=float) #assert min_gq >= 0 and min_gq < 1000 assert min_total_reads or min_alt_reads, "No filter" matrix = SimpleVariantMatrix.read_as_am(summary_file) #var_matrix = SimpleVariantMatrix.read(summary_file) #call_matrix = var_matrix.call_matrix #annot_matrix = var_matrix.annot_matrix #annovar_matrix = None #for (name, matrix) in var_matrix.named_matrices: # if "ExonicFunc.refGene" in matrix.headers: # annovar_matrix = matrix # break #assert annovar_matrix, "Missing annotation: ExonicFunc.refGene" # copy.deepcopy is very slow. Try to avoid it. # Strategy: # 1. Make a list of the changes to be made. # 2. Save the filtered rows. # 3. Make the changes. # 4. Save the non-filtered rows. I_remove = {} # i -> 1 call_remove = {} # i -> (sample, caller) -> 1 #CHROM = matrix.header2annots["______Chrom"] #POS = matrix.header2annots["______Pos"] #POS = [int(x) for x in POS] #REF = matrix.header2annots["______Ref"] #ALT = matrix.header2annots["______Alt"] # Optimization: normalize the headers for the samples and callers. sc2header = {} # (sample, caller) -> header_h for sc in itertools.product(matrix.samples, matrix.callers): sample, caller = sc header = "%s___%s___Ref/Alt/VAF" % (sample, caller) header_h = matrix.normalize_header(header) assert header_h sc2header[sc] = header_h for i in range(matrix.num_annots()): has_calls = False # whether this row has any calls. for sc in itertools.product(matrix.samples, matrix.callers): sample, caller = sc header_h = sc2header[sc] call_str = matrix.header2annots[header_h][i] if not call_str: continue call = SimpleVariantMatrix._parse_call(call_str) filt = False # filter_by_min_alt_reads if min_alt_reads > 0 and \ (call.num_alt is None or call.num_alt < min_alt_reads): filt = True # filter_by_min_total_reads if min_total_reads > 0 and (call.total is None or call.total < min_total_reads): filt = True # filter_by_min_vaf if min_vaf >= 1E-6 and (call.vaf is None or call.vaf < min_vaf): filt = True if filt: if i not in call_remove: call_remove[i] = {} call_remove[i][sc] = 1 else: has_calls = True # If this coordinate has no more calls, then remove the # whole row. if not has_calls: I_remove[i] = 1 I_remove = sorted(I_remove) # Write out a matrix of the discarded rows. filtered_matrix = AnnotationMatrix.rowslice(matrix, I_remove) SimpleVariantMatrix.write_from_am("discarded.txt", filtered_matrix) # Remove the calls. for i in call_remove: for sc in call_remove[i]: header_h = sc2header[sc] call_str = matrix.header2annots[header_h][i] assert call_str matrix.header2annots[header_h][i] = "" # Which rows to keep. I_remove_dict = {}.fromkeys(I_remove) I_keep = [ i for i in range(matrix.num_annots()) if i not in I_remove_dict ] filtered_matrix = AnnotationMatrix.rowslice(matrix, I_keep) SimpleVariantMatrix.write_from_am(out_filename, filtered_matrix) ## ## Filter out synonymous variants. ## #if nonsynonymous_and_stopgain_only: ## # # Make sure annotated with Annovar. ## # assert "ExonicFunc.refGene" in annovar_matrix.headers ## # exonic_func = annovar_matrix["ExonicFunc.refGene"] ## # for i, efunc in enumerate(exonic_func): ## # efunc = exonic_func[i] ## # assert efunc in [ ## # "", "nonsynonymous SNV", "synonymous SNV", ## # "stopgain", "stoploss", ## # "frameshift substitution", "nonframeshift substitution", ## # "unknown"], \ ## # "Unknown exonic_func: %s" % efunc ## # if efunc not in ["nonsynonymous SNV", "stopgain"]: ## # I_remove[i] = 1 ## # continue ## # Filter based on the calls. ## if min_alt_reads > 0 or min_total_reads > 0: ## all_coord = call_matrix.coord2samplecaller2call.keys() ## for coord in all_coord: ## all_sc = call_matrix.coord2samplecaller2call[coord].keys() ## for sc in all_sc: ## # SimpleVariantMatrix.Call object. ## call = call_matrix.coord2samplecaller2call[coord][sc] ## # filter_by_min_alt_reads ## if min_alt_reads > 0 and \ ## (call.num_alt is None or call.num_alt < min_alt_reads): ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # filter_by_min_total_reads ## if min_total_reads > 0 and ( ## call.total is None or call.total < min_total_reads): ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # Filter based on VAF. ## if min_vaf >= 1E-6: ## all_coord = call_matrix.coord2samplecaller2call.keys() ## for coord in all_coord: ## all_sc = call_matrix.coord2samplecaller2call[coord].keys() ## for sc in all_sc: ## call = call_matrix.coord2samplecaller2call[coord][sc] ## # filter_by_min_vaf ## if call.vaf is None or call.vaf < min_vaf: ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # If any of these coordinates have no more variants, then ## # remove the whole row. ## if call_remove: ## chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ## ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] ## pos = [int(x) for x in pos] ## coord2i = {} ## for i, coord in enumerate(zip(chrom, pos, ref, alt)): ## coord2i[coord] = i ## for coord in call_remove: ## num_remove = len(call_remove[coord]) ## num_calls = len(call_matrix.coord2samplecaller2call[coord]) ## assert num_remove <= num_calls ## if num_remove == num_calls: ## i = coord2i[coord] ## I_remove[i] = 1 ## # Make a matrix of the discarded rows. ## old_annot_matrix = var_matrix.annot_matrix ## old_named_matrices = var_matrix.named_matrices ## filtered_matrix = var_matrix ## x = AnnotationMatrix.rowslice(var_matrix.annot_matrix, I_remove) ## filtered_matrix.annot_matrix = x ## named_matrices = [] ## for (name, matrix) in var_matrix.named_matrices: ## matrix = AnnotationMatrix.rowslice(matrix, I_remove) ## named_matrices.append((name, matrix)) ## filtered_matrix.named_matrices = named_matrices ## SimpleVariantMatrix.write("discarded.txt", filtered_matrix) ## var_matrix.annot_matrix = old_annot_matrix ## var_matrix.named_matrices = old_named_matrices ## # Remove the calls. ## for coord in call_remove: ## chrom, pos, ref, alt = coord ## for (sample, caller) in call_remove[coord]: ## var_matrix.call_matrix.set_call( ## chrom, pos, ref, alt, sample, caller, None) ## # Which rows to keep. ## I_keep = [ ## i for i in range(var_matrix.num_variants()) if i not in I_remove] ## # Filter annotation matrix ## var_matrix.annot_matrix = AnnotationMatrix.rowslice( ## var_matrix.annot_matrix, I_keep) ## # Filter named matrices. ## for i, (name, matrix) in enumerate(var_matrix.named_matrices): ## matrix = AnnotationMatrix.rowslice(matrix, I_keep) ## var_matrix.named_matrices[i] = (name, matrix) ## SimpleVariantMatrix.write(out_filename, var_matrix) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import math from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) linked_file = mlib.get_user_option(user_options, "linked_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the linked variant file. # Chrom Pos Perc Linked p coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(linked_file, header=1): pos = int(d.Pos) if (d.Chrom, pos) not in all_coords: continue coord2info[(d.Chrom, pos)] = d # Align the linked annotations to the matrix. MAX_SCORE = 1000 min_p = 10**-(MAX_SCORE / 10) linked_headers = ["Perc Linked", "Score"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(linked_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] score = MAX_SCORE if float(d.p) >= min_p: score = -10 * math.log(float(d.p), 10) x = d.Perc_Linked, score assert len(x) == len(linked_headers) annotations.append(x) # Convert the headers and annotations to SVM format. linked_headers = ["Linkage______%s" % x for x in linked_headers] linked_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 ## If Annovar exists, put after. #I = [i for (i, x) in enumerate(SVM.headers) # if x.upper().startswith("ANNOVAR")] #if I: # INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + linked_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def main(): import sys import argparse from genomicode import SimpleVariantMatrix parser = argparse.ArgumentParser( description="Perform operations on a SimpleVariantMatrix file.") parser.add_argument("filename", nargs=1, help="Annotation file.") parser.add_argument( "--ignore_germline", action="append", default=[], help="Ignore these germline samples. Can use multiple times." "Affects: --filter_min_callers.") group = parser.add_argument_group(title="Filter Calls") group.add_argument( "--filter_min_total_reads", type=int, help="Discard calls if no samples have at least this many " "callers.") group = parser.add_argument_group(title="Filter Variants") group.add_argument( "--filter_min_callers", type=int, help="Discard variants if no samples have at least this many " "callers.") group.add_argument( "--filter_linked_perc", type=float, help="Discard variants if their linkage percent is more than this. " '(e.g. "50.0" will discard anything with Perc Linked > 50.0).') group.add_argument("--exonic_only", action="store_true", help="Keep variants only if they are exonic.") group = parser.add_argument_group(title="Annotation") group.add_argument( "--annotate_linked_variants", help="Add a column that shows the linkage score for each variant. " "Format: <linkage file>.") args = parser.parse_args() assert len(args.filename) == 1 FILENAME = args.filename[0] # Read the matrix. MATRIX = SimpleVariantMatrix.read_as_am(FILENAME) # Annotation MATRIX = annotate_linked_variants(MATRIX, args.annotate_linked_variants) # Filters MATRIX = filter_min_callers(MATRIX, args.filter_min_callers, args.ignore_germline) MATRIX = filter_min_total_reads(MATRIX, args.filter_min_total_reads) MATRIX = filter_linked_perc(MATRIX, args.filter_linked_perc) MATRIX = exonic_only(MATRIX, args.exonic_only) # Write the matrix back out. SimpleVariantMatrix.write_from_am(sys.stdout, MATRIX)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #from genomicode import filelib from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simple_file = in_data.identifier metadata = {} num_callers = mlib.get_user_option(user_options, "num_callers", not_empty=True, type=int) assert num_callers >= 0 and num_callers < 100 var_matrix = SimpleVariantMatrix.read(simple_file) annot_matrix = var_matrix.annot_matrix call_matrix = var_matrix.call_matrix # For each coord and sample, count the number of callers. coord2sample2nc = {} # (chrom, pos, ref, alt) -> sample -> num callers for x in call_matrix.coord2samplecaller2call.iteritems(): coord, samplecaller2call = x if coord not in coord2sample2nc: coord2sample2nc[coord] = {} sample2nc = coord2sample2nc[coord] for (sample, caller), call in samplecaller2call.iteritems(): # Make sure this is a real call. if not (call.num_ref or call.num_alt or call.total or call.vaf): continue sample2nc[sample] = sample2nc.get(sample, 0) + 1 # Make a list of the coordinates that have the right number of calls. calls = {} # coord -> sample -> nc for coord, sample2nc in coord2sample2nc.iteritems(): for sample, nc in sample2nc.iteritems(): if nc < num_callers: continue if coord not in calls: calls[coord] = {} calls[coord][sample] = nc handle = open(out_filename, 'w') # Print out the matrix. header = annot_matrix.headers + var_matrix.samples print >> handle, "\t".join(header) # Cache for convenience. j2annots = {} for j, h in enumerate(annot_matrix.headers_h): annots = annot_matrix.header2annots[h] j2annots[j] = annots num_annots = len(j2annots) chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] for i, coord in enumerate(zip(chrom, pos, ref, alt)): if coord not in calls: continue row0 = [None] * num_annots for j in range(num_annots): row0[j] = j2annots[j][i] row1 = [""] * len(var_matrix.samples) for j, sample in enumerate(var_matrix.samples): if sample in calls[coord]: row1[j] = coord2sample2nc[coord][sample] row = row0 + row1 assert len(row) == len(header) print >> handle, "\t".join(map(str, row)) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): from genomicode import filelib from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simplematrix_file = in_data.identifier filelib.assert_exists_nz(simplematrix_file) metadata = {} x = mlib.get_user_option(user_options, "nonsynonymous_and_stopgain_only", allowed_values=["no", "yes"]) nonsynonymous_and_stopgain_only = (x == "yes") x = mlib.get_user_option(user_options, "sift_polyphen_damaging", allowed_values=["no", "yes"]) sift_polyphen_damaging = (x == "yes") min_coverage_in_every_sample = None min_callers_in_every_sample = None min_callers_in_any_sample = None min_gene_expression_in_every_sample = None x = mlib.get_user_option(user_options, "min_coverage_in_every_sample", type=int) if x != "": min_coverage_in_every_sample = x x = mlib.get_user_option(user_options, "min_callers_in_every_sample", type=int) if x != "": min_callers_in_every_sample = x x = mlib.get_user_option(user_options, "min_callers_in_any_sample", type=int) if x != "": min_callers_in_any_sample = x x = mlib.get_user_option(user_options, "min_gene_expression_in_every_sample", type=float) if x != "": min_gene_expression_in_every_sample = x assert not (min_callers_in_every_sample and min_callers_in_any_sample) assert nonsynonymous_and_stopgain_only or \ sift_polyphen_damaging or \ min_callers_in_every_sample or \ min_callers_in_any_sample or \ min_gene_expression_in_every_sample or \ min_coverage_in_every_sample, \ "No filters" MATRIX = SimpleVariantMatrix.read_as_am(simplematrix_file) commands = [] #in_attrs = in_data.data.attributes if nonsynonymous_and_stopgain_only: # Actually, just look into the file instead. #assert in_attrs["annotated"] == "yes" MATRIX = filter_nonsynonymous(MATRIX) commands.append("Keep only nonsynonymous and stopgain variants.") if sift_polyphen_damaging: MATRIX = filter_sift_polyphen_damaging(MATRIX) commands.append("Keep only if predicted to be damaging by " "SIFT or Polyphen2.") if min_coverage_in_every_sample is not None: MATRIX = filter_min_coverage_in_every_sample( MATRIX, min_coverage_in_every_sample) commands.append("Keep only variants with coverage >= %d " "in every sample." % min_coverage_in_every_sample) if min_callers_in_every_sample is not None: MATRIX = filter_min_callers_in_every_sample( MATRIX, min_callers_in_every_sample) commands.append("Keep only variants called with >= %d callers " "in every sample." % min_callers_in_every_sample) if min_callers_in_any_sample is not None: MATRIX = filter_min_callers_in_any_sample( MATRIX, min_callers_in_any_sample) commands.append("Keep only variants called with >= %d callers " "in at least one sample." % min_callers_in_any_sample) if min_gene_expression_in_every_sample is not None: # Actually, just look into the file instead. #assert in_attrs["with_gxp"] == "yes" MATRIX = filter_min_gene_expression_in_every_sample( MATRIX, min_gene_expression_in_every_sample) commands.append("Keep only variants with gene expression >= %g " "in every sample." % min_gene_expression_in_every_sample) metadata["commands"] = commands SimpleVariantMatrix.write_from_am(out_filename, MATRIX) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import hashlib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simple_node = in_data filelib.assert_exists_nz(simple_node.identifier) gene_file = mlib.get_user_option( user_options, "cancer_genes_file", not_empty=True, check_file=True) # Read the cancer genes file. # <Gene ID> <Gene Symbol> <Dataset> ... symbol2info = {} # symbol -> d gene_iter = filelib.read_row(gene_file, header=1) header = None for d in gene_iter: assert "Gene Symbol" in d._header if header is None: header = [ x for x in d._header if x not in ["Gene ID", "Gene Symbol"]] if not d.Gene_Symbol: continue symbol2info[d.Gene_Symbol] = d # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier) GENE_H = "Annovar______Gene.refGene" assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H GENES = SVM[GENE_H] # Align the matrix to the simple variant matrix. gene_headers = header gene_annotations = [] for i, gene_str in enumerate(GENES): # Format of genes: # PFN1P2 # PMS2P2,PMS2P7 values = [""] * len(gene_headers) genes = gene_str.split(",") for gene in genes: if gene not in symbol2info: continue d = symbol2info[gene] for j, h in enumerate(gene_headers): h = hashlib.hash_var(h) assert hasattr(d, h) x = getattr(d, h) assert x in ["", "1"] if x == "1": values[j] = 1 gene_annotations.append(values) # Convert the headers and annotations to SVM format. gene_headers = ["Cancer Genes______%s" % x for x in gene_headers] gene_annotations = jmath.transpose(gene_annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 # If Annovar exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("ANNOVAR")] if I: INDEX = max(INDEX, max(I)+1) # If SnpEff exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("SNPEFF")] if I: INDEX = max(INDEX, max(I)+1) # If COSMIC exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("COSMIC")] if I: INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + gene_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #import shutil from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_node = in_data summary_filename = summary_node.identifier metadata = {} buildver = mlib.get_user_option(user_options, "annovar_buildver", allowed_values=["hg19"], not_empty=True) # Name files. p, root, ext = mlib.splitpath(summary_filename) annovar_infile = "pos.txt" log_filename = "annovar.log" # Annovar takes a filestem, without the ".vcf". annovar_outstem = "annotations" # Produces file: # <annovar_outstem>.hg19_multianno.txt multianno_file = "%s.hg19_multianno.txt" % annovar_outstem #temp_file = "temp.txt" # Make the infile for Annovar. # <chrom> <start> <end> <ref> <alt> handle = open(annovar_infile, 'w') for d in filelib.read_row(summary_filename, skip=2, header=1): x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt print >> handle, "\t".join(x) handle.close() cmd = alignlib.make_annovar_command(annovar_infile, log_filename, annovar_outstem, buildver, vcf_input=False) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(log_filename) filelib.assert_exists_nz(multianno_file) matrix = SimpleVariantMatrix.read(summary_filename) annot_matrix = matrix.annot_matrix #headers = annot_matrix.headers + anno_header[5:] chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] # Read in the multianno output file. pos2d = {} # (chrom, start, ref, alt) -> d anno_header = None for d in filelib.read_row(multianno_file, header=1): key = d.Chr, int(d.Start), d.Ref, d.Alt assert key not in pos2d, "Duplicate pos: %s" % str(key) pos2d[key] = d if not anno_header: anno_header = d._header assert anno_header # Multianno starts with: # Chr Start End Ref Alt # Ignore these. assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"] headers = anno_header[5:] all_annots = [] #for h in annot_matrix.headers_h: # x = annot_matrix.header2annots[h] # all_annots.append(x) for i in range(5, len(anno_header)): annots = [] for coord in zip(chrom, pos, ref, alt): d = pos2d.get(coord) x = "" if d: x = d._cols[i] annots.append(x) all_annots.append(annots) x = AnnotationMatrix.create_from_annotations(headers, all_annots) matrix.named_matrices.insert(0, ("Annovar", x)) SimpleVariantMatrix.write(out_filename, matrix) ## cols_to_add = len(anno_header) - 5 ## assert cols_to_add > 0 ## # Merge the multianno file with the simple call summary. Add ## # these columns before the <Sample>. ## # Sample <Sample> ## # Caller <Caller> ## # Chrom Pos Ref Alt Ref/Alt/VAF ## handle = open(temp_file, 'w') ## it = filelib.read_cols(summary_filename) ## header1 = it.next() ## header2 = it.next() ## header3 = it.next() ## assert len(header1) == len(header2), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert len(header1) == len(header3), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert header1[0] == "Sample" ## assert header2[0] == "Caller" ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"] ## header1 = header1[:4] + [""]*cols_to_add + header1[4:] ## header2 = header2[:4] + [""]*cols_to_add + header2[4:] ## header3 = header3[:4] + anno_header[5:] + header3[4:] ## print >>handle, "\t".join(header1) ## print >>handle, "\t".join(header2) ## print >>handle, "\t".join(header3) ## for cols in it: ## chrom, pos, ref, alt = cols[:4] ## pos = int(pos) ## d = pos2d.get((chrom, pos)) ## if not d: ## cols = cols[:4] + [""]*cols_to_add + cols[4:] ## continue ## assert ref == d.Ref, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## assert alt == d.Alt, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## x = d._cols[5:] ## assert len(x) == cols_to_add ## cols = cols[:4] + x + cols[4:] ## print >>handle, "\t".join(cols) ## handle.close() ## shutil.move(temp_file, out_filename) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_filename): import arrayio from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix simple_node, signal_node = antecedents filelib.assert_exists_nz(simple_node.identifier) filelib.assert_exists_nz(signal_node.identifier) # Read the variant file. SVM = SimpleVariantMatrix.read(simple_node.identifier) #AM = SVM.annot_matrix #assert GENE_H in AM.headers # Read the gene expression file. GXP = arrayio.read(signal_node.identifier) # Make sure the samples from the variant matrix can be found # in the gene expression matrix. GXP_samples = GXP.col_names(arrayio.COL_ID) missing = [x for x in SVM.samples if x not in GXP_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and gene expression file have " "no common samples.") # Actually, may not have all the same samples. For example, a # gene expression profile might not have been calculated for # the germline sample. So ignore if something is missing. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in gene expression file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Add all the samples from the gene expression file. SAMPLES = GXP_samples # Find the genes in each row. GENE_H = "Gene.refGene" annovar_matrix = None for (name, matrix) in SVM.named_matrices: if GENE_H in matrix.headers: annovar_matrix = matrix break assert annovar_matrix, "Missing annotation: %s" % GENE_H GENES = annovar_matrix[GENE_H] # Make a list of the genes. genes = {} for i, gene_str in enumerate(GENES): # Format of genes: # PFN1P2 # PMS2P2,PMS2P7 for x in gene_str.split(","): genes[x] = 1 genes = sorted(genes) # Make a matrix of the gene expression values for each gene # and each sample. #I = [GXP_samples.index(x) for x in SVM.samples] #GXP_a = GXP.matrix(genes, I) # align the matrices. GXP_a = GXP.matrix(genes, None) # Write out the expression matrix for debugging purposes. arrayio.write(GXP_a, "expression.txt") # Search for each of the genes in the matrix. gene2I = {} # gene -> list of row indexes for gene in genes: x = GXP_a._index(row=gene) I_row, i_col = x if I_row: gene2I[gene] = I_row # Align the gene expression matrix to the simple variant # matrix. #matrix = [[None]*len(SVM.samples) for i in range(len(GENES))] matrix = [[None]*len(SAMPLES) for i in range(len(GENES))] for i, gene_str in enumerate(GENES): # Format of genes: Format of output # PFN1P2 5.2 # PMS2P2,PMS2P7 2.2,8.6 # If a gene is missing, then skip it. genes = gene_str.split(",") #for j in range(len(SVM.samples)): for j in range(len(SAMPLES)): values = [] # expression values for each gene. for k in range(len(genes)): if genes[k] not in gene2I: continue x = [GXP_a._X[l][j] for l in gene2I[genes[k]]] # If there are multiple instances of this gene, # then pick the one with the maximum expression. x = max(x) values.append(x) values = [_pretty_gxp(x) for x in values] x = ",".join(values) matrix[i][j] = x # Add the matrix back to the simple variant matrix. #headers = SVM.samples headers = SAMPLES all_annots = [] for j in range(len(headers)): x = [matrix[i][j] for i in range(len(matrix))] all_annots.append(x) x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append(("Gene Expression", x)) # Write to file. SimpleVariantMatrix.write(out_filename, SVM)
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov): from genomicode import jmath from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix # Read the variant file. SVM = SimpleVariantMatrix.read(svm_file) AM = SVM.annot_matrix assert "Chrom" in AM.headers assert "Pos" in AM.headers CHROM = AM["Chrom"] POS = AM["Pos"] POS = [int(x) for x in POS] # Read the coverage matrix. # Chrom Pos <Sample> [<Sample> ...] # Pos is 1-based. coord2sample2cov = {} # (chrom, pos) -> sample -> ref/alt/vaf cov_samples = {} for d in filelib.read_row(coverage_file, header=1): coord = d.Chrom, int(d.Pos) if coord not in coord2sample2cov: coord2sample2cov[coord] = {} for i in range(2, len(d._header)): sample = d._header[i] cov = d._cols[i] if not cov: continue #coord2sample2cov[coord][sample] = int(cov) coord2sample2cov[coord][sample] = cov cov_samples[sample] = 1 # Make sure the samples from the variant matrix can be found # in the coverage matrix. missing = [x for x in SVM.samples if x not in cov_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and coverage file have " "no common samples.") # If the samples aren't sequenced at high coverage, it's # possible they just don't have reads at these positions. Be # a little lenient here, and accept the file if some of the # samples overlap. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in coverage file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Report the coverage for the samples at the intersection. SAMPLES = [x for x in SVM.samples if x in cov_samples] # Align the matrix to the simple variant matrix. #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())] matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())] for i in range(AM.num_annots()): coord = CHROM[i], POS[i] sample2cov = coord2sample2cov.get(coord, {}) x = [sample2cov.get(x, "") for x in SAMPLES] #x = map(str, x) matrix[i] = x # Add the matrix back to the simple variant matrix. headers = SAMPLES all_annots = jmath.transpose(matrix) name = "Coverage" # If this is being used to add RNA coverage, use a different # name. if is_rna_cov: name = "RNA Coverage" x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append((name, x)) # Write to file. SimpleVariantMatrix.write(outfile, SVM)
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) cosmic_file = mlib.get_user_option( user_options, "cosmic_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the COSMIC variant file. # Chrom Start End GRCh Count SNP # Mutation CDS Mutation AA # FATHMM prediction FATHMM score Mutation somatic status coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(cosmic_file, header=1): start, end = int(d.Start), int(d.End) in_svm = False for pos in range(start, end+1): if (d.Chrom, pos) in all_coords: in_svm = True break if not in_svm: continue coord2info[(d.Chrom, pos)] = d # Align the COSMIC annotations to the matrix. cosmic_headers = [ "SNP", "Num Tumors", "Mutation CDS", "Mutation AA", "FATHMM prediction", "FATHMM score", "Mutation somatic status"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(cosmic_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] x = d.SNP, d.Count, d.Mutation_CDS, d.Mutation_AA, \ d.FATHMM_prediction, d.FATHMM_score, \ d.Mutation_somatic_status annotations.append(x) # Convert the headers and annotations to SVM format. cosmic_headers = ["COSMIC______%s" % x for x in cosmic_headers] cosmic_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 # If Annovar exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("ANNOVAR")] if I: INDEX = max(INDEX, max(I)+1) # If SnpEff exists, put after. I = [i for (i, x) in enumerate(SVM.headers) if x.upper().startswith("SNPEFF")] if I: INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + cosmic_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + cosmic_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): from genomicode import filelib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix simple_file = in_data.identifier metadata = {} # Read all in memory. Hopefully, not too big. ds = [] for d in filelib.read_row(simple_file, header=-1): ds.append(d) #if len(ds) > 50000: # DEBUG # break # MuSE sometimes has alternates. # Alt A,C # Num_Alt 13,0 # VAF 0.19,0.0 # Detect this and fix it. Take the alternate with the highest VAF. for d in ds: if d.Num_Alt.find(",") < 0: continue x1 = d.Num_Alt.split(",") x2 = d.VAF.split(",") assert len(x1) == len(x2) x1 = map(int, x1) x2 = map(float, x2) max_vaf = max_i = None for i in range(len(x2)): if max_vaf is None or x2[i] > max_vaf: max_vaf = x2[i] max_i = i assert max_i is not None d.Num_Alt = str(x1[max_i]) d.VAF = str(x2[max_i]) # Make a list of all the positions. positions = {} # (Chrom, Pos) -> 1 for d in ds: positions[(d.Chrom, int(d.Pos))] = 1 positions = sorted(positions) # Make a list of all the callers. callers = {} for d in ds: callers[d.Caller] = 1 callers = sorted(callers) # Make a list of all the samples. samples = {} for d in ds: samples[d.Sample] = 1 samples = sorted(samples) # Make a list of the coordinates. coord_data = {} for d in ds: x = d.Chrom, int(d.Pos), d.Ref, d.Alt coord_data[x] = 1 coord_data = sorted(coord_data) # Make a list of all DNA calls. call_data = [] for d in ds: assert d.Source in ["DNA", "RNA"] if d.Source != "DNA": continue num_ref = num_alt = vaf = None if d.Num_Ref: num_ref = int(d.Num_Ref) if d.Num_Alt: num_alt = int(d.Num_Alt) if d.VAF: vaf = float(d.VAF) if num_ref is None and num_alt is None and vaf is None: continue call = SimpleVariantMatrix.Call(num_ref, num_alt, vaf) x = d.Chrom, int(d.Pos), d.Ref, d.Alt, d.Sample, d.Caller, call call_data.append(x) # sample -> caller -> chrom, pos, ref, alt -> call samp2caller2coord2call = {} for x in call_data: chrom, pos, ref, alt, sample, caller, call = x coord = chrom, pos, ref, alt if sample not in samp2caller2coord2call: samp2caller2coord2call[sample] = {} caller2coord2call = samp2caller2coord2call[sample] if caller not in caller2coord2call: caller2coord2call[caller] = {} coord2call = caller2coord2call[caller] # A (sample, caller, coord) may have multiple calls. For # example, for germline samples that are called with each # tumor sample. If this is the case, then take the call # with the highest coverage. if coord in coord2call: old_call = coord2call[coord] cov = old_cov = None if call.num_ref is not None and call.num_alt is not None: cov = call.num_ref + call.num_alt if old_call.num_ref is not None and \ old_call.num_alt is not None: old_cov = old_call.num_ref + old_call.num_alt if cov is None and old_cov is not None: call = old_call elif cov is not None and old_cov is not None and cov < old_cov: call = old_call coord2call[coord] = call # Count the number of callers that called a variant at each # position for each sample. samp2coord2caller = {} # sample -> chrom, pos, ref, alt -> caller -> 1 # Need to do this first, to make sure each caller is counted # at most once. This is to account for germline samples that # is called by each caller multiple times. for x in call_data: chrom, pos, ref, alt, sample, caller, call = x coord = chrom, pos, ref, alt if sample not in samp2coord2caller: samp2coord2caller[sample] = {} if coord not in samp2coord2caller[sample]: samp2coord2caller[sample][coord] = {} samp2coord2caller[sample][coord][caller] = 1 samp2coord2nc = {} # sample -> chrom, pos, ref, alt -> num_callers for sample in samp2coord2caller: samp2coord2nc[sample] = {} for coord in samp2coord2caller[sample]: samp2coord2nc[sample][coord] = len( samp2coord2caller[sample][coord]) #for x in call_data: # chrom, pos, ref, alt, sample, caller, call = x # coord = chrom, pos, ref, alt # if sample not in samp2coord2nc: # samp2coord2nc[sample] = {} # nc = samp2coord2nc[sample].get(coord, 0) + 1 # samp2coord2nc[sample][coord] = nc # Format everything into an annotation matrix. headers0 = [] headers1 = [] headers2 = [] all_annots = [] # Add the positions. headers0 += ["", "", "", ""] headers1 += ["", "", "", ""] headers2 += ["Chrom", "Pos", "Ref", "Alt"] for i in range(4): x = [x[i] for x in coord_data] x = [str(x) for x in x] all_annots.append(x) # Add the number of callers information. headers0 += ["Num Callers"] * len(samples) headers1 += [""] * len(samples) headers2 += samples for sample in samples: annots = [] for coord in coord_data: nc = samp2coord2nc.get(sample, {}).get(coord, "") annots.append(nc) all_annots.append(annots) # Add information about calls. for sample in samples: caller2coord2call = samp2caller2coord2call.get(sample, {}) for i, caller in enumerate(callers): h0 = "" if not i: h0 = sample h1 = caller h2 = "Ref/Alt/VAF" headers0.append(h0) headers1.append(h1) headers2.append(h2) coord2call = caller2coord2call.get(caller, {}) annots = [] for coord in coord_data: x = "" call = coord2call.get(coord) if call: x = SimpleVariantMatrix._format_call(call) annots.append(x) all_annots.append(annots) # Set the headers. assert len(headers0) == len(headers1) assert len(headers0) == len(headers2) assert len(headers0) == len(all_annots) headers = [None] * len(headers0) for i, x in enumerate(zip(headers0, headers1, headers2)): x = "___".join(x) headers[i] = x matrix = AnnotationMatrix.create_from_annotations(headers, all_annots) SimpleVariantMatrix.write_from_am(out_filename, matrix) #annot_header = ["Chrom", "Pos", "Ref", "Alt"] #matrix = SimpleVariantMatrix.make_matrix( # samples, callers, annot_header, coord_data, named_data, # call_data) #SimpleVariantMatrix.write(out_filename, matrix) return metadata