def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #from genomicode import filelib from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simple_file = in_data.identifier metadata = {} num_callers = mlib.get_user_option(user_options, "num_callers", not_empty=True, type=int) assert num_callers >= 0 and num_callers < 100 var_matrix = SimpleVariantMatrix.read(simple_file) annot_matrix = var_matrix.annot_matrix call_matrix = var_matrix.call_matrix # For each coord and sample, count the number of callers. coord2sample2nc = {} # (chrom, pos, ref, alt) -> sample -> num callers for x in call_matrix.coord2samplecaller2call.iteritems(): coord, samplecaller2call = x if coord not in coord2sample2nc: coord2sample2nc[coord] = {} sample2nc = coord2sample2nc[coord] for (sample, caller), call in samplecaller2call.iteritems(): # Make sure this is a real call. if not (call.num_ref or call.num_alt or call.total or call.vaf): continue sample2nc[sample] = sample2nc.get(sample, 0) + 1 # Make a list of the coordinates that have the right number of calls. calls = {} # coord -> sample -> nc for coord, sample2nc in coord2sample2nc.iteritems(): for sample, nc in sample2nc.iteritems(): if nc < num_callers: continue if coord not in calls: calls[coord] = {} calls[coord][sample] = nc handle = open(out_filename, 'w') # Print out the matrix. header = annot_matrix.headers + var_matrix.samples print >> handle, "\t".join(header) # Cache for convenience. j2annots = {} for j, h in enumerate(annot_matrix.headers_h): annots = annot_matrix.header2annots[h] j2annots[j] = annots num_annots = len(j2annots) chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] for i, coord in enumerate(zip(chrom, pos, ref, alt)): if coord not in calls: continue row0 = [None] * num_annots for j in range(num_annots): row0[j] = j2annots[j][i] row1 = [""] * len(var_matrix.samples) for j, sample in enumerate(var_matrix.samples): if sample in calls[coord]: row1[j] = coord2sample2nc[coord][sample] row = row0 + row1 assert len(row) == len(header) print >> handle, "\t".join(map(str, row)) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_filename): import arrayio from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix simple_node, signal_node = antecedents filelib.assert_exists_nz(simple_node.identifier) filelib.assert_exists_nz(signal_node.identifier) # Read the variant file. SVM = SimpleVariantMatrix.read(simple_node.identifier) #AM = SVM.annot_matrix #assert GENE_H in AM.headers # Read the gene expression file. GXP = arrayio.read(signal_node.identifier) # Make sure the samples from the variant matrix can be found # in the gene expression matrix. GXP_samples = GXP.col_names(arrayio.COL_ID) missing = [x for x in SVM.samples if x not in GXP_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and gene expression file have " "no common samples.") # Actually, may not have all the same samples. For example, a # gene expression profile might not have been calculated for # the germline sample. So ignore if something is missing. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in gene expression file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Add all the samples from the gene expression file. SAMPLES = GXP_samples # Find the genes in each row. GENE_H = "Gene.refGene" annovar_matrix = None for (name, matrix) in SVM.named_matrices: if GENE_H in matrix.headers: annovar_matrix = matrix break assert annovar_matrix, "Missing annotation: %s" % GENE_H GENES = annovar_matrix[GENE_H] # Make a list of the genes. genes = {} for i, gene_str in enumerate(GENES): # Format of genes: # PFN1P2 # PMS2P2,PMS2P7 for x in gene_str.split(","): genes[x] = 1 genes = sorted(genes) # Make a matrix of the gene expression values for each gene # and each sample. #I = [GXP_samples.index(x) for x in SVM.samples] #GXP_a = GXP.matrix(genes, I) # align the matrices. GXP_a = GXP.matrix(genes, None) # Write out the expression matrix for debugging purposes. arrayio.write(GXP_a, "expression.txt") # Search for each of the genes in the matrix. gene2I = {} # gene -> list of row indexes for gene in genes: x = GXP_a._index(row=gene) I_row, i_col = x if I_row: gene2I[gene] = I_row # Align the gene expression matrix to the simple variant # matrix. #matrix = [[None]*len(SVM.samples) for i in range(len(GENES))] matrix = [[None]*len(SAMPLES) for i in range(len(GENES))] for i, gene_str in enumerate(GENES): # Format of genes: Format of output # PFN1P2 5.2 # PMS2P2,PMS2P7 2.2,8.6 # If a gene is missing, then skip it. genes = gene_str.split(",") #for j in range(len(SVM.samples)): for j in range(len(SAMPLES)): values = [] # expression values for each gene. for k in range(len(genes)): if genes[k] not in gene2I: continue x = [GXP_a._X[l][j] for l in gene2I[genes[k]]] # If there are multiple instances of this gene, # then pick the one with the maximum expression. x = max(x) values.append(x) values = [_pretty_gxp(x) for x in values] x = ",".join(values) matrix[i][j] = x # Add the matrix back to the simple variant matrix. #headers = SVM.samples headers = SAMPLES all_annots = [] for j in range(len(headers)): x = [matrix[i][j] for i in range(len(matrix))] all_annots.append(x) x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append(("Gene Expression", x)) # Write to file. SimpleVariantMatrix.write(out_filename, SVM)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #import shutil from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_node = in_data summary_filename = summary_node.identifier metadata = {} buildver = mlib.get_user_option(user_options, "annovar_buildver", allowed_values=["hg19"], not_empty=True) # Name files. p, root, ext = mlib.splitpath(summary_filename) annovar_infile = "pos.txt" log_filename = "annovar.log" # Annovar takes a filestem, without the ".vcf". annovar_outstem = "annotations" # Produces file: # <annovar_outstem>.hg19_multianno.txt multianno_file = "%s.hg19_multianno.txt" % annovar_outstem #temp_file = "temp.txt" # Make the infile for Annovar. # <chrom> <start> <end> <ref> <alt> handle = open(annovar_infile, 'w') for d in filelib.read_row(summary_filename, skip=2, header=1): x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt print >> handle, "\t".join(x) handle.close() cmd = alignlib.make_annovar_command(annovar_infile, log_filename, annovar_outstem, buildver, vcf_input=False) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(log_filename) filelib.assert_exists_nz(multianno_file) matrix = SimpleVariantMatrix.read(summary_filename) annot_matrix = matrix.annot_matrix #headers = annot_matrix.headers + anno_header[5:] chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] # Read in the multianno output file. pos2d = {} # (chrom, start, ref, alt) -> d anno_header = None for d in filelib.read_row(multianno_file, header=1): key = d.Chr, int(d.Start), d.Ref, d.Alt assert key not in pos2d, "Duplicate pos: %s" % str(key) pos2d[key] = d if not anno_header: anno_header = d._header assert anno_header # Multianno starts with: # Chr Start End Ref Alt # Ignore these. assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"] headers = anno_header[5:] all_annots = [] #for h in annot_matrix.headers_h: # x = annot_matrix.header2annots[h] # all_annots.append(x) for i in range(5, len(anno_header)): annots = [] for coord in zip(chrom, pos, ref, alt): d = pos2d.get(coord) x = "" if d: x = d._cols[i] annots.append(x) all_annots.append(annots) x = AnnotationMatrix.create_from_annotations(headers, all_annots) matrix.named_matrices.insert(0, ("Annovar", x)) SimpleVariantMatrix.write(out_filename, matrix) ## cols_to_add = len(anno_header) - 5 ## assert cols_to_add > 0 ## # Merge the multianno file with the simple call summary. Add ## # these columns before the <Sample>. ## # Sample <Sample> ## # Caller <Caller> ## # Chrom Pos Ref Alt Ref/Alt/VAF ## handle = open(temp_file, 'w') ## it = filelib.read_cols(summary_filename) ## header1 = it.next() ## header2 = it.next() ## header3 = it.next() ## assert len(header1) == len(header2), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert len(header1) == len(header3), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert header1[0] == "Sample" ## assert header2[0] == "Caller" ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"] ## header1 = header1[:4] + [""]*cols_to_add + header1[4:] ## header2 = header2[:4] + [""]*cols_to_add + header2[4:] ## header3 = header3[:4] + anno_header[5:] + header3[4:] ## print >>handle, "\t".join(header1) ## print >>handle, "\t".join(header2) ## print >>handle, "\t".join(header3) ## for cols in it: ## chrom, pos, ref, alt = cols[:4] ## pos = int(pos) ## d = pos2d.get((chrom, pos)) ## if not d: ## cols = cols[:4] + [""]*cols_to_add + cols[4:] ## continue ## assert ref == d.Ref, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## assert alt == d.Alt, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## x = d._cols[5:] ## assert len(x) == cols_to_add ## cols = cols[:4] + x + cols[4:] ## print >>handle, "\t".join(cols) ## handle.close() ## shutil.move(temp_file, out_filename) return metadata
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov): from genomicode import jmath from genomicode import filelib from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix # Read the variant file. SVM = SimpleVariantMatrix.read(svm_file) AM = SVM.annot_matrix assert "Chrom" in AM.headers assert "Pos" in AM.headers CHROM = AM["Chrom"] POS = AM["Pos"] POS = [int(x) for x in POS] # Read the coverage matrix. # Chrom Pos <Sample> [<Sample> ...] # Pos is 1-based. coord2sample2cov = {} # (chrom, pos) -> sample -> ref/alt/vaf cov_samples = {} for d in filelib.read_row(coverage_file, header=1): coord = d.Chrom, int(d.Pos) if coord not in coord2sample2cov: coord2sample2cov[coord] = {} for i in range(2, len(d._header)): sample = d._header[i] cov = d._cols[i] if not cov: continue #coord2sample2cov[coord][sample] = int(cov) coord2sample2cov[coord][sample] = cov cov_samples[sample] = 1 # Make sure the samples from the variant matrix can be found # in the coverage matrix. missing = [x for x in SVM.samples if x not in cov_samples] assert len(missing) < len(SVM.samples), ( "SimpleVariantMatrix and coverage file have " "no common samples.") # If the samples aren't sequenced at high coverage, it's # possible they just don't have reads at these positions. Be # a little lenient here, and accept the file if some of the # samples overlap. #x = missing #if len(x) > 5: # x = x[:5] + ["..."] #msg = "Samples (%d) not found in coverage file: %s" % ( # len(missing), ", ".join(x)) #assert not missing, msg # Report the coverage for the samples at the intersection. SAMPLES = [x for x in SVM.samples if x in cov_samples] # Align the matrix to the simple variant matrix. #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())] matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())] for i in range(AM.num_annots()): coord = CHROM[i], POS[i] sample2cov = coord2sample2cov.get(coord, {}) x = [sample2cov.get(x, "") for x in SAMPLES] #x = map(str, x) matrix[i] = x # Add the matrix back to the simple variant matrix. headers = SAMPLES all_annots = jmath.transpose(matrix) name = "Coverage" # If this is being used to add RNA coverage, use a different # name. if is_rna_cov: name = "RNA Coverage" x = AnnotationMatrix.create_from_annotations(headers, all_annots) SVM.named_matrices.append((name, x)) # Write to file. SimpleVariantMatrix.write(outfile, SVM)