def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import itertools from genomicode import config from genomicode import parallel from genomicode import filelib signal_node, annotation_node = antecedents signal_filename = signal_node.identifier annotation_filename = annotation_node.identifier filelib.assert_exists_nz(signal_filename) filelib.assert_exists_nz(annotation_filename) metadata = {} align_matrices = filelib.which_assert(config.align_matrices) # Make sure the signal_filename has an ID_REF header. header = filelib.read_cols(signal_filename).next() assert header[0] == "ID_REF", "Missing ID_REF header: %s" % \ signal_filename signal_align_file = "signal.aligned.txt" annot_align_file = "annot.aligned.txt" # First, align the two files. sq = parallel.quote cmd = [ sq(align_matrices), "--annot_file", signal_filename, "--header", "ID_REF", "--annot_file", annotation_filename, "--left_join", signal_align_file, annot_align_file, ] cmd = " ".join(cmd) parallel.sshell(cmd) metadata["command"] = cmd # Now merge them. Take the first column of the expression # file (should be ID_REF), the whole annotation file, then the # remainder of the expression file. signal_handle = filelib.read_cols(signal_align_file) annot_handle = filelib.read_cols(annot_align_file) outhandle = open(outfile, 'w') for x1, x2 in itertools.izip(signal_handle, annot_handle): x = [x1[0]] + x2 + x1[1:] print >> outhandle, "\t".join(x) outhandle.close() #cmd = "paste %s %s > %s" % ( # annot_align_file, signal_align_file, outfile) #shell.single(cmd) filelib.assert_exists_nz(outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib import os from genomicode import jmath in_data = antecedents matrix = [x for x in filelib.read_cols(in_data.identifier)] matrix = [x[1:] for x in matrix] matrix = jmath.transpose(matrix) sample = matrix[0][1:] data = matrix[1:] if not os.path.exists(outfile): os.mkdir(outfile) for one_data in data: value = one_data[1:] value = [float(i) for i in value] pair = [(value[i], sample[i]) for i in range(len(value))] pair.sort() gene_value = [i[0] for i in pair] label = [i[1] for i in pair] ylabel = one_data[0] from genomicode import mplgraph fig = mplgraph.barplot(gene_value, box_label=label, xtick_rotation=90, xlabel='sample', ylabel=ylabel) output = os.path.join(outfile, ylabel) fig.savefig(output + '.png') assert filelib.exists_nz(outfile), ( 'the output file %s for plot_geneset_score_bar fails' % outfile)
def read_geneset_scores(filename): # Read the output from score_geneset.py and return a Matrix # object. import os from genomicode import jmath from genomicode import filelib from genomicode import Matrix from arrayio import const from arrayio import tab_delimited_format as tdf assert os.path.exists(filename) matrix = [x for x in filelib.read_cols(filename)] matrix = jmath.transpose(matrix) # Only want the scores. Get rid of the direction, pvalue, and # significance lines. # Columns: # SAMPLE # FILE # [Score ...] # [Direction ...] " direction" # [p value ...] " pvalue" # [significant ...] " significant" assert matrix i = 0 while i < len(matrix): assert matrix[i] metadata = False if matrix[i][0].endswith(" direction"): metadata = True elif matrix[i][0].endswith(" pvalue"): metadata = True elif matrix[i][0].endswith(" significant"): metadata = True if not metadata: i += 1 continue del matrix[i] # BUG: Need more checks on size and format of matrix. col_names = {} sample_row = 0 if matrix[1][0].upper() == "SAMPLE": sample_row = 1 col_names[tdf.SAMPLE_NAME] = matrix[sample_row][1:] row_names = {} row_names['geneset'] = [] synonyms = {} synonyms[const.COL_ID] = tdf.SAMPLE_NAME data = [] for line in matrix[2:]: single_data = [jmath.safe_float(i) for i in line[1:]] data.append(single_data) row_names['geneset'].append(line[0]) M = Matrix.InMemoryMatrix(data, row_names=row_names, col_names=col_names, synonyms=synonyms) return M
def label_control_probes(probe_ids, control_probe_file): # BFRM_Normalize expects control probes to start with "AFFX" in # all upper case. Make sure I can find these probes. from genomicode import config from genomicode import filelib control_probes = {} # First, take a look to see if any affymetrix control probes # exist. for i, pid in enumerate(probe_ids): if not pid.upper().startswith("AFFX"): continue control_probes[pid.upper()] = 1 # Use the probes from the control probe file if: # 1. a control probe file is specified OR # 2. no affx probes exist (use a default control probe file). if not control_probes and not control_probe_file: control_probe_file = config.illumina_HUMANHT12_CONTROL assert os.path.exists(control_probe_file), \ "I could not find any control probes." if control_probe_file: assert os.path.exists(control_probe_file), \ "I could not find file: %s" % control_probe_file control_probes = {} for cols in filelib.read_cols(control_probe_file): for x in cols: control_probes[x.upper()] = 1 # Hack: If it is an Illumina control probe, then prepend "AFFX_" # to it so that BFRM_Normalize will recognize it as a control. probe_ids = probe_ids[:] found = False for i, pid in enumerate(probe_ids): upid = pid.upper() is_control_probe = upid in control_probes if is_control_probe: found = True # If a probe is not a control and starts with AFFX, mask it # out so that BFRM_Normalize will not recognize it. if not is_control_probe and upid.startswith("AFFX"): pid = "AFF_" + pid[4:] # If a probe is a control and does not start with AFFX, add # AFFX so that BFRM_Normalize will recognize it. if is_control_probe and not upid.startswith("AFFX"): pid = "AFFX_%s" % pid if is_control_probe: assert pid.startswith("AFFX") else: assert not pid.startswith("AFFX") probe_ids[i] = pid assert found, "I could not find any control probes." return probe_ids
def read_fastqc_summary(filename): # Return list of (<status>, <statistic>, <filename>) import os from genomicode import filelib assert os.path.exists(filename) data = [] for x in filelib.read_cols(filename): assert len(x) == 3 status, statistic, filename = x data.append((status, statistic, filename)) return data
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import mplgraph from genomicode import filelib in_data = antecedents matrix = [x for x in filelib.read_cols(in_data.identifier)] header = matrix[0] index = header.index('Confidence') matrix = matrix[1:] confidence = [float(i[index]) for i in matrix] sample = [i[0] for i in matrix] if confidence == [''] * len(matrix) or 'Correct?' in header: index = header.index('Predicted_class') class_value = [i[index] for i in matrix] label_dict = dict() label_list = [] i = -1 for label in class_value: if label not in label_dict.keys(): i = i + 1 label_dict[label] = i label_list.append(label_dict[label]) yticks = label_dict.keys() ytick_pos = [label_dict[i] for i in label_dict.keys()] fig = mplgraph.barplot(label_list, box_label=sample, ylim=(-0.5, 1.5), ytick_pos=ytick_pos, yticks=yticks, xtick_rotation='vertical', ylabel='Prediction', xlabel='Sample') fig.savefig(outfile) else: fig = mplgraph.barplot(confidence, box_label=sample, ylim=(-1.5, 1.5), xtick_rotation='vertical', ylabel='Prediction', xlabel='Sample') fig.savefig(outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for plot_prediction_bar fails' % outfile )
def _convert_gene_ids_local(in_platform, out_platform): # Return a dictionary of gene_id -> list of converted_ids, or None # if these platforms cannot be converted. import os from genomicode import config from genomicode import filelib filelib.assert_exists_nz(config.convert_platform) x = "%s___%s.txt" % (in_platform, out_platform) filename = os.path.join(config.convert_platform, x) if not os.path.exists(filename): return None in2out = {} for cols in filelib.read_cols(filename): # <in_id> <out_id1> ... <out_idn> assert len(cols) >= 2 in_id = cols[0] out_ids = cols[1:] in2out[in_id] = out_ids return in2out
def merge_parsed_files(parsed_files, outfile): # First, make sure each of the parsed files has the same header. from genomicode import filelib assert parsed_files header = None for f in parsed_files: cols = filelib.read_cols(f).next() if not header: header = cols assert header == cols, "Mismatched headers" assert header handle = open(outfile, 'w') seen = {} for f in parsed_files: for line in filelib.openfh(f): if line in seen: continue seen[line] = 1 print >> handle, line,
def fix_cluster30_dup_header(filename): # Cluster30 creates a file with "NAME" as the header for the third # column. If the infile also has a "NAME" column, then this will # be duplicated. Detect this situation and fix it. from genomicode import filelib filelib.assert_exists_nz(filename) matrix = [x for x in filelib.read_cols(filename)] assert matrix assert matrix[0] header = matrix[0] # GID <COL0> NAME GWEIGHT <COL1> [<SAMPLES>...] assert len(header) >= 5 changed = False if header[1] == "NAME" and header[2] == "NAME": header[1] = "NAME_" changed = True if not changed: return handle = open(filename, 'w') for x in matrix: print >> handle, "\t".join(x)
def _read_vcf(filename): # Return a tuple of: # - a list of lines. Each line is a list of columns. # - the index of the header row (or None) # - the sample names from genomicode import filelib lines = [x for x in filelib.read_cols(filename)] header_i = None for i, cols in enumerate(lines): if cols[0] == "#CHROM": header_i = i break assert header_i is not None, "Could not find #CHROM: %s" % filename header = lines[header_i] x = [ "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT" ] assert header[:len(x)] == x, "Unknown format: %s" % header samples = header[len(x):] return lines, header_i, samples
def _make_intervallist_file(intervallist_file, features_bed, bam_filename): from genomicode import config from genomicode import filelib from genomicode import parallel outhandle = open(intervallist_file, 'w') # Add the @HD and @SQ headers from the bam file. # samtools view -H <filename> samtools = filelib.which_assert(config.samtools) sq = parallel.quote cmd = [ sq(samtools), "view", "-H", sq(bam_filename), ] cmd = " ".join(cmd) x = parallel.sshell(cmd) lines = x.split("\n") lines = [x.rstrip() for x in lines] for line in lines: if line.startswith("@HD") or line.startswith("@SQ"): print >> outhandle, line # Add the information from the BAM files. # BED chrom chromStart (0-based) chromEnd name score strand # Interval chrom chromStart (1-based) chromEnd strand name for cols in filelib.read_cols(features_bed): assert len(cols) >= 6 chrom, chromStart0, chromEnd, name, score, strand = cols[:6] chromStart0, chromEnd = int(chromStart0), int(chromEnd) chromStart1 = chromStart0 + 1 x = chrom, chromStart1, chromEnd, strand, name print >> outhandle, "\t".join(map(str, x)) outhandle.close()
def list_snpeff_databases(): import os import StringIO from genomicode import parallel from genomicode import filelib from Betsy import module_utils as mlib path = mlib.get_config("snp_eff_path", which_assert_file=True) snpeff = os.path.join(path, "snpEff.jar") filelib.assert_exists_nz(snpeff) # Genome Organism Status Bundle Database download link # ------ -------- ------ ------ ---------------------- sq = parallel.quote cmd = [ "java", "-Xmx16g", "-jar", sq(snpeff), "databases", ] output = parallel.sshell(cmd) header = i_db = None databases = [] for cols in filelib.read_cols(StringIO.StringIO(output)): cols = [x.strip() for x in cols] if header is None: header = cols assert "Genome" in header i_db = header.index("Genome") continue assert len(cols) == len(header) if cols[0].startswith("---"): continue db_name = cols[i_db] databases.append(db_name) return databases
def format_firehose_mirna(filename, output): matrix = [x for x in filelib.read_cols(filename)] HYB_REF = "Hybridization REF" GENE_ID = "miRNA_ID" assert matrix assert matrix[0][0] == HYB_REF assert matrix[1][0] == GENE_ID header0 = matrix[0] header1 = matrix[1] for i in range(1, len(header1), 3): assert header1[i] == "read_count" assert header1[i + 1] == "reads_per_million_miRNA_mapped" assert header1[i + 2] == "cross-mapped" sample_name = [header0[i] for i in range(2, len(header0), 3)] header = ["miRNA ID"] + sample_name f = file(output, 'w') f.write("\t".join(header) + '\n') for i in range(2, len(matrix)): x = [matrix[i][j] for j in range(2, len(matrix[i]), 3)] x = [matrix[i][0]] + x assert len(x) == len(header) f.write("\t".join(x) + '\n') f.close()
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import StringIO import arrayio from genomicode import arrayplatformlib from genomicode import parallel from genomicode import filelib from genomicode import AnnotationMatrix from Betsy import module_utils as mlib M = arrayio.read(in_data.identifier) metadata = {} # Add GENE_ID, GENE_SYMBOL, and DESCRIPTION. Figure out which # platforms provide each one of this. CATEGORIES = [ arrayplatformlib.GENE_ID, arrayplatformlib.GENE_SYMBOL, # biomaRt doesn't convert description. So just ignore it # for now. # TODO: implement DESCRIPTION. #arrayplatformlib.DESCRIPTION, ] #all_platforms = arrayplatformlib.identify_all_platforms_of_matrix(M) #assert all_platforms, "Unknown platform: %s" % in_data.identifier #header, platform_name = all_platforms[0] scores = arrayplatformlib.score_matrix(M) scores = [x for x in scores if x.max_score >= 0.75] assert scores, "I could not identify any platforms." # Find all the platforms not in the matrix. platforms = [ arrayplatformlib.find_platform_by_name(x.platform_name) for x in scores ] categories = [x.category for x in platforms] missing = [x for x in CATEGORIES if x not in categories] score = scores[0] platform = platforms[0] to_add = [] # list of platform names for category in missing: x = arrayplatformlib.PLATFORMS x = [x for x in x if x.category == category] x = [x for x in x if x.bm_organism == platform.bm_organism] x = [x for x in x if x.name != score.platform_name] # Take the first one, if any. if x: to_add.append(x[0].name) if to_add: annotate = mlib.get_config("annotate_matrix", which_assert_file=True) sq = parallel.quote cmd = [ "python", sq(annotate), "--no_na", "--header", sq(score.header), ] for x in to_add: x = ["--platform", sq(x)] cmd.extend(x) cmd.append(in_data.identifier) cmd = " ".join(cmd) data = parallel.sshell(cmd) metadata["commands"] = [cmd] assert data.find("Traceback") < 0, data else: data = open(in_data.identifier).read() # Clean up the headers. platform2pretty = { "Entrez_ID_human": "Gene ID", "Entrez_Symbol_human": "Gene Symbol", "Entrez_ID_mouse": "Gene ID", "Entrez_Symbol_mouse": "Gene Symbol", } handle = open(outfile, 'w') header_written = False for cols in filelib.read_cols(StringIO.StringIO(data)): if not header_written: cols = [platform2pretty.get(x, x) for x in cols] cols = AnnotationMatrix.uniquify_headers(cols) header_written = True print >> handle, "\t".join(cols) return metadata
def read_as_am(filename, is_csv=False): # Read file in SVM format. Return an AnnotationMatrix object. # Does no special processing on any columns (i.e. no parsing as # integers or Call objects). Everything is a string. # Header format: <header0>___<header1>___<header2> # "blanks" are filled in. E.g. "Annovar" occurs in each Annovar # column in header0. # # Headers: # ______Chrom # ______Pos # ______Ref # ______Alt # Num Callers______<Sample> # ... from genomicode import filelib from genomicode import AnnotationMatrix delimiter = "\t" if is_csv: delimiter = "," matrix = [] for x in filelib.read_cols(filename, delimiter=delimiter): matrix.append(x) assert len(matrix) >= 3 # at least 3 rows for the header for i in range(1, len(matrix)): assert len(matrix[i]) == len(matrix[0]) assert len(matrix[0]) >= 4 # Chrom, Pos, Ref, Alt assert len(matrix[0]) >= 5, "No calls" header0 = matrix[0] header1 = matrix[1] header2 = matrix[2] assert header2[:4] == ["Chrom", "Pos", "Ref", "Alt"] # Fill in the blanks for header1. for i in range(1, len(header1)): if header1[i]: continue # header1[i] is blank. If header0[i], then this starts a new # "block". Start with a new header1, and do not copy the old # one over. if not header1[i] and not header0[i]: header1[i] = header1[i - 1] # Fill in the blanks for header0. for i in range(1, len(header0)): if not header0[i]: header0[i] = header0[i - 1] # Make a list of all samples. I = [i for (i, x) in enumerate(header2) if x == "Ref/Alt/VAF"] assert I x = [header0[i] for i in I] x = [x for x in x if x] # Get rid of duplicates, preserving order. x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]] samples = x # Make a list of all callers. x = [header1[i] for i in I] x = [x for x in x if x] # Get rid of duplicates, preserving order. x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]] callers = x headers = [] for x in zip(header0, header1, header2): x = "___".join(x) headers.append(x) all_annots = [] for j in range(len(headers)): annots = [x[j] for x in matrix[3:]] all_annots.append(annots) matrix = AnnotationMatrix.create_from_annotations(headers, all_annots) matrix.samples = samples matrix.callers = callers return matrix
def main(): import os import argparse import itertools from genomicode import filelib from genomicode import config from genomicode import parallel from genomicode import alignlib parser = argparse.ArgumentParser(description="") parser.add_argument("reference_genome", help="fasta file") parser.add_argument("-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") parser.add_argument( "--dry_run", action="store_true", help="Just display the commands, and don't generate the alignment.") parser.add_argument("--window", default=80, type=int, help="Number of bases in alignment. Default: 80") group = parser.add_argument_group(title="Input") group.add_argument("--bam_file", help="Indexed BAM file.") group.add_argument("--bam_path", help="Path to BAM files.") group.add_argument( "--position", action="append", default=[], help="Specify a position to view, " "e.g. chr20:45,927,663 or chr20:45927663. 1-based coordinates") group.add_argument("--position_file", help="Tab-delimited text file with two columns. " "Column 1 is chromosome, column 2 is position.") group = parser.add_argument_group(title="Output") group.add_argument("--prefix", help="Pre-pend a prefix to each outfile.") group.add_argument( "--outpath", help="If multiple alignments are generated, this option " "directs where to save the output files.") group.add_argument( "--noclobber", action="store_true", help="If an output file already exists, don't overwrite it.") # Parse the input arguments. args = parser.parse_args() filelib.assert_exists_nz(args.reference_genome) assert args.bam_file or args.bam_path, \ "Either --bam_file or --bam_path must be provided." assert not (args.bam_file and args.bam_path), \ "Cannot specify both --bam_file or --bam_path." if args.bam_file: filelib.assert_exists_nz(args.bam_file) if args.bam_path: assert os.path.exists(args.bam_path) if args.position_file: filelib.assert_exists_nz(args.position_file) if args.outpath and not os.path.exists(args.outpath): os.mkdir(args.outpath) if args.num_procs < 1 or args.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") assert args.window >= 1 and args.window < 500 bam_filenames = [] if args.bam_file: bam_filenames.append(args.bam_file) else: x = os.listdir(args.bam_path) x = [x for x in x if x.endswith(".bam")] x = [os.path.join(args.bam_path, x) for x in x] bam_filenames = x assert bam_filenames, "No bam files found." positions = [] # list of (chrom, pos) for x in args.position: chrom, pos = _parse_position(x) positions.append((chrom, pos)) if args.position_file and os.path.exists(args.position_file): for cols in filelib.read_cols(args.position_file): assert len(cols) == 2, "Position file should have 2 columns" chrom, pos = cols pos = int(pos) assert pos >= 1 positions.append((chrom, pos)) assert positions, "No positions specified." # Make the commands. assert hasattr(config, "samtools") filelib.assert_exists(config.samtools) # Make sure we have the right version of samtools. # 1.2 (using htslib 1.2.1) # 0.1.18 (r982:295) version = alignlib.get_samtools_version() x = version.split(".") assert len(x) >= 2 major = x[0] assert major in ["0", "1"], "Unknown samtools version: %s" % version major = int(major) assert major >= 1, "Requires samtools >= 1 (Current version: %s)" % version commands = [] for x in itertools.product(bam_filenames, positions): bam_filename, (chrom, pos) = x p, f = os.path.split(bam_filename) sample, e = os.path.splitext(f) left = max(pos - args.window / 2, 1) pos_str = "%s:%s" % (chrom, left) x = "%2s.%9s.%s.html" % (chrom, pos, sample) if args.prefix: x = "%s.%s" % (args.prefix, x) if args.outpath: x = os.path.join(args.outpath, x) out_filename = x if args.noclobber and os.path.exists(out_filename): continue # samtools tview -d t -p 7:100550778 bam01/196B-lung.bam $FA sq = parallel.quote x = [ sq(config.samtools), "tview", "-d", "h", "-p", pos_str, sq(bam_filename), sq(args.reference_genome), ] x = " ".join(x) x = "%s >& %s" % (x, sq(out_filename)) commands.append(x) if args.dry_run: for x in commands: print x return parallel.pshell(commands, max_procs=args.num_procs)
def merge_rppa_files(in_files, out_file): import shutil from genomicode import filelib assert len(in_files) == 2 x1 = [x for x in in_files if x.endswith(".antibody_annotation.txt")] x2 = [x for x in in_files if x.endswith(".rppa.txt")] assert len(x1) == 1 assert len(x2) == 1 annotation_file = x1[0] data_file = x2[0] # Actually, just return the data_file. It contains all the # information we need. shutil.copy2(data_file, out_file) return # OV.antibody_annotation.txt # Gene Name Composite Element REF # YWHAB 14-3-3_beta # YWHAE 14-3-3_epsilon # YWHAZ 14-3-3_zeta # EIF4EBP1 4E-BP1 # EIF4EBP1 4E-BP1_pS65 # OV.rppa.txt # Composite.Element.REF TCGA-04-1335-01A-21-1561-20 # YWHAB|14-3-3_beta -0.00855276625000018 # YWHAE|14-3-3_epsilon 0.05985423025 # YWHAZ|14-3-3_zeta -0.04074335825 # EIF4EBP1|4E-BP1 -0.62276845725 # EIF4EBP1|4E-BP1_pS65 0.00776960074999994 # EIF4EBP1|4E-BP1_pT37_T46 -0.04959447325 # Make sure these files are aligned properly. M1 = [x for x in filelib.read_cols(annotation_file)] M2 = [x for x in filelib.read_cols(data_file)] assert M1 and M2 assert M1[0][0] == "Gene Name" assert M1[0][1] == "Composite Element REF" assert M2[0][0] == "Composite.Element.REF" assert len(M1) == len(M2) # Make sure the header names don't conflict. M1[0][1] = "Antibody" for i in range(1, len(M1)): name1 = M1[i][0] x = M2[i][0] x = x.split("|") assert len(x) == 2 name2, antibody = x assert name1 == name2 M = [] for i in range(len(M1)): x = M1[i] + M2[i] M.append(x) handle = open(out_file, 'w') for x in M: print >> handle, "\t".join(x)
def extract_signal(filename, outhandle): import os import tempfile from genomicode import filelib # Write stuff to file to handle large data sets. tmpfile1 = tmpfile2 = tmpfile3 = None try: # tmpfile1 Raw signal data from series matrix file. # tmpfile2.<num> Raw data split into separate tables. # tmpfile3 Final merged signal table. x, tmpfile1 = tempfile.mkstemp(dir=".") os.close(x) x, tmpfile2 = tempfile.mkstemp(dir=".") os.close(x) x, tmpfile3 = tempfile.mkstemp(dir=".") os.close(x) # Get a list of all lines in the series matrix tables. handle = open(tmpfile1, 'w') in_matrix_table = 0 for cols in filelib.read_cols(filename): # Some files can have blank lines. if not cols: continue if cols[0] == "!series_matrix_table_begin": in_matrix_table = 1 elif cols[0] == "!series_matrix_table_end": in_matrix_table = 0 elif in_matrix_table: cols = [remove_quotes(x).strip() for x in cols] print >> handle, "\t".join(cols) handle.close() handle = None # Split the data into separate tables. num_tables = 0 for line in filelib.openfh(tmpfile1): if line.startswith("ID_REF"): handle = open("%s.%d" % (tmpfile2, num_tables), 'w') num_tables += 1 assert handle print >> handle, line, if handle: handle.close() assert num_tables # Sometimes the tables will not be aligned. # E.g. GSE9899-GPL570 contains two tables, and the 2nd is # missing some probe sets. Get a list of the probe sets in # the tables. files = ["%s.%d" % (tmpfile2, i) for i in range(num_tables)] matrices = [FileMatrix(x) for x in files] id2indexes = [] for matrix in matrices: id2index = {} for i, row in enumerate(matrix): id_ = row[0] id2index[id_] = i id2indexes.append(id2index) # Make a list of all the IDs. all_ids = {} for id2index in id2indexes: for id_ in id2index: all_ids[id_] = 1 del all_ids["ID_REF"] all_ids = all_ids.keys() all_ids.sort() all_ids = ["ID_REF"] + all_ids # Align the indexes. #num_rows = row_names = None #for i in range(num_tables): # filename = "%s.%d" % (tmpfile2, i) # rname, nrow = [], 0 # for line in openfh(filename): # x = line.split("\t", 1)[0] # rname.append(x) # nrow += 1 # if num_rows is None: # num_rows = nrow # if row_names is None: # row_names = rname # assert num_rows == nrow, "table is unaligned" # assert row_names == rname # Merge all the pieces together into one big table. handle = open(tmpfile3, 'w') for id_ in all_ids: cols = [] for matrix, id2index in zip(matrices, id2indexes): if id_ in id2index: x = matrix[id2index[id_]] else: # If this ID is missing, then just insert blank values. x = [""] * len(matrix[0]) if cols: # If this is not the first matrix, then delete the # row names. x = x[1:] cols.extend(x) print >> handle, "\t".join(cols) handle.close() num_rows = len(all_ids) num_cols = len(filelib.read_cols(tmpfile3).next()) # Figure out which expression values are missing. data_missing = {} for i, cols in enumerate(filelib.read_cols(tmpfile3)): assert len(cols) == num_cols, "line %d unaligned [%d:%d]" % ( i, len(cols), num_cols) if i == 0: continue for j in range(1, len(cols)): try: float(cols[j]) except ValueError, x: data_missing[(i, j)] = 1 if cols[j] == "nan": data_missing[(i, j)] = 1 ## Remove the samples where >50% values are missing. #col_missing = [0] * num_cols # number of values missing in each col #for i, j in data_missing: # col_missing[j] += 1 good_cols = [0] for i in range(1, num_cols): #if col_missing[i] > 0.50*(num_rows-1): # -1 for the row names # continue good_cols.append(i) ## Remove the genes where any value is missing. #row_missing = [0] * num_rows #for i, j in data_missing: # if j not in good_cols: # ignore samples that are already dropped # continue # row_missing[i] += 1 good_rows = [0] for i in range(1, num_rows): #if row_missing[i] > 0: # a value is missing. # continue good_rows.append(i) assert len(good_cols) > 1, "no data" assert len(good_rows) > 1, "no data" # Write out the data. for i, cols in enumerate(filelib.read_cols(tmpfile3)): if i not in good_rows: continue x = [x for (i, x) in enumerate(cols) if i in good_cols] print >> outhandle, "\t".join(x)
def read(filename, is_csv=False): # Everything are strings. No numeric conversion. from genomicode import filelib #from genomicode import AnnotationMatrix delimiter = "\t" if is_csv: delimiter = "," matrix = [] for x in filelib.read_cols(filename, delimiter=delimiter): matrix.append(x) #if len(matrix) > 50000: # DEBUG # break assert len(matrix) >= 3 # at least 3 rows for the header for i in range(1, len(matrix)): assert len(matrix[i]) == len(matrix[0]) assert len(matrix[0]) >= 4 # Chrom, Pos, Ref, Alt assert len(matrix[0]) >= 5, "No calls" header0 = matrix[0] header1 = matrix[1] header2 = matrix[2] #assert header0[0] == "Sample" #assert header1[0] == "Caller" assert header2[:4] == ["Chrom", "Pos", "Ref", "Alt"] # Make a list of all samples. I = [i for (i, x) in enumerate(header2) if x == "Ref/Alt/VAF"] assert I x = [header0[i] for i in I] #x = header0[1:] x = [x for x in x if x] # Get rid of duplicates, preserving order. x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]] samples = x # Make a list of all callers. x = [header1[i] for i in I] #x = header1[1:] x = [x for x in x if x] # Get rid of duplicates, preserving order. x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]] callers = x # Figure out where the annotations end. for i in range(1, len(header0)): if header0[i]: break else: raise AssertionError, "No calls" annot_end = i # Make the annotation matrix. annot_header = header2[:annot_end] annot_data = [x[:annot_end] for x in matrix[3:]] # Find the start coordinates of the named matrices. x = [i for (i, x) in enumerate(header0) if x] x = [i for i in x if i not in I] I_named = x # list of start index of the named matrices. I_coord = [] # list of (start, end) of named matrices. for i in range(len(I_named)): i_start = I_named[i] if i + 1 < len(I_named): i_end = I_named[i + 1] else: i_end = I[0] I_coord.append((i_start, i_end)) # Make the named matrices. named_data = [] # list of (name, named_header, named_annots) for (i_start, i_end) in I_coord: name = header0[i_start] assert name named_header = header2[i_start:i_end] M = [x[i_start:i_end] for x in matrix[3:]] named_annots = [] for j in range(len(named_header)): x = [M[i][j] for i in range(len(M))] named_annots.append(x) x = name, named_header, named_annots named_data.append(x) # Make the call_data. call_data = [] header_samples = [None] * len(header0) for i in I: if header0[i]: header_samples[i] = header0[i] else: header_samples[i] = header_samples[i - 1] assert header_samples[i] header_callers = [None] * len(header1) for i in I: header_callers[i] = header1[i] assert header_callers[i] for i in range(3, len(matrix)): chrom, pos, ref, alt = matrix[i][:4] pos = int(pos) for j in I: sample, caller = header_samples[j], header_callers[j] if not matrix[i][j]: continue call = _parse_call(matrix[i][j]) x = chrom, pos, ref, alt, sample, caller, call call_data.append(x) return make_matrix(samples, callers, annot_header, annot_data, named_data, call_data)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils bam_node, ref_node, pos_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Positions file has 0-based coordinates (like BAM files). # But samtools requires 1-based coordinates. Convert to # 1-based coordinates. positions_filename = "positions.txt" outhandle = open(positions_filename, 'w') for x in filelib.read_cols(pos_node.identifier): assert len(x) == 2 chrom, pos = x pos = int(pos) + 1 # convert from 0- to 1-based coords. x = chrom, pos print >> outhandle, "\t".join(map(str, x)) outhandle.close() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = filelib.GenericObject(in_filename=in_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) ## Get possible positions file. #positions_filename = module_utils.get_user_option( # user_options, "positions_file", check_file=True) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel", "consensus"] #if cov == "yes": # assert positions_filename, "Missing: positions_file" # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \ # $i > $j" samtools = filelib.which_assert(config.samtools) # Get an error if the BAM files are not indexed. # [W::bam_hdr_read] EOF marker is absent. The input is probably # truncated. #if vartype == "consensus": # args = [ # "-R", # Ignore read group tags. # "-B", # Disable BAQ (base quality) computation. # "-q", 0, # Skip bases with mapQ smaller than this. # "-Q", 0, # Skip bases with BAQ smaller than this. # "-d10000000", # Allow deep reads. # ] #else: # raise NotImplementedError args = [ "-R", # Ignore read group tags. "-B", # Disable BAQ (base quality) computation. "-q", 0, # Skip bases with mapQ smaller than this. "-Q", 0, # Skip bases with BAQ smaller than this. "-d10000000", # Allow deep reads. ] sq = parallel.quote commands = [] for j in jobs: x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] if positions_filename: x.extend(["-l", positions_filename]) x.extend(args) x.append(sq(j.in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename) commands.append(x) #for x in commands: # print x parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands # File may be empty if there are no reads. x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) # Make sure there's no errors in the log files. for j in jobs: check_log_file(j.err_filename) return metadata