def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--skip-all-nan', dest='skip_nan', action='store_true') parser.add_argument('--skip-all-zero', dest='skip_zero', action='store_true') parser.add_argument('--libs', dest="library_ids", default=None) parser.add_argument('--transcripts', dest="transcript_ids", default=None) parser.add_argument('input_dir') parser.add_argument('output_dir') args = parser.parse_args() if not os.path.exists(args.input_dir): parser.error("input directory '%s' not found" % (args.input_dir)) if os.path.exists(args.output_dir): parser.error("output directory '%s' already exists" % (args.output_dir)) # open matrix bm = BigCountMatrix.open(args.input_dir) # get library and transcript ids if args.library_ids is not None: library_ids = set([line.strip() for line in open(args.library_ids)]) else: library_ids = set() if args.transcript_ids is not None: transcript_ids = set([line.strip() for line in open(args.transcript_ids)]) else: transcript_ids = set(bm.rownames) if args.skip_nan or args.skip_zero: logging.debug('Checking matrix for rows of all zero and/or nan') skip_ids = set(find_transcripts_to_skip(args.input_dir, args.skip_nan, args.skip_zero)) transcript_ids.difference_update(skip_ids) logging.debug('Creating subset with %d transcripts' % (len(transcript_ids))) bm.copy(args.output_dir, transcript_ids, library_ids) bm.close()
def worker(args): (input_path, matrix_dir, meta, fdr_thresholds) = args bm = BigCountMatrix.open(matrix_dir) ss_compname = os.path.basename(input_path) results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE) sigup = [set() for x in fdr_thresholds] sigdn = [set() for x in fdr_thresholds] # extract data n = 0 for res in parse_results(results_file): n += 1 if (n % 10000) == 0: logging.debug('%s parsed %d' % (ss_compname, n)) transcript_id = bm.rownames[res.t_id] if (meta is not None) and (transcript_id not in meta): continue for i,fdr_threshold in enumerate(fdr_thresholds): if res.ss_fdr_q_value > fdr_threshold: continue if res.ss_frac > 0: sigup[i].add(res.t_id) else: sigdn[i].add(res.t_id) bm.close() return (ss_compname, sigup, sigdn)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--na-value', dest='matrix_na_values', default=[DEFAULT_NA_VALUE], action='append', help='Value to interpret as missing/invalid ' 'in weight matrix [default=%(default)s]') parser.add_argument('input_tsv_file') parser.add_argument('output_dir') # parse args args = parser.parse_args() input_tsv_file = args.input_tsv_file output_dir = args.output_dir matrix_na_values = args.matrix_na_values # check args if not os.path.exists(input_tsv_file): parser.error('Input file "%s" not found' % (input_tsv_file)) if not os.path.exists(output_dir): os.makedirs(output_dir) # convert matrix logging.info("Converting text matrix file to binary format") bm = BigCountMatrix.from_tsv(input_tsv_file, output_dir, na_values=matrix_na_values) logging.info("Estimating size factors") bm.estimate_size_factors('deseq') bm.close()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--na-value', dest='matrix_na_values', default=[DEFAULT_NA_VALUE], action='append', help='Value to interpret as missing/invalid ' 'in weight matrix [default=%(default)s]') parser.add_argument('input_tsv_file') parser.add_argument('output_dir') # parse args args = parser.parse_args() input_tsv_file = args.input_tsv_file output_dir = args.output_dir matrix_na_values = args.matrix_na_values # check args if not os.path.exists(input_tsv_file): parser.error('Input file "%s" not found' % (input_tsv_file)) if not os.path.exists(output_dir): os.makedirs(output_dir) # convert matrix logging.info("Converting text matrix file to binary format") bm = BigCountMatrix.from_tsv(input_tsv_file, output_dir, na_values=matrix_na_values) logging.info("Estimating size factors") bm.estimate_size_factors('deseq') bm.close()
def main(): parser = argparse.ArgumentParser() # parser.add_argument('--colmeta', dest='col_metadata_file', # help='file containing metadata corresponding to each ' # 'column of the weight matrix file') # parser.add_argument('--rowmeta', dest='row_metadata_file', # help='file containing metadata corresponding to each ' # 'row of the weight matrix file') parser.add_argument("-r", dest="row", action="store_true", default=False, help="Print row_meta JSONs") parser.add_argument("-c", dest="col", action="store_true", default=False, help="Print col_meta JSONs") parser.add_argument('matrix_dir') args = parser.parse_args() # check command line args matrix_dir = os.path.abspath(args.matrix_dir) col_metadata_file = os.path.join(matrix_dir, 'colmeta.tsv') row_metadata_file = os.path.join(matrix_dir, 'rowmeta.tsv') if not os.path.exists(col_metadata_file): parser.error("Column metadata file '%s' not found" % (args.col_metadata_file)) if not os.path.exists(row_metadata_file): parser.error("Row metadata file '%s' not found" % (args.row_metadata_file)) if not os.path.exists(args.matrix_dir): parser.error('matrix path "%s" not found' % (args.matrix_dir)) # col_metadata_file = os.path.abspath(args.col_metadata_file) # row_metadata_file = os.path.abspath(args.row_metadata_file) # open matrix bm = BigCountMatrix.open(matrix_dir) if bm.size_factors is None: parser.error("Size factors not found in count matrix") # read metadata logging.info("Reading row metadata") row_metadata = list(Metadata.parse_tsv(row_metadata_file, bm.rownames)) logging.info("Reading column metadata") col_metadata = list(Metadata.parse_tsv(col_metadata_file, bm.colnames)) # pipe row metadata into mongoimport if args.row: logging.debug("Importing row metadata") for m in row_metadata: print >> sys.stdout, m.to_json() if args.col: logging.debug("Importing column metadata") for m in col_metadata: print >> sys.stdout, m.to_json() # cleanup bm.close()
def db_ss_printJSON(ssea_dir, matrix_dir, ss_id): sample_sets_json_file = os.path.join(ssea_dir, 'sample_set.json') bm = BigCountMatrix.open(matrix_dir) samples = bm.colnames ss = SampleSet.parse_json(sample_sets_json_file)[0] membership = ss.get_array(samples) d = ss.to_dict(membership) d['_id'] = int(ss_id) print json.dumps(d)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('metadata_file') parser.add_argument('matrix_dir') args = parser.parse_args() # get args metadata_file = args.metadata_file matrix_dir = args.matrix_dir bm = BigCountMatrix.open(matrix_dir) # read transcript lengths logging.debug('Reading transcript_length') lengths = {} with open(metadata_file) as f: header_fields = f.next().strip().split('\t') t_id_ind = header_fields.index('transcript_id') length_ind = header_fields.index('transcript_length') for line in f: fields = line.strip().split('\t') t_id = fields[t_id_ind] length = int(fields[length_ind]) lengths[t_id] = float(length) / 1000.0 if not set(lengths.keys()).issuperset(bm.rownames): parser.error('Metadata does not contain all transcripts in matrix') # get total counts per library logging.debug('Getting total counts per library') lib_sizes = np.empty(bm.shape[1], dtype=np.float) for j in xrange(bm.shape[1]): a = bm.counts_t[j,:] a = a[np.isfinite(a)] lib_sizes[j] = a.sum() lib_sizes /= 1.0e6 # normalize logging.debug('Normalizing and summarizing counts per transcript') print '\t'.join(['transcript_id', 'exprtot', 'exprmean', 'exprmedian', 'exprmax', 'expr9999', 'expr999', 'expr99', 'expr95', 'expr90']) for i in xrange(bm.shape[0]): t_id = bm.rownames[i] if t_id not in lengths: logging.warning('Transcript %s not found in metadata' % (t_id)) continue length = lengths[t_id] a = bm.counts[i,:] valid = np.isfinite(a) anorm = (a[valid] / lib_sizes[valid]) / length # get stats fields = [t_id, np.sum(anorm), np.mean(anorm), np.median(anorm), np.max(anorm), np.percentile(anorm, 99.99), np.percentile(anorm, 99.9), np.percentile(anorm, 99), np.percentile(anorm, 95), np.percentile(anorm, 90)] print '\t'.join(map(str, fields)) bm.close()
def stats_parallel(input_paths, matrix_dir, transcripts, prefix, fdr_thresholds, num_processes): tasklist = [] for input_path in input_paths: tasklist.append((input_path, matrix_dir, transcripts, fdr_thresholds)) # create pool pool = Pool(processes=num_processes) result_iter = pool.imap_unordered(worker, tasklist) sigup = [set() for x in fdr_thresholds] sigdn = [set() for x in fdr_thresholds] sigall = [set() for x in fdr_thresholds] bm = BigCountMatrix.open(matrix_dir) nrows = len(bm.rownames) filename = prefix + '.txt' with open(filename, 'w') as f: header_fields = ['ss_compname', 'dir', 'fdr', 'count'] print >>f, '\t'.join(header_fields) for ss_compname, ss_sigup, ss_sigdn in result_iter: for i,fdr_threshold in enumerate(fdr_thresholds): fields = [ss_compname, 'up', '%.1e' % (fdr_threshold), len(ss_sigup[i])] print >>f, '\t'.join(map(str, fields)) fields = [ss_compname, 'dn', '%.1e' % (fdr_threshold), len(ss_sigdn[i])] print >>f, '\t'.join(map(str, fields)) ss_sigall = ss_sigup[i].union(ss_sigdn[i]) fields = [ss_compname, 'both', '%.1e' % (fdr_threshold), len(ss_sigall)] print >>f, '\t'.join(map(str, fields)) num_none = nrows - len(ss_sigall) fields = [ss_compname, 'none', '%.1e' % (fdr_threshold), num_none] print >>f, '\t'.join(map(str, fields)) sigup[i].update(ss_sigup[i]) sigdn[i].update(ss_sigdn[i]) sigall[i].update(ss_sigall) pool.close() pool.join() # global stats for i,fdr_threshold in enumerate(fdr_thresholds): filename = prefix + '_%.1e_up' % (fdr_threshold) with open(filename, 'w') as f: sig_t_ids = [bm.rownames[x] for x in sorted(sigup[i])] print >>f, '\n'.join(sig_t_ids) filename = prefix + '_%.1e_dn' % (fdr_threshold) with open(filename, 'w') as f: sig_t_ids = [bm.rownames[x] for x in sorted(sigdn[i])] print >>f, '\n'.join(sig_t_ids) filename = prefix + '_%.1e_both' % (fdr_threshold) sig_t_ids = [bm.rownames[x] for x in sorted(sigall[i])] with open(filename, 'w') as f: print >>f, '\n'.join(sig_t_ids) filename = prefix + '_%.1e_none' % (fdr_threshold) with open(filename, 'w') as f: none_t_ids = set(bm.rownames).difference(sig_t_ids) print >>f, '\n'.join(none_t_ids) bm.close()
def main(): parser = argparse.ArgumentParser() # parser.add_argument('--colmeta', dest='col_metadata_file', # help='file containing metadata corresponding to each ' # 'column of the weight matrix file') # parser.add_argument('--rowmeta', dest='row_metadata_file', # help='file containing metadata corresponding to each ' # 'row of the weight matrix file') parser.add_argument("-r", dest="row", action="store_true", default=False, help="Print row_meta JSONs") parser.add_argument("-c", dest="col", action="store_true", default=False, help="Print col_meta JSONs") parser.add_argument('matrix_dir') args = parser.parse_args() # check command line args matrix_dir = os.path.abspath(args.matrix_dir) col_metadata_file = os.path.join(matrix_dir, 'colmeta.tsv') row_metadata_file = os.path.join(matrix_dir, 'rowmeta.tsv') if not os.path.exists(col_metadata_file): parser.error("Column metadata file '%s' not found" % (args.col_metadata_file)) if not os.path.exists(row_metadata_file): parser.error("Row metadata file '%s' not found" % (args.row_metadata_file)) if not os.path.exists(args.matrix_dir): parser.error('matrix path "%s" not found' % (args.matrix_dir)) # col_metadata_file = os.path.abspath(args.col_metadata_file) # row_metadata_file = os.path.abspath(args.row_metadata_file) # open matrix bm = BigCountMatrix.open(matrix_dir) if bm.size_factors is None: parser.error("Size factors not found in count matrix") # read metadata logging.info("Reading row metadata") row_metadata = list(Metadata.parse_tsv(row_metadata_file, bm.rownames)) logging.info("Reading column metadata") col_metadata = list(Metadata.parse_tsv(col_metadata_file, bm.colnames)) # pipe row metadata into mongoimport if args.row: logging.debug("Importing row metadata") for m in row_metadata: print >>sys.stdout, m.to_json() if args.col: logging.debug("Importing column metadata") for m in col_metadata: print >>sys.stdout, m.to_json() # cleanup bm.close()
def query_worker(args): (input_path, matrix_dir, meta, fdr_threshold, frac_threshold, fpr_threshold, prec_threshold) = args bm = BigCountMatrix.open(matrix_dir) ss_compname = os.path.basename(input_path) results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE) i = 0 sig = 0 lines = [] # extract data for res in parse_results(results_file): i += 1 #if (i % 10000) == 0: # logging.debug('%s %d results' % (ss_compname, i)) transcript_id = bm.rownames[res.t_id] if (meta is not None) and (transcript_id not in meta): continue core_size = res.core_hits + res.core_misses if core_size == 0: prec = 0.0 else: prec = res.core_hits / float(core_size) num_misses = res.core_misses + res.null_misses if num_misses == 0: fpr = 0.0 else: fpr = res.core_misses / float(num_misses) if ((res.ss_fdr_q_value <= fdr_threshold) and (abs(res.ss_frac) >= frac_threshold) and (fpr <= fpr_threshold) and (prec >= prec_threshold)): if meta is None: fields = [transcript_id] else: fields = list(meta[transcript_id]) fields.extend([ ss_compname, res.es, res.nes, res.ss_fdr_q_value, res.ss_frac, fpr, prec ]) lines.append('\t'.join(map(str, fields))) sig += 1 bm.close() logging.debug('Found %d results for path %s' % (sig, input_path)) return lines
def query_worker(args): (input_path, matrix_dir, meta, fdr_threshold, frac_threshold, fpr_threshold, prec_threshold) = args bm = BigCountMatrix.open(matrix_dir) ss_compname = os.path.basename(input_path) results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE) i = 0 sig = 0 lines = [] # extract data for res in parse_results(results_file): i += 1 #if (i % 10000) == 0: # logging.debug('%s %d results' % (ss_compname, i)) transcript_id = bm.rownames[res.t_id] if (meta is not None) and (transcript_id not in meta): continue core_size = res.core_hits + res.core_misses if core_size == 0: prec = 0.0 else: prec = res.core_hits / float(core_size) num_misses = res.core_misses + res.null_misses if num_misses == 0: fpr = 0.0 else: fpr = res.core_misses / float(num_misses) if ((res.ss_fdr_q_value <= fdr_threshold) and (abs(res.ss_frac) >= frac_threshold) and (fpr <= fpr_threshold) and (prec >= prec_threshold)): if meta is None: fields = [transcript_id] else: fields = list(meta[transcript_id]) fields.extend([ss_compname, res.es, res.nes, res.ss_fdr_q_value, res.ss_frac, fpr, prec]) lines.append('\t'.join(map(str,fields))) sig += 1 bm.close() logging.debug('Found %d results for path %s' % (sig, input_path)) return lines
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--skip-all-nan', dest='skip_nan', action='store_true') parser.add_argument('--skip-all-zero', dest='skip_zero', action='store_true') parser.add_argument('--libs', dest="library_ids", default=None) parser.add_argument('--transcripts', dest="transcript_ids", default=None) parser.add_argument('input_dir') parser.add_argument('output_dir') args = parser.parse_args() if not os.path.exists(args.input_dir): parser.error("input directory '%s' not found" % (args.input_dir)) if os.path.exists(args.output_dir): parser.error("output directory '%s' already exists" % (args.output_dir)) # open matrix bm = BigCountMatrix.open(args.input_dir) # get library and transcript ids if args.library_ids is not None: library_ids = set([line.strip() for line in open(args.library_ids)]) else: library_ids = set() if args.transcript_ids is not None: transcript_ids = set( [line.strip() for line in open(args.transcript_ids)]) else: transcript_ids = set(bm.rownames) if args.skip_nan or args.skip_zero: logging.debug('Checking matrix for rows of all zero and/or nan') skip_ids = set( find_transcripts_to_skip(args.input_dir, args.skip_nan, args.skip_zero)) transcript_ids.difference_update(skip_ids) logging.debug('Creating subset with %d transcripts' % (len(transcript_ids))) bm.copy(args.output_dir, transcript_ids, library_ids) bm.close()
def find_transcripts_to_skip(input_dir, skip_nan, skip_zero): ''' find transcripts with 'nan' or 0.0 values ''' skip_rownames = [] bm = BigCountMatrix.open(input_dir) for i in xrange(bm.shape[0]): a = np.array(bm.counts[i, :], dtype=np.float) skip = False if skip_nan: num_finite = np.isfinite(a).sum() if num_finite == 0: logging.debug('Row %d t_id %s all nan' % (i, bm.rownames[i])) skip = True if skip_zero: num_nonzero = (a > 0).sum() if num_nonzero == 0: logging.debug('Row %d t_id %s all zeros' % (i, bm.rownames[i])) skip = True if skip: skip_rownames.append(bm.rownames[i]) bm.close() logging.debug('Found %d rows to skip' % (len(skip_rownames))) return skip_rownames
def find_transcripts_to_skip(input_dir, skip_nan, skip_zero): ''' find transcripts with 'nan' or 0.0 values ''' skip_rownames = [] bm = BigCountMatrix.open(input_dir) for i in xrange(bm.shape[0]): a = np.array(bm.counts[i,:], dtype=np.float) skip = False if skip_nan: num_finite = np.isfinite(a).sum() if num_finite == 0: logging.debug('Row %d t_id %s all nan' % (i, bm.rownames[i])) skip = True if skip_zero: num_nonzero = (a > 0).sum() if num_nonzero == 0: logging.debug('Row %d t_id %s all zeros' % (i, bm.rownames[i])) skip = True if skip: skip_rownames.append(bm.rownames[i]) bm.close() logging.debug('Found %d rows to skip' % (len(skip_rownames))) return skip_rownames
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("memmap_dir") parser.add_argument("metadata_file") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") #make dict of lengths for count --> FPKM conversion meta_file = open(args.metadata_file) meta_header = meta_file.next().strip().split('\t') t_meta_dict = {} logging.info("Parsing metadata file for transcript lengths") for line in meta_file: line = line.strip().split('\t') t_id = line[meta_header.index('transcript_id')] t_meta_dict[t_id] = line bcm = BigCountMatrix.open(args.memmap_dir) rows = bcm.rownames cols = bcm.colnames matrix = bcm.counts matrix_t = bcm.counts_t #loop through matrix by column to get total # of reads for FPKM conversion logging.info('Looping through matrix_t for number of reads per sample') tot_reads_array = [] for x in xrange(len(matrix_t[:,1])): if (x%500) == 0: logging.info('Finished %d/%d samples' % (x, len(matrix_t[:,1]))) col_expr_array = matrix_t[x,] tot_reads_array.append(np.nansum(col_expr_array)) tot_reads_np_array = np.array(tot_reads_array) #looping through matrix and converting to FPKM then reporting stats logging.info('Converting to FPKM then reporting stats') new_fields = [ 'max', '99.99th', '99.9th', '99.5th', '99th', '95th', '90th', 'mean', 'median' ] print '\t'.join(meta_header + new_fields) for x in xrange(len(rows)): if (x%5000) == 0: logging.info('Finished %d/%d transcripts' % (x, len(rows))) t_id = rows[x] metadata = t_meta_dict[t_id] t_len = metadata[meta_header.index('transcript_length')] t_len = int(t_len) count_array = matrix[x,] #convert to FPKM fpkm_array = (count_array*10e8)/(t_len*tot_reads_np_array) max = fpkm_array.max() median = np.median(fpkm_array) mean = np.mean(fpkm_array) _99_99 = np.percentile(fpkm_array, 99.99) _99_9 = np.percentile(fpkm_array, 99.9) _99_5 = np.percentile(fpkm_array, 99.5) _99 = np.percentile(fpkm_array, 99) _95 = np.percentile(fpkm_array, 95) _90 = np.percentile(fpkm_array, 90) lineo = metadata + [max, _99_99, _99_9, _99_5, _99, _95, _90, mean, median] print '\t'.join(map(str,lineo)) # return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--colmeta', dest='col_metadata_file', default=None) parser.add_argument('--rowmeta', dest='row_metadata_file', default=None) parser.add_argument('matrix_dir') args = parser.parse_args() col_metadata_file = args.col_metadata_file row_metadata_file = args.row_metadata_file matrix_dir = args.matrix_dir if not os.path.exists(matrix_dir): parser.error('matrix directory "%s" not found' % (matrix_dir)) # open matrix bm = BigCountMatrix.open(matrix_dir) # get pheno data logging.debug('Reading library metadata') col_inds = parse_library_table(col_metadata_file, bm.colnames) # get transcript metadata logging.debug('Reading transcript metadata') row_inds, lengths = parse_transcript_table(row_metadata_file, bm.rownames) # get total counts per library # logging.debug('Getting total fragments per library') # lib_sizes = np.empty(len(col_inds), dtype=np.float) # for j in xrange(len(col_inds)): # a = bm.counts_t[j,:] # a = a[np.isfinite(a)] # lib_sizes[j] = a.sum() # lib_sizes /= 1.0e6 # normalize logging.debug('Normalizing and summarizing counts per transcript') header_fields = ['transcript_id'] header_fields.extend([bm.colnames[x] for x in col_inds]) print '\t'.join(header_fields) for i,lengthkb in zip(row_inds,lengths): t_id = bm.rownames[i] counts = bm.counts[i, col_inds] # ignore nans valid_inds = np.isfinite(counts) # normalize counts a = (counts[valid_inds] / bm.size_factors[valid_inds]) # log transform a = np.log2(a + 1.0) # z-score #mean = np.mean(a) #std = np.std(a) #a = (a - mean) / std # subtract median and divide by MAD med = np.median(a) mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR) a = (a - med) / mad # write out = np.empty(len(col_inds), dtype=np.float) out[:] = np.nan out[valid_inds] = a fields = [t_id] fields.extend(map(str,out)) print '\t'.join(fields) continue #a = (counts[valid_inds] / bm.size_factors[valid_inds]) #a = (counts[valid_inds] / lib_sizes[valid_inds]) / lengthkb # center and scale med = np.median(a) mad = med + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR) #mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR) a = (a - med) / mad # log transform a = np.sign(a) * np.log2(np.abs(a)+1.0) # normalize by transcript length #a -= np.log2(lengthkb) # output result out = np.empty(len(col_inds), dtype=np.float) out[:] = np.nan out[valid_inds] = a fields = [t_id] fields.extend(map(str,out)) print '\t'.join(fields) #normcounts = (counts[valid_inds] / lib_sizes[valid_inds]) / length #normlogcounts = np.log2(normcounts + 1) # subtracting global median and dividing by the median absolute deviation #med = np.median(normlogcounts) #mad = MAD_CONSTANT + (np.median(np.abs(normlogcounts - med)) * MAD_SCALE_FACTOR) #normlogcounts = (normlogcounts - med) / mad # output final matrix #a = np.empty(len(col_inds), dtype=np.float) #a[:] = np.nan #a[valid_inds] = normlogcounts #fields = [t_id] #fields.extend(map(str,a)) #print '\t'.join(fields) bm.close() return 0
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--colmeta', dest='col_metadata_file', default=None) parser.add_argument('--rowmeta', dest='row_metadata_file', default=None) parser.add_argument('matrix_dir') args = parser.parse_args() col_metadata_file = args.col_metadata_file row_metadata_file = args.row_metadata_file matrix_dir = args.matrix_dir if not os.path.exists(matrix_dir): parser.error('matrix directory "%s" not found' % (matrix_dir)) # open matrix bm = BigCountMatrix.open(matrix_dir) # get pheno data logging.debug('Reading library metadata') col_inds = parse_library_table(col_metadata_file, bm.colnames) # get transcript metadata logging.debug('Reading transcript metadata') row_inds, lengths = parse_transcript_table(row_metadata_file, bm.rownames) # get total counts per library # logging.debug('Getting total fragments per library') # lib_sizes = np.empty(len(col_inds), dtype=np.float) # for j in xrange(len(col_inds)): # a = bm.counts_t[j,:] # a = a[np.isfinite(a)] # lib_sizes[j] = a.sum() # lib_sizes /= 1.0e6 # normalize logging.debug('Normalizing and summarizing counts per transcript') header_fields = ['transcript_id'] header_fields.extend([bm.colnames[x] for x in col_inds]) print '\t'.join(header_fields) for i, lengthkb in zip(row_inds, lengths): t_id = bm.rownames[i] counts = bm.counts[i, col_inds] # ignore nans valid_inds = np.isfinite(counts) # normalize counts a = (counts[valid_inds] / bm.size_factors[valid_inds]) # log transform a = np.log2(a + 1.0) # z-score #mean = np.mean(a) #std = np.std(a) #a = (a - mean) / std # subtract median and divide by MAD med = np.median(a) mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR) a = (a - med) / mad # write out = np.empty(len(col_inds), dtype=np.float) out[:] = np.nan out[valid_inds] = a fields = [t_id] fields.extend(map(str, out)) print '\t'.join(fields) continue #a = (counts[valid_inds] / bm.size_factors[valid_inds]) #a = (counts[valid_inds] / lib_sizes[valid_inds]) / lengthkb # center and scale med = np.median(a) mad = med + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR) #mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR) a = (a - med) / mad # log transform a = np.sign(a) * np.log2(np.abs(a) + 1.0) # normalize by transcript length #a -= np.log2(lengthkb) # output result out = np.empty(len(col_inds), dtype=np.float) out[:] = np.nan out[valid_inds] = a fields = [t_id] fields.extend(map(str, out)) print '\t'.join(fields) #normcounts = (counts[valid_inds] / lib_sizes[valid_inds]) / length #normlogcounts = np.log2(normcounts + 1) # subtracting global median and dividing by the median absolute deviation #med = np.median(normlogcounts) #mad = MAD_CONSTANT + (np.median(np.abs(normlogcounts - med)) * MAD_SCALE_FACTOR) #normlogcounts = (normlogcounts - med) / mad # output final matrix #a = np.empty(len(col_inds), dtype=np.float) #a[:] = np.nan #a[valid_inds] = normlogcounts #fields = [t_id] #fields.extend(map(str,a)) #print '\t'.join(fields) bm.close() return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--fdr', dest='fdr', type=float, default=1.0) parser.add_argument('--frac', dest='frac', type=float, default=0.0) parser.add_argument('--meta', dest='metadata_file', default=None) parser.add_argument("-i", dest="input_paths_file", default=None) parser.add_argument('matrix_dir') args = parser.parse_args() # get args matrix_dir = args.matrix_dir fdr_threshold = args.fdr frac_threshold = args.frac input_paths_file = args.input_paths_file metadata_file = args.metadata_file # check args bm = BigCountMatrix.open(matrix_dir) fdr_threshold = max(0.0, min(fdr_threshold, 1.0)) frac_threshold = max(0.0, min(frac_threshold, 1.0)) input_paths = [] if input_paths_file is None: logging.error('No input directories specified (use -i).. Exiting.') return 1 if not os.path.exists(input_paths_file): logging.error('Input paths file "%s" not found' % (input_paths_file)) else: with open(input_paths_file) as fileh: for line in fileh: path = line.strip() if path in input_paths: continue if check_path(path): input_paths.append(path) if len(input_paths) == 0: logging.error('No valid SSEA results directories found.. Exiting.') return 1 meta = None if metadata_file is not None: logging.debug('Parsing transcript metadata') meta = {} with open(metadata_file) as f: meta_header_fields = f.next().strip().split() for line in f: fields = line.strip().split('\t') meta[fields[0]] = fields logging.debug('Found metadata for %d transcripts' % (len(meta))) else: meta = None meta_header_fields = ['transcript_id'] # parse results logging.debug('SSEA results: %d' % (len(input_paths))) logging.debug('FDR threshold: %f' % (fdr_threshold)) logging.debug('Frac threshold: %f' % (frac_threshold)) header_fields = meta_header_fields + ['ss_compname', 'es', 'nes', 'fdr', 'frac', 'prec'] print '\t'.join(header_fields) for input_path in input_paths: logging.debug('Parsing path %s' % (input_path)) results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE) # extract data ss_compname = os.path.basename(input_path) i = 0 sig = 0 for res in parse_results(results_file): # logging i += 1 if (i % 10000) == 0: logging.debug('Parsed %d results' % (i)) transcript_id = bm.rownames[res.t_id] if meta is not None: if transcript_id not in meta: continue if ((res.ss_fdr_q_value <= fdr_threshold) and (abs(res.ss_frac) >= frac_threshold)): if meta is None: fields = [bm.rownames[res.t_id]] else: fields = list(meta[transcript_id]) core_size = res.core_hits + res.core_misses if core_size == 0: prec = 0.0 else: prec = res.core_hits / float(res.core_hits + res.core_misses) fields.extend([ss_compname, res.es, res.nes, res.ss_fdr_q_value, res.ss_frac, prec]) print '\t'.join(map(str, fields)) sig += 1 logging.debug('Found %d results for path %s' % (sig, input_path)) bm.close()
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--fdr', dest='fdr', type=float, default=1.0) parser.add_argument('-a', '--attr', dest='ssea_attrs', action='append', default=[]) parser.add_argument("-i", dest="input_paths_file", default=None) parser.add_argument('matrix_dir') args = parser.parse_args() # get args fdr_threshold = args.fdr matrix_dir = args.matrix_dir ssea_attrs = args.ssea_attrs input_paths_file = args.input_paths_file # check args if len(ssea_attrs) == 0: parser.error('Please specify one or more attributes using "-a" or "--attr"') input_paths = [] if input_paths_file is None: logging.error('No input directories specified (use -i).. Exiting.') return 1 if not os.path.exists(input_paths_file): logging.error('Input paths file "%s" not found' % (input_paths_file)) else: with open(input_paths_file) as fileh: for line in fileh: path = line.strip() if path in input_paths: continue if check_path(path): input_paths.append(path) if len(input_paths) == 0: logging.error('No valid SSEA results directories found.. Exiting.') return 1 # parse results logging.debug('SSEA results: %d' % (len(input_paths))) logging.debug('SSEA attributes: %s' % (','.join(ssea_attrs))) logging.debug('SSEA FDR threshold: %f' % fdr_threshold) bm = BigCountMatrix.open(matrix_dir) dat = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) for input_path in input_paths: logging.debug('Parsing path %s' % (input_path)) results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE) # extract data i = 0 sig = 0 for res in parse_results(results_file): # logging i += 1 if (i % 10000) == 0: logging.debug('Parsed %d results' % (i)) if res.ss_fdr_q_value > fdr_threshold: continue sig += 1 transcript_id = bm.rownames[res.t_id] for a in ssea_attrs: dat[a][transcript_id].append(getattr(res, a)) logging.debug('Found %d results for path %s (%d significant)' % (i, input_path, sig)) bm.close() # output results header_fields = ['transcript_id', 'attr', 'min', 'max', 'absmax', 'mean', 'median'] print '\t'.join(header_fields) for a in ssea_attrs: attrdict = dat[a] for transcript_id in sorted(attrdict): arr = np.array(attrdict[transcript_id]) fields = [transcript_id, a, np.min(arr), np.max(arr), np.max(np.abs(arr)), np.median(arr), np.mean(arr)] print '\t'.join(map(str, fields))
def ssea_serial(config, sample_set, output_basename, startrow=None, endrow=None): ''' main SSEA loop (single processor) matrix_dir: numpy memmap matrix containing numeric data sample_set: SampleSet object config: Config object output_basename: prefix for writing result files ''' # initialize random number generator rng = RandomState() # open data matrix bm = BigCountMatrix.open(config.matrix_dir) # determine range of matrix to process if startrow is None: startrow = 0 if endrow is None: endrow = bm.shape[0] assert startrow < endrow # get membership array for sample set membership = sample_set.get_array(bm.colnames) valid_samples = (membership >= 0) # setup histograms hists = _init_hists() # setup report file unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX outfileh = open(unsorted_json_file, 'wb') for i in xrange(startrow, endrow): logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow)) # read from memmap counts = np.array(bm.counts[i, :], dtype=np.float) # remove 'nan' values valid_inds = np.logical_and(valid_samples, np.isfinite(counts)) # subset counts, size_factors, and membership array counts = counts[valid_inds] size_factors = bm.size_factors[valid_inds] valid_membership = membership[valid_inds] # write dummy results for invalid rows if (valid_inds.sum() == 0) or (np.all(counts == 0)): res = Result.default() else: # run ssea res, null_nes_vals = ssea_run(counts, size_factors, valid_membership, rng, config) # update histograms null_keys = [] obs_keys = [] if res.es < 0: null_keys.append('null_nes_neg') obs_keys.append('obs_nes_neg') elif res.es > 0: null_keys.append('null_nes_pos') obs_keys.append('obs_nes_pos') for k in xrange(len(null_keys)): null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX) obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX) hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0] hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0] # save t_id res.t_id = i # convert to json and write print >> outfileh, res.to_json() # close report file outfileh.close() # save histograms to a file output_hist_file = output_basename + NPY_HISTS_SUFFIX np.savez(output_hist_file, **hists) # cleanup bm.close() # sort output json file by abs(NES) logging.debug("Worker %s: sorting results" % (output_basename)) # make tmp dir for sorting if os.path.exists(output_basename): shutil.rmtree(output_basename) os.makedirs(output_basename) # call batch sort python function sorted_json_file = output_basename + JSON_SORTED_SUFFIX batch_sort(input=unsorted_json_file, output=sorted_json_file, key=_cmp_json_nes, buffer_size=SORT_BUFFER_SIZE, tempdirs=[output_basename]) # remove tmp dir shutil.rmtree(output_basename) # remove unsorted json file os.remove(unsorted_json_file) logging.debug("Worker %s: done" % (output_basename)) return 0
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("matrix_dir") parser.add_argument("title") parser.add_argument("library_ids") parser.add_argument("metadata") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # open matrix bm = BigCountMatrix.open(args.matrix_dir) # get pheno data logging.debug('Reading library metadata') col_inds = parse_library_table(args.library_ids, bm.colnames) # get transcript metadata logging.debug('Reading transcript metadata') row_inds, lengths = parse_transcript_table(args.metadata, bm.rownames) bm.count_tots /= 1.0e6 mean_dict = collections.defaultdict(lambda: float('NaN')) nf_dict = collections.defaultdict(lambda: float('NaN')) nn_dict = collections.defaultdict(lambda: float('NaN')) mean_dict_f = collections.defaultdict(lambda: float('NaN')) nf_dict_f = collections.defaultdict(lambda: float('NaN')) nn_dict_f = collections.defaultdict(lambda: float('NaN')) logging.debug('Calculating expression stats') k = 0 for i,lengthkb in zip(row_inds,lengths): k+=1 if k%10000 == 0: logging.debug('Finished %d transcripts' % k) t_id = bm.rownames[i] counts = bm.counts[i, col_inds] # ignore nans valid_inds = np.isfinite(counts) # normalize counts count_vals = (counts[valid_inds] / bm.size_factors[valid_inds]) fpkm_vals = (counts[valid_inds] / (lengthkb*bm.count_tots[valid_inds])) mean = np.mean(count_vals) mean_f = np.mean(fpkm_vals) if list(count_vals) == []: continue nf = np.percentile(count_vals, 95) nf_f = np.percentile(fpkm_vals, 95) nn = np.percentile(count_vals, 99) nn_f = np.percentile(fpkm_vals, 99) mean_dict[t_id] = mean mean_dict_f[t_id] = mean_f nf_dict[t_id] = nf nf_dict_f[t_id] = nf_f nn_dict[t_id] = nn nn_dict_f[t_id] = nn_f logging.debug('Printing output') meta_fh = open(args.metadata) meta_header = meta_fh.next().strip().split('\t') meta_header.append(args.title+'_count_mean') meta_header.append(args.title+'_count_95') meta_header.append(args.title+'_count_99') meta_header.append(args.title+'_fpkm_mean') meta_header.append(args.title+'_fpkm_95') meta_header.append(args.title+'_fpkm_99') print '\t'.join(meta_header) for line in meta_fh: line = line.strip().split('\t') t_id = line[meta_header.index('transcript_id')] mean = mean_dict[t_id] mean_f = mean_dict_f[t_id] nf = nf_dict[t_id] nf_f = nf_dict_f[t_id] nn = nn_dict[t_id] nn_f = nn_dict_f[t_id] if mean == 'na' or mean_f == 'na': line.append('na') line.append("%.3f" % mean) line.append("%.3f" % nf) line.append("%.3f" % nn) line.append("%.3f" % mean_f) line.append("%.3f" % nf_f) line.append("%.3f" % nn_f) print '\t'.join(line) return 0
def ssea_serial(config, sample_set, output_basename, startrow=None, endrow=None): ''' main SSEA loop (single processor) matrix_dir: numpy memmap matrix containing numeric data sample_set: SampleSet object config: Config object output_basename: prefix for writing result files ''' # initialize random number generator rng = RandomState() # open data matrix bm = BigCountMatrix.open(config.matrix_dir) # determine range of matrix to process if startrow is None: startrow = 0 if endrow is None: endrow = bm.shape[0] assert startrow < endrow # get membership array for sample set membership = sample_set.get_array(bm.colnames) valid_samples = (membership >= 0) # setup histograms hists = _init_hists() # setup report file unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX outfileh = open(unsorted_json_file, 'wb') for i in xrange(startrow, endrow): logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow)) # read from memmap counts = np.array(bm.counts[i,:], dtype=np.float) # remove 'nan' values valid_inds = np.logical_and(valid_samples, np.isfinite(counts)) # subset counts, size_factors, and membership array counts = counts[valid_inds] size_factors = bm.size_factors[valid_inds] valid_membership = membership[valid_inds] # write dummy results for invalid rows if (valid_inds.sum() == 0) or (np.all(counts == 0)): res = Result.default() else: # run ssea res, null_nes_vals = ssea_run(counts, size_factors, valid_membership, rng, config) # update histograms null_keys = [] obs_keys = [] if res.es < 0: null_keys.append('null_nes_neg') obs_keys.append('obs_nes_neg') elif res.es > 0: null_keys.append('null_nes_pos') obs_keys.append('obs_nes_pos') for k in xrange(len(null_keys)): null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX) obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX) hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0] hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0] # save t_id res.t_id = i # convert to json and write print >>outfileh, res.to_json() # close report file outfileh.close() # save histograms to a file output_hist_file = output_basename + NPY_HISTS_SUFFIX np.savez(output_hist_file, **hists) # cleanup bm.close() # sort output json file by abs(NES) logging.debug("Worker %s: sorting results" % (output_basename)) # make tmp dir for sorting if os.path.exists(output_basename): shutil.rmtree(output_basename) os.makedirs(output_basename) # call batch sort python function sorted_json_file = output_basename + JSON_SORTED_SUFFIX batch_sort(input=unsorted_json_file, output=sorted_json_file, key=_cmp_json_nes, buffer_size=SORT_BUFFER_SIZE, tempdirs=[output_basename]) # remove tmp dir shutil.rmtree(output_basename) # remove unsorted json file os.remove(unsorted_json_file) logging.debug("Worker %s: done" % (output_basename)) return 0
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("library_ids") parser.add_argument("transcript_ids") parser.add_argument("output_prefix") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") if not os.path.exists(EXPRESSION_DIR): parser.error("Expression matrix directory '%s' not found" % (args.matrix_dir)) # read input library ids library_ids = read_lines(args.library_ids) library_id_set = set(library_ids) # read input gene ids gene_ids = read_lines(args.transcript_ids) gene_id_set = set(gene_ids) # load gene expression pheno_file = os.path.join(EXPRESSION_DIR, 'colnames.txt') metadata_file = os.path.join(EXPRESSION_DIR, 'rownames.txt') # find libraries ncols, lib_inds, lib_ids = \ read_table(pheno_file, subset=library_id_set) if len(lib_inds) == 0: print "No libraries found" return 1 # find genes logging.info('Acquiring data for %s libraries' % len(lib_ids)) nrows, g_inds, g_ids = \ read_table(metadata_file, subset=gene_id_set) if len(g_inds) == 0: print "No genes found" return 1 logging.info('Acquiring data for %s transcripts' % len(g_ids)) # read gene expression bm = BigCountMatrix.open(EXPRESSION_DIR) mat = bm.counts # get subset of matrix logging.info("performing gene subset") submat = mat[g_inds,:] logging.info("performing library subset") submat = submat[:,lib_inds] # write expr file output_expr_file = args.output_prefix + ".expr.tsv" fileh = open(output_expr_file, 'w') fields = ['gene_id'] fields.extend(lib_ids) logging.debug('Printing expression file') print >>fileh, '\t'.join(fields) for i in xrange(len(g_inds)): if i%1000==0: logging.debug("Finished %d/%d transcripts" % (i, len(g_inds))) fields = [g_ids[i]] for x in submat[i, :]: if x<0: fields.append('NA') else: fields.append(str(x)) print >>fileh, '\t'.join(fields) fileh.close() return 0