Esempio n. 1
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                      format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--skip-all-nan', dest='skip_nan', action='store_true')
    parser.add_argument('--skip-all-zero', dest='skip_zero', action='store_true')
    parser.add_argument('--libs', dest="library_ids", default=None)
    parser.add_argument('--transcripts', dest="transcript_ids", default=None)
    parser.add_argument('input_dir')
    parser.add_argument('output_dir')
    args = parser.parse_args()
    if not os.path.exists(args.input_dir):
        parser.error("input directory '%s' not found" % (args.input_dir))
    if os.path.exists(args.output_dir):
        parser.error("output directory '%s' already exists" % (args.output_dir))
    # open matrix
    bm = BigCountMatrix.open(args.input_dir)
    # get library and transcript ids
    if args.library_ids is not None:
        library_ids = set([line.strip() for line in open(args.library_ids)])
    else:
        library_ids = set()    
    if args.transcript_ids is not None:
        transcript_ids = set([line.strip() for line in open(args.transcript_ids)])
    else:
        transcript_ids = set(bm.rownames)
    if args.skip_nan or args.skip_zero:
        logging.debug('Checking matrix for rows of all zero and/or nan')
        skip_ids = set(find_transcripts_to_skip(args.input_dir, 
                                                args.skip_nan, 
                                                args.skip_zero))
        transcript_ids.difference_update(skip_ids)
    logging.debug('Creating subset with %d transcripts' % (len(transcript_ids)))
    bm.copy(args.output_dir, transcript_ids, library_ids)
    bm.close()
Esempio n. 2
0
def worker(args):    
    (input_path, matrix_dir, meta, fdr_thresholds) = args 
    bm = BigCountMatrix.open(matrix_dir)
    ss_compname = os.path.basename(input_path)
    results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE)
    sigup = [set() for x in fdr_thresholds]
    sigdn = [set() for x in fdr_thresholds]
    # extract data
    n = 0
    for res in parse_results(results_file):
        n += 1
        if (n % 10000) == 0:
            logging.debug('%s parsed %d' % (ss_compname, n))
        transcript_id = bm.rownames[res.t_id]
        if (meta is not None) and (transcript_id not in meta):
            continue
        for i,fdr_threshold in enumerate(fdr_thresholds):
            if res.ss_fdr_q_value > fdr_threshold:
                continue                
            if res.ss_frac > 0:
                sigup[i].add(res.t_id)
            else:
                sigdn[i].add(res.t_id)
    bm.close()
    return (ss_compname, sigup, sigdn)
Esempio n. 3
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--na-value', dest='matrix_na_values', 
                        default=[DEFAULT_NA_VALUE], action='append',
                        help='Value to interpret as missing/invalid '
                        'in weight matrix [default=%(default)s]')    
    parser.add_argument('input_tsv_file')
    parser.add_argument('output_dir')
    # parse args
    args = parser.parse_args()    
    input_tsv_file = args.input_tsv_file
    output_dir = args.output_dir
    matrix_na_values = args.matrix_na_values
    # check args
    if not os.path.exists(input_tsv_file):
        parser.error('Input file "%s" not found' % (input_tsv_file))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # convert matrix 
    logging.info("Converting text matrix file to binary format")
    bm = BigCountMatrix.from_tsv(input_tsv_file, output_dir, 
                                 na_values=matrix_na_values)
    logging.info("Estimating size factors")
    bm.estimate_size_factors('deseq')
    bm.close()
Esempio n. 4
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--na-value',
                        dest='matrix_na_values',
                        default=[DEFAULT_NA_VALUE],
                        action='append',
                        help='Value to interpret as missing/invalid '
                        'in weight matrix [default=%(default)s]')
    parser.add_argument('input_tsv_file')
    parser.add_argument('output_dir')
    # parse args
    args = parser.parse_args()
    input_tsv_file = args.input_tsv_file
    output_dir = args.output_dir
    matrix_na_values = args.matrix_na_values
    # check args
    if not os.path.exists(input_tsv_file):
        parser.error('Input file "%s" not found' % (input_tsv_file))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # convert matrix
    logging.info("Converting text matrix file to binary format")
    bm = BigCountMatrix.from_tsv(input_tsv_file,
                                 output_dir,
                                 na_values=matrix_na_values)
    logging.info("Estimating size factors")
    bm.estimate_size_factors('deseq')
    bm.close()
def main():
    parser = argparse.ArgumentParser()
    #     parser.add_argument('--colmeta', dest='col_metadata_file',
    #                         help='file containing metadata corresponding to each '
    #                         'column of the weight matrix file')
    #     parser.add_argument('--rowmeta', dest='row_metadata_file',
    #                         help='file containing metadata corresponding to each '
    #                         'row of the weight matrix file')
    parser.add_argument("-r",
                        dest="row",
                        action="store_true",
                        default=False,
                        help="Print row_meta JSONs")
    parser.add_argument("-c",
                        dest="col",
                        action="store_true",
                        default=False,
                        help="Print col_meta JSONs")
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    # check command line args
    matrix_dir = os.path.abspath(args.matrix_dir)
    col_metadata_file = os.path.join(matrix_dir, 'colmeta.tsv')
    row_metadata_file = os.path.join(matrix_dir, 'rowmeta.tsv')

    if not os.path.exists(col_metadata_file):
        parser.error("Column metadata file '%s' not found" %
                     (args.col_metadata_file))
    if not os.path.exists(row_metadata_file):
        parser.error("Row metadata file '%s' not found" %
                     (args.row_metadata_file))
    if not os.path.exists(args.matrix_dir):
        parser.error('matrix path "%s" not found' % (args.matrix_dir))


#     col_metadata_file = os.path.abspath(args.col_metadata_file)
#     row_metadata_file = os.path.abspath(args.row_metadata_file)
# open matrix
    bm = BigCountMatrix.open(matrix_dir)
    if bm.size_factors is None:
        parser.error("Size factors not found in count matrix")
    # read metadata
    logging.info("Reading row metadata")
    row_metadata = list(Metadata.parse_tsv(row_metadata_file, bm.rownames))
    logging.info("Reading column metadata")
    col_metadata = list(Metadata.parse_tsv(col_metadata_file, bm.colnames))

    # pipe row metadata into mongoimport
    if args.row:
        logging.debug("Importing row metadata")
        for m in row_metadata:
            print >> sys.stdout, m.to_json()
    if args.col:
        logging.debug("Importing column metadata")
        for m in col_metadata:
            print >> sys.stdout, m.to_json()
    # cleanup
    bm.close()
Esempio n. 6
0
def db_ss_printJSON(ssea_dir, matrix_dir, ss_id):
    sample_sets_json_file = os.path.join(ssea_dir, 'sample_set.json')
    bm = BigCountMatrix.open(matrix_dir)
    samples = bm.colnames
    ss = SampleSet.parse_json(sample_sets_json_file)[0]
    membership = ss.get_array(samples)
    d = ss.to_dict(membership)
    d['_id'] = int(ss_id)
    print json.dumps(d)
Esempio n. 7
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('metadata_file')
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    # get args
    metadata_file = args.metadata_file
    matrix_dir = args.matrix_dir
    bm = BigCountMatrix.open(matrix_dir)
    # read transcript lengths
    logging.debug('Reading transcript_length')
    lengths = {}
    with open(metadata_file) as f:
        header_fields = f.next().strip().split('\t')
        t_id_ind = header_fields.index('transcript_id')
        length_ind = header_fields.index('transcript_length')
        for line in f:
            fields = line.strip().split('\t')
            t_id = fields[t_id_ind]
            length = int(fields[length_ind])
            lengths[t_id] = float(length) / 1000.0
    if not set(lengths.keys()).issuperset(bm.rownames):
        parser.error('Metadata does not contain all transcripts in matrix')
    # get total counts per library
    logging.debug('Getting total counts per library')
    lib_sizes = np.empty(bm.shape[1], dtype=np.float)
    for j in xrange(bm.shape[1]):
        a = bm.counts_t[j,:]
        a = a[np.isfinite(a)]
        lib_sizes[j] = a.sum()
    lib_sizes /= 1.0e6
    # normalize
    logging.debug('Normalizing and summarizing counts per transcript')
    print '\t'.join(['transcript_id', 'exprtot', 'exprmean', 'exprmedian', 'exprmax', 
                     'expr9999', 'expr999', 'expr99', 'expr95', 'expr90'])
    for i in xrange(bm.shape[0]):
        t_id = bm.rownames[i]
        if t_id not in lengths:
            logging.warning('Transcript %s not found in metadata' % (t_id))
            continue
        length = lengths[t_id]
        a = bm.counts[i,:]
        valid = np.isfinite(a)
        anorm = (a[valid] / lib_sizes[valid]) / length
        # get stats
        fields = [t_id, np.sum(anorm), np.mean(anorm), 
                  np.median(anorm), np.max(anorm), 
                  np.percentile(anorm, 99.99), 
                  np.percentile(anorm, 99.9), 
                  np.percentile(anorm, 99), 
                  np.percentile(anorm, 95),
                  np.percentile(anorm, 90)]
        print '\t'.join(map(str, fields))
    bm.close()
Esempio n. 8
0
def db_ss_printJSON(ssea_dir, matrix_dir, ss_id):
    sample_sets_json_file = os.path.join(ssea_dir,
                                         'sample_set.json')
    bm = BigCountMatrix.open(matrix_dir)
    samples = bm.colnames
    ss = SampleSet.parse_json(sample_sets_json_file)[0]
    membership = ss.get_array(samples)
    d = ss.to_dict(membership)
    d['_id'] = int(ss_id)
    print json.dumps(d)
Esempio n. 9
0
def stats_parallel(input_paths, matrix_dir, transcripts, prefix,
                   fdr_thresholds, num_processes):
    tasklist = []
    for input_path in input_paths:
        tasklist.append((input_path, matrix_dir, transcripts, fdr_thresholds))
    # create pool
    pool = Pool(processes=num_processes)
    result_iter = pool.imap_unordered(worker, tasklist)
    sigup = [set() for x in fdr_thresholds]
    sigdn = [set() for x in fdr_thresholds]
    sigall = [set() for x in fdr_thresholds]
    bm = BigCountMatrix.open(matrix_dir)
    nrows = len(bm.rownames)
    filename = prefix + '.txt'
    with open(filename, 'w') as f:
        header_fields = ['ss_compname', 'dir', 'fdr', 'count']
        print >>f, '\t'.join(header_fields)
        for ss_compname, ss_sigup, ss_sigdn in result_iter:
            for i,fdr_threshold in enumerate(fdr_thresholds):
                fields = [ss_compname, 'up', '%.1e' % (fdr_threshold), len(ss_sigup[i])]
                print >>f, '\t'.join(map(str, fields))
                fields = [ss_compname, 'dn', '%.1e' % (fdr_threshold), len(ss_sigdn[i])]
                print >>f, '\t'.join(map(str, fields))
                ss_sigall = ss_sigup[i].union(ss_sigdn[i])
                fields = [ss_compname, 'both', '%.1e' % (fdr_threshold), len(ss_sigall)]
                print >>f, '\t'.join(map(str, fields))
                num_none = nrows - len(ss_sigall)
                fields = [ss_compname, 'none', '%.1e' % (fdr_threshold), num_none]
                print >>f, '\t'.join(map(str, fields))
                sigup[i].update(ss_sigup[i])
                sigdn[i].update(ss_sigdn[i])
                sigall[i].update(ss_sigall)
    pool.close()
    pool.join()
    # global stats
    for i,fdr_threshold in enumerate(fdr_thresholds):
        filename = prefix + '_%.1e_up' % (fdr_threshold)
        with open(filename, 'w') as f:
            sig_t_ids = [bm.rownames[x] for x in sorted(sigup[i])]
            print >>f, '\n'.join(sig_t_ids)
        filename = prefix + '_%.1e_dn' % (fdr_threshold)
        with open(filename, 'w') as f:
            sig_t_ids = [bm.rownames[x] for x in sorted(sigdn[i])]
            print >>f, '\n'.join(sig_t_ids)
        filename = prefix + '_%.1e_both' % (fdr_threshold)
        sig_t_ids = [bm.rownames[x] for x in sorted(sigall[i])]
        with open(filename, 'w') as f:
            print >>f, '\n'.join(sig_t_ids)
        filename = prefix + '_%.1e_none' % (fdr_threshold)
        with open(filename, 'w') as f:
            none_t_ids = set(bm.rownames).difference(sig_t_ids)
            print >>f, '\n'.join(none_t_ids)
    bm.close()
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()            
#     parser.add_argument('--colmeta', dest='col_metadata_file',
#                         help='file containing metadata corresponding to each '
#                         'column of the weight matrix file')
#     parser.add_argument('--rowmeta', dest='row_metadata_file',
#                         help='file containing metadata corresponding to each '
#                         'row of the weight matrix file')
    parser.add_argument("-r", dest="row", 
                        action="store_true", default=False, 
                        help="Print row_meta JSONs")
    parser.add_argument("-c", dest="col", 
                        action="store_true", default=False, 
                        help="Print col_meta JSONs")
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    # check command line args
    matrix_dir = os.path.abspath(args.matrix_dir)
    col_metadata_file = os.path.join(matrix_dir, 'colmeta.tsv')
    row_metadata_file = os.path.join(matrix_dir, 'rowmeta.tsv')
    
    if not os.path.exists(col_metadata_file):
        parser.error("Column metadata file '%s' not found" % (args.col_metadata_file))
    if not os.path.exists(row_metadata_file):
        parser.error("Row metadata file '%s' not found" % (args.row_metadata_file))
    if not os.path.exists(args.matrix_dir):
        parser.error('matrix path "%s" not found' % (args.matrix_dir))

#     col_metadata_file = os.path.abspath(args.col_metadata_file)
#     row_metadata_file = os.path.abspath(args.row_metadata_file)
    # open matrix
    bm = BigCountMatrix.open(matrix_dir)
    if bm.size_factors is None:
        parser.error("Size factors not found in count matrix")
    # read metadata
    logging.info("Reading row metadata")
    row_metadata = list(Metadata.parse_tsv(row_metadata_file, bm.rownames))
    logging.info("Reading column metadata")
    col_metadata = list(Metadata.parse_tsv(col_metadata_file, bm.colnames))
    
    # pipe row metadata into mongoimport 
    if args.row:
        logging.debug("Importing row metadata")
        for m in row_metadata:
            print >>sys.stdout, m.to_json()
    if args.col:
        logging.debug("Importing column metadata")
        for m in col_metadata:
            print >>sys.stdout, m.to_json()
    # cleanup
    bm.close()
Esempio n. 11
0
def query_worker(args):
    (input_path, matrix_dir, meta, fdr_threshold, frac_threshold,
     fpr_threshold, prec_threshold) = args
    bm = BigCountMatrix.open(matrix_dir)
    ss_compname = os.path.basename(input_path)
    results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE)
    i = 0
    sig = 0
    lines = []
    # extract data
    for res in parse_results(results_file):
        i += 1
        #if (i % 10000) == 0:
        #    logging.debug('%s %d results' % (ss_compname, i))
        transcript_id = bm.rownames[res.t_id]
        if (meta is not None) and (transcript_id not in meta):
            continue
        core_size = res.core_hits + res.core_misses
        if core_size == 0:
            prec = 0.0
        else:
            prec = res.core_hits / float(core_size)
        num_misses = res.core_misses + res.null_misses
        if num_misses == 0:
            fpr = 0.0
        else:
            fpr = res.core_misses / float(num_misses)
        if ((res.ss_fdr_q_value <= fdr_threshold)
                and (abs(res.ss_frac) >= frac_threshold)
                and (fpr <= fpr_threshold) and (prec >= prec_threshold)):
            if meta is None:
                fields = [transcript_id]
            else:
                fields = list(meta[transcript_id])
            fields.extend([
                ss_compname, res.es, res.nes, res.ss_fdr_q_value, res.ss_frac,
                fpr, prec
            ])
            lines.append('\t'.join(map(str, fields)))
            sig += 1
    bm.close()
    logging.debug('Found %d results for path %s' % (sig, input_path))
    return lines
Esempio n. 12
0
def query_worker(args):    
    (input_path, matrix_dir, meta, fdr_threshold, 
     frac_threshold, fpr_threshold, prec_threshold) = args
    bm = BigCountMatrix.open(matrix_dir)
    ss_compname = os.path.basename(input_path)
    results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE)
    i = 0
    sig = 0
    lines = []
    # extract data
    for res in parse_results(results_file):
        i += 1
        #if (i % 10000) == 0:
        #    logging.debug('%s %d results' % (ss_compname, i))
        transcript_id = bm.rownames[res.t_id]
        if (meta is not None) and (transcript_id not in meta):
            continue
        core_size = res.core_hits + res.core_misses
        if core_size == 0:
            prec = 0.0
        else:
            prec = res.core_hits / float(core_size)
        num_misses = res.core_misses + res.null_misses
        if num_misses == 0:
            fpr = 0.0
        else:
            fpr = res.core_misses / float(num_misses)
        if ((res.ss_fdr_q_value <= fdr_threshold) and 
            (abs(res.ss_frac) >= frac_threshold) and
            (fpr <= fpr_threshold) and
            (prec >= prec_threshold)):
            if meta is None:
                fields = [transcript_id]
            else:
                fields = list(meta[transcript_id])
            fields.extend([ss_compname, res.es, res.nes, 
                           res.ss_fdr_q_value, res.ss_frac, fpr, prec])
            lines.append('\t'.join(map(str,fields)))
            sig += 1
    bm.close()
    logging.debug('Found %d results for path %s' % (sig, input_path))
    return lines
Esempio n. 13
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--skip-all-nan', dest='skip_nan', action='store_true')
    parser.add_argument('--skip-all-zero',
                        dest='skip_zero',
                        action='store_true')
    parser.add_argument('--libs', dest="library_ids", default=None)
    parser.add_argument('--transcripts', dest="transcript_ids", default=None)
    parser.add_argument('input_dir')
    parser.add_argument('output_dir')
    args = parser.parse_args()
    if not os.path.exists(args.input_dir):
        parser.error("input directory '%s' not found" % (args.input_dir))
    if os.path.exists(args.output_dir):
        parser.error("output directory '%s' already exists" %
                     (args.output_dir))
    # open matrix
    bm = BigCountMatrix.open(args.input_dir)
    # get library and transcript ids
    if args.library_ids is not None:
        library_ids = set([line.strip() for line in open(args.library_ids)])
    else:
        library_ids = set()
    if args.transcript_ids is not None:
        transcript_ids = set(
            [line.strip() for line in open(args.transcript_ids)])
    else:
        transcript_ids = set(bm.rownames)
    if args.skip_nan or args.skip_zero:
        logging.debug('Checking matrix for rows of all zero and/or nan')
        skip_ids = set(
            find_transcripts_to_skip(args.input_dir, args.skip_nan,
                                     args.skip_zero))
        transcript_ids.difference_update(skip_ids)
    logging.debug('Creating subset with %d transcripts' %
                  (len(transcript_ids)))
    bm.copy(args.output_dir, transcript_ids, library_ids)
    bm.close()
Esempio n. 14
0
def find_transcripts_to_skip(input_dir, skip_nan, skip_zero):
    '''
    find transcripts with 'nan' or 0.0 values
    '''
    skip_rownames = []
    bm = BigCountMatrix.open(input_dir)
    for i in xrange(bm.shape[0]):
        a = np.array(bm.counts[i, :], dtype=np.float)
        skip = False
        if skip_nan:
            num_finite = np.isfinite(a).sum()
            if num_finite == 0:
                logging.debug('Row %d t_id %s all nan' % (i, bm.rownames[i]))
                skip = True
        if skip_zero:
            num_nonzero = (a > 0).sum()
            if num_nonzero == 0:
                logging.debug('Row %d t_id %s all zeros' % (i, bm.rownames[i]))
                skip = True
        if skip:
            skip_rownames.append(bm.rownames[i])
    bm.close()
    logging.debug('Found %d rows to skip' % (len(skip_rownames)))
    return skip_rownames
Esempio n. 15
0
def find_transcripts_to_skip(input_dir, skip_nan, skip_zero):
    '''
    find transcripts with 'nan' or 0.0 values
    '''
    skip_rownames = []
    bm = BigCountMatrix.open(input_dir)
    for i in xrange(bm.shape[0]):
        a = np.array(bm.counts[i,:], dtype=np.float)
        skip = False
        if skip_nan:
            num_finite = np.isfinite(a).sum()
            if num_finite == 0:                
                logging.debug('Row %d t_id %s all nan' % (i, bm.rownames[i]))
                skip = True
        if skip_zero:
            num_nonzero = (a > 0).sum()
            if num_nonzero == 0:
                logging.debug('Row %d t_id %s all zeros' % (i, bm.rownames[i]))
                skip = True
        if skip:
            skip_rownames.append(bm.rownames[i])
    bm.close()
    logging.debug('Found %d rows to skip' % (len(skip_rownames)))
    return skip_rownames
Esempio n. 16
0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("memmap_dir")
    parser.add_argument("metadata_file")
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG,
                      format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    
    #make dict of lengths for count --> FPKM conversion    
    meta_file = open(args.metadata_file)
    meta_header = meta_file.next().strip().split('\t')
    t_meta_dict = {}
    logging.info("Parsing metadata file for transcript lengths")
    for line in meta_file: 
        line = line.strip().split('\t')
        t_id = line[meta_header.index('transcript_id')]
        t_meta_dict[t_id] = line
    
        
    bcm = BigCountMatrix.open(args.memmap_dir)
    rows = bcm.rownames
    cols = bcm.colnames
    
    matrix = bcm.counts
    matrix_t = bcm.counts_t
    
    
    #loop through matrix by column to get total # of reads for FPKM conversion
    logging.info('Looping through matrix_t for number of reads per sample')
    tot_reads_array = []
    for x in xrange(len(matrix_t[:,1])):
        if (x%500) == 0: 
            logging.info('Finished %d/%d samples' % (x, len(matrix_t[:,1])))
        col_expr_array = matrix_t[x,]
        tot_reads_array.append(np.nansum(col_expr_array))
    tot_reads_np_array = np.array(tot_reads_array)
    
    #looping through matrix and converting to FPKM then reporting stats
    logging.info('Converting to FPKM then reporting stats')
    new_fields = [
                  'max',
                  '99.99th',
                  '99.9th',
                  '99.5th',
                  '99th',
                  '95th',
                  '90th',
                  'mean',
                  'median'
                  ]
    print '\t'.join(meta_header + new_fields)
    for x in xrange(len(rows)):
        if (x%5000) == 0: 
            logging.info('Finished %d/%d transcripts' % (x, len(rows))) 
        t_id = rows[x]
        metadata = t_meta_dict[t_id]
        t_len = metadata[meta_header.index('transcript_length')]
        t_len = int(t_len)
        count_array = matrix[x,]
        #convert to FPKM 
        fpkm_array = (count_array*10e8)/(t_len*tot_reads_np_array)
        max = fpkm_array.max()
        median = np.median(fpkm_array)
        mean = np.mean(fpkm_array)
        _99_99 = np.percentile(fpkm_array, 99.99)
        _99_9 = np.percentile(fpkm_array, 99.9)
        _99_5 = np.percentile(fpkm_array, 99.5)
        _99 = np.percentile(fpkm_array, 99)
        _95 = np.percentile(fpkm_array, 95)
        _90 = np.percentile(fpkm_array, 90)
        lineo = metadata + [max, _99_99, _99_9, _99_5, _99, _95, _90, mean, median]
        print '\t'.join(map(str,lineo))

#     
    return 0
Esempio n. 17
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--colmeta', dest='col_metadata_file', default=None)
    parser.add_argument('--rowmeta', dest='row_metadata_file', default=None)
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    col_metadata_file = args.col_metadata_file
    row_metadata_file = args.row_metadata_file
    matrix_dir = args.matrix_dir
    if not os.path.exists(matrix_dir):
        parser.error('matrix directory "%s" not found' % (matrix_dir))
    # open matrix
    bm = BigCountMatrix.open(matrix_dir)
    # get pheno data
    logging.debug('Reading library metadata')
    col_inds = parse_library_table(col_metadata_file, bm.colnames)
    # get transcript metadata
    logging.debug('Reading transcript metadata')
    row_inds, lengths = parse_transcript_table(row_metadata_file, bm.rownames)
    # get total counts per library
#    logging.debug('Getting total fragments per library')
#    lib_sizes = np.empty(len(col_inds), dtype=np.float)
#    for j in xrange(len(col_inds)):
#        a = bm.counts_t[j,:]
#        a = a[np.isfinite(a)]
#        lib_sizes[j] = a.sum()
#    lib_sizes /= 1.0e6
    # normalize
    logging.debug('Normalizing and summarizing counts per transcript')
    header_fields = ['transcript_id']
    header_fields.extend([bm.colnames[x] for x in col_inds])
    print '\t'.join(header_fields)
    for i,lengthkb in zip(row_inds,lengths):
        t_id = bm.rownames[i]
        counts = bm.counts[i, col_inds]
        # ignore nans
        valid_inds = np.isfinite(counts)
        # normalize counts
        a = (counts[valid_inds] / bm.size_factors[valid_inds])
        # log transform
        a = np.log2(a + 1.0)
        # z-score
        #mean = np.mean(a)
        #std = np.std(a)
        #a = (a - mean) / std
        # subtract median and divide by MAD
        med = np.median(a)
        mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR)
        a = (a - med) / mad
        # write
        out = np.empty(len(col_inds), dtype=np.float)
        out[:] = np.nan
        out[valid_inds] = a
        fields = [t_id]
        fields.extend(map(str,out))
        print '\t'.join(fields)
        continue
        #a = (counts[valid_inds] / bm.size_factors[valid_inds])
        #a = (counts[valid_inds] / lib_sizes[valid_inds]) / lengthkb
        # center and scale
        med = np.median(a)
        mad = med + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR)
        #mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR)
        a = (a - med) / mad
        # log transform
        a = np.sign(a) * np.log2(np.abs(a)+1.0)
        # normalize by transcript length
        #a -= np.log2(lengthkb)
        # output result
        out = np.empty(len(col_inds), dtype=np.float)
        out[:] = np.nan
        out[valid_inds] = a
        fields = [t_id]
        fields.extend(map(str,out))
        print '\t'.join(fields)
        
        #normcounts = (counts[valid_inds] / lib_sizes[valid_inds]) / length
        #normlogcounts = np.log2(normcounts + 1)
        # subtracting global median and dividing by the median absolute deviation
        #med = np.median(normlogcounts)
        #mad = MAD_CONSTANT + (np.median(np.abs(normlogcounts - med)) * MAD_SCALE_FACTOR)
        #normlogcounts = (normlogcounts - med) / mad
        # output final matrix
        #a = np.empty(len(col_inds), dtype=np.float)
        #a[:] = np.nan
        #a[valid_inds] = normlogcounts
        #fields = [t_id]
        #fields.extend(map(str,a))
        #print '\t'.join(fields)
    bm.close()    
    return 0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--colmeta', dest='col_metadata_file', default=None)
    parser.add_argument('--rowmeta', dest='row_metadata_file', default=None)
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    col_metadata_file = args.col_metadata_file
    row_metadata_file = args.row_metadata_file
    matrix_dir = args.matrix_dir
    if not os.path.exists(matrix_dir):
        parser.error('matrix directory "%s" not found' % (matrix_dir))
    # open matrix
    bm = BigCountMatrix.open(matrix_dir)
    # get pheno data
    logging.debug('Reading library metadata')
    col_inds = parse_library_table(col_metadata_file, bm.colnames)
    # get transcript metadata
    logging.debug('Reading transcript metadata')
    row_inds, lengths = parse_transcript_table(row_metadata_file, bm.rownames)
    # get total counts per library
    #    logging.debug('Getting total fragments per library')
    #    lib_sizes = np.empty(len(col_inds), dtype=np.float)
    #    for j in xrange(len(col_inds)):
    #        a = bm.counts_t[j,:]
    #        a = a[np.isfinite(a)]
    #        lib_sizes[j] = a.sum()
    #    lib_sizes /= 1.0e6
    # normalize
    logging.debug('Normalizing and summarizing counts per transcript')
    header_fields = ['transcript_id']
    header_fields.extend([bm.colnames[x] for x in col_inds])
    print '\t'.join(header_fields)
    for i, lengthkb in zip(row_inds, lengths):
        t_id = bm.rownames[i]
        counts = bm.counts[i, col_inds]
        # ignore nans
        valid_inds = np.isfinite(counts)
        # normalize counts
        a = (counts[valid_inds] / bm.size_factors[valid_inds])
        # log transform
        a = np.log2(a + 1.0)
        # z-score
        #mean = np.mean(a)
        #std = np.std(a)
        #a = (a - mean) / std
        # subtract median and divide by MAD
        med = np.median(a)
        mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR)
        a = (a - med) / mad
        # write
        out = np.empty(len(col_inds), dtype=np.float)
        out[:] = np.nan
        out[valid_inds] = a
        fields = [t_id]
        fields.extend(map(str, out))
        print '\t'.join(fields)
        continue
        #a = (counts[valid_inds] / bm.size_factors[valid_inds])
        #a = (counts[valid_inds] / lib_sizes[valid_inds]) / lengthkb
        # center and scale
        med = np.median(a)
        mad = med + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR)
        #mad = MAD_CONSTANT + (np.median(np.abs(a - med)) * MAD_SCALE_FACTOR)
        a = (a - med) / mad
        # log transform
        a = np.sign(a) * np.log2(np.abs(a) + 1.0)
        # normalize by transcript length
        #a -= np.log2(lengthkb)
        # output result
        out = np.empty(len(col_inds), dtype=np.float)
        out[:] = np.nan
        out[valid_inds] = a
        fields = [t_id]
        fields.extend(map(str, out))
        print '\t'.join(fields)

        #normcounts = (counts[valid_inds] / lib_sizes[valid_inds]) / length
        #normlogcounts = np.log2(normcounts + 1)
        # subtracting global median and dividing by the median absolute deviation
        #med = np.median(normlogcounts)
        #mad = MAD_CONSTANT + (np.median(np.abs(normlogcounts - med)) * MAD_SCALE_FACTOR)
        #normlogcounts = (normlogcounts - med) / mad
        # output final matrix
        #a = np.empty(len(col_inds), dtype=np.float)
        #a[:] = np.nan
        #a[valid_inds] = normlogcounts
        #fields = [t_id]
        #fields.extend(map(str,a))
        #print '\t'.join(fields)
    bm.close()
    return 0
Esempio n. 19
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--fdr', dest='fdr', type=float, default=1.0)
    parser.add_argument('--frac', dest='frac', type=float, default=0.0)
    parser.add_argument('--meta', dest='metadata_file', default=None)
    parser.add_argument("-i", dest="input_paths_file", default=None)
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    # get args
    matrix_dir = args.matrix_dir
    fdr_threshold = args.fdr
    frac_threshold = args.frac
    input_paths_file = args.input_paths_file
    metadata_file = args.metadata_file
    # check args
    bm = BigCountMatrix.open(matrix_dir)
    fdr_threshold = max(0.0, min(fdr_threshold, 1.0))
    frac_threshold = max(0.0, min(frac_threshold, 1.0))
    input_paths = []
    if input_paths_file is None:
        logging.error('No input directories specified (use -i).. Exiting.')
        return 1
    if not os.path.exists(input_paths_file):
        logging.error('Input paths file "%s" not found' % (input_paths_file))
    else:
        with open(input_paths_file) as fileh:
            for line in fileh:
                path = line.strip()
                if path in input_paths:
                    continue
                if check_path(path):
                    input_paths.append(path)
    if len(input_paths) == 0:
        logging.error('No valid SSEA results directories found.. Exiting.')
        return 1
    meta = None
    if metadata_file is not None:
        logging.debug('Parsing transcript metadata')
        meta = {}
        with open(metadata_file) as f:
            meta_header_fields = f.next().strip().split()
            for line in f:
                fields = line.strip().split('\t')
                meta[fields[0]] = fields
        logging.debug('Found metadata for %d transcripts' % (len(meta)))
    else:
        meta = None
        meta_header_fields = ['transcript_id']
    # parse results
    logging.debug('SSEA results: %d' % (len(input_paths)))
    logging.debug('FDR threshold: %f' % (fdr_threshold))
    logging.debug('Frac threshold: %f' % (frac_threshold))
    header_fields = meta_header_fields + ['ss_compname', 'es', 'nes', 'fdr', 'frac', 'prec']
    print '\t'.join(header_fields)
    for input_path in input_paths:
        logging.debug('Parsing path %s' % (input_path))
        results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE)
        # extract data
        ss_compname = os.path.basename(input_path)
        i = 0
        sig = 0
        for res in parse_results(results_file):
            # logging
            i += 1
            if (i % 10000) == 0:
                logging.debug('Parsed %d results' % (i))
            transcript_id = bm.rownames[res.t_id]
            if meta is not None:
                if transcript_id not in meta:
                    continue
            if ((res.ss_fdr_q_value <= fdr_threshold) and 
                (abs(res.ss_frac) >= frac_threshold)):
                if meta is None:
                    fields = [bm.rownames[res.t_id]]
                else:
                    fields = list(meta[transcript_id])
                core_size = res.core_hits + res.core_misses
                if core_size == 0:
                    prec = 0.0
                else:
                    prec = res.core_hits / float(res.core_hits + res.core_misses)
                fields.extend([ss_compname,
                               res.es,
                               res.nes,
                               res.ss_fdr_q_value,
                               res.ss_frac,
                               prec])
                print '\t'.join(map(str, fields))
                sig += 1
        logging.debug('Found %d results for path %s' % (sig, input_path))
    bm.close()
Esempio n. 20
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--fdr', dest='fdr', type=float, default=1.0)
    parser.add_argument('-a', '--attr', dest='ssea_attrs', action='append', default=[])
    parser.add_argument("-i", dest="input_paths_file", default=None)
    parser.add_argument('matrix_dir')
    args = parser.parse_args()
    # get args
    fdr_threshold = args.fdr
    matrix_dir = args.matrix_dir
    ssea_attrs = args.ssea_attrs
    input_paths_file = args.input_paths_file
    # check args
    if len(ssea_attrs) == 0:
        parser.error('Please specify one or more attributes using "-a" or "--attr"')
    input_paths = []
    if input_paths_file is None:
        logging.error('No input directories specified (use -i).. Exiting.')
        return 1
    if not os.path.exists(input_paths_file):
        logging.error('Input paths file "%s" not found' % (input_paths_file))
    else:
        with open(input_paths_file) as fileh:
            for line in fileh:
                path = line.strip()
                if path in input_paths:
                    continue
                if check_path(path):
                    input_paths.append(path)
    if len(input_paths) == 0:
        logging.error('No valid SSEA results directories found.. Exiting.')
        return 1
    # parse results
    logging.debug('SSEA results: %d' % (len(input_paths)))
    logging.debug('SSEA attributes: %s' % (','.join(ssea_attrs)))
    logging.debug('SSEA FDR threshold: %f' % fdr_threshold)
    bm = BigCountMatrix.open(matrix_dir)
    dat = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    for input_path in input_paths:
        logging.debug('Parsing path %s' % (input_path))
        results_file = os.path.join(input_path, Config.RESULTS_JSON_FILE)
        # extract data
        i = 0
        sig = 0
        for res in parse_results(results_file):
            # logging
            i += 1
            if (i % 10000) == 0:
                logging.debug('Parsed %d results' % (i))
            if res.ss_fdr_q_value > fdr_threshold:
                continue
            sig += 1
            transcript_id = bm.rownames[res.t_id]
            for a in ssea_attrs:
                dat[a][transcript_id].append(getattr(res, a))
        logging.debug('Found %d results for path %s (%d significant)' % (i, input_path, sig))
    bm.close()
    # output results
    header_fields = ['transcript_id', 'attr', 'min', 'max', 'absmax', 'mean', 'median']
    print '\t'.join(header_fields)
    for a in ssea_attrs:
        attrdict = dat[a]
        for transcript_id in sorted(attrdict):
            arr = np.array(attrdict[transcript_id])
            fields = [transcript_id, a, np.min(arr), np.max(arr), np.max(np.abs(arr)), np.median(arr), np.mean(arr)]
            print '\t'.join(map(str, fields))
Esempio n. 21
0
def ssea_serial(config,
                sample_set,
                output_basename,
                startrow=None,
                endrow=None):
    '''
    main SSEA loop (single processor)
    
    matrix_dir: numpy memmap matrix containing numeric data 
    sample_set: SampleSet object
    config: Config object
    output_basename: prefix for writing result files
    '''
    # initialize random number generator
    rng = RandomState()
    # open data matrix
    bm = BigCountMatrix.open(config.matrix_dir)
    # determine range of matrix to process
    if startrow is None:
        startrow = 0
    if endrow is None:
        endrow = bm.shape[0]
    assert startrow < endrow
    # get membership array for sample set
    membership = sample_set.get_array(bm.colnames)
    valid_samples = (membership >= 0)
    # setup histograms
    hists = _init_hists()
    # setup report file
    unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX
    outfileh = open(unsorted_json_file, 'wb')
    for i in xrange(startrow, endrow):
        logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow))
        # read from memmap
        counts = np.array(bm.counts[i, :], dtype=np.float)
        # remove 'nan' values
        valid_inds = np.logical_and(valid_samples, np.isfinite(counts))
        # subset counts, size_factors, and membership array
        counts = counts[valid_inds]
        size_factors = bm.size_factors[valid_inds]
        valid_membership = membership[valid_inds]
        # write dummy results for invalid rows
        if (valid_inds.sum() == 0) or (np.all(counts == 0)):
            res = Result.default()
        else:
            # run ssea
            res, null_nes_vals = ssea_run(counts, size_factors,
                                          valid_membership, rng, config)
            # update histograms
            null_keys = []
            obs_keys = []
            if res.es < 0:
                null_keys.append('null_nes_neg')
                obs_keys.append('obs_nes_neg')
            elif res.es > 0:
                null_keys.append('null_nes_pos')
                obs_keys.append('obs_nes_pos')
            for k in xrange(len(null_keys)):
                null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX)
                obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX)
                hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0]
                hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0]
        # save t_id
        res.t_id = i
        # convert to json and write
        print >> outfileh, res.to_json()
    # close report file
    outfileh.close()
    # save histograms to a file
    output_hist_file = output_basename + NPY_HISTS_SUFFIX
    np.savez(output_hist_file, **hists)
    # cleanup
    bm.close()
    # sort output json file by abs(NES)
    logging.debug("Worker %s: sorting results" % (output_basename))
    # make tmp dir for sorting
    if os.path.exists(output_basename):
        shutil.rmtree(output_basename)
    os.makedirs(output_basename)
    # call batch sort python function
    sorted_json_file = output_basename + JSON_SORTED_SUFFIX
    batch_sort(input=unsorted_json_file,
               output=sorted_json_file,
               key=_cmp_json_nes,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[output_basename])
    # remove tmp dir
    shutil.rmtree(output_basename)
    # remove unsorted json file
    os.remove(unsorted_json_file)
    logging.debug("Worker %s: done" % (output_basename))
    return 0
Esempio n. 22
0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("matrix_dir")
    parser.add_argument("title")
    parser.add_argument("library_ids")
    parser.add_argument("metadata")
    
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG,
                      format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    
    # open matrix
    bm = BigCountMatrix.open(args.matrix_dir)
    # get pheno data
    logging.debug('Reading library metadata')
    col_inds = parse_library_table(args.library_ids, bm.colnames)
    # get transcript metadata
    logging.debug('Reading transcript metadata')
    row_inds, lengths = parse_transcript_table(args.metadata, bm.rownames)
    
    bm.count_tots /= 1.0e6
    
    mean_dict = collections.defaultdict(lambda: float('NaN'))
    nf_dict = collections.defaultdict(lambda: float('NaN'))
    nn_dict = collections.defaultdict(lambda: float('NaN'))
    mean_dict_f = collections.defaultdict(lambda: float('NaN'))
    nf_dict_f = collections.defaultdict(lambda: float('NaN'))
    nn_dict_f = collections.defaultdict(lambda: float('NaN'))
    
    logging.debug('Calculating expression stats')
    k = 0 
    for i,lengthkb in zip(row_inds,lengths):
        k+=1
        if k%10000 == 0: 
            logging.debug('Finished %d transcripts' % k)
        
        t_id = bm.rownames[i]
        counts = bm.counts[i, col_inds]
        # ignore nans
        valid_inds = np.isfinite(counts)
        # normalize counts
        count_vals = (counts[valid_inds] / bm.size_factors[valid_inds])
        fpkm_vals = (counts[valid_inds] / (lengthkb*bm.count_tots[valid_inds]))
        
        mean = np.mean(count_vals)
        mean_f = np.mean(fpkm_vals)
        if list(count_vals) == []: 
            continue
        nf = np.percentile(count_vals, 95)
        nf_f = np.percentile(fpkm_vals, 95)
        nn = np.percentile(count_vals, 99)
        nn_f = np.percentile(fpkm_vals, 99)
        
        mean_dict[t_id] = mean
        mean_dict_f[t_id] = mean_f
        nf_dict[t_id] = nf
        nf_dict_f[t_id] = nf_f
        nn_dict[t_id] = nn
        nn_dict_f[t_id] = nn_f
    
    logging.debug('Printing output')
    meta_fh = open(args.metadata)
    meta_header = meta_fh.next().strip().split('\t')
    meta_header.append(args.title+'_count_mean')
    meta_header.append(args.title+'_count_95')
    meta_header.append(args.title+'_count_99')
    meta_header.append(args.title+'_fpkm_mean')
    meta_header.append(args.title+'_fpkm_95')
    meta_header.append(args.title+'_fpkm_99')
    print '\t'.join(meta_header)
    for line in meta_fh:
        line = line.strip().split('\t')
        t_id = line[meta_header.index('transcript_id')]
        mean = mean_dict[t_id]
        mean_f = mean_dict_f[t_id]
        nf = nf_dict[t_id]
        nf_f = nf_dict_f[t_id]
        nn = nn_dict[t_id]
        nn_f = nn_dict_f[t_id]
        if mean == 'na' or mean_f == 'na':
            line.append('na')
        line.append("%.3f" % mean)
        line.append("%.3f" % nf)
        line.append("%.3f" % nn)
        line.append("%.3f" % mean_f)
        line.append("%.3f" % nf_f)
        line.append("%.3f" % nn_f)
        print '\t'.join(line)
    
    return 0
Esempio n. 23
0
File: algo.py Progetto: BioXiao/ssea
def ssea_serial(config, sample_set, output_basename, 
                startrow=None, endrow=None):
    '''
    main SSEA loop (single processor)
    
    matrix_dir: numpy memmap matrix containing numeric data 
    sample_set: SampleSet object
    config: Config object
    output_basename: prefix for writing result files
    '''
    # initialize random number generator
    rng = RandomState()
    # open data matrix
    bm = BigCountMatrix.open(config.matrix_dir)
    # determine range of matrix to process
    if startrow is None:
        startrow = 0
    if endrow is None:
        endrow = bm.shape[0]
    assert startrow < endrow
    # get membership array for sample set
    membership = sample_set.get_array(bm.colnames)
    valid_samples = (membership >= 0)
    # setup histograms
    hists = _init_hists()
    # setup report file
    unsorted_json_file = output_basename + JSON_UNSORTED_SUFFIX
    outfileh = open(unsorted_json_file, 'wb')    
    for i in xrange(startrow, endrow):
        logging.debug("\tRow: %d (%d-%d)" % (i, startrow, endrow))
        # read from memmap
        counts = np.array(bm.counts[i,:], dtype=np.float)
        # remove 'nan' values
        valid_inds = np.logical_and(valid_samples, np.isfinite(counts))
        # subset counts, size_factors, and membership array
        counts = counts[valid_inds]
        size_factors = bm.size_factors[valid_inds]
        valid_membership = membership[valid_inds]
        # write dummy results for invalid rows
        if (valid_inds.sum() == 0) or (np.all(counts == 0)):
            res = Result.default()
        else:
            # run ssea
            res, null_nes_vals = ssea_run(counts, size_factors, 
                                          valid_membership, rng, config)
            # update histograms
            null_keys = []
            obs_keys = []
            if res.es < 0:
                null_keys.append('null_nes_neg')
                obs_keys.append('obs_nes_neg')
            elif res.es > 0:
                null_keys.append('null_nes_pos')
                obs_keys.append('obs_nes_pos')
            for k in xrange(len(null_keys)):
                null_nes = np.clip(np.fabs(null_nes_vals), NES_MIN, NES_MAX)
                obs_nes = np.clip(np.fabs(res.nes), NES_MIN, NES_MAX)
                hists[null_keys[k]] += np.histogram(null_nes, NES_BINS)[0]
                hists[obs_keys[k]] += np.histogram(obs_nes, NES_BINS)[0]
        # save t_id
        res.t_id = i
        # convert to json and write
        print >>outfileh, res.to_json()
    # close report file
    outfileh.close()
    # save histograms to a file
    output_hist_file = output_basename + NPY_HISTS_SUFFIX
    np.savez(output_hist_file, **hists)
    # cleanup
    bm.close()
    # sort output json file by abs(NES)
    logging.debug("Worker %s: sorting results" % (output_basename))
    # make tmp dir for sorting
    if os.path.exists(output_basename):
        shutil.rmtree(output_basename)
    os.makedirs(output_basename) 
    # call batch sort python function
    sorted_json_file = output_basename + JSON_SORTED_SUFFIX
    batch_sort(input=unsorted_json_file,
               output=sorted_json_file,
               key=_cmp_json_nes,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[output_basename])
    # remove tmp dir
    shutil.rmtree(output_basename)
    # remove unsorted json file
    os.remove(unsorted_json_file)    
    logging.debug("Worker %s: done" % (output_basename))
    return 0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("library_ids")
    parser.add_argument("transcript_ids")
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG,
                      format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    if not os.path.exists(EXPRESSION_DIR):
        parser.error("Expression matrix directory '%s' not found" % (args.matrix_dir))
    # read input library ids
    library_ids = read_lines(args.library_ids)
    library_id_set = set(library_ids)
    # read input gene ids
    gene_ids = read_lines(args.transcript_ids)
    gene_id_set = set(gene_ids)

    # load gene expression   
    pheno_file = os.path.join(EXPRESSION_DIR, 'colnames.txt')
    metadata_file = os.path.join(EXPRESSION_DIR, 'rownames.txt')
    
    # find libraries
    ncols, lib_inds, lib_ids = \
        read_table(pheno_file, 
                   subset=library_id_set)
    if len(lib_inds) == 0:
        print "No libraries found"
        return 1
    # find genes
    logging.info('Acquiring data for %s libraries' % len(lib_ids))
    nrows, g_inds, g_ids = \
        read_table(metadata_file, 
                   subset=gene_id_set)
    if len(g_inds) == 0:
        print "No genes found"
        return 1
    logging.info('Acquiring data for %s transcripts' % len(g_ids))
    
    # read gene expression
    bm = BigCountMatrix.open(EXPRESSION_DIR)
    mat = bm.counts
    
    # get subset of matrix
    logging.info("performing gene subset")
    submat = mat[g_inds,:]
    logging.info("performing library subset")
    submat = submat[:,lib_inds]
    
    # write expr file
    output_expr_file = args.output_prefix + ".expr.tsv"
    fileh = open(output_expr_file, 'w')
    fields = ['gene_id']
    fields.extend(lib_ids)
    logging.debug('Printing expression file')
    print >>fileh, '\t'.join(fields)
    for i in xrange(len(g_inds)):
        if i%1000==0: 
            logging.debug("Finished %d/%d transcripts" % (i, len(g_inds)))
        fields = [g_ids[i]]
        for x in submat[i, :]:
            if x<0:
                fields.append('NA')
            else: 
                fields.append(str(x))
        print >>fileh, '\t'.join(fields)
    fileh.close()
    return 0