def test_download(my_expression_file, my_gene_ontology_file, my_fly_gene_set_file): """Test if required data files were downloaded successfully.""" # expression file print(my_expression_file) assert os.path.isfile(my_expression_file) matrix = ExpMatrix.read_tsv(my_expression_file) assert isinstance(matrix, ExpMatrix) assert matrix.hash == 'aa7cc5e6e04d34e65058f059bcdfe5ea' # gene ontology file print(my_gene_ontology_file) assert os.path.isfile(my_gene_ontology_file) # hash not stable? #ontology = GeneOntology.read_obo(my_gene_ontology_file) #assert isinstance(ontology, GeneOntology) #assert ontology.hash == '978546899cfb0196ac2005d4b177725f' # gene set file print(my_fly_gene_set_file) assert os.path.isfile(my_fly_gene_set_file) gene_sets = GeneSetCollection.read_tsv(my_fly_gene_set_file) assert isinstance(gene_sets, GeneSetCollection) assert gene_sets.hash == '78b4b27e9658560a8e5993154d3228fa'
def plot_read_count_distribution(barcode_count_file, output_file, xaxis_label=('# mapped reads ' '(log<sub>10</sub>-scale)')): """Plot histogram of the distribution of reads per barcode. TODO: docstring""" matrix = ExpMatrix.read_tsv(barcode_count_file) x = np.float64(matrix.values.ravel()) num_total_reads = int(np.sum(x)) x[x < 1] = 1 x = np.log10(x) data = [go.Histogram(x=x, nbinsx=100)] layout = go.Layout( title='Total number of mapped reads: %d' % num_total_reads, font=dict( size=20, family='serif', ), xaxis=dict(title=xaxis_label, ), yaxis=dict( title='# barcodes', type='log', ), ) fig = go.Figure(data=data, layout=layout) plot(fig, filename=output_file, show_link=False, auto_open=False)
def test_tsv(tmpdir, my_matrix): output_file = tmpdir.join('expression_matrix.tsv').strpath my_matrix.write_tsv(output_file) # data = open(str(path), mode='rb').read() # h = hashlib.md5(data).hexdigest() # assert h == 'd34bf3d376eb613e4fea894f7c9d601f' other = ExpMatrix.read_tsv(output_file) assert other is not my_matrix assert other == my_matrix
def main(args=None): """Run GO-PCA and store the result in a `pickle` file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: # read arguments from the command line parser = get_argument_parser() # parse first with default options, in case "--help" is specified # ("--help" causes the program to exit at this point) args = parser.parse_args() # now remove the defaults and parse again # (removing the defaults is important so that we know which values # were specified by the user) no_defaults = dict([p, None] for p in GOPCA.get_param_defaults()) no_defaults2 = dict([p, None] for p in GOPCAParams.get_param_defaults()) no_defaults.update(no_defaults2) parser.set_defaults(**no_defaults) args = parser.parse_args() # reporting options log_file = args.log_file quiet = args.quiet verbose = args.verbose # test if we can write to log_file? # configure root logger logger = util.get_logger(log_file=log_file, quiet=quiet) # check if required parameters were specified passed = True if args.expression_file is None: logger.error('No expression file specified!') passed = False if args.gene_set_file is None: logger.error('No gene set file specified!') passed = False if args.output_file is None: logger.error('No output file specified!') passed = False if not passed: logger.error('Not all required parameters were specified.') return 1 # generate configuration if args.config_file is not None: # read parameter values from config file params = GOPCAParams.read_ini(args.config_file) else: # start with default configuration params = GOPCAParams() # overwrite parameters specified on the command line for p in GOPCAParams.get_param_defaults(): v = getattr(args, p) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) params.set_param(p, v) global_params = GOPCA.get_param_defaults() for k in list(global_params.keys()): v = getattr(args, k) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) global_params[k] = v # read expression file matrix = ExpMatrix.read_tsv(args.expression_file) logger.info('Expression matrix size: ' + '(p = %d genes) x (n = %d samples).', matrix.p, matrix.n) if args.sel_var_genes > 0: # filter genes by variance matrix = matrix.filter_variance(args.sel_var_genes) # read gene set file gene_sets = GeneSetCollection.read_tsv(args.gene_set_file) print(args.gene_set_file, gene_sets) # read ontology file (if supplied) gene_ontology = None if args.gene_ontology_file is not None: p_logger = logging.getLogger(genometools.__name__) p_logger.setLevel(logging.ERROR) gene_ontology = GeneOntology.read_obo( args.gene_ontology_file, part_of_cc_only=params.go_part_of_cc_only) p_logger.setLevel(logging.NOTSET) M = GOPCA.simple_setup(matrix, params, gene_sets, gene_ontology, verbose=verbose, **global_params) run = M.run() if run is None: logger.error('GO-PCA run failed!') return 1 # write run to pickle file logger.info('Storing GO-PCA run in file "%s"...', args.output_file) run.write_pickle(args.output_file) return 0
def my_matrix_filtered(my_expression_file): matrix = ExpMatrix.read_tsv(my_expression_file) matrix_filtered = filter_variance(matrix, 8000) return matrix_filtered
def main(args=None): """Run GO-PCA and store the result in a `pickle` file. Parameters ---------- args: argparse.Namespace object, optional The argument values. If not specified, the values will be obtained by parsing the command line arguments using the `argparse` module. Returns ------- int Exit code (0 if no error occurred). Raises ------ SystemError If the version of the Python interpreter is not >= 2.7. """ vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: # read arguments from the command line parser = get_argument_parser() # parse first with default options, in case "--help" is specified # ("--help" causes the program to exit at this point) args = parser.parse_args() # now remove the defaults and parse again # (removing the defaults is important so that we know which values # were specified by the user) no_defaults = dict([p, None] for p in GOPCA.get_param_defaults()) no_defaults2 = dict([p, None] for p in GOPCAParams.get_param_defaults()) no_defaults.update(no_defaults2) parser.set_defaults(**no_defaults) args = parser.parse_args() # reporting options log_file = args.log_file quiet = args.quiet verbose = args.verbose # test if we can write to log_file? # configure root logger logger = util.get_logger(log_file=log_file, quiet=quiet) # check if required parameters were specified passed = True if args.expression_file is None: logger.error('No expression file specified!') passed = False if args.gene_set_file is None: logger.error('No gene set file specified!') passed = False if args.output_file is None: logger.error('No output file specified!') passed = False if not passed: logger.error('Not all required parameters were specified.') return 1 # generate configuration if args.config_file is not None: # read parameter values from config file params = GOPCAParams.read_ini(args.config_file) else: # start with default configuration params = GOPCAParams() # overwrite parameters specified on the command line for p in GOPCAParams.get_param_defaults(): v = getattr(args, p) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) params.set_param(p, v) global_params = GOPCA.get_param_defaults() for k in list(global_params.keys()): v = getattr(args, k) if v is not None: logger.debug('Parameter "%s" specified on command line!', p) global_params[k] = v # read expression file matrix = ExpMatrix.read_tsv(args.expression_file) logger.info( 'Expression matrix size: ' + '(p = %d genes) x (n = %d samples).', matrix.p, matrix.n) if args.sel_var_genes > 0: # filter genes by variance matrix = matrix.filter_variance(args.sel_var_genes) # read gene set file gene_sets = GeneSetCollection.read_tsv(args.gene_set_file) print(args.gene_set_file, gene_sets) # read ontology file (if supplied) gene_ontology = None if args.gene_ontology_file is not None: p_logger = logging.getLogger(genometools.__name__) p_logger.setLevel(logging.ERROR) gene_ontology = GeneOntology.read_obo( args.gene_ontology_file, part_of_cc_only=params.go_part_of_cc_only) p_logger.setLevel(logging.NOTSET) M = GOPCA.simple_setup(matrix, params, gene_sets, gene_ontology, verbose=verbose, **global_params) run = M.run() if run is None: logger.error('GO-PCA run failed!') return 1 # write run to pickle file logger.info('Storing GO-PCA run in file "%s"...', args.output_file) run.write_pickle(args.output_file) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError( "Python interpreter version >= 2.7 required, " "found %d.%d instead." % (vinfo.major, vinfo.minor) ) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGenome.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( "Failed to convert %d / %d entrez IDs " "to gene symbols (%.1f%%).", f, matrix.p, 100 * (f / float(matrix.p)), ) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( "Failed to find %d / %d gene symbols in list of " "protein-coding genes (%.1f%%)", f, p, 100 * (f / float(p)), ) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug("Genes: %d, Samples: %d, matrix: %s", len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0
def main(args=None): vinfo = sys.version_info if not (vinfo >= (2, 7)): raise SystemError('Python interpreter version >= 2.7 required, ' 'found %d.%d instead.' % (vinfo.major, vinfo.minor)) if args is None: parser = get_argument_parser() args = parser.parse_args() expression_file = args.expression_file entrez2gene_file = args.entrez2gene_file gene_file = args.gene_file output_file = args.output_file strip_affy_suffix = args.strip_affy_suffix log_file = args.log_file quiet = args.quiet verbose = args.verbose # configure root logger logger = misc.get_logger(log_file=log_file, quiet=quiet, verbose=verbose) # read data genome = ExpGeneTable.read_tsv(gene_file) matrix = ExpMatrix.read_tsv(expression_file) e2g = dict(misc.read_all(entrez2gene_file)) entrez = matrix.genes if strip_affy_suffix: # remove "_at" suffix from Entrez IDs entrez = [e[:-3] for e in entrez] logger.debug(str(entrez[:3])) # check that Entrez IDs are unique assert len(entrez) == len(set(entrez)) # convert Entrez IDs to gene names f = 0 genes = [] X = [] # g = None for i, e in enumerate(entrez): # print e try: g = e2g[e] except KeyError: f += 1 else: # check if there are multiple entrez IDs pointing to the same gene # assert g not in genes genes.append(g) X.append(matrix.X[i, :]) assert len(genes) == len(set(genes)) if f > 0: logger.warning( 'Failed to convert %d / %d entrez IDs ' 'to gene symbols (%.1f%%).', f, matrix.p, 100 * (f / float(matrix.p))) # filter for known protein-coding genes X = np.float64(X) p = X.shape[0] logger.debug(str(X.shape)) sel = np.zeros(p, dtype=np.bool_) for i in range(p): if genes[i] in genome: sel[i] = True sel = np.nonzero(sel)[0] genes = [genes[i] for i in sel] X = X[sel, :] f = p - sel.size if f > 0: logger.warning( 'Failed to find %d / %d gene symbols in list of ' 'protein-coding genes (%.1f%%)', f, p, 100 * (f / float(p))) # generate new matrix (this automatically sorts the genes alphabetically) logger.debug('Genes: %d, Samples: %d, matrix: %s', len(genes), len(matrix.samples), str(X.shape)) matrix_conv = ExpMatrix(genes=genes, samples=matrix.samples, X=X) # write output file matrix_conv.write_tsv(output_file) return 0