def test_load_find_library_analysis_file(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') mm10 = models.load_library_tables([mm10tsv]) cwd_files = list(models.find_library_analysis_file(mm10, '*.coverage')) self.assertGreaterEqual(len(cwd_files), 1) for f in cwd_files: self.assertTrue(isinstance(f, models.AnalysisFile)) with TemporaryDirectory() as analysis_dir: with chdir(analysis_dir): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') tmpname = os.path.join(analysis_dir, 'library-mm10-se.tsv') shutil.copy(mm10tsv, tmpname) analysis_root = os.path.dirname(mm10tsv) mm10 = models.load_library_tables([tmpname], analysis_root=analysis_root) abs_files = list(models.find_library_analysis_file(mm10, '*.coverage')) self.assertGreaterEqual(len(abs_files), 1) for f in abs_files: self.assertTrue(isinstance(f, models.AnalysisFile)) self.assertEqual(len(cwd_files), len(abs_files)) self.assertEqual(cwd_files[0].filename, abs_files[0].filename)
def test_load_library(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') hg38tsv = resource_filename(__name__, 'library-hg38-se.tsv') mm10 = models.load_library_tables([mm10tsv]) self.assertEqual(len(mm10), count_valid_records(mm10tsv)) hg38 = models.load_library_tables([hg38tsv]) both = models.load_library_tables([mm10tsv, hg38tsv]) self.assertEqual(len(mm10) + len(hg38), len(both))
def test_load_library(self): mm10tsv = resource_filename(__name__, "library-mm10-se.tsv") hg38tsv = resource_filename(__name__, "library-hg38-se.tsv") mm10 = models.load_library_tables([mm10tsv]) self.assertEqual(len(mm10), count_valid_records(mm10tsv)) hg38 = models.load_library_tables([hg38tsv]) both = models.load_library_tables([mm10tsv, hg38tsv]) self.assertEqual(len(mm10) + len(hg38), len(both))
def test_load_library_analysis_root(self): with TemporaryDirectory() as analysis_dir: with chdir(analysis_dir): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') tmpname = os.path.join(analysis_dir, 'library-mm10-se.tsv') shutil.copy(mm10tsv, tmpname) analysis_root = os.path.dirname(mm10tsv) mm10 = models.load_library_tables([mm10tsv]) mm10tmp = models.load_library_tables([tmpname], analysis_root=analysis_root) for i in mm10['analysis_dir'].index: self.assertEqual(mm10['analysis_dir'][i], mm10tmp['analysis_dir'][i])
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) if not validate_library_file_existance(args): parser.error('Fix incorrect library file names') library_filenames = args.libraries if len(library_filenames) == 0: parser.error('Need library information table') libraries = load_library_tables(library_filenames, sep) custom_tracks = [] for library_id, library in libraries.iterrows(): if args.bigwig: custom_tracks.extend( make_bigwig_custom_tracks(library, args.web_root, args.root)) if args.bam: custom_tracks.append( make_bam_custom_track(library, args.web_root, args.root)) print(os.linesep.join(custom_tracks))
def test_load_all_star_counts(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') mm10 = models.load_library_tables([mm10tsv]) scores = models.load_all_star_counts(mm10, '+') self.assertEqual(scores.shape, (11, 2)) self.assertEqual(scores.index.name, 'gene_id') self.assertEqual(list(scores.columns), ['12304', '12305'])
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-l', '--library', required=True, action='append', help="library table to load") parser.add_argument('-o', '--output', help='filename to write report to') args = parser.parse_args(cmdline) libraries = load_library_tables(args.library) metrics = [] for library_id, library in libraries.iterrows(): genome_triple = genome_name_from_library(library) filename = library.analysis_name + '-' + genome_triple + '_picard_markdup.metrics' pathname = Path(library.analysis_dir) / filename if pathname.exists(): picard_metric = parse_picard_metric(pathname, library_id=library_id) metrics.append(picard_metric) else: print('{} is missing. Skipping'.format(pathname)) metrics = pandas.DataFrame(metrics) metrics.set_index('LIBRARY', inplace=True) if args.output: metrics.to_csv(args.output, sep='\t') else: print(metrics)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) output_sep = get_seperator(args.output_format) output_extension = {"TAB": ".tsv", ",": ".csv"}[args.output_format] if args.transcriptome: # isoforms load_quantifications = madqc.load_transcriptome_quantifications quantification_extension = "_isoform_" + args.quantification + output_extension else: # genes load_quantifications = madqc.load_genomic_quantifications quantification_extension = "_gene_" + args.quantification + output_extension for name in experiments: filename = name + quantification_extension replicates = experiments[name] logger.info("%s %s: %s", name, args.quantification, ",".join(replicates)) quantifications = load_quantifications(replicates, libraries, args.quantification) quantifications.to_csv(filename, sep=output_sep)
def test_read_line_from_stream(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') with open(mm10tsv) as instream: lines = list(models.read_line_from_stream(instream)) mm10 = models.load_library_tables([mm10tsv]) # add one to mm10 dataframe because the header is not counted in len() self.assertEqual(len(lines), len(mm10) + 1)
def create_quantification_cache( library_table, experiment_name, replicates, quantification_name, sep='\t'): score_filename = models.make_correlation_filename(experiment_name) quant_filename = models.make_quantification_filename(experiment_name, quantification_name) libraries = models.load_library_tables([library_table], sep=sep) quantifications = load_genomic_quantifications( replicates, libraries, quantification_name) if os.path.exists(quant_filename): os.unlink(quant_filename) store = pandas.HDFStore(quant_filename, complevel=9, complib='blosc') store.append('quantifications', quantifications) store.close() scores = compute_all_vs_all_scores(quantifications) if os.path.exists(score_filename): os.unlink(score_filename) store = pandas.HDFStore(score_filename) for key in scores: store.append(key, scores[key]) store.close()
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) if args.debug: logging.basicConfig(level=logging.DEBUG) elif args.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARN) sep = get_seperator(args.sep) if args.experiments: experiments = models.load_experiments(args.experiments, sep=sep, analysis_root=args.root) else: if args.experiment_name is None: parser.error( "Please provide an experiment name. (Used as filename)") if len(args.replicates) == 0: parser.error( "Please provide list of replicates or experiment table") experiments = {args.experiment_name: args.replicates} if args.libraries is None: parser.error("Please provide library information tables") libraries = models.load_library_tables(args.libraries, sep=sep) for i, experiment in experiments.iterrows(): logging.info('Processing: %s', experiment.name) create_quantification_cache(experiment, libraries, args.quantification, args.model, sep)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) if args.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.ERROR) experiments = models.load_experiments(args.experiments) libraries = models.load_library_tables(args.libraries) coverage = models.load_all_coverage(libraries) if args.all_experiments: make_combined_median_normalized_summary(experiments, coverage, args.output_format, args.bare) elif args.experiment_median_summary: make_per_experiment_median_normalized_summary(experiments, coverage, args.output_format, args.bare) elif args.by_experiment: make_by_experiment_median_summary(experiments, coverage, args.output_format, args.bare) elif args.combined_median_summary: make_combined_experiment_median_summary(experiments, coverage, args.output_format, args.bare) else: make_experiment_by_library_coverage_plots(experiments, coverage, args.output_format, args.bare)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) if not validate_path_args(args): parser.error('Please set required parameters') if not (validate_library_file_existance(args) and validate_experiment_file_existance(args)): parser.error('Fix path to files') sep = get_seperator(args.sep) library_filenames = args.libraries library_filenames.extend(args.other_libraries) libraries = models.load_library_tables(library_filenames, sep) read1 = dict(find_fastqs(libraries, 'read_1')) if 'read_2' in libraries.columns: read2 = dict(find_fastqs(libraries, 'read_2')) else: read2 = {} dags = generate_star_rsem_analysis(args, libraries, read1, read2) generate_combined_analysis(args, dags) return 0
def test_reference_prefix(self): spurtsv = resource_filename(__name__, 'library-spur-se.tsv') spur = models.load_library_tables([spurtsv]) self.assertEqual(make_dag.get_reference_prefix(spur, '12304'), 'scaffold') self.assertEqual(make_dag.get_reference_prefix(spur, '12307'), 'chr')
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-n', '--experiment-name', required=True, help='Experiment name to select') add_metadata_arguments(parser) add_debug_arguments(parser) args = parser.parse_args(cmdline) configure_logging(args) header_printed = False libraries = load_library_tables(args.libraries) experiments = load_experiments(args.experiments) replicates = experiments.loc[args.experiment_name, 'replicates'] for i, (library_id, library) in enumerate(libraries.loc[replicates].iterrows()): filename = find_library_bam_file(library) LOGGER.info(' Reading %s %d/%d', filename, i + 1, len(replicates)) mode = get_mode(filename, 'r') with pysam.AlignmentFile(filename, mode) as alignment: if not header_printed: print(str(alignment.header)) header_printed = True for read in alignment: print(read.to_string())
def load_asof_run17_libraries(): library_files = [os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split('\n')] libraries = models.load_library_tables(library_files) name = libraries.index.name libraries.index = [x.replace('_mm10', '').replace('_clean', '') for x in libraries.index] libraries.index.name = name return libraries
def load_asof_run17_libraries(): library_files = list(split_files_text(ASOF_RUN17_library_files)) libraries = models.load_library_tables(library_files) name = libraries.index.name libraries.index = [sanitize_library_name(x) for x in libraries.index] libraries.index.name = name return libraries
def setUp(self): self.mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') self.mm10 = models.load_library_tables([self.mm10tsv]) self.female = pandas.DataFrame() self.female.index.name = 'female' self.male = pandas.DataFrame() self.male.index.name = 'male'
def load_920cell_library_table(): clusters = pandas.DataFrame( find_bigwigs.read_peng_20180710_cluster_memberships()) asof_run17 = generate_combined_transcript_C1.ASOF_RUN17_library_files.split( '\n') libraries = [os.path.expanduser(x.strip()) for x in asof_run17] library_df = load_library_tables(libraries) library_df = library_df.reindex(clusters['cell_id']) return library_df
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) experiments = load_experiments(args.experiments) libraries = load_library_tables(args.libraries) plot = MeanGeneCoverage(experiments, libraries) plot.use_experiment(args.use_experiment) return plot
def build_hash_tree(library_filename): table = load_library_tables([library_filename]) hashes = {} for library_id, row in table.iterrows(): analysis_dir = row.analysis_dir name = row.analysis_name + '-' + genome_name_from_library( row) + '_genome.bam' alignment = os.path.join(analysis_dir, name) hashes[library_id] = hash_alignments(alignment) return hashes
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) libraries = [] if args.libraries: libraries = load_library_tables(args.libraries, analysis_root=args.root) LOGGER.info("loaded %d libraries", len(libraries)) if len(libraries) == 0 and len(args.gene_list) == 0: parser.error('Please specify a libraries to process') with open(args.gtf, 'rt') as stream: gene_types = readGeneTypes(stream) LOGGER.info("Loaded %s gene types", len(gene_types)) coverage_by_type = {} counts_by_type = {} for gene_coverage_table in load_all_gene_coverage(libraries, args.gene_list, args.gene_normalization): coverage, counts = sum_gene_coverage_by_type(gene_types, gene_coverage_table) coverage_by_type[coverage.name] = coverage counts_by_type[coverage.name] = counts LOGGER.info('Preparing plot class') plot = GeneCoverageDetail(coverage_by_type, counts_by_type, args.gene_normalization) if args.save: for library_id in plot: # avoid names that cause problems for files systems assert not library_id.startswith('..') assert '/' not in library_id assert '\\' not in library_id filename = '{}_gene_coverage_detail.html'.format(library_id) pathname = os.path.join(args.output_dir, filename) LOGGER.info("Saving plot for %s to %s", library_id, pathname) save( plot.make_plot(library_id), pathname, resources=resources.CDN, title=library_id, ) return plot
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-o', '--output', help='output directory') parser.add_argument('--mode', default=None, choices=[ 'customtrack', 'trackhub', 'merge_paper_wiggles', 'paper_median_coverage', 'check_bedgraphs', 'localize_tsvs', 'paper_as_single_experiment_tsv', 'paper_as_cluster_experiment_tsv', ]) args = parser.parse_args(cmdline) experiment_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_experiment_files.split() ] library_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split() ] experiments = models.load_experiments(experiment_files) libraries = models.load_library_tables(library_files) to_include = read_peng_20180710_cluster_memberships() #print('{} cells to include'.format(len(to_include))) if args.mode == 'customtrack': make_custom_tracks() elif args.mode == 'trackhub': make_trackhub() elif args.mode == 'merge_paper_wiggles': merge_paper_wiggles(to_include, libraries) elif args.mode == 'paper_median_coverage': make_paper_median_coverage(to_include, libraries, args.output) elif args.mode == 'check_bedgraphs': check_bedgraphs(to_include, libraries) elif args.mode == 'localize_tsvs': localize_tsvs(experiments, libraries, args.output) elif args.mode == 'paper_as_single_experiment_tsv': paper920_as_single_experiment_tsv(to_include, args.output) elif args.mode == 'paper_as_cluster_experiment_tsv': paper920_as_cluster_experiment_tsv(to_include, args.output) else: parser.error('Did you want to pick an operation mode?')
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) experiments = load_experiments(args.experiments) libraries = load_library_tables(args.libraries) if args.use_experiment: try: experiments = experiments.loc[[args.use_experiment]] except KeyError: print('{} was not found in {}'.format(args.use_experiment, ', '.join(list(experiments.index)))) return None plot = DistributionPlot(experiments, libraries) return plot
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) output_sep = get_seperator(args.output_format) output_extension = { 'TAB': '.tsv', ',': '.csv', }[args.output_format] if args.add_names: if args.gtf_cache is None: parser.error('GTF-cache is needed to add names to the quantification file') else: logger.info('Loading GTF Cache %s', args.gtf_cache) annotation = models.load_gtf_cache(args.gtf_cache) else: annotation = None if args.transcriptome: # isoforms load_quantifications = madqc.load_transcriptome_quantifications lookup_ids = models.lookup_gene_name_by_transcript_id quantification_extension = '_isoform_' + args.quantification + output_extension else: # genes load_quantifications = madqc.load_genomic_quantifications lookup_ids = models.lookup_gene_name_by_gene_id quantification_extension = '_gene_' + args.quantification + output_extension for name in experiments: filename = name + quantification_extension replicates = experiments[name] logger.info("%s %s: %s", name, args.quantification, ','.join(replicates)) quantifications = load_quantifications( replicates, libraries, args.quantification) if annotation is not None: quantifications = lookup_ids(annotation, quantifications) quantifications.to_csv(filename, sep=output_sep)
def load_filtered_transcripts(): sep = '\t' cache_file = os.path.expanduser( '~sau/genomes/mm10-M4-male/mm10-M4-male.h5') #annotation = models.load_gtf_cache(cache_file) annotation = None loader = IsoformRsemLoader('FPKM', annotation) index_name = 'transcript_id' # loader = GeneRsemLoader(args.quantification, annotation) #index_name = 'gene_id' to_include = generate_to_include_asof_run17()[1:] experiment_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_experiment_files.split() ] library_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split() ] quantifications = [] for e, l in zip(experiment_files, library_files): print('loading', e) experiments = models.load_experiments([e], sep=sep) libraries = models.load_library_tables([l], sep=sep) for i, experiment in experiments.iterrows(): print(experiment) quantification = loader.load(experiment, libraries) quantification.columns = list( filter_columns(quantification.columns)) quantifications.append(quantification) sheets = pandas.concat(quantifications, axis=1) print('all', sheets.shape) # sheets.to_csv('C1_mouse_combined_transcript_asof_run17_unfiltred.tsv', sep='\t') # was crashing because of _mm10 suffix filtered = sheets[to_include] print('filtered', filtered.shape) return filtered
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-o', '--output-dir') args = parser.parse_args(cmdline) experiment_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_experiment_files.split() ] library_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split() ] experiments = load_experiments(experiment_files) libraries = load_library_tables(library_files) #link_rsem(libraries, args.output_dir) link_genome_bams(libraries, args.output_dir)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) if not validate_args(args): parser.error("Please set required parameters") sep = get_seperator(args.sep) libraries = models.load_library_tables(args.libraries, sep) read1 = dict(find_fastqs(libraries, "read_1")) if "read_2" in libraries.columns: read2 = dict(find_fastqs(libraries, "read_2")) else: read2 = {} dag = generate_star_rsem_analysis(args, libraries, read1, read2) print(dag) return 0
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) experiments = load_experiments(args.experiments) libraries = load_library_tables(args.libraries) if args.use_experiment: try: experiments = experiments.loc[[args.use_experiment]] except KeyError: logger.error('{} was not found in {}'.format( args.use_experiment, ', '.join(list(experiments.index)))) return None if len(args.gene_type_filter) > 0: logger.info('Limiting to the following gene types {}'.format(','.join( args.gene_type_filter))) else: logger.info('Using all gene types') # ids will be None if args.gene_list_filter is None ids = load_gene_id_list(args.gene_list_filter) plot = GenesDetectedPlot( experiments, libraries, args.genome_dir, args.quantification, gene_type_filter=args.gene_type_filter, gene_list_filter=ids, ) if __name__ == '__main__': curdoc().add_root(plot.static_layout()) save(curdoc(), args.output, title=plot.title) return plot
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) gtf_cache = None if args.add_names: if args.genome_dir is None: parser.error( 'genome-dir is needed to add names to the quantification file') else: gtf_cache = GTFCache(libraries, args.genome_dir) if len(args.quantification) > 0: quantification_list = args.quantification else: quantification_list = ['FPKM'] if args.transcriptome: # isoforms RsemLoader = IsoformRsemLoader else: # genes RsemLoader = GeneRsemLoader for quantification in quantification_list: logger.info('Building expression matrix for %s', quantification) for i, experiment in experiments.iterrows(): loader = RsemLoader(quantification, gtf_cache) matrix = loader.load(experiment, libraries) loader.save(matrix, args.output_format)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) if args.add_names: if args.gtf_cache is None: parser.error('GTF-cache is needed to add names to the quantification file') else: logger.info('Loading GTF Cache %s', args.gtf_cache) annotation = models.load_gtf_cache(args.gtf_cache) else: annotation = None loader = StarLoader(args.strand, annotation) for i, experiment in experiments.iterrows(): quantification = loader.load(experiment, libraries) loader.save(quantification, args.output_format)
def test_load_stranded_library(self): mm10tsv = resource_filename(__name__, 'library-mm10-stranded.tsv') mm10 = models.load_library_tables([mm10tsv]) expected = ['forward', 'reverse', 'unstranded', 'forward', 'reverse', 'unstranded'] for strand, (library_id, row) in zip(expected, mm10.iterrows()): self.assertEqual(strand, row.stranded)
def test_genome_name_from_library_series(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') mm10 = models.load_library_tables([mm10tsv]) self.assertEqual(models.genome_name_from_library(mm10.loc['12304']), 'mm10-M4-female') self.assertEqual(models.genome_name_from_library(mm10.loc['12309']), 'mm10-M4-male')
def test_load_all_distribution(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') mm10 = models.load_library_tables([mm10tsv]) distribution = models.load_all_distribution(mm10) self.assertEqual(distribution.shape, (1, 3)) self.assertEqual(distribution.index[0], '12304')
def test_reference_prefix(self): spurtsv = resource_filename(__name__, "library-spur-se.tsv") spur = models.load_library_tables([spurtsv]) self.assertEqual(make_dag.get_reference_prefix(spur, "12304"), "scaffold") self.assertEqual(make_dag.get_reference_prefix(spur, "12307"), "chr")
def setUp(self): self.exp_tsv = resource_filename(__name__, 'experiments-mm10.tsv') self.lib_tsv = resource_filename(__name__, 'library-mm10-se.tsv') self.libraries = models.load_library_tables([self.lib_tsv]) self.experiments = models.load_experiments([self.exp_tsv])
def test_reference_prefix_missing(self): mm10tsv = resource_filename(__name__, "library-mm10-se.tsv") mm10 = models.load_library_tables([mm10tsv]) self.assertEqual(make_dag.get_reference_prefix(mm10, "12304"), "chr")