def test_load_experiment(self): mm10tsv = resource_filename(__name__, "experiments-mm10.tsv") hg38tsv = resource_filename(__name__, "experiments-hg38.tsv") mm10 = models.load_experiments([mm10tsv]) self.assertEqual(len(mm10), count_valid_records(mm10)) hg38 = models.load_experiments([hg38tsv]) both = models.load_experiments([mm10tsv, hg38tsv]) self.assertEqual(len(mm10) + len(hg38), len(both))
def test_load_experiment(self): mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv') hg38tsv = resource_filename(__name__, 'experiments-hg38.tsv') mm10 = models.load_experiments([mm10tsv]) self.assertIn('replicates', mm10.columns) self.assertEqual(len(mm10), count_valid_records(mm10tsv)) hg38 = models.load_experiments([hg38tsv]) both = models.load_experiments([mm10tsv, hg38tsv]) self.assertEqual(len(mm10) + len(hg38), len(both)) self.assertEqual(mm10.loc['expm']['replicates'], ['12307', '12308'])
def test_load_experiments_analysis_root(self): with TemporaryDirectory() as analysis_dir: with chdir(analysis_dir): mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv') tmpname = os.path.join(analysis_dir, 'experiments-mm10.tsv') shutil.copy(mm10tsv, tmpname) analysis_root = os.path.dirname(mm10tsv) mm10 = models.load_experiments([mm10tsv]) mm10tmp = models.load_experiments([tmpname], analysis_root=analysis_root) for i in mm10['analysis_dir'].index: self.assertEqual(mm10['analysis_dir'][i], mm10tmp['analysis_dir'][i])
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) sep = get_seperator(args.sep) if args.experiments: experiments = models.load_experiments([args.experiments], sep=sep) else: if args.experiment_name is None: parser.error( "Please provide an experiment name. (Used as filename)") if len(args.replicates) == 0: parser.error( "Please provide list of replicates or experiment table") experiments = {args.experiment_name: args.replicates} if args.library is None: parser.error("Please provide library information tables") for experiment_name in experiments: replicates = experiments[experiment_name] logging.info('Processing:', experiment_name, ','.join(replicates)) create_quantification_cache( args.library, experiment_name, replicates, args.quantification, sep)
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-n', '--experiment-name', required=True, help='Experiment name to select') add_metadata_arguments(parser) add_debug_arguments(parser) args = parser.parse_args(cmdline) configure_logging(args) header_printed = False libraries = load_library_tables(args.libraries) experiments = load_experiments(args.experiments) replicates = experiments.loc[args.experiment_name, 'replicates'] for i, (library_id, library) in enumerate(libraries.loc[replicates].iterrows()): filename = find_library_bam_file(library) LOGGER.info(' Reading %s %d/%d', filename, i + 1, len(replicates)) mode = get_mode(filename, 'r') with pysam.AlignmentFile(filename, mode) as alignment: if not header_printed: print(str(alignment.header)) header_printed = True for read in alignment: print(read.to_string())
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) if args.debug: logging.basicConfig(level=logging.DEBUG) elif args.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARN) sep = get_seperator(args.sep) if args.experiments: experiments = models.load_experiments(args.experiments, sep=sep, analysis_root=args.root) else: if args.experiment_name is None: parser.error( "Please provide an experiment name. (Used as filename)") if len(args.replicates) == 0: parser.error( "Please provide list of replicates or experiment table") experiments = {args.experiment_name: args.replicates} if args.libraries is None: parser.error("Please provide library information tables") libraries = models.load_library_tables(args.libraries, sep=sep) for i, experiment in experiments.iterrows(): logging.info('Processing: %s', experiment.name) create_quantification_cache(experiment, libraries, args.quantification, args.model, sep)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) if args.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.ERROR) experiments = models.load_experiments(args.experiments) libraries = models.load_library_tables(args.libraries) coverage = models.load_all_coverage(libraries) if args.all_experiments: make_combined_median_normalized_summary(experiments, coverage, args.output_format, args.bare) elif args.experiment_median_summary: make_per_experiment_median_normalized_summary(experiments, coverage, args.output_format, args.bare) elif args.by_experiment: make_by_experiment_median_summary(experiments, coverage, args.output_format, args.bare) elif args.combined_median_summary: make_combined_experiment_median_summary(experiments, coverage, args.output_format, args.bare) else: make_experiment_by_library_coverage_plots(experiments, coverage, args.output_format, args.bare)
def test_create_quantification_cache_tempdir(self): with tempfile.TemporaryDirectory() as tempdir: temp_experiments = models.load_experiments([self.exp_tsv], analysis_root=tempdir) quant = 'FPKM' score_filename = models.make_correlation_filename( temp_experiments.iloc[0]) quant_filename = models.make_quantification_filename( temp_experiments.iloc[0], quant, 'gene') print(temp_experiments) print(tempdir, score_filename) self.assertTrue(score_filename.startswith(tempdir)) self.assertTrue(quant_filename.startswith(tempdir)) assert not os.path.exists(score_filename) assert not os.path.exists(quant_filename) cache = madqc.create_quantification_cache(temp_experiments.iloc[0], self.libraries, quant, 'gene') self.assertIsInstance(cache['rafa_spearman'], pandas.DataFrame) self.assertTrue(os.path.exists(score_filename)) os.remove(score_filename) self.assertTrue(os.path.exists(quant_filename)) os.remove(quant_filename)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) experiments = load_experiments(args.experiments) #libraries = load_library_tables(args.libraries) return ScoreCorrelationPlot(experiments)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) output_sep = get_seperator(args.output_format) output_extension = {"TAB": ".tsv", ",": ".csv"}[args.output_format] if args.transcriptome: # isoforms load_quantifications = madqc.load_transcriptome_quantifications quantification_extension = "_isoform_" + args.quantification + output_extension else: # genes load_quantifications = madqc.load_genomic_quantifications quantification_extension = "_gene_" + args.quantification + output_extension for name in experiments: filename = name + quantification_extension replicates = experiments[name] logger.info("%s %s: %s", name, args.quantification, ",".join(replicates)) quantifications = load_quantifications(replicates, libraries, args.quantification) quantifications.to_csv(filename, sep=output_sep)
def load_asof_run17_experiments(): experiment_files = list(split_files_text(ASOF_RUN17_experiment_files)) experiments = models.load_experiments(experiment_files) for experiment_name, row in experiments.iterrows(): row.replicates = [sanitize_library_name(x) for x in row.replicates] return experiments
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) experiments = load_experiments(args.experiments) libraries = load_library_tables(args.libraries) plot = MeanGeneCoverage(experiments, libraries) plot.use_experiment(args.use_experiment) return plot
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-s', '--sheet', default=0, help='Sheet to use') parser.add_argument('--header', default=None, help="header row") parser.add_argument('filename', nargs=1, help='spreadsheet to look at') args = parser.parse_args(cmdline) header = int(args.header) if args.header is not None else None book = ODFReader(args.filename[0]) data = book.parse(args.sheet, header=header) server = ENCODED('www.encodeproject.org') server.load_netrc() first_experiments = models.load_experiments( to_files(paper_433_experiment_files)) all_experiments = models.load_experiments( to_files(ASOF_RUN17_experiment_files)) first_libraries = set(parse_replicates(first_experiments['replicates'])) all_libraries = set(parse_replicates(all_experiments['replicates'])) #print(first_libraries) #print(all_libraries) results = [] for i, library_id in enumerate(data[data.columns[0]]): if library_id in first_libraries: tranche = 1 elif library_id in all_libraries: tranche = 2 else: tranche = 'C' row = find_library_info(server, library_id) row['tranche'] = tranche results.append(row) if (i + 1) % 10: print('.', end='', flush=True) df = pandas.DataFrame(results) df.to_csv('tranche.csv', index=False)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) experiments = load_experiments(args.experiments) libraries = load_library_tables(args.libraries) if args.use_experiment: try: experiments = experiments.loc[[args.use_experiment]] except KeyError: print('{} was not found in {}'.format(args.use_experiment, ', '.join(list(experiments.index)))) return None plot = DistributionPlot(experiments, libraries) return plot
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-o', '--output', help='output directory') parser.add_argument('--mode', default=None, choices=[ 'customtrack', 'trackhub', 'merge_paper_wiggles', 'paper_median_coverage', 'check_bedgraphs', 'localize_tsvs', 'paper_as_single_experiment_tsv', 'paper_as_cluster_experiment_tsv', ]) args = parser.parse_args(cmdline) experiment_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_experiment_files.split() ] library_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split() ] experiments = models.load_experiments(experiment_files) libraries = models.load_library_tables(library_files) to_include = read_peng_20180710_cluster_memberships() #print('{} cells to include'.format(len(to_include))) if args.mode == 'customtrack': make_custom_tracks() elif args.mode == 'trackhub': make_trackhub() elif args.mode == 'merge_paper_wiggles': merge_paper_wiggles(to_include, libraries) elif args.mode == 'paper_median_coverage': make_paper_median_coverage(to_include, libraries, args.output) elif args.mode == 'check_bedgraphs': check_bedgraphs(to_include, libraries) elif args.mode == 'localize_tsvs': localize_tsvs(experiments, libraries, args.output) elif args.mode == 'paper_as_single_experiment_tsv': paper920_as_single_experiment_tsv(to_include, args.output) elif args.mode == 'paper_as_cluster_experiment_tsv': paper920_as_cluster_experiment_tsv(to_include, args.output) else: parser.error('Did you want to pick an operation mode?')
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) output_sep = get_seperator(args.output_format) output_extension = { 'TAB': '.tsv', ',': '.csv', }[args.output_format] if args.add_names: if args.gtf_cache is None: parser.error('GTF-cache is needed to add names to the quantification file') else: logger.info('Loading GTF Cache %s', args.gtf_cache) annotation = models.load_gtf_cache(args.gtf_cache) else: annotation = None if args.transcriptome: # isoforms load_quantifications = madqc.load_transcriptome_quantifications lookup_ids = models.lookup_gene_name_by_transcript_id quantification_extension = '_isoform_' + args.quantification + output_extension else: # genes load_quantifications = madqc.load_genomic_quantifications lookup_ids = models.lookup_gene_name_by_gene_id quantification_extension = '_gene_' + args.quantification + output_extension for name in experiments: filename = name + quantification_extension replicates = experiments[name] logger.info("%s %s: %s", name, args.quantification, ','.join(replicates)) quantifications = load_quantifications( replicates, libraries, args.quantification) if annotation is not None: quantifications = lookup_ids(annotation, quantifications) quantifications.to_csv(filename, sep=output_sep)
def test_make_quantification_filename_other(self): """Does make_quantification_filename work with an alternate analysis_root """ results = { 'genome': 'expf_FPKM.h5', 'transcriptome': 'expf_transcriptome_FPKM.h5', } for reference_type in results: mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv') path = '/tmp' mm10 = models.load_experiments([mm10tsv], analysis_root=path) filename = models.make_quantification_filename( mm10.iloc[0], reference_type=reference_type, ) expected = os.path.join(path, results[reference_type]) self.assertEqual(filename, expected)
def test_make_correlation_filename_default(self): """Does make_correlation_filename work with default analysis_root """ results = { 'genome': 'expf_correlation.h5', 'transcriptome': 'expf_transcriptome_correlation.h5', } for reference_type in results: mm10tsv = resource_filename(__name__, 'experiments-mm10.tsv') path, _ = os.path.split(mm10tsv) mm10 = models.load_experiments([mm10tsv]) filename = models.make_correlation_filename( mm10.iloc[0], reference_type=reference_type, ) expected = os.path.join(path, results[reference_type]) self.assertEqual(filename, expected)
def load_filtered_transcripts(): sep = '\t' cache_file = os.path.expanduser( '~sau/genomes/mm10-M4-male/mm10-M4-male.h5') #annotation = models.load_gtf_cache(cache_file) annotation = None loader = IsoformRsemLoader('FPKM', annotation) index_name = 'transcript_id' # loader = GeneRsemLoader(args.quantification, annotation) #index_name = 'gene_id' to_include = generate_to_include_asof_run17()[1:] experiment_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_experiment_files.split() ] library_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split() ] quantifications = [] for e, l in zip(experiment_files, library_files): print('loading', e) experiments = models.load_experiments([e], sep=sep) libraries = models.load_library_tables([l], sep=sep) for i, experiment in experiments.iterrows(): print(experiment) quantification = loader.load(experiment, libraries) quantification.columns = list( filter_columns(quantification.columns)) quantifications.append(quantification) sheets = pandas.concat(quantifications, axis=1) print('all', sheets.shape) # sheets.to_csv('C1_mouse_combined_transcript_asof_run17_unfiltred.tsv', sep='\t') # was crashing because of _mm10 suffix filtered = sheets[to_include] print('filtered', filtered.shape) return filtered
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-o', '--output-dir') args = parser.parse_args(cmdline) experiment_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_experiment_files.split() ] library_files = [ os.path.expanduser(x.strip()) for x in ASOF_RUN17_library_files.split() ] experiments = load_experiments(experiment_files) libraries = load_library_tables(library_files) #link_rsem(libraries, args.output_dir) link_genome_bams(libraries, args.output_dir)
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('--first-tranche', default=False, action='store_true', help='Use just the first tranche as experiment list') parser.add_argument('--name', required=True, help='submission name') parser.add_argument('-s', '--sheet', default=0, help='Sheet to use') parser.add_argument('--header', default=None, help="header row") parser.add_argument('filename', nargs=1, help='driver spreadsheet') args = parser.parse_args(cmdline) root_fastq_url = 'http://jumpgate.caltech.edu/runfolders/volvox02/' desplit = os.path.expanduser('~/proj/htsworkflow/htsworkflow/pipelines/desplit_fastq.py') header = int(args.header) if args.header is not None else None data = read_spreadsheet(args.filename[0], args.sheet, header) print(data.shape) if args.first_tranche: experiment_file_list = paper_433_experiment_files.split('\n') else: experiment_file_list = ASOF_RUN17_experiment_files.split('\n') experiment_files = [ os.path.expanduser(x.strip()) for x in experiment_file_list] experiments = load_experiments(experiment_files) experiments['replicates'] = experiments['replicates'].apply(lambda l: [x.replace('_mm10', '').replace('_clean', '') for x in l]) current_experiments = find_experiments_to_submit(experiments, data) aliases_tsv = '{}-aliases.tsv'.format(args.name) make_library_aliases(current_experiments, aliases_tsv) submission_fastqs_tsv = '{}-fastqs.tsv'.format(args.name) if not os.path.exists(submission_fastqs_tsv): fastq_urls = find_all_fastqs(root_fastq_url, current_experiments, submission_fastqs_tsv) fastq_urls = pandas.read_csv(submission_fastqs_tsv, sep='\t') barcodes_tsv = '{}-barcodes.tsv'.format(args.name) make_library_barcodes(fastq_urls, barcodes_tsv) metadata_tsv = '{}-flowcell-details.tsv'.format(args.name) metadata = make_metadata(fastq_urls, root_fastq_url, metadata_tsv) merge_file = '{}-merge-fastqs.condor'.format(args.name) make_desplit_condor(fastq_urls, metadata, desplit, root_fastq_url, merge_file)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) experiments = load_experiments(args.experiments) libraries = load_library_tables(args.libraries) if args.use_experiment: try: experiments = experiments.loc[[args.use_experiment]] except KeyError: logger.error('{} was not found in {}'.format( args.use_experiment, ', '.join(list(experiments.index)))) return None if len(args.gene_type_filter) > 0: logger.info('Limiting to the following gene types {}'.format(','.join( args.gene_type_filter))) else: logger.info('Using all gene types') # ids will be None if args.gene_list_filter is None ids = load_gene_id_list(args.gene_list_filter) plot = GenesDetectedPlot( experiments, libraries, args.genome_dir, args.quantification, gene_type_filter=args.gene_type_filter, gene_list_filter=ids, ) if __name__ == '__main__': curdoc().add_root(plot.static_layout()) save(curdoc(), args.output, title=plot.title) return plot
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) gtf_cache = None if args.add_names: if args.genome_dir is None: parser.error( 'genome-dir is needed to add names to the quantification file') else: gtf_cache = GTFCache(libraries, args.genome_dir) if len(args.quantification) > 0: quantification_list = args.quantification else: quantification_list = ['FPKM'] if args.transcriptome: # isoforms RsemLoader = IsoformRsemLoader else: # genes RsemLoader = GeneRsemLoader for quantification in quantification_list: logger.info('Building expression matrix for %s', quantification) for i, experiment in experiments.iterrows(): loader = RsemLoader(quantification, gtf_cache) matrix = loader.load(experiment, libraries) loader.save(matrix, args.output_format)
def main(cmdline=None): parser = make_parser() args = parser.parse_args(cmdline) configure_logging(args) sep = get_seperator(args.sep) experiments = models.load_experiments(args.experiments, sep=sep) libraries = models.load_library_tables(args.libraries, sep=sep) if args.add_names: if args.gtf_cache is None: parser.error('GTF-cache is needed to add names to the quantification file') else: logger.info('Loading GTF Cache %s', args.gtf_cache) annotation = models.load_gtf_cache(args.gtf_cache) else: annotation = None loader = StarLoader(args.strand, annotation) for i, experiment in experiments.iterrows(): quantification = loader.load(experiment, libraries) loader.save(quantification, args.output_format)
def setUp(self): self.exp_tsv = resource_filename(__name__, 'experiments-mm10.tsv') self.lib_tsv = resource_filename(__name__, 'library-mm10-se.tsv') self.libraries = models.load_library_tables([self.lib_tsv]) self.experiments = models.load_experiments([self.exp_tsv])
def test_load_numeric_experiment(self): filename = resource_filename(__name__, 'experiments-numeric.tsv') experiment = models.load_experiments([filename]) for name in experiment.index: self.assertIsInstance(name, str)
def __init__(self, experiments, sep='\t', analysis_root=None): self.name = None self.experiments = load_experiments(experiments, sep) self.quantification_name = None self.quantification = None