def test_single_sample(self, tmpdir, path_proj_conf_file, which_sample_index): """ Single Sample is perfectly valid for Project and sheet. """ # Pull out the values for the current sample. values = DATA[which_sample_index] # Write the annotations. anns_path = os.path.join(tmpdir.strpath, NAME_ANNOTATIONS_FILE) with open(anns_path, 'w') as anns_file: anns_file.write("{}\n".format(",".join(COLUMNS))) anns_file.write("{}\n".format(",".join([str(v) for v in values]))) # Build the sheet. p = Project(path_proj_conf_file) sheet = p.build_sheet() # It should be a single-row DataFrame. assert isinstance(sheet, pd.DataFrame) assert 1 == len(sheet) assert 1 == p.num_samples # There will be additional values added from the Project, # but the core data values will have remained the same. sample = list(p.samples)[0] for attr, exp_val in zip(COLUMNS, values): obs_val = getattr(sample, attr) try: assert exp_val == obs_val except AssertionError as e: try: assert exp_val == int(obs_val) except AssertionError: raise e
def test_nonexistent_env_settings_file( self, tmpdir, minimal_project_conf_path, env_config_filepath, envconf_filename): """ Project doesn't require default environment settings. """ # Create name to nonexistent file based on true default file. envconf_dirpath, _ = os.path.split(env_config_filepath) misnamed_envconf = os.path.join(envconf_dirpath, envconf_filename) # Create and add log message handler for expected errors. logfile = tmpdir.join("project-error-messages.log").strpath expected_error_message_handler = logging.FileHandler(logfile, mode='w') expected_error_message_handler.setLevel(logging.ERROR) looper.models._LOGGER.handlers.append(expected_error_message_handler) # Create Project, expecting to generate error messages. project = Project(minimal_project_conf_path, default_compute=misnamed_envconf) # Remove the temporary message handler. del looper.models._LOGGER.handlers[-1] # Ensure nulls for all relevant Project attributes. self._assert_null_compute_environment(project) # We should have two error messages, describing the exception caught # during default environment parsing and that it couldn't be set. with open(logfile, 'r') as messages: exception_messages = messages.readlines() try: assert 2 == len(exception_messages) except AssertionError: print("Exception messages: {}".format(exception_messages)) raise
def test_minimal_configuration_name_inference( self, tmpdir, minimal_project_conf_path, env_config_filepath): """ Project infers name from where its configuration lives. """ project = Project(minimal_project_conf_path, default_compute=env_config_filepath) _, expected_name = os.path.split(tmpdir.strpath) assert expected_name == project.name
def test_no_default_env_settings_provided( self, minimal_project_conf_path, explicit_null, compute_env_attname): """ Project doesn't require default environment settings. """ kwargs = {"default_compute": None} if explicit_null else {} project = Project(minimal_project_conf_path, **kwargs) observed_attribute = getattr(project, compute_env_attname) expected_attribute = \ self.default_compute_settings(project)[compute_env_attname] if compute_env_attname == "compute": # 'compute' refers to a section in the default environment # settings file and also to a Project attribute. A Project # instance selects just one of the options in the 'compute' # section of the file as the value for its 'compute' attribute. expected_attribute = expected_attribute["default"] observed_attribute = _compute_paths_to_names(observed_attribute) elif compute_env_attname == "environment": envs_with_reduced_filepaths = \ _env_paths_to_names(observed_attribute["compute"]) observed_attribute = AttributeDict( {"compute": envs_with_reduced_filepaths}) assert expected_attribute == observed_attribute
def project(request, tmpdir, env_config_filepath): """ Provide requesting test case with a basic Project instance. """ # Write just the sample names as the annotations. annotations_filename = "anns-fill.csv" anns_path = tmpdir.join(annotations_filename).strpath num_samples = request.getfixturevalue("num_samples") df = pd.DataFrame( OrderedDict([("sample_name", ["sample{}".format(i) for i in range(num_samples)]), ("data", range(num_samples))])) with open(anns_path, 'w') as anns_file: df.to_csv(anns_file, sep="\t", index=False) # Create the Project config data. config_data = {"metadata": {SAMPLE_ANNOTATIONS_KEY: annotations_filename}} if request.getfixturevalue(request.cls.CONFIG_DATA_PATHS_HOOK): config_data["paths"] = {} paths_dest = config_data["paths"] else: paths_dest = config_data["metadata"] # Add the paths data to the Project config. for path_name, path in PATH_BY_TYPE.items(): paths_dest[path_name] = os.path.join(tmpdir.strpath, path) # Write the Project config file. conf_path = tmpdir.join("proj-conf.yaml").strpath with open(conf_path, 'w') as conf_file: yaml.safe_dump(config_data, conf_file) return Project(conf_path, default_compute=env_config_filepath)
def main(): args = parse_arguments() # Start project object prj = Project(args.project_config_file) if "trackhubs" not in prj: raise ValueError( "Project configuration does not have a trackhub section.") if "trackhub_dir" not in prj.trackhubs: raise ValueError( "Project trackhub configuration does not have a trackhub_dir attribute." ) # Setup paths and hub files bigwig_dir = os.path.join(prj.trackhubs.trackhub_dir) track_hub = os.path.join(bigwig_dir, "hub.txt") genomes_hub = os.path.join(bigwig_dir, "genomes.txt") open(genomes_hub, 'w').write("") # Setup attributes proj_name = prj[ 'project_name'] if "project_name" in prj else os.path.basename( prj['paths']['output_dir']) proj_desc = prj[ 'project_description'] if "project_description" in prj else proj_name user_email = prj['email'] if "email" in prj else "" # In the future there will be more actions than this make_ucsc_trackhub(args, prj, track_hub, bigwig_dir, genomes_hub, proj_name, proj_desc, user_email) track_file = os.path.join(bigwig_dir, "igv", "index.html") track_url = os.path.join(prj['trackhubs']['url'], "igv") make_igv_tracklink(prj, track_file, track_url)
def test_sample_name_availability( self, path_project_conf, path_sample_anns, lazy): """ Sample names always available on Project. """ with open(path_sample_anns, 'r') as anns_file: expected_sample_names = \ [l.split(",")[0] for l in anns_file.readlines()[1:] if l] p = Project(path_project_conf, defer_sample_construction=lazy) assert expected_sample_names == list(p.sample_names)
def test_samples_are_generic(path_anns_file, path_proj_conf_file): """ Regardless of protocol, Samples for sheet are generic. """ # Annotations filepath fixture is also writes that file, so # it's needed even though that return value isn't used locally. p = Project(path_proj_conf_file) assert len(SAMPLE_NAMES) == p.num_samples samples = list(p.samples) assert p.num_samples == len(samples) assert all([Sample is type(s) for s in samples])
def project(self, tmpdir, minimal_project_conf_path): """ Create a Project with base/default environment. """ # Write base/default environment data to disk. env_config_filename = "env-conf.yaml" env_config_filepath = tmpdir.join(env_config_filename).strpath with open(env_config_filepath, 'w') as env_conf: yaml.safe_dump(self.ENVIRONMENT_CONFIG_DATA, env_conf) return Project(minimal_project_conf_path, default_compute=env_config_filepath)
def test_multiple_samples(self, protocols, path_anns_file, path_proj_conf_file): """ Project also processes multiple Sample fine. """ p = Project(path_proj_conf_file) # Total sample count is constant. assert len(SAMPLE_NAMES) == sum(1 for _ in p.samples) # But the sheet permits filtering to specific protocol(s). exp_num_samples = len(SAMPLE_NAMES) if not protocols else \ sum(sum(1 for l in LIBRARIES if l == p) for p in protocols) sheet = p.build_sheet(*protocols) assert exp_num_samples == len(sheet) if protocols: fuzzy_protos = {alpha_cased(p) for p in protocols} for _, sample_data in sheet.iterrows(): assert alpha_cased(sample_data.library) in fuzzy_protos
def interactive(prj_lines=PROJECT_CONFIG_LINES, iface_lines=PIPELINE_INTERFACE_CONFIG_LINES, merge_table_lines=MERGE_TABLE_LINES, annotation_lines=SAMPLE_ANNOTATION_LINES, project_kwargs=None, logger_kwargs=None): """ Create Project and PipelineInterface instances from default or given data. This is intended to provide easy access to instances of fundamental looper object for interactive test-authorship-motivated work in an iPython interpreter or Notebook. Test authorship is simplified if we provide easy access to viable instances of these objects. :param Iterable[str] prj_lines: project config lines :param Iterable[str] iface_lines: pipeline interface config lines :param Iterable[str] merge_table_lines: lines for a merge table file :param Iterable[str] annotation_lines: lines for a sample annotations file :param str | int loglevel: level at which to attend to log messages :param dict project_kwargs: keyword arguments for Project constructor :param dict logger_kwargs: keyword arguments for logging configuration :param bool devmode: whether logging should be done in development mode; this implies a more verbose level setting and a more information-rich template for logging message formatting :param str logfile: path to file to which to write logging messages :return Project, PipelineInterface: one Project and one PipelineInterface, """ # Establish logging for interactive session. looper_logger_kwargs = {"level": "DEBUG"} looper_logger_kwargs.update(logger_kwargs or {}) setup_looper_logger(**looper_logger_kwargs) # TODO: don't work with tempfiles once ctors tolerate Iterable. dirpath = tempfile.mkdtemp() path_conf_file = _write_temp(prj_lines, dirpath=dirpath, fname=P_CONFIG_FILENAME) path_iface_file = _write_temp(iface_lines, dirpath=dirpath, fname="pipeline_interface.yaml") path_merge_table_file = _write_temp(merge_table_lines, dirpath=dirpath, fname=MERGE_TABLE_FILENAME) path_sample_annotation_file = _write_temp(annotation_lines, dirpath=dirpath, fname=ANNOTATIONS_FILENAME) prj = Project(path_conf_file, **(project_kwargs or {})) iface = PipelineInterface(path_iface_file) for path in [ path_conf_file, path_iface_file, path_merge_table_file, path_sample_annotation_file ]: os.unlink(path) return prj, iface
def observed_argstring_elements( self, confdata, pipeline, confpath, envpath): """ Write config, build project, and validate argstring for pipeline. :param dict confdata: project configuration data :param str pipeline: name of pipeline for which to build argstring :param str confpath: where to write project config file :param str envpath: pointer to default environment file :return Iterable[str] argstring components """ conf_file_path = _write_project_config(confdata, dirpath=confpath) # Subvert requirement for sample annotations file. with mock.patch("looper.models.check_sheet"): project = Project(conf_file_path, default_compute=envpath) argstring = project.get_arg_string(pipeline) return argstring.split(" ")
def test_counting_samples_doesnt_create_samples( self, sample_annotation_lines, path_project_conf, path_sample_anns): """ User can ask about sample count without creating samples. """ # We're not parameterized in terms of Sample creation laziness here # because a piece of the test's essence is Sample collection absence. p = Project(path_project_conf, defer_sample_construction=True) assert p._samples is None expected_sample_count = sum(1 for _ in sample_annotation_lines) - 1 assert expected_sample_count == p.num_samples assert p._samples is None
def test_lacks_sample_annotations( self, project_config_data, env_config_filepath, tmpdir): """ Lack of sample annotations precludes Project construction. """ # Remove sample annotations KV pair from config data for this test. del project_config_data["metadata"][SAMPLE_ANNOTATIONS_KEY] # Write the config and assert the expected exception for Project ctor. conf_path = _write_project_config( project_config_data, dirpath=tmpdir.strpath) with pytest.raises(_MissingMetadataException): Project(conf_path, default_compute=env_config_filepath)
def create_project( self, project_config_data, default_env_path, case_type, dirpath): """ For a test case, determine expectations and create Project instance. :param dict project_config_data: the actual data to write to the Project configuration file :param str default_env_path: path to the default environment config file to pass to Project constructor :param str case_type: type of test case to execute; this determines how to specify the derived columns in the config file :param str dirpath: path in which to write config file :return (Iterable[str], Project): collection of names of derived columns to expect, along with Project instance with which to test """ # Ensure valid parameterization. if case_type not in self.DERIVED_COLUMNS_CASE_TYPES: raise ValueError( "Unexpected derived_columns case type: '{}' (known={})". format(case_type, self.DERIVED_COLUMNS_CASE_TYPES)) # Parameterization specifies expectation and explicit specification. expected_derived_columns = copy.copy(Project.DERIVED_COLUMNS_DEFAULT) if case_type == "implicit": # Negative control; ensure config data lacks derived columns. assert "derived_columns" not in project_config_data else: explicit_derived_columns = \ copy.copy(self.ADDITIONAL_DERIVED_COLUMNS) expected_derived_columns.extend(self.ADDITIONAL_DERIVED_COLUMNS) # Determine explicit inclusion of default derived columns. if case_type == "intersection": explicit_derived_columns.extend( Project.DERIVED_COLUMNS_DEFAULT) project_config_data["derived_columns"] = explicit_derived_columns # Write the config and build the Project. conf_file_path = _write_project_config( project_config_data, dirpath=dirpath) with mock.patch("looper.models.check_sheet"): project = Project(conf_file_path, default_compute=default_env_path) return expected_derived_columns, project
def test_sample_creation_laziness( self, path_project_conf, path_sample_anns, lazy): """ Project offers control over whether to create base Sample(s). """ p = Project(path_project_conf, defer_sample_construction=lazy) if lazy: # Samples should remain null during lazy Project construction. assert p._samples is None else: # Eager Project construction builds Sample objects. assert p._samples is not None with open(path_sample_anns, 'r') as anns_file: anns_file_lines = anns_file.readlines() # Sum excludes the header line. num_samples_expected = sum(1 for l in anns_file_lines[1:] if l) assert num_samples_expected == len(p._samples) assert all([Sample == type(s) for s in p._samples])
def test_no_merge_table_in_config( self, tmpdir, spec_type, lazy, proj_conf_data, path_sample_anns): """ Merge table attribute remains null if config lacks merge_table. """ metadata = proj_conf_data["metadata"] try: assert "merge_table" in metadata except AssertionError: print("Project metadata section lacks 'merge_table'") print("All config data: {}".format(proj_conf_data)) print("Config metadata section: {}".format(metadata)) raise if spec_type == "as_null": metadata["merge_table"] = None elif spec_type == "missing": del metadata["merge_table"] else: raise ValueError("Unknown way to specify no merge table: {}". format(spec_type)) path_config_file = os.path.join(tmpdir.strpath, "project_config.yaml") with open(path_config_file, 'w') as conf_file: yaml.safe_dump(proj_conf_data, conf_file) p = Project(path_config_file, defer_sample_construction=lazy) assert p.merge_table is None
sep="\t", header=None, skiprows=1, names=["gene_name", "chr", "strand", sample.name]).set_index("gene_name")[sample.name] except IOError: print("Sample {} is missing.".format(sample.name)) continue # add gene index if first: expr = pd.DataFrame(counts) first = False else: expr[sample.name] = counts return expr prj = Project(os.path.join("metadata", "config.yaml")) prj.add_sample_sheet() prj.paths.results_dir = os.path.join("results") # get guide annotation guide_annotation = pd.read_csv(os.path.join("metadata", "guide_annotation.csv")) # Gather gRNA assignment info across all samples used for experiment, rows in prj.sheet.df.groupby(['experiment']): # merge gRNA data reads = pd.DataFrame() scores = pd.DataFrame() assignment = pd.DataFrame() for sample_name in rows["sample_name"]:
"duplets_assignment_overlap.bothlog.svg"), bbox_inches="tight") plt.close("all") sns.jointplot(overlap_others, overlap_assignment, xlim=(-100, overlap_assignment.max() + 100), ylim=(-100, overlap_assignment.max() + 100), alpha=0.1) plt.savefig(os.path.join(output_dir, "duplets_assignment_overlap.lims.svg"), bbox_inches="tight") plt.close("all") # Start project, add samples prj = Project(os.path.join("metadata", "config.yaml")) # only used in older versions of looper # prj.add_sample_sheet() # get guide annotation guide_annotation = pd.read_csv(os.path.join("metadata", "guide_annotation.csv")) for sample in [s for s in prj.samples if hasattr(s, "replicate") ]: # [s for s in prj.samples if hasattr(s, "replicate")] output_dir = os.path.join(sample.paths.sample_root, "gRNA_assignment") # select gRNAs in respective sample library sel_guide_annotation = guide_annotation[guide_annotation['library'] == sample.grna_library]
def main(): parser = ArgumentParser(prog="ngs_analysis_recipe", description="NGS analysis recipe.") parser = add_args(parser) args = parser.parse_args() # args = parser.parse_args('-t ATAC-seq metadata/project_config.yaml'.split(" ")) # Start project print( "Starting looper project with project configuration file: '{}'".format( args.config_file)) prj = Project(args.config_file) print("Changing directory to project root directory: '{}'.".format( prj.metadata.output_dir)) os.chdir(prj.metadata.output_dir) if args.pass_qc: print( "Filtering samples out which didn't pass QC as specified in sample annotation in column 'pass_qc'" ) prj._samples = [ s for s in prj._samples if s.pass_qc not in ['0', 0, 'False', False] ] print("Setting location of sample files dependent on sample types.") for sample in prj.samples: if hasattr(sample, "protocol"): sample.library = sample.protocol if sample.library in ["ATAC-seq", "ChIP-seq", "ChIPmentation"]: sample.mapped = os.path.join(sample.paths.sample_root, "mapped", sample.name + ".trimmed.bowtie2.bam") sample.filtered = os.path.join( sample.paths.sample_root, "mapped", sample.name + ".trimmed.bowtie2.filtered.bam") sample.peaks = os.path.join(sample.paths.sample_root, "peaks", sample.name + "_peaks.narrowPeak") elif sample.library == "RNA-seq": sample.bitseq_counts = os.path.join( sample.paths.sample_root, "bowtie1_{}".format(sample.transcriptome), "bitSeq", sample.name + ".counts") # ANALYSIS if args.data_type is None: print( "Type of analysis not specified. Will run independent analysis for all types of data in the sample annotation sheet." ) data_types = sorted(list(set([s.library for s in prj.samples]))) print("Sample data types: '{}'.".format(",".join(data_types))) else: print( "Type of analysis specified. Will run only analysis for samples of type '{}'." .format(args.data_type)) data_types = [args.data_type] print("Sample data types: '{}'.".format(",".join(data_types))) if args.name is None: print( "Analysis name not specified, will use name in project configuration file: '{}'." .format(prj.project_name)) args.name = prj.project_name for data_type in data_types: print("Starting analysis for samples of type: '{}'.".format(data_type)) samples = [s for s in prj.samples if (s.library == data_type)] if len(samples) > 0: print("Samples under consideration: '{}'. ".format(",".join( [s.name for s in samples])) + "Total of {} samples.".format(len([s.name for s in samples]))) else: raise ValueError( "There were no valid samples for this analysis type!") if data_type in ["ATAC-seq"]: print("Initializing ATAC-seq analysis") analysis = ATACSeqAnalysis(name=args.name + "_atacseq", prj=prj, samples=samples, results_dir=args.results_dir) elif data_type in ["ChIP-seq"]: print("Initializing ChIP-seq analysis") analysis = ChIPSeqAnalysis(name=args.name + "_chipseq", prj=prj, samples=samples, results_dir=args.results_dir) elif data_type in ["RNA-seq"]: print("Initializing RNA-seq analysis") analysis = RNASeqAnalysis(name=args.name + "_rnaseq", prj=prj, samples=samples, results_dir=args.results_dir) if hasattr(prj, "sample_attributes"): print( "Using sample attributes from project configuration file: '{}'." .format(",".join(prj.sample_attributes))) sample_attributes = prj.sample_attributes else: print( "Project configuration file does not contain a 'sample_attributes' section." ) print("Sample annotation will be minimal!") sample_attributes = ['sample_name'] if hasattr(prj, "group_attributes"): print( "Using group attributes from project configuration file: '{}'." .format(",".join(prj.group_attributes))) group_attributes = prj.group_attributes else: print( "Project configuration file does not contain a 'group_attributes' section." ) print("Group-wise labeling of samples will not be possible!") group_attributes = ['sample_name'] return main_analysis_pipeline(analysis, data_type=data_type, sample_attributes=sample_attributes, plotting_attributes=group_attributes, alpha=args.alpha, abs_fold_change=args.abs_fold_change)
def test_minimal_configuration_doesnt_fail( self, minimal_project_conf_path, env_config_filepath): """ Project ctor requires minimal config and default environment. """ Project(config_file=minimal_project_conf_path, default_compute=env_config_filepath)
def test_minimal_configuration_output_dir( self, tmpdir, minimal_project_conf_path, env_config_filepath): """ Project infers output path from its configuration location. """ project = Project(minimal_project_conf_path, default_compute=env_config_filepath) assert tmpdir.strpath == project.output_dir
def test_no_samples(self, protocols, delimiter, path_empty_project): """ Lack of Samples is unproblematic for the sheet build. """ # Regardless of protocol(s), the sheet should be empty. p = Project(path_empty_project) sheet = p.build_sheet(*protocols) assert sheet.empty
def test_no_samples(self, path_empty_project): """ Lack of Samples is unproblematic. """ p = Project(path_empty_project) assert 0 == p.num_samples assert [] == list(p.samples)
def test_empty_project(self, path_empty_project): """ It's unproblematic to create a Project that lacks samples. """ Project(path_empty_project)
interactions_TF.to_csv(os.path.join(foots_dir, label + ".piq.TF-TF_interactions.tsv"), sep="\t", index=False) # Filter for TF-TF interactions stronger than 1 interactions_TF_filtered = interactions_TF[interactions_TF['interaction_score'] >= 1] interactions_TF_filtered.to_csv(os.path.join(foots_dir, label + ".piq.TF-TF_interactions.filtered.tsv"), sep="\t", index=False) # INIT # Get path configuration data_dir = os.path.join('.', "data") scratch_dir = os.path.join("/scratch/users/arendeiro/piq") results_dir = os.path.join('.', "results") plots_dir = os.path.join(results_dir, "plots") # Start project prj = Project("metadata/project_config.yaml") prj.add_sample_sheet() # annotated samples with a few more things: prj.samples = annotate_samples(prj.samples, prj.sheet.df.columns.tolist()) samples = prj.samples # MOTIFS FOR PIQ motifs_file = "data/external/jaspar_human_motifs.txt" n_motifs = 366 # prepare motifs for footprinting (done once) cmds = piq_prepare_motifs(motifs_file, n_motifs) for cmd in cmds: cmd2 = tk.slurmHeader("PIQ_preparemotifs", os.path.join("/home/arendeiro/", "piq_preparemotifs.slurm.log"), cpusPerTask=1, queue="shortq")
interactions_TF['interaction_score'] >= 1] interactions_TF_filtered.to_csv(os.path.join( foots_dir, label + ".piq.TF-TF_interactions.filtered.tsv"), sep="\t", index=False) # INIT # Get path configuration data_dir = os.path.join('.', "data") scratch_dir = os.path.join("/scratch/users/arendeiro/piq") results_dir = os.path.join('.', "results") plots_dir = os.path.join(results_dir, "plots") # Start project prj = Project("metadata/project_config.yaml") prj.add_sample_sheet() # annotated samples with a few more things: prj.samples = annotate_samples(prj.samples, prj.sheet.df.columns.tolist()) samples = prj.samples # MOTIFS FOR PIQ motifs_file = "data/external/jaspar_human_motifs.txt" n_motifs = 366 # prepare motifs for footprinting (done once) cmds = piq_prepare_motifs(motifs_file, n_motifs) for cmd in cmds: cmd2 = tk.slurmHeader("PIQ_preparemotifs", os.path.join("/home/arendeiro/",