Exemple #1
0
 def __init__(self,
              exp_seqs_fname,
              control_seqs_fname,
              kmer_lens,
              output_dir,
              exp_coords_fname=None,
              control_coords_fname=None,
              genome="mm9"):
     # FASTAs representing the sequences for exp and control
     # conditions
     self.exp_seqs_fname = exp_seqs_fname
     self.control_seqs_fname = control_seqs_fname
     # Coordinates files for exp and control conditions
     self.exp_coords_fname = exp_coords_fname
     self.control_coords_fname = control_coords_fname
     # Kmer lengths to consider
     self.kmer_lens = kmer_lens
     self.output_dir = output_dir
     # Optional genome name
     self.genome = genome
     # Define Kmer tables for each kmer length
     self.exp_kmer_tables = []
     self.control_kmer_tables = []
     self.logger = utils.get_logger("MotifSet",
                                    os.path.join(self.output_dir, "logs"))
Exemple #2
0
 def __init__(self,
              exp_seqs_fname,
              control_seqs_fname,
              kmer_lens,
              output_dir,
              exp_coords_fname=None,
              control_coords_fname=None,
              genome="mm9"):
     # FASTAs representing the sequences for exp and control
     # conditions
     self.exp_seqs_fname = exp_seqs_fname
     self.control_seqs_fname = control_seqs_fname
     # Coordinates files for exp and control conditions
     self.exp_coords_fname = exp_coords_fname
     self.control_coords_fname = control_coords_fname
     # Kmer lengths to consider
     self.kmer_lens = kmer_lens
     self.output_dir = output_dir
     # Optional genome name
     self.genome = genome
     # Define Kmer tables for each kmer length
     self.exp_kmer_tables = []
     self.control_kmer_tables = []
     self.logger = utils.get_logger("MotifSet",
                                    os.path.join(self.output_dir, "logs"))
Exemple #3
0
 def __init__(self, sample, pipeline):
     # Pipeline instance that the sample is attached to
     self.pipeline = pipeline
     self.sample = sample
     self.settings_info = pipeline.settings_info
     # Define logger
     self.logger = utils.get_logger("QualityControl.%s" % (sample.label), self.pipeline.pipeline_outdirs["logs"])
     # QC header: order of QC fields to be outputted
     self.regions_header = [
         "num_ribo",
         "num_exons",
         "num_cds",
         "num_introns",
         "num_3p_utr",
         "num_5p_utr",
         "num_tRNAs",
         "num_junctions",
     ]
     self.qc_stats_header = [
         "percent_mapped",
         "percent_unique",
         "percent_ribo",
         "percent_exons",
         "percent_cds",
         "percent_introns",
         "percent_3p_utr",
         "percent_5p_utr",
         "percent_tRNAs",
         "3p_to_cds",
         "5p_to_cds",
         "3p_to_5p",
         "exon_intron_ratio",
     ]
     self.qc_header = (
         ["num_reads", "num_mapped", "num_ribosub_mapped", "num_unique_mapped"]
         + self.qc_stats_header
         + self.regions_header
     )
     # QC results
     self.na_val = "NA"
     self.qc_results = defaultdict(lambda: self.na_val)
     # QC output dir
     self.qc_outdir = self.pipeline.pipeline_outdirs["qc"]
     # QC filename for this sample
     self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label)
     utils.make_dir(self.sample_outdir)
     # Regions output dir
     self.regions_outdir = os.path.join(self.sample_outdir, "regions")
     utils.make_dir(self.regions_outdir)
     self.qc_filename = os.path.join(self.sample_outdir, "%s.qc.txt" % (self.sample.label))
     self.qc_loaded = False
     # use ensGene gene table for QC computations
     self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"]
     # Load QC information if file corresponding to sample
     # already exists
     self.load_qc_from_file()
Exemple #4
0
 def __init__(self, settings_filename, log_output_dir, curr_sample=None):
     """
     Initialize pipeline.
     """
     # If invoked to run on particular sample
     self.curr_sample = curr_sample
     self.genome = None
     # Output directory for logging pipeline activity
     self.log_output_dir = log_output_dir
     # Output directory for actual pipeline output
     self.output_dir = None
     # Load settings file
     self.settings_filename = settings_filename
     # Load settings
     self.sequence_filenames = None
     self.parsed_settings = None
     self.settings_info = None
     self.data_type = None
     # Directory where pipeline init files are
     self.init_dir = None
     # Paired-end or not
     self.is_paired_end = None
     self.sample_to_group = None
     self.group_to_samples = None
     self.samples = []
     # Cluster objects to use
     self.my_cluster = None
     # Check settings are correct
     self.load_pipeline_settings()
     # Pipeline output subdirectories
     self.pipeline_outdirs = {}
     # RPKM directory for teh pipeline
     self.rpkm_dir = None
     # QC objects for each sample in pipeline
     self.qc_objects = {}
     # Top-level output dirs
     self.toplevel_dirs = ["rawdata", "mapping", "qc", "analysis", "logs"]
     self.init_outdirs()
     pipeline_log_name = "Pipeline"
     if self.curr_sample is not None:
         pipeline_log_name = "Pipeline.%s" % (self.curr_sample)
     self.logger = utils.get_logger(pipeline_log_name, self.pipeline_outdirs["logs"])
     self.load_cluster()
     ## Load RNA Base: object storing all the relevant
     ## initialization information
     self.rna_base = None
     self.load_rna_base()
     ## Load samples
     self.load_pipeline_samples()
     ## Initialize QC for samples
     # QC header: order of QC fields to be outputted
     self.qc_header = []
     self.init_qc()
     self.na_val = "NA"
Exemple #5
0
 def __init__(self, sample, pipeline):
     # Pipeline instance that the sample is attached to
     self.pipeline = pipeline
     self.sample = sample
     self.settings_info = pipeline.settings_info
     # Define logger
     self.logger = utils.get_logger("QualityControl.%s" % (sample.label),
                                    self.pipeline.pipeline_outdirs["logs"])
     # QC header: order of QC fields to be outputted
     self.regions_header = [
         "num_ribo", "num_exons", "num_cds", "num_introns", "num_3p_utr",
         "num_5p_utr", "num_tRNAs", "num_junctions"
     ]
     self.qc_stats_header = [
         "percent_mapped", "percent_unique", "percent_ribo",
         "percent_exons", "percent_cds", "percent_introns",
         "percent_3p_utr", "percent_5p_utr", "percent_tRNAs", "3p_to_cds",
         "5p_to_cds", "3p_to_5p", "exon_intron_ratio"
     ]
     self.qc_header = ["num_reads",
                       "num_mapped",
                       "num_ribosub_mapped",
                       "num_unique_mapped"] + \
                       self.qc_stats_header + \
                       self.regions_header
     # QC results
     self.na_val = "NA"
     self.qc_results = defaultdict(lambda: self.na_val)
     # QC output dir
     self.qc_outdir = self.pipeline.pipeline_outdirs["qc"]
     # QC filename for this sample
     self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label)
     utils.make_dir(self.sample_outdir)
     # Regions output dir
     self.regions_outdir = os.path.join(self.sample_outdir, "regions")
     utils.make_dir(self.regions_outdir)
     self.qc_filename = os.path.join(self.sample_outdir,
                                     "%s.qc.txt" % (self.sample.label))
     self.qc_loaded = False
     # use ensGene gene table for QC computations
     self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"]
     # Load QC information if file corresponding to sample
     # already exists
     self.load_qc_from_file()
Exemple #6
0
 def __init__(self, results_dir, output_dir,
              label=None):
     """
     Load up results directory
     """
     self.output_dir = output_dir
     self.logger_label = label
     if self.logger_label is None:
         self.logger_label = "BindnSeq"
     self.logger = utils.get_logger(self.logger_label,
                                    self.output_dir)
     self.results_dir = results_dir
     self.label = label
     # All kmer lengths to load
     self.kmer_lens = [4, 5, 6, 7, 8, 9]
     # Odds ratios (DataFrames indexed by kmer length)
     self.odds_ratios = {}
     # Counts (DataFrames indexed by kmer length)
     self.counts = {}
Exemple #7
0
 def load_settings(self):
     """
     Load settings for misowrap.
     """
     settings_info, parsed_settings = \
             misowrap_settings.load_misowrap_settings(self.settings_filename)
     self.settings_info = settings_info
     # Load basic settings about data
     self.read_len = self.settings_info["settings"]["readlen"]
     self.overhang_len = self.settings_info["settings"]["overhanglen"]
     self.miso_bin_dir = \
       utils.pathify(self.settings_info["settings"]["miso_bin_dir"])
     self.miso_settings_filename = \
       utils.pathify(self.settings_info["settings"]["miso_settings_filename"])
     self.miso_events_dir = \
       utils.pathify(self.settings_info["settings"]["miso_events_dir"])
     self.miso_outdir = \
       utils.pathify(self.settings_info["settings"]["miso_output_dir"])
     # Load data-related parameters
     self.bam_files = self.settings_info["data"]["bam_files"]
     if "insert_lens_dir" in self.settings_info["data"]:
         self.insert_lens_dir = \
           utils.pathify(self.settings_info["data"]["insert_lens_dir"])
     # Sample labels
     self.sample_labels = self.settings_info["data"]["sample_labels"]
     # Set output directories
     self.comparisons_dir = os.path.join(self.output_dir, 
                                         "comparisons")
     self.comparison_groups = \
         self.settings_info["data"]["comparison_groups"]
     self.logs_outdir = os.path.join(self.output_dir,
                                     "misowrap_logs")
     # Create necessary directories
     utils.make_dir(self.miso_outdir)
     utils.make_dir(self.comparisons_dir)
     utils.make_dir(self.logs_outdir)
     if "cluster_type" in self.settings_info["settings"]:
         self.use_cluster = True
         self.cluster_type = \
             self.settings_info["settings"]["cluster_type"]
         self.chunk_jobs = \
             self.settings_info["settings"]["chunk_jobs"]
     if self.use_cluster:
         print "Loading cluster information."
         # Load cluster object if given a cluster type
         self.load_cluster()
     # Create a logger object
     if self.logger_label is None:
         self.logger_label = "misowrap"
     else:
         self.logger_label = "misowrap_%s" %(logger_label)
     self.logger = utils.get_logger(self.logger_label,
                                    self.logs_outdir)
     # Whether to prefilter MISO events
     # Set general default settings
     if "prefilter_miso" not in settings_info["settings"]:
         # By default, set it so that MISO events are not
         # prefiltered
         settings_info["settings"]["prefilter_miso"] = False
     self.prefilter_miso = \
         self.settings_info["settings"]["prefilter_miso"]
     # Load event types
     self.load_event_types()
     # Set path to MISO scripts
     self.run_miso_cmd = os.path.join(self.miso_bin_dir,
                                      "run_miso.py")
     self.run_events_cmd = os.path.join(self.miso_bin_dir,
                                        "run_events_analysis.py")
     self.pe_utils_cmd = os.path.join(self.miso_bin_dir,
                                      "pe_utils.py")
     # Files related to gene tables
     self.tables_dir = \
         os.path.join(self.settings_info["pipeline-files"]["init_dir"],
                      "ucsc")
     if not os.path.isdir(self.tables_dir):
         print "Error: %s directory does not exist." \
             %(self.tables_dir)
         sys.exit(1)
     self.const_exons_gff = os.path.join(self.tables_dir,
                                         "exons",
                                         "const_exons",
                                         "ensGene.const_exons.gff")
     if not os.path.isfile(self.const_exons_gff):
         print "Error: Const. exons GFF %s does not exist." \
             %(self.const_exons_gff)
         sys.exit(1)
Exemple #8
0
 def load_settings(self):
     """
     Load settings for misowrap.
     """
     settings_info, parsed_settings = \
           misowrap_settings.load_misowrap_settings(self.settings_filename)
     self.settings_info = settings_info
     # Load basic settings about data
     self.read_len = self.settings_info["settings"]["readlen"]
     self.overhang_len = self.settings_info["settings"]["overhanglen"]
     self.miso_bin_dir = \
       utils.pathify(self.settings_info["settings"]["miso_bin_dir"])
     self.miso_settings_filename = \
       utils.pathify(self.settings_info["settings"]["miso_settings_filename"])
     self.miso_events_dir = \
       utils.pathify(self.settings_info["settings"]["miso_events_dir"])
     self.miso_outdir = \
       utils.pathify(self.settings_info["settings"]["miso_output_dir"])
     # Load data-related parameters
     self.bam_files = self.settings_info["data"]["bam_files"]
     if "insert_lens_dir" in self.settings_info["data"]:
         self.insert_lens_dir = \
           utils.pathify(self.settings_info["data"]["insert_lens_dir"])
     # Sample labels
     self.sample_labels = self.settings_info["data"]["sample_labels"]
     # Set output directories
     self.comparisons_dir = os.path.join(self.output_dir, "comparisons")
     self.comparison_groups = \
         self.settings_info["data"]["comparison_groups"]
     self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs")
     # Create necessary directories
     utils.make_dir(self.logs_outdir)
     if "cluster_type" in self.settings_info["settings"]:
         self.use_cluster = True
         self.cluster_type = \
             self.settings_info["settings"]["cluster_type"]
         self.chunk_jobs = \
             self.settings_info["settings"]["chunk_jobs"]
     if self.use_cluster:
         print "Loading cluster information."
         # Load cluster object if given a cluster type
         self.load_cluster()
     # Create a logger object
     if self.logger_label is None:
         self.logger_label = "misowrap"
     else:
         self.logger_label = "misowrap_%s" % (logger_label)
     self.logger = utils.get_logger(self.logger_label, self.logs_outdir)
     # Whether to prefilter MISO events
     # Set general default settings
     if "prefilter_miso" not in settings_info["settings"]:
         # By default, set it so that MISO events are not
         # prefiltered
         settings_info["settings"]["prefilter_miso"] = False
     self.prefilter_miso = \
         self.settings_info["settings"]["prefilter_miso"]
     # Load event types
     self.load_event_types()
     # Set path to MISO scripts
     self.compare_miso_cmd = os.path.join(self.miso_bin_dir, "compare_miso")
     self.summarize_miso_cmd = os.path.join(self.miso_bin_dir,
                                            "summarize_miso")
     self.run_events_cmd = os.path.join(self.miso_bin_dir, "miso")
     self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils")
     # Files related to gene tables
     self.tables_dir = \
         os.path.join(self.settings_info["pipeline-files"]["init_dir"],
                      "ucsc")
     if not os.path.isdir(self.tables_dir):
         print "Error: %s directory does not exist." \
             %(self.tables_dir)
         sys.exit(1)
     self.const_exons_gff = os.path.join(self.tables_dir, "exons",
                                         "const_exons",
                                         "ensGene.const_exons.gff")
     if not os.path.isfile(self.const_exons_gff):
         print "Error: Const. exons GFF %s does not exist." \
             %(self.const_exons_gff)
         sys.exit(1)