def main(argv): # Create pipeline object pipeline = Pipeline() # Create a parser parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Parse parameters, sanity check and run the pipeline try: parser = pipeline.createParameters(parser) # Parse arguments options = parser.parse_args() pipeline.load_parameters(options) sys.stdout.write("ST Pipeline, parameters loaded\n") pipeline.createLogger() sys.stdout.write("ST Pipeline, logger created\n") pipeline.sanityCheck() sys.stdout.write( "ST Pipeline, sanity check passed. Starting the run.\n") pipeline.run() sys.stdout.write("ST Pipeline, run completed!\n") except Exception as e: sys.stderr.write("Error running the pipeline\n") sys.stderr.write(str(e) + "\n") sys.exit(1) finally: pipeline.clean_filenames()
def main(argv): # Create pipeline object pipeline = Pipeline() # Create a parser parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Parse parameters, sanity check and run the pipeline try: parser = pipeline.createParameters(parser) # Parse arguments options = parser.parse_args() pipeline.load_parameters(options) sys.stdout.write("ST Pipeline, parameters loaded\n") pipeline.createLogger() sys.stdout.write("ST Pipeline, logger created\n") pipeline.sanityCheck() sys.stdout.write("ST Pipeline, sanity check passed. Starting the run.\n") pipeline.run() sys.stdout.write("ST Pipeline, run completed!\n") except Exception as e: sys.stderr.write("Error running the pipeline\n") sys.stderr.write(str(e) + "\n") sys.exit(1) finally: pipeline.clean_filenames()
class TestPipeline(unittest.TestCase): @classmethod def setUpClass(self): # Obtain paths and files. testdir = str( os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) self.infile_fw = os.path.join(testdir, 'input/arrayjet_1002/testdata_R1.fastq') self.infile_rv = os.path.join(testdir, 'input/arrayjet_1002/testdata_R2.fastq') self.annotfile = os.path.join( testdir, 'config/annotations/Homo_sapiens.GRCh38.79_chr19.gtf') self.chipfile = os.path.join( testdir, 'config/idfiles/150204_arrayjet_1000L2_probes.txt') self.expname = "test" # Obtain temp dir self.tmpdir = tempfile.mkdtemp(prefix="st_pipeline_test_temp") print "ST Pipeline Test Temporary directory {}".format(self.tmpdir) self.outdir = tempfile.mkdtemp(prefix="st_pipeline_test_output") print "ST Pipeline Test Temporary output {}".format(self.outdir) self.error_file = os.path.join(self.tmpdir, 'error.log') self.logFile = tempfile.mktemp(prefix="st_pipeline_test_log") print "ST Pipeline Test Log file {}".format(self.logFile) # Create genome index dirs. self.genomedir = os.path.join(self.tmpdir, 'config/genomes/mouse_grcm38') os.makedirs(self.genomedir) # STAR contaminant dir self.contamdir = os.path.join(self.tmpdir, 'config/contaminant_genomes/R45S5_R5S1') os.makedirs(self.contamdir) genomefasta = os.path.join(self.genomedir, "human_grcm38_chromosome19.fasta") genomefastagz = os.path.join(self.genomedir, "human_grcm38_chromosome19.fasta.gz") # Change dir to the temp folder os.chdir(self.tmpdir) # Download and unpack fasta files try: print "ST Pipeline Test Downloading genome files..." copyfile( os.path.join( testdir, "config/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz"), genomefastagz) check_call(['gunzip', genomefastagz]) except Exception as e: print str(e) self.assertTrue(0, "Downloading genome files failed \n") # Make genome indexes try: print "ST Pipeline Test Creating genome index..." check_call([ "STAR", "--runMode", "genomeGenerate", "--runThreadN", str(multiprocessing.cpu_count() - 1), "--genomeDir", self.genomedir, "--genomeFastaFiles", genomefasta ]) print "ST Pipeline Test Creating contaminant genome index..." contamfasta = os.path.join( testdir, "config/contaminant_genomes/R45S5_R5S1/Rn45s_Rn5s.fasta") check_call([ "STAR", "--runMode", "genomeGenerate", "--runThreadN", str(multiprocessing.cpu_count() - 1), "--genomeDir", self.contamdir, "--genomeFastaFiles", contamfasta ]) except Exception as e: print str(e) self.assertTrue(0, "Creating genome index failed \n") # Verify existence of input files assert (os.path.exists(self.infile_fw)) assert (os.path.exists(self.infile_rv)) assert (os.path.isdir(self.genomedir)) assert (os.path.isdir(self.contamdir)) assert (os.path.exists(self.annotfile)) assert (os.path.exists(self.chipfile)) assert (os.path.isdir(self.outdir)) assert (os.path.isdir(self.tmpdir)) # Create a pipeline Instance self.pipeline = Pipeline() # Init pipeline arguments self.pipeline.expName = self.expname self.pipeline.fastq_fw = self.infile_fw self.pipeline.fastq_rv = self.infile_rv self.pipeline.umi_allowed_mismatches = 1 self.pipeline.umi_start_position = 18 self.pipeline.umi_end_position = 27 self.pipeline.keep_discarded_files = True self.pipeline.allowed_missed = 2 self.pipeline.allowed_kmer = 6 self.pipeline.min_length_trimming = 25 self.pipeline.trimming_rv = 1 self.pipeline.min_quality_trimming = 20 self.pipeline.clean = False self.pipeline.barcode_start = 0 self.pipeline.threads = multiprocessing.cpu_count() - 1 self.pipeline.verbose = True self.pipeline.ids = os.path.abspath(self.chipfile) self.pipeline.ref_map = os.path.abspath(self.genomedir) self.pipeline.ref_annotation = os.path.abspath(self.annotfile) self.pipeline.htseq_mode = "intersection-nonempty" self.pipeline.htseq_no_ambiguous = False self.pipeline.contaminant_index = os.path.abspath(self.contamdir) self.pipeline.output_folder = os.path.abspath(self.outdir) self.pipeline.temp_folder = os.path.abspath(self.tmpdir) self.pipeline.logfile = self.logFile self.pipeline.remove_polyA_distance = 15 self.pipeline.remove_polyT_distance = 15 self.pipeline.remove_polyG_distance = 15 self.pipeline.remove_polyC_distance = 15 self.pipeline.umi_cluster_algorithm = "hierarchical" self.pipeline.umi_filter = True self.pipeline.compute_saturation = True self.pipeline.include_non_annotated = True self.pipeline.inverse_trimming_rv = 1 self.pipeline.low_memory = True self.pipeline.two_pass_mode = True @classmethod def tearDownClass(self): print "ST Pipeline Test Remove temporary output {}".format(self.outdir) for root, dirs, files in os.walk(self.outdir, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) if os.path.exists(self.outdir): os.rmdir(self.outdir) print "ST Pipeline Test Remove temporary directory {}".format( self.tmpdir) for root, dirs, files in os.walk(self.tmpdir, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) if os.path.exists(self.tmpdir): os.rmdir(self.tmpdir) # Remove STAR log files log_std = "Log.std.out" log = "Log.out" log_sj = "SJ.out.tab" log_final = "Log.final.out" log_progress = "Log.progress.out" if os.path.isfile(log_std): os.remove(log_std) if os.path.isfile(log): os.remove(log) if os.path.isfile(log_sj): os.remove(log_sj) if os.path.isfile(log_progress): os.remove(log_progress) if os.path.isfile(log_final): os.remove(log_final) def validateOutputData(self, expName): # Verify existence of output files and temp files self.assertNotEqual(os.listdir(self.outdir), [], "Output folder is not empty") self.assertNotEqual(os.listdir(self.tmpdir), [], "Tmp folder is not empty") datafile = os.path.join(self.outdir, str(self.pipeline.expName) + "_stdata.tsv") readsfile = os.path.join(self.outdir, str(self.pipeline.expName) + "_reads.bed") statsfile = os.path.join(self.outdir, str(self.pipeline.expName) + "_qa_stats.json") self.assertTrue(os.path.exists(datafile), "ST Data file exists") self.assertTrue( os.path.getsize(datafile) > 1024, "ST Data file is not empty") self.assertTrue(os.path.exists(readsfile), "ST Data BED file exists") self.assertTrue( os.path.getsize(readsfile) > 1024, "ST Data BED file is not empty") self.assertTrue(os.path.exists(statsfile), "Stats JSON file exists") # Verify that the stats are correct counts_table = pd.read_table(datafile, sep="\t", header=0, index_col=0) self.assertTrue( np.sum(counts_table.values, dtype=np.int32) == 8429, "ST data incorrect stats") self.assertTrue( len(counts_table.columns) == 638, "ST data incorrect stats") aggregated_spot_counts = counts_table.sum(axis=1).values aggregated_gene_counts = (counts_table != 0).sum(axis=1).values self.assertTrue(aggregated_gene_counts.max() == 78, "ST data incorrect stats") self.assertTrue(aggregated_gene_counts.min() == 1, "ST data incorrect stats") self.assertTrue(aggregated_spot_counts.max() == 192, "ST data incorrect stats") self.assertTrue(aggregated_spot_counts.min() == 1, "ST data incorrect stats") def test_normal_run(self): """ Tests st_pipeline on a mouse data subset with normal fastq files """ # Start the pipeline try: self.pipeline.createLogger() self.pipeline.sanityCheck() self.pipeline.run() self.pipeline.clean_filenames() except Exception as e: print str(e) self.assertTrue(0, "Running Pipeline Test failed \n") self.validateOutputData(self.expname)
class TestPipeline(unittest.TestCase): @classmethod def setUpClass(self): # Obtain paths and files. testdir = str(os.path.abspath(os.path.dirname(os.path.realpath(__file__)))) self.infile_fw = os.path.join(testdir, 'input/arrayjet_1002/testdata_R1.fastq') self.infile_rv = os.path.join(testdir, 'input/arrayjet_1002/testdata_R2.fastq') self.annotfile = os.path.join(testdir, 'config/annotations/Homo_sapiens.GRCh38.79_chr19.gtf') self.chipfile = os.path.join(testdir, 'config/idfiles/150204_arrayjet_1000L2_probes.txt') self.expname = "test" # Obtain temp dir self.tmpdir = tempfile.mkdtemp(prefix="st_pipeline_test_temp") print "ST Pipeline Test Temporary directory {}".format(self.tmpdir) self.outdir = tempfile.mkdtemp(prefix="st_pipeline_test_output") print "ST Pipeline Test Temporary output {}".format(self.outdir) self.error_file = os.path.join(self.tmpdir, 'error.log') self.logFile = tempfile.mktemp(prefix="st_pipeline_test_log") print "ST Pipeline Test Log file {}".format(self.logFile) # Create genome index dirs. self.genomedir = os.path.join(self.tmpdir, 'config/genomes/mouse_grcm38') os.makedirs(self.genomedir) # STAR contaminant dir self.contamdir = os.path.join(self.tmpdir, 'config/contaminant_genomes/R45S5_R5S1') os.makedirs(self.contamdir) genomefasta = os.path.join(self.genomedir, "human_grcm38_chromosome19.fasta") genomefastagz = os.path.join(self.genomedir, "human_grcm38_chromosome19.fasta.gz") # Change dir to the temp folder os.chdir(self.tmpdir) # Download and unpack fasta files try: print "ST Pipeline Test Downloading genome files..." copyfile(os.path.join(testdir, "config/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz"), genomefastagz) check_call(['gunzip', genomefastagz]) except Exception as e: print str(e) self.assertTrue(0, "Downloading genome files failed \n") # Make genome indexes try: print "ST Pipeline Test Creating genome index..." check_call(["STAR", "--runMode", "genomeGenerate", "--runThreadN", str(multiprocessing.cpu_count() - 1), "--genomeDir", self.genomedir, "--genomeFastaFiles", genomefasta]) print "ST Pipeline Test Creating contaminant genome index..." contamfasta = os.path.join(testdir, "config/contaminant_genomes/R45S5_R5S1/Rn45s_Rn5s.fasta") check_call(["STAR", "--runMode", "genomeGenerate", "--runThreadN", str(multiprocessing.cpu_count() - 1), "--genomeDir", self.contamdir, "--genomeFastaFiles", contamfasta]) except Exception as e: print str(e) self.assertTrue(0, "Creating genome index failed \n") # Verify existence of input files assert(os.path.exists(self.infile_fw)) assert(os.path.exists(self.infile_rv)) assert(os.path.isdir(self.genomedir)) assert(os.path.isdir(self.contamdir)) assert(os.path.exists(self.annotfile)) assert(os.path.exists(self.chipfile)) assert(os.path.isdir(self.outdir)) assert(os.path.isdir(self.tmpdir)) # Create a pipeline Instance self.pipeline = Pipeline() # Init pipeline arguments self.pipeline.expName = self.expname self.pipeline.fastq_fw = self.infile_fw self.pipeline.fastq_rv = self.infile_rv self.pipeline.umi_allowed_mismatches = 1 self.pipeline.umi_start_position = 18 self.pipeline.umi_end_position = 27 self.pipeline.keep_discarded_files = True self.pipeline.allowed_missed = 2 self.pipeline.allowed_kmer = 6 self.pipeline.min_length_trimming = 25 self.pipeline.trimming_rv = 1 self.pipeline.min_quality_trimming = 20 self.pipeline.clean = False self.pipeline.barcode_start = 0 self.pipeline.threads = multiprocessing.cpu_count() - 1 self.pipeline.verbose = True self.pipeline.ids = os.path.abspath(self.chipfile) self.pipeline.ref_map = os.path.abspath(self.genomedir) self.pipeline.ref_annotation = os.path.abspath(self.annotfile) self.pipeline.htseq_mode = "intersection-nonempty" self.pipeline.htseq_no_ambiguous = False self.pipeline.contaminant_index= os.path.abspath(self.contamdir) self.pipeline.output_folder = os.path.abspath(self.outdir) self.pipeline.temp_folder = os.path.abspath(self.tmpdir) self.pipeline.logfile = self.logFile self.pipeline.remove_polyA_distance = 15 self.pipeline.remove_polyT_distance = 15 self.pipeline.remove_polyG_distance = 15 self.pipeline.remove_polyC_distance = 15 self.pipeline.umi_cluster_algorithm = "hierarchical" self.pipeline.umi_filter = True self.pipeline.compute_saturation = True self.pipeline.include_non_annotated = True self.pipeline.inverse_trimming_rv = 1 self.pipeline.low_memory = True self.pipeline.two_pass_mode = True self.pipeline.saturation_points = [10, 100, 1000, 10000] @classmethod def tearDownClass(self): print "ST Pipeline Test Remove temporary output {}".format(self.outdir) for root, dirs, files in os.walk(self.outdir, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) if os.path.exists(self.outdir): os.rmdir(self.outdir) print "ST Pipeline Test Remove temporary directory {}".format(self.tmpdir) for root, dirs, files in os.walk(self.tmpdir, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) if os.path.exists(self.tmpdir): os.rmdir(self.tmpdir) # Remove STAR log files log_std = "Log.std.out" log = "Log.out" log_sj = "SJ.out.tab" log_final = "Log.final.out" log_progress = "Log.progress.out" if os.path.isfile(log_std): os.remove(log_std) if os.path.isfile(log): os.remove(log) if os.path.isfile(log_sj): os.remove(log_sj) if os.path.isfile(log_progress): os.remove(log_progress) if os.path.isfile(log_final): os.remove(log_final) def validateOutputData(self, expName): # Verify existence of output files and temp files self.assertNotEqual(os.listdir(self.outdir), [], "Output folder is not empty") self.assertNotEqual(os.listdir(self.tmpdir), [], "Tmp folder is not empty") datafile = os.path.join(self.outdir, str(self.pipeline.expName) + "_stdata.tsv") readsfile = os.path.join(self.outdir, str(self.pipeline.expName) + "_reads.bed") statsfile = os.path.join(self.outdir, str(self.pipeline.expName) + "_qa_stats.json") self.assertTrue(os.path.exists(datafile), "ST Data file exists") self.assertTrue(os.path.getsize(datafile) > 1024, "ST Data file is not empty") self.assertTrue(os.path.exists(readsfile), "ST Data BED file exists") self.assertTrue(os.path.getsize(readsfile) > 1024, "ST Data BED file is not empty") self.assertTrue(os.path.exists(statsfile), "Stats JSON file exists") # Verify that the stats are correct counts_table = pd.read_table(datafile, sep="\t", header=0, index_col=0) self.assertTrue(np.sum(counts_table.values, dtype=np.int32) == 8429, "ST data incorrect stats") self.assertTrue(len(counts_table.columns) == 638, "ST data incorrect stats") aggregated_spot_counts = counts_table.sum(axis=1).values aggregated_gene_counts = (counts_table != 0).sum(axis=1).values self.assertTrue(aggregated_gene_counts.max() == 78, "ST data incorrect stats") self.assertTrue(aggregated_gene_counts.min() == 1, "ST data incorrect stats") self.assertTrue(aggregated_spot_counts.max() == 192, "ST data incorrect stats") self.assertTrue(aggregated_spot_counts.min() == 1, "ST data incorrect stats") def test_normal_run(self): """ Tests st_pipeline on a mouse data subset with normal fastq files """ # Start the pipeline try: self.pipeline.createLogger() self.pipeline.sanityCheck() self.pipeline.run() self.pipeline.clean_filenames() except Exception as e: print str(e) self.assertTrue(0, "Running Pipeline Test failed \n") self.validateOutputData(self.expname)