Esempio n. 1
0
    def test_3_generate_mixed_dataset(self):
        """ Generates a mixed synthetic dataset of eschColi with reads[0] reads
            and dm3 with reads[1] reads.
        """
        orgs = [os.path.join(self.reference, "eschColi_K12/seq/eschColi_K12.fa"),
                os.path.join(self.reference, "dm3/seq/dm3.fa")]

        reads = [3000, 6000]

        # Will hold the real number of fastq reads after simNGS, independent of
        # organism
        lines = []

        dst = os.path.join(self.synthetic_fastq,
                           "simngs.mixed_{org1}_{org2}_{reads1}vs{reads2}.fastq".format(org1='eschColi_K12',
                                                                                        org2='dm3',
                                                                                        reads1=reads[0],
                                                                                        reads2=reads[1]))
        for org, read in izip(orgs, reads):
            fa_entries = 0
            with open(org, 'r') as cnt:
                for line in cnt:
                    if '>' in line:
                        fa_entries += 1

            with open(dst, 'a') as fh:
                n = str(ceil(read/float(fa_entries)))
                cl1 = [self.simlib, "--seed", self.sim_seed, "-n", n, org]
                cl2 = [self.simngs, "-s", self.sim_seed, "-o", "fastq", self.runfile]

                p1 = subprocess.Popen(cl1, stdout=subprocess.PIPE)
                p2 = subprocess.Popen(cl2, stdin=p1.stdout, stdout=fh).communicate()
                p1.stdout.close()

        #trim_fastq will trim the excess of dm3 reads, as they're the last ones,
        #leading to a file with exactly 3000 reads of E.choli and 6000 of dm3
        helpers.trim_fastq(dst, sum(reads))
Esempio n. 2
0
    def test_2_run_simNGS(self):
        """ Simulates an Illumina run with simNGS read simulator
            for each organism in references directory.
        """
        # Generate N simulated reads of every organism present in "org"
        orgs = [o for o in glob.glob(os.path.join(self.reference, "*/seq/*.fa"))]

        for org in orgs:

            for reads in self.sim_reads:
                dst = os.path.join(self.synthetic_fastq,
                                   "simngs_{org}_{reads}.fastq".format(org=org.split(os.sep)[-3], reads=reads))

                #Do not regenerate datasets that are already present
                if not os.path.exists(dst):
                    fa_entries = 0
                    # Determine how many FASTA "Description lines" (headers) there are
                    # since simNGS will generate reads depending on that number
                    with open(org, 'r') as cnt:
                        for line in cnt:
                            if '>' in line:
                                fa_entries += 1

                    with open(dst, 'w') as fh:
                        n = str(ceil(reads/float(fa_entries)))
                        cl1 = [self.simlib, "--seed", self.sim_seed, "-n", n, org]
                        cl2 = [self.simngs, "-s", self.sim_seed, "-o", "fastq", self.runfile]
                        # XXX: To be parametrized in future benchmarks (for paired end reads)
                        #cl2 = [simngs, "-o", "fastq", "-p", "paired", runfile]

                        # http://docs.python.org/2/library/subprocess.html#replacing-shell-pipeline
                        p1 = subprocess.Popen(cl1, stdout=subprocess.PIPE)
                        p2 = subprocess.Popen(cl2, stdin=p1.stdout, stdout=fh).communicate()
                        p1.stdout.close()
                    #Trim the FAST file to the actual number of reads
                    helpers.trim_fastq(dst, reads)