Esempio n. 1
0
 def blast_db(self):
     """A blastable database of refseq + all marine organism genes"""
     blast_db = BLASTdb(self.p.genes, 'prot')
     if not self.p.genes.exists:
         # We are going to cat a whole of files together #
         print "Regrouping all fasta files together..."
         all_genes = self.refseq_bact_orig   \
                   + self.refseq_arch_orig \
                   + self.missing_marine   \
                   + self.missing_fresh
         shell_output("zcat %s > %s" % (' '.join(all_genes), self.p.genes))
         self.timer.print_elapsed()
         # Check that all files ended with a newline #
         print "Checking that sequence counts match..."
         assert len(blast_db) == sum(map(len,self.refseq_bact_orig))  \
                               + sum(map(len,self.refseq_arch_orig)) \
                               + sum(map(len,self.missing_marine))   \
                               + sum(map(len,self.missing_fresh))
         self.timer.print_elapsed()
     if not self.p.pin.exists:
         # Call make DB #
         print "Building a BLAST database..."
         blast_db.makeblastdb(logfile=self.p.log, out=self.p.out)
         self.timer.print_elapsed()
     return blast_db
Esempio n. 2
0
 def blast_db(self):
     """A blastable database of refseq + all marine organism genes"""
     blast_db = BLASTdb(self.p.genes, 'prot')
     if not self.p.genes.exists:
         # We are going to cat a whole of files together #
         print "Regrouping all fasta files together..."
         all_genes = self.refseq_bact_orig   \
                   + self.refseq_arch_orig \
                   + self.missing_marine   \
                   + self.missing_fresh
         shell_output("zcat %s > %s" % (' '.join(all_genes), self.p.genes))
         self.timer.print_elapsed()
         # Check that all files ended with a newline #
         print "Checking that sequence counts match..."
         assert len(blast_db) == sum(map(len,self.refseq_bact_orig))  \
                               + sum(map(len,self.refseq_arch_orig)) \
                               + sum(map(len,self.missing_marine))   \
                               + sum(map(len,self.missing_fresh))
         self.timer.print_elapsed()
     if not self.p.pin.exists:
         # Call make DB #
         print "Building a BLAST database..."
         blast_db.makeblastdb(logfile=self.p.log, out=self.p.out)
         self.timer.print_elapsed()
     return blast_db
Esempio n. 3
0
 def retrieve_from_nt(self):
     """Get all GI numbers with their length from local NT database.
     Then filter out the ones that are too long."""
     temp_file = new_temp_file()
     shell_output("blastdbcmd -db nt -entry all -outfmt '%g %l' > " + temp_file)
     with_len = (map(int,line.strip('\n').split()) for line in temp_file)
     self.writelines(str(gid) + '\n' for gid, length in with_len if length < self.length_cutoff)
     temp_file.remove()
 def sqlite_by_shell(self, destination):
     """Method with shell and a temp file. This is hopefully fast."""
     script_path = new_temp_path()
     self.sqlite_dump_shell(script_path)
     from shell_command import shell_output
     shell_output('sqlite3 -bail -init "%s" "%s" .quit' %
                  (script, destination))
     script.remove()
 def count(self):
     # Import module #
     from shell_command import shell_output
     # Case when we are not compressed #
     if not self.gzipped:
         return int(int(shell_output("cat %s | wc -l" % self.path)) / 4)
     # If we are gzipped we can just use zcat or gzcat on macOS #
     program = 'gzcat' if sys.platform != 'linux' else 'zcat'
     command = "%s %s | wc -l" % (program, self.path)
     return int(int(shell_output(command)) / 4)
Esempio n. 6
0
 def blast_db(self):
     """A blastable database of all genes"""
     db = BLASTdb(self.p.all_fasta, self.seq_type)
     if not self.p.all_nin and not self.p.all_pin:
         print "--> STEP 1: Building BLASTable database with all genes..."
         shell_output('gunzip -c %s > %s' % (' '.join(genomes.values()), db))
         assert len(db) == sum(map(len, genomes.values()))
         db.makeblastdb()
         self.timer.print_elapsed()
     return db
Esempio n. 7
0
 def fresh_fasta(self):
     """A file containing all the fresh water genes"""
     fasta = FASTA(self.p.fresh_fasta)
     if not fasta.exists:
         print "Building fasta file with all fresh genes..."
         fresh = [g for g in genomes.values() if g.fresh]
         shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta))
         assert len(fasta) == sum(map(len, fresh))
         self.timer.print_elapsed()
     return fasta
Esempio n. 8
0
 def fresh_fasta(self):
     """A file containing all the fresh water genes"""
     fasta = FASTA(self.p.fresh_fasta)
     if not fasta.exists:
         print "Building fasta file with all fresh genes..."
         fresh = [g for g in genomes.values() if g.fresh]
         shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta))
         assert len(fasta) == sum(map(len, fresh))
         self.timer.print_elapsed()
     return fasta
Esempio n. 9
0
 def retrieve_from_nt(self):
     """Get all GI numbers with their length from local NT database.
     Then filter out the ones that are too long."""
     temp_file = new_temp_file()
     shell_output("blastdbcmd -db nt -entry all -outfmt '%g %l' > " +
                  temp_file)
     with_len = (map(int, line.strip('\n').split()) for line in temp_file)
     self.writelines(
         str(gid) + '\n' for gid, length in with_len
         if length < self.length_cutoff)
     temp_file.remove()
Esempio n. 10
0
 def run(self):
     # Combine reads but in fastq format this time #
     paths = [sample.renamed for sample in self.cluster]
     shell_output("cat %s > %s" % (" ".join(paths), self.reads))
     # Clean #
     shutil.rmtree(self.p.clusters_dir)
     # Run command #
     cdhit = sh.Command(cdhit_script)
     cdhit("-i", self.reads, "-o", self.p.clusters_dir, "-p", TmpFile.from_string("[ACTG]"))
     # Create the centers file with good names #
     self.cdhit_centers.rename_with_num("OTU-", self.centers)
Esempio n. 11
0
 def run(self):
     # Combine reads but in fastq format this time #
     paths = [sample.renamed for sample in self.cluster]
     shell_output('cat %s > %s' % (' '.join(paths), self.reads))
     # Clean #
     shutil.rmtree(self.p.clusters_dir)
     # Run command #
     cdhit = sh.Command(cdhit_script)
     cdhit('-i', self.reads, '-o', self.p.clusters_dir, '-p',
           TmpFile.from_string('[ACTG]'))
     # Create the centers file with good names #
     self.cdhit_centers.rename_with_num('OTU_', self.centers)
Esempio n. 12
0
 def sqlite_dump_shell(self, script_path):
     """Generate a text dump compatible with SQLite by using
     shell commands. Place this script at *script_path*."""
     # First the schema #
     shell_output('mdb-schema "%s" sqlite >> "%s"' % (self.path, script_path))
     # Start a transaction, speeds things up when importing #
     script_path.append("\n\n\nBEGIN TRANSACTION;\n")
     # Then export every table #
     for table in self.tables:
         command = 'mdb-export -I sqlite "%s" "%s" >> "%s"'
         shell_output(command % (self.path, table, script_path))
     # End the transaction
     script_path.append("\n\n\nEND TRANSACTION;\n")
 def sqlite_dump_shell(self, script_path):
     """Generate a text dump compatible with SQLite by using
     shell commands. Place this script at *script_path*."""
     # First the schema #
     from shell_command import shell_output
     shell_output('mdb-schema "%s" sqlite >> "%s"' %
                  (self.path, script_path))
     # Start a transaction, speeds things up when importing #
     script_path.append("\n\n\nBEGIN TRANSACTION;\n")
     # Then export every table #
     for table in self.tables:
         command = 'mdb-export -I sqlite "%s" "%s" >> "%s"'
         shell_output(command % (self.path, table, script_path))
     # End the transaction
     script_path.append("\n\n\nEND TRANSACTION;\n")
Esempio n. 14
0
 def run(self):
     # Make preferences files #
     with open(self.p.config, 'w') as handle: handle.write(default_config)
     with open(self.p.param, 'w') as handle: handle.write(default_param)
     # Make symlink for the input #
     if os.path.lexists(self.fasta_path): os.remove(self.fasta_path)
     os.symlink(self.sample.p.clean_fasta, self.fasta_path)
     # Remove the PGDB if it exists #
     if os.path.exists(self.pgdb_path): shutil.rmtree(self.pgdb_path)
     # Make a stupid link #
     if os.path.lexists("blastDB"): os.remove("blastDB")
     os.symlink(metapath_dir + "blastDB/", "blastDB")
     # The command #
     with open(self.p.out, 'w') as handle: handle.write(self.bash_str + '\n\n')
     shell_output(self.bash_str)
     # Clean up #
     if os.path.exists("blastDB"): os.remove("blastDB")
     if os.path.exists(self.fasta_path): os.remove(self.fasta_path)
     # Check for errors #
     with open(self.p.out, 'r') as handle: log = handle.read()
     if 'Error!' in log: raise Exception('Metapathway tools reported an error in file "%s".' % self.p.out)
Esempio n. 15
0
 def extract(self):
     # Call extraction #
     shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
     shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
     shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
     # Convert #
     sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)
Esempio n. 16
0
 def extract(self):
     # Call extraction #
     shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
     shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
     shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
     # Convert #
     sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)
Esempio n. 17
0
 def raw(self):
     # Call extraction #
     shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
     shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
     shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
     # Convert #
     sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)
     # Fast QC #
     sh.fastqc(self.p.raw_fastq, '-o', self.p.raw_dir, '-q')
     if os.path.exists(self.p.raw_dir + 'raw_fastqc.zip'): os.remove(self.p.raw_dir + 'raw_fastqc.zip')
Esempio n. 18
0
 def combine_reads(self):
     paths = [sample.fasta.path for sample in self]
     shell_output('cat %s > %s' % (' '.join(paths), self.reads))
Esempio n. 19
0
for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered))
over.close()
over.graphs[-1].plot()
crest = SimpleCrestTaxonomy(over, folder)
crest.assign()
crest.composition.graph.plot()
rdp = SimpleRdpTaxonomy(over, folder)
rdp.assign()
rdp.composition.graph.plot()

# Check unassembled mate pairs #
unassembled = [p.good_barcodes.unassembled for p in pools]
paths = [u.flipped_reads.path for u in unassembled]
folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "unassembled_taxonomy/")
all_unassembled = FASTA(folder + 'unassembled_reads.fasta')
shell_output('cat %s > %s' % (' '.join(paths), all_unassembled))
tax = SimpleRdpTaxonomy(all_unassembled, folder)
tax.assign()
tax.composition.graph.plot()

# Upload raw samples for ENA #
for pool in pools: pool.create_raw_samples()
for s in samples:
    print s.short_name
    s.ena.upload_to_ena()

# Submit to ENA #
from illumitag.helper.ena import MakeAllXML
make_xml = MakeAllXML(proj, cluster)
make_xml.write_files()
Esempio n. 20
0
 def combine_reads(self):
     """This is the first function should call. It will combine all the
     reads of all the samples of this cluster into one big FASTA file."""
     paths = [sample.fasta.path for sample in self]
     shell_output('cat %s > %s' % (' '.join(paths), self.reads))
     return self.reads
Esempio n. 21
0
 def regroup(self):
     if self.format == 'xml':
         with open(self.out_path, 'w') as handle:
             merge_blast_xml(map(open,(q.out_path for q in self.sub_queries)), handle)
     else:
         shell_output("cat %s > %s" % (' '.join(q.out_path for q in self.sub_queries), self.out_path))
Esempio n. 22
0
 def sqlite_by_shell(self, destination):
     """Method with shell and a temp file. This is hopefully fast."""
     script_path = new_temp_path()
     self.sqlite_dump_shell(script_path)
     shell_output('sqlite3 -bail -init "%s" "%s" .quit' % (script, destination))
     script.remove()
Esempio n. 23
0
 def uncompress(self):
     shell_output('gunzip -c %s > %s' % (self.fwd_path, self.fwd))
     shell_output('gunzip -c %s > %s' % (self.rev_path, self.rev))
Esempio n. 24
0
$ ./gen_test.py
"""

# Built-in modules #
import time, datetime

# Internal modules #
from gefes import projects

# Third party modules #
from shell_command import shell_output
import playdoh

# Timer #
now = time.time()

###############################################################################
print "Making test files"

# Do it #
pairs = []
pairs += [(projects['humic'][i].fwd_path, projects['test'][i].fwd_path) for i in range(3)]
pairs += [(projects['humic'][i].rev_path, projects['test'][i].rev_path) for i in range(3)]
process = lambda x : shell_output('zcat %s |head -n 4000| gzip > %s' % (x[0],x[1]))

# Run it in parallel #
playdoh.map(process, pairs, cpu=len(pairs))

# Report Success #
run_time = datetime.timedelta(seconds=round(time.time()-now))
print "\033[0;32mRun time: '%s'\033[0m" % (run_time)
Esempio n. 25
0
 def no_barcode_split(self):
     """Will translate the SFF files without barcodes. Just to explore an SFF file."""
     shell_output('sffinfo -s %s > %s' % (self.path, self.raw_fasta_path))
     shell_output('sffinfo -q %s > %s' % (self.path, self.raw_qual_path))
     sh.fasta_to_fastq(self.raw_fasta_path, self.raw_qual_path, self.fastq)
 def compress_fast(self, new_path):
     """Do the compression with an external shell command call."""
     # We don't want python to be buffering the text for speed #
     from shell_command import shell_output
     cmd = 'gzip --stdout %s > %s' % (self.path, new_path)
     return shell_output(cmd)
Esempio n. 27
0
 def join_outputs(self):
     """Join the outputs"""
     shell_output('cat %s > %s' % (' '.join(q.out_path for q in self.queries), self.out_path))
Esempio n. 28
0
 def no_barcode_split(self):
     """Will translate the SFF files without barcodes. Just to explore an SFF file."""
     shell_output('sffinfo -s %s > %s' % (self.path, self.raw_fasta_path))
     shell_output('sffinfo -q %s > %s' % (self.path, self.raw_qual_path))
     sh.fasta_to_fastq(self.raw_fasta_path, self.raw_qual_path, self.fastq)