def blast_db(self): """A blastable database of refseq + all marine organism genes""" blast_db = BLASTdb(self.p.genes, 'prot') if not self.p.genes.exists: # We are going to cat a whole of files together # print "Regrouping all fasta files together..." all_genes = self.refseq_bact_orig \ + self.refseq_arch_orig \ + self.missing_marine \ + self.missing_fresh shell_output("zcat %s > %s" % (' '.join(all_genes), self.p.genes)) self.timer.print_elapsed() # Check that all files ended with a newline # print "Checking that sequence counts match..." assert len(blast_db) == sum(map(len,self.refseq_bact_orig)) \ + sum(map(len,self.refseq_arch_orig)) \ + sum(map(len,self.missing_marine)) \ + sum(map(len,self.missing_fresh)) self.timer.print_elapsed() if not self.p.pin.exists: # Call make DB # print "Building a BLAST database..." blast_db.makeblastdb(logfile=self.p.log, out=self.p.out) self.timer.print_elapsed() return blast_db
def retrieve_from_nt(self): """Get all GI numbers with their length from local NT database. Then filter out the ones that are too long.""" temp_file = new_temp_file() shell_output("blastdbcmd -db nt -entry all -outfmt '%g %l' > " + temp_file) with_len = (map(int,line.strip('\n').split()) for line in temp_file) self.writelines(str(gid) + '\n' for gid, length in with_len if length < self.length_cutoff) temp_file.remove()
def sqlite_by_shell(self, destination): """Method with shell and a temp file. This is hopefully fast.""" script_path = new_temp_path() self.sqlite_dump_shell(script_path) from shell_command import shell_output shell_output('sqlite3 -bail -init "%s" "%s" .quit' % (script, destination)) script.remove()
def count(self): # Import module # from shell_command import shell_output # Case when we are not compressed # if not self.gzipped: return int(int(shell_output("cat %s | wc -l" % self.path)) / 4) # If we are gzipped we can just use zcat or gzcat on macOS # program = 'gzcat' if sys.platform != 'linux' else 'zcat' command = "%s %s | wc -l" % (program, self.path) return int(int(shell_output(command)) / 4)
def blast_db(self): """A blastable database of all genes""" db = BLASTdb(self.p.all_fasta, self.seq_type) if not self.p.all_nin and not self.p.all_pin: print "--> STEP 1: Building BLASTable database with all genes..." shell_output('gunzip -c %s > %s' % (' '.join(genomes.values()), db)) assert len(db) == sum(map(len, genomes.values())) db.makeblastdb() self.timer.print_elapsed() return db
def fresh_fasta(self): """A file containing all the fresh water genes""" fasta = FASTA(self.p.fresh_fasta) if not fasta.exists: print "Building fasta file with all fresh genes..." fresh = [g for g in genomes.values() if g.fresh] shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta)) assert len(fasta) == sum(map(len, fresh)) self.timer.print_elapsed() return fasta
def retrieve_from_nt(self): """Get all GI numbers with their length from local NT database. Then filter out the ones that are too long.""" temp_file = new_temp_file() shell_output("blastdbcmd -db nt -entry all -outfmt '%g %l' > " + temp_file) with_len = (map(int, line.strip('\n').split()) for line in temp_file) self.writelines( str(gid) + '\n' for gid, length in with_len if length < self.length_cutoff) temp_file.remove()
def run(self): # Combine reads but in fastq format this time # paths = [sample.renamed for sample in self.cluster] shell_output("cat %s > %s" % (" ".join(paths), self.reads)) # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # cdhit = sh.Command(cdhit_script) cdhit("-i", self.reads, "-o", self.p.clusters_dir, "-p", TmpFile.from_string("[ACTG]")) # Create the centers file with good names # self.cdhit_centers.rename_with_num("OTU-", self.centers)
def run(self): # Combine reads but in fastq format this time # paths = [sample.renamed for sample in self.cluster] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # cdhit = sh.Command(cdhit_script) cdhit('-i', self.reads, '-o', self.p.clusters_dir, '-p', TmpFile.from_string('[ACTG]')) # Create the centers file with good names # self.cdhit_centers.rename_with_num('OTU_', self.centers)
def sqlite_dump_shell(self, script_path): """Generate a text dump compatible with SQLite by using shell commands. Place this script at *script_path*.""" # First the schema # shell_output('mdb-schema "%s" sqlite >> "%s"' % (self.path, script_path)) # Start a transaction, speeds things up when importing # script_path.append("\n\n\nBEGIN TRANSACTION;\n") # Then export every table # for table in self.tables: command = 'mdb-export -I sqlite "%s" "%s" >> "%s"' shell_output(command % (self.path, table, script_path)) # End the transaction script_path.append("\n\n\nEND TRANSACTION;\n")
def sqlite_dump_shell(self, script_path): """Generate a text dump compatible with SQLite by using shell commands. Place this script at *script_path*.""" # First the schema # from shell_command import shell_output shell_output('mdb-schema "%s" sqlite >> "%s"' % (self.path, script_path)) # Start a transaction, speeds things up when importing # script_path.append("\n\n\nBEGIN TRANSACTION;\n") # Then export every table # for table in self.tables: command = 'mdb-export -I sqlite "%s" "%s" >> "%s"' shell_output(command % (self.path, table, script_path)) # End the transaction script_path.append("\n\n\nEND TRANSACTION;\n")
def run(self): # Make preferences files # with open(self.p.config, 'w') as handle: handle.write(default_config) with open(self.p.param, 'w') as handle: handle.write(default_param) # Make symlink for the input # if os.path.lexists(self.fasta_path): os.remove(self.fasta_path) os.symlink(self.sample.p.clean_fasta, self.fasta_path) # Remove the PGDB if it exists # if os.path.exists(self.pgdb_path): shutil.rmtree(self.pgdb_path) # Make a stupid link # if os.path.lexists("blastDB"): os.remove("blastDB") os.symlink(metapath_dir + "blastDB/", "blastDB") # The command # with open(self.p.out, 'w') as handle: handle.write(self.bash_str + '\n\n') shell_output(self.bash_str) # Clean up # if os.path.exists("blastDB"): os.remove("blastDB") if os.path.exists(self.fasta_path): os.remove(self.fasta_path) # Check for errors # with open(self.p.out, 'r') as handle: log = handle.read() if 'Error!' in log: raise Exception('Metapathway tools reported an error in file "%s".' % self.p.out)
def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)
def raw(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) # Fast QC # sh.fastqc(self.p.raw_fastq, '-o', self.p.raw_dir, '-q') if os.path.exists(self.p.raw_dir + 'raw_fastqc.zip'): os.remove(self.p.raw_dir + 'raw_fastqc.zip')
def combine_reads(self): paths = [sample.fasta.path for sample in self] shell_output('cat %s > %s' % (' '.join(paths), self.reads))
for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered)) over.close() over.graphs[-1].plot() crest = SimpleCrestTaxonomy(over, folder) crest.assign() crest.composition.graph.plot() rdp = SimpleRdpTaxonomy(over, folder) rdp.assign() rdp.composition.graph.plot() # Check unassembled mate pairs # unassembled = [p.good_barcodes.unassembled for p in pools] paths = [u.flipped_reads.path for u in unassembled] folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "unassembled_taxonomy/") all_unassembled = FASTA(folder + 'unassembled_reads.fasta') shell_output('cat %s > %s' % (' '.join(paths), all_unassembled)) tax = SimpleRdpTaxonomy(all_unassembled, folder) tax.assign() tax.composition.graph.plot() # Upload raw samples for ENA # for pool in pools: pool.create_raw_samples() for s in samples: print s.short_name s.ena.upload_to_ena() # Submit to ENA # from illumitag.helper.ena import MakeAllXML make_xml = MakeAllXML(proj, cluster) make_xml.write_files()
def combine_reads(self): """This is the first function should call. It will combine all the reads of all the samples of this cluster into one big FASTA file.""" paths = [sample.fasta.path for sample in self] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) return self.reads
def regroup(self): if self.format == 'xml': with open(self.out_path, 'w') as handle: merge_blast_xml(map(open,(q.out_path for q in self.sub_queries)), handle) else: shell_output("cat %s > %s" % (' '.join(q.out_path for q in self.sub_queries), self.out_path))
def sqlite_by_shell(self, destination): """Method with shell and a temp file. This is hopefully fast.""" script_path = new_temp_path() self.sqlite_dump_shell(script_path) shell_output('sqlite3 -bail -init "%s" "%s" .quit' % (script, destination)) script.remove()
def uncompress(self): shell_output('gunzip -c %s > %s' % (self.fwd_path, self.fwd)) shell_output('gunzip -c %s > %s' % (self.rev_path, self.rev))
$ ./gen_test.py """ # Built-in modules # import time, datetime # Internal modules # from gefes import projects # Third party modules # from shell_command import shell_output import playdoh # Timer # now = time.time() ############################################################################### print "Making test files" # Do it # pairs = [] pairs += [(projects['humic'][i].fwd_path, projects['test'][i].fwd_path) for i in range(3)] pairs += [(projects['humic'][i].rev_path, projects['test'][i].rev_path) for i in range(3)] process = lambda x : shell_output('zcat %s |head -n 4000| gzip > %s' % (x[0],x[1])) # Run it in parallel # playdoh.map(process, pairs, cpu=len(pairs)) # Report Success # run_time = datetime.timedelta(seconds=round(time.time()-now)) print "\033[0;32mRun time: '%s'\033[0m" % (run_time)
def no_barcode_split(self): """Will translate the SFF files without barcodes. Just to explore an SFF file.""" shell_output('sffinfo -s %s > %s' % (self.path, self.raw_fasta_path)) shell_output('sffinfo -q %s > %s' % (self.path, self.raw_qual_path)) sh.fasta_to_fastq(self.raw_fasta_path, self.raw_qual_path, self.fastq)
def compress_fast(self, new_path): """Do the compression with an external shell command call.""" # We don't want python to be buffering the text for speed # from shell_command import shell_output cmd = 'gzip --stdout %s > %s' % (self.path, new_path) return shell_output(cmd)
def join_outputs(self): """Join the outputs""" shell_output('cat %s > %s' % (' '.join(q.out_path for q in self.queries), self.out_path))