def split_fasta(request,analysis_id): """Really it would be better to just write the zip file directly to the socket somehow (if possible, I'm not sure how the zip format works). But this writing to disk and then reading off and rm-ing seems wasteful""" original_file = os.path.join(constants.analyses_dir, analysis_id,constants.data_file_name) sep = open(os.path.join(constants.analyses_dir,analysis_id,constants.seperator_file_name)).read() of = helper_functions.seqs(open(original_file)) time_stamp = str(helper_functions.get_time_stamp()) ids = [] tmp_dir = '/tmp' for seq in of: seq_id = time_stamp+seq.split(sep)[0].strip('>') f = open(os.path.join(tmp_dir,seq_id),'a') f.write(seq) f.close() ids.append(seq_id) ids = set(ids) #zip all files into HTTP response zip_file = os.path.join(tmp_dir,'all_samples.zip') z = zipfile.ZipFile(zip_file, 'w',zipfile.ZIP_DEFLATED) for i in ids: z.write(os.path.join(tmp_dir,i),i[len(time_stamp):]+'.fna')#timestamp is clipped off for filename sent to user z.close() """ args = ['zip',zipfile]+[ os.path.join('/tmp',time_stamp+'*')] subprocess.call(args) """ response = HttpResponse(FileWrapper(open(zip_file)), content_type='application/zip') os.remove(zip_file) for i in ids: os.remove(os.path.join(tmp_dir,i)) response['Content-Disposition'] = 'attachment; filename=all_samples.zip' return response
def low_confidence_seqs(fasta_file, rdp_outfile, threshold, separator): """Sequences which the classifier marked as below the given threshold. The sequence IDs in the fasta file must be a superset of those in the RDP output. The expected case is that RDP was run against the fasta file to generate the output """ #first assume the rdp out and fasta line up exactly: results = rdp_results(rdp_outfile,separator) sequences = helper_functions.seqs(fasta_file) below_threshold = set(res.seq_id for res in results if float(res.confidences['genus']) < threshold) for seq in sequences: ids = filter(lambda i: i.strip() != '',seq.split()[0].split(separator)) s_id = ids[1] if s_id in below_threshold: yield seq
def low_confidence_seqs(fasta_file, blast_outfile, thresholds, separator): """Sequences which blastn marked as below the given threshold. The expected case is that blastn was run against the fasta file to generate the output """ # import pdb;pdb.set_trace() results = blast_results(blast_outfile) sequences = helper_functions.seqs(fasta_file) below_threshold = set( best_alignment(res).qseqid for res in results if thresholds and result_fails_threshold(res, thresholds) ) for seq in sequences: s_id = seq.split()[0].strip(">") if s_id in below_threshold: yield seq
def create_blast_db(self): #maybe need a parallel DB_Meta object for dbs fasta_path = self.request_dict['data_file_path'] name = self.request_dict['db_name'] for escape_char in ['/','..',' ']: name = name.replace(escape_char, '_') out = os.path.join(c.blastdb_dir,name)#and here, to put all blastdbs in their own folder err = os.path.join(out,name+".err") framework.tools.blast.make_blastdb(fasta_path, name,output_dir=out,error_log=err) num_seqs = 0#sum((1 for l in open(fasta_path) if l.startswith('>'))) legend = {'ranks':['species']}#this is where we can store any taxonomic information we have about the db legend['name'] = name #import pdb;pdb.set_trace() for s in helper_functions.seqs(open(fasta_path)): head = s.split('\n')[0] num_seqs += 1 #MAD DEPENDENCY WARNING: #THIS CODE IS DUPLICATED IN tools/blast.py, LINE 114 if '|' in head and head[1].isdigit(): head = head.split('|') id = head[0].strip('>') #expects header to look like: '>5524211|gb|AAD44166.1| cytochrome b |Elephas maximus maximus' legend[id] = [head[3].strip()]#aah magic number. legend['length'] = num_seqs cPickle.dump(legend,open(os.path.join(out, name+c.blast_legend_file_extension),'w')) exts = [".nhr",".nin",".nsq",c.blast_legend_file_extension] for e in exts: if not os.path.exists(os.path.join(out,name+e)): msg = "BLAST DB creation failed: \n" try: msg += open(err).read() except: pass shutil.rmtree(out) self.write_socket({'response':'error','exception':msg}) return self.write_socket({'response': 'OK','length':num_seqs,"id":name})