Ejemplo n.º 1
0
def split_fasta(request,analysis_id):
    """Really it would be better to just write the zip file directly to the socket somehow (if possible, I'm not sure how the zip format works). But this writing to disk and then reading off and rm-ing seems wasteful"""
    original_file = os.path.join(constants.analyses_dir, analysis_id,constants.data_file_name)
    sep = open(os.path.join(constants.analyses_dir,analysis_id,constants.seperator_file_name)).read()
    of = helper_functions.seqs(open(original_file))
    time_stamp = str(helper_functions.get_time_stamp())
    ids = []
    tmp_dir = '/tmp'
    for seq in of:
        seq_id = time_stamp+seq.split(sep)[0].strip('>')
        f = open(os.path.join(tmp_dir,seq_id),'a')
        f.write(seq)
        f.close()
        ids.append(seq_id)
    ids = set(ids)
    #zip all files into HTTP response
    zip_file = os.path.join(tmp_dir,'all_samples.zip')
    z = zipfile.ZipFile(zip_file, 'w',zipfile.ZIP_DEFLATED)
    for i in ids:
        z.write(os.path.join(tmp_dir,i),i[len(time_stamp):]+'.fna')#timestamp is clipped off for filename sent to user
    z.close()
    """
    args = ['zip',zipfile]+[ os.path.join('/tmp',time_stamp+'*')]
    subprocess.call(args)
    """
    response = HttpResponse(FileWrapper(open(zip_file)), content_type='application/zip')
    
    os.remove(zip_file)
    for i in ids:
        os.remove(os.path.join(tmp_dir,i))
    
    response['Content-Disposition'] = 'attachment; filename=all_samples.zip'
    return response
Ejemplo n.º 2
0
def low_confidence_seqs(fasta_file, rdp_outfile, threshold, separator):
    """Sequences which the classifier marked as below the given threshold. The sequence IDs in the fasta file must be a superset of those in the RDP output. The expected case is that RDP was run against the fasta file to generate the output """
    #first assume the rdp out and fasta line up exactly:
    results = rdp_results(rdp_outfile,separator)
    sequences = helper_functions.seqs(fasta_file)
    below_threshold = set(res.seq_id for res in results if float(res.confidences['genus']) < threshold)
    for seq in sequences:
        ids = filter(lambda i: i.strip() != '',seq.split()[0].split(separator))
        s_id = ids[1]
        if s_id in below_threshold:
            yield seq
Ejemplo n.º 3
0
def low_confidence_seqs(fasta_file, blast_outfile, thresholds, separator):
    """Sequences which blastn marked as below the given threshold. The expected case is that blastn was run against the fasta file to generate the output """
    # import pdb;pdb.set_trace()
    results = blast_results(blast_outfile)
    sequences = helper_functions.seqs(fasta_file)
    below_threshold = set(
        best_alignment(res).qseqid for res in results if thresholds and result_fails_threshold(res, thresholds)
    )
    for seq in sequences:
        s_id = seq.split()[0].strip(">")
        if s_id in below_threshold:
            yield seq
Ejemplo n.º 4
0
    def create_blast_db(self):
        #maybe need a parallel DB_Meta object for dbs
        fasta_path = self.request_dict['data_file_path']
        name = self.request_dict['db_name']
        for escape_char in ['/','..',' ']:
            name = name.replace(escape_char, '_')
        out = os.path.join(c.blastdb_dir,name)#and here, to put all blastdbs in their own folder
        err = os.path.join(out,name+".err")
        framework.tools.blast.make_blastdb(fasta_path, name,output_dir=out,error_log=err)

        num_seqs = 0#sum((1 for l in open(fasta_path) if l.startswith('>')))
        legend = {'ranks':['species']}#this is where we can store any taxonomic information we have about the db
        legend['name'] = name
        #import pdb;pdb.set_trace()
        for s in helper_functions.seqs(open(fasta_path)):
            head = s.split('\n')[0]
            num_seqs += 1
            #MAD DEPENDENCY WARNING:
            #THIS CODE IS DUPLICATED IN tools/blast.py, LINE 114
            if '|' in head and head[1].isdigit():
                head = head.split('|')
                id = head[0].strip('>')
                #expects header to look like: '>5524211|gb|AAD44166.1| cytochrome b |Elephas maximus maximus'
                legend[id] = [head[3].strip()]#aah magic number.

        legend['length'] = num_seqs
        cPickle.dump(legend,open(os.path.join(out, name+c.blast_legend_file_extension),'w'))

        exts = [".nhr",".nin",".nsq",c.blast_legend_file_extension]

        for e in exts:
            if not os.path.exists(os.path.join(out,name+e)):
                msg = "BLAST DB creation failed: \n"
                try:
                    msg += open(err).read()
                except:
                    pass
                shutil.rmtree(out)
                self.write_socket({'response':'error','exception':msg})
                return

        self.write_socket({'response': 'OK','length':num_seqs,"id":name})