Ejemplo n.º 1
0
def prepareReport(ex, name, tot_counts, counts_primers, counts_primers_filtered,
                  tot_ambiguous=0, tot_discarded=0):
    """
    Example::
        Primer  Total_number_of_reads   nFiltered       nValidReads
        HoxD13  9406932 4296211 5110721
        HoxD4   3835395 415503  3419892
        HoxD9   6908054 594449  6313605
        Discarded   xxxx
        Ambiguous   xxx
        Unclassified    1304048
        Total   21454429
    """
    tot_counts_primers = sum(counts_primers.values())
    dataReport = unique_filename_in()
    out = open(dataReport,'w')
    out.write("Primer\tTotal_number_of_reads\tnFiltered\tnValidReads\n")
    for k,v in counts_primers.iteritems():
        if counts_primers_filtered[k]>0:
            nFiltered=v-counts_primers_filtered[k]
        else:
            nFiltered=0
            counts_primers_filtered[k]=v
        out.write("\t".join([str(x) for x in [k,v,nFiltered,counts_primers_filtered[k]]])+"\n")
    out.write("Discarded\t"+str(tot_discarded)+"\n")
    out.write("Ambiguous\t"+str(tot_ambiguous)+"\n")
    tot=tot_counts_primers+tot_discarded+tot_ambiguous
    out.write("Unclassified\t"+str(tot_counts-tot)+"\n")
    out.write("Total\t"+str(tot_counts)+"\n")
    out.close()
    return (tot_counts>tot_counts_primers,dataReport)
Ejemplo n.º 2
0
def filterSeq(ex, fastqFiles, seqToFilter, gid, grp_name, via='lsf'):
    indexSeqToFilter = {}
    indexFiles = {}
    global bcDelimiter
    for k, f in seqToFilter.iteritems():
        if os.path.getsize(f) == 0: continue
        ex.add(f,
               description=set_file_descr(grp_name + "_" +
                                          k.replace(bcDelimiter, "_") +
                                          "_seqToFilter.fa",
                                          groupId=gid,
                                          step="filtering",
                                          type="fa",
                                          view="admin"))
        if k in fastqFiles:
            indexFiles[k] = bowtie_build.nonblocking(ex, f, via=via)

    unalignedFiles = {}
    futures = []
    bwtarg = ["-a", "-q", "-n", "2", "-l", "20", "--un"]
    for k, f in indexFiles.iteritems():
        unalignedFiles[k] = unique_filename_in()
        touch(ex, unalignedFiles[k])
        futures.append(
            bowtie.nonblocking(ex,
                               f.wait(),
                               fastqFiles[k],
                               bwtarg + [unalignedFiles[k]],
                               via='lsf'))

    for f in futures:
        f.wait()
    return unalignedFiles
Ejemplo n.º 3
0
def fastqToFasta(fqFile, n=1, x=22, output=None):
    if output is None: output = unique_filename_in()
    return {
        'arguments': [
            "fastqToFasta.py", "-i", fqFile, "-o", output, "-n",
            str(n), "-x",
            str(x)
        ],
        'return_value':
        output
    }
Ejemplo n.º 4
0
def getSeqToFilter(ex,primersFile):
    filenames = {}
    with open(primersFile,"r") as f:
        for s in f:
            if s[0] == '>': continue
            s_split = s.split('|')
            key = s_split[0].replace(">","")
            filenames[key] = unique_filename_in()
            with open(filenames[key],"w") as fout:
                for i in range(4,len(s_split)):
                    if 'Exclude' not in s_split[i]:
                        fout.write(">seq"+str(i)+"\n"+s_split[i].replace(".","")+"\n")
    return filenames
Ejemplo n.º 5
0
def getSeqToFilter(ex, primersFile):
    filenames = {}
    with open(primersFile, "r") as f:
        for s in f:
            if s[0] != '>': continue
            s_split = s.split('|')
            key = s_split[0].replace(">", "")
            filenames[key] = unique_filename_in()
            with open(filenames[key], "w") as fout:
                for i in range(4, len(s_split)):
                    if 'Exclude' not in s_split[i]:
                        fout.write(">seq" + str(i) + "\n" +
                                   s_split[i].replace(".", "") + "\n")
    return filenames
Ejemplo n.º 6
0
def prepareReport(ex,
                  name,
                  tot_counts,
                  counts_primers,
                  counts_primers_filtered,
                  tot_ambiguous=0,
                  tot_discarded=0):
    """
    Example::
        Primer  Total_number_of_reads   nFiltered       nValidReads
        HoxD13  9406932 4296211 5110721
        HoxD4   3835395 415503  3419892
        HoxD9   6908054 594449  6313605
        Discarded   xxxx
        Ambiguous   xxx
        Unclassified    1304048
        Total   21454429
    """
    tot_counts_primers = sum(counts_primers.values())
    dataReport = unique_filename_in()
    out = open(dataReport, 'w')
    out.write("Primer\tTotal_number_of_reads\tnFiltered\tnValidReads\n")
    for k, v in counts_primers.iteritems():
        if counts_primers_filtered[k] > 0:
            nFiltered = v - counts_primers_filtered[k]
        else:
            nFiltered = 0
            counts_primers_filtered[k] = v
        out.write("\t".join(
            [str(x)
             for x in [k, v, nFiltered, counts_primers_filtered[k]]]) + "\n")
    out.write("Discarded\t" + str(tot_discarded) + "\n")
    out.write("Ambiguous\t" + str(tot_ambiguous) + "\n")
    tot = tot_counts_primers + tot_discarded + tot_ambiguous
    out.write("Unclassified\t" + str(tot_counts - tot) + "\n")
    out.write("Total\t" + str(tot_counts) + "\n")
    out.close()
    return (tot_counts > tot_counts_primers, dataReport)
Ejemplo n.º 7
0
def filterSeq(ex,fastqFiles,seqToFilter,gid,grp_name,via='lsf'):
#seqToFilter=`awk -v primer=${curPrimer} '{if($0 ~ /^>/){n=split($0,a,"|");curPrimer=a[1];gsub(">","",curPrimer); if(curPrimer == primer){seqToFilter="";for(i=5;i<n;i++){if(a[i] !~ /Exclude=/){seqToFilter=seqToFilter""a[i]","}} if(a[n] !~ /Exclude=/){seqToFilter=seqToFilter""a[n]}else{gsub(/,$/,"",seqToFilter)};print seqToFilter}}}' ${primersFile}`
    indexSeqToFilter = {}
    indexFiles = {}
    for k,f in seqToFilter.iteritems():
        if os.path.getsize(f) == 0: continue
        ex.add(f,description=set_file_descr(grp_name+"_"+k+"_seqToFilter.fa",
                                            groupId=gid,step="filtering",
                                            type="fa",view="admin"))
        if k in fastqFiles:
            indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via)

    unalignedFiles = {}
    futures = []
    bwtarg = ["-a","-q","-n","2","-l","20","--un"]
    for k,f in indexFiles.iteritems():
        unalignedFiles[k] = unique_filename_in()
        touch(ex,unalignedFiles[k])
        futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k],
                                           bwtarg+[unalignedFiles[k]], via='lsf'))

    for f in futures: f.wait()
    return unalignedFiles
Ejemplo n.º 8
0
def filterSeq(ex,fastqFiles,seqToFilter,gid,grp_name,via='lsf'):
    indexSeqToFilter = {}
    indexFiles = {}
    global bcDelimiter
    for k,f in seqToFilter.iteritems():
        if os.path.getsize(f) == 0: continue
        ex.add(f,description=set_file_descr(grp_name+"_"+k.replace(bcDelimiter,"_")+"_seqToFilter.fa",
                                            groupId=gid,step="filtering",
                                            type="fa",view="admin"))
        if k in fastqFiles:
            indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via)

    unalignedFiles = {}
    futures = []
    bwtarg = ["-a","-q","-n","2","-l","20","--un"]
    for k,f in indexFiles.iteritems():
        unalignedFiles[k] = unique_filename_in()
        touch(ex,unalignedFiles[k])
        futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k],
                                           bwtarg+[unalignedFiles[k]], via='lsf'))

    for f in futures: f.wait()
    return unalignedFiles
Ejemplo n.º 9
0
def demultiplex_workflow(ex, job, gl, file_path="../", via='lsf',
                         logfile=sys.stdout, debugfile=sys.stderr):
    script_path=gl['script_path']
    file_names = {}
    job_groups=job.groups
    resFiles={}
    for gid, group in job_groups.iteritems():
        file_names[gid] = {}

        primersFilename = 'group_' + group['name'] + "_barcode_file.fa"
        primersFile = os.path.join(file_path,primersFilename)
        ex.add(primersFile,description=set_file_descr(primersFilename,groupId=gid,step="init",type="fa"))

        paramsFilename = 'group_' + group['name'] + "_param_file.txt"
        paramsFile = os.path.join(file_path,paramsFilename)
        ex.add(paramsFile,description=set_file_descr(paramsFilename,groupId=gid,step="init",type="txt"))
        params = load_paramsFile(paramsFile)

        infiles = []
        tot_counts = 0
        allSubFiles = []
        for rid,run in group['runs'].iteritems():
            infiles.append(run)
            n=count_lines(ex,run)
            tot_counts += n/4
            if n>10000000:
                allSubFiles.extend(split_file(ex,run,n_lines=8000000))
            else:
                allSubFiles.append(run)
        (resExonerate, tot_ambiguous, tot_discarded) = parallel_exonerate( ex, allSubFiles, primersFile,
                                                                           (gid, group['name']), via=via, **params )
        gzipfile(ex,run)
        ex.add(run+".gz",description=set_file_descr(group['name']+"_full_fastq.gz",
                                                    groupId=gid,step='exonerate',view='debug',type="fastq"))
        logfile.write("Will get sequences to filter\n");logfile.flush()
        seqToFilter = getSeqToFilter(ex,primersFile)

        logfile.write("Will filter the sequences\n")
        filteredFastq = filterSeq(ex,resExonerate,seqToFilter,gid,group['name'],via=via)

        logfile.write("After filterSeq, filteredFastq=%s\n" %filteredFastq);logfile.flush()

        counts_primers = {}
        counts_primers_filtered = {}
        if len(filteredFastq):
            archive = unique_filename_in()
            tgz = tarfile.open(archive, "w:gz")
        for k,f in resExonerate.iteritems():
            counts_primers[k] = count_lines(ex,f)/4
            if k in filteredFastq:
                file_names[gid][k] = group['name']+"_"+k+"_filtered"
                ex.add(filteredFastq[k],description=set_file_descr(file_names[gid][k]+".fastq",
                                                                   groupId=gid,step="final",
                                                                   type="fastq"))
                counts_primers_filtered[k] = count_lines(ex,filteredFastq[k])/4
                tgz.add( f, arcname=group['name']+"_"+k+".fastq" )
            else:
                file_names[gid][k] = group['name']+"_"+k
                ex.add(f,description=set_file_descr(file_names[gid][k]+".fastq",
                                                    groupId=gid,step="final",
                                                    type="fastq"))
                counts_primers_filtered[k] = 0
        if len(filteredFastq):
            tgz.close()
            ex.add(archive,description=set_file_descr(group['name']+"_unfiltered_fastq.tgz",
                                                      groupId=gid,step="exonerate",
                                                      type="tar"))

        # Prepare report per group of runs
        report_ok,reportFile = prepareReport(ex,group['name'],
                                             tot_counts, counts_primers,counts_primers_filtered,
                                             tot_ambiguous, tot_discarded)
        ex.add(reportFile,description = set_file_descr(
                group['name']+"_report_demultiplexing.txt",
                groupId=gid,step="final",type="txt",view="admin"))
        if report_ok:
            reportFile_pdf = unique_filename_in()
            createReport(ex,reportFile,reportFile_pdf,script_path)
            ex.add(reportFile_pdf,description=set_file_descr(
                    group['name']+"_report_demultiplexing.pdf",
                    groupId=gid,step="final",type="pdf"))
        else:
            logfile.write("*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n");logfile.flush()
    add_pickle( ex, file_names,
                set_file_descr('file_names',step="final",type='py',view='admin') )
    return resFiles
Ejemplo n.º 10
0
def split_exonerate(filename,minScore,l=30,n=1):
    correction = {}
    files = {}
    filenames = {}
    alignments = {}
    prev_idLine = ''
    # Is ambiguous a read with multiple equally good exonerate alignments (i.e. with a same score >= minScore)
    # Will often be empty as it contains only good alignments (>minScore).
    # Other ambiguous but "bad" alignments will be in "unaligned"
    alignments["ambiguous"] = unique_filename_in()
    files["ambiguous"] = open(alignments["ambiguous"],"w")
    alignments["ambiguous_fastq"] = unique_filename_in()
    files["ambiguous_fastq"] = open(alignments["ambiguous_fastq"],"w")
    # Is unaligned a read with score < minScore (that is score between my_minScore and minScore)
    # my_minScore is the score used for exonerate and is ~ equal to half the length of the primer sequence
    alignments["unaligned"] = unique_filename_in()
    files["unaligned"] = open(alignments["unaligned"],"w")
    # discarded: if score ok but len(seq) < l (that is the remaining sequence is too short)
    alignments["discarded"] = unique_filename_in()
    files["discarded"] = open(alignments["discarded"],"w")
    line_buffer = []

    def _process_line(line_buffer):
        if len(line_buffer) == 1:
            files[line_buffer[0][3]].write(line_buffer[0][1])
        elif len(line_buffer)>1:
            for buf in sorted(line_buffer):
                files["ambiguous"].write(" ".join(buf[2])+"\n")
            # add the corresponding ambiguous read to the ambiguous_fastq file (only once)
            files["ambiguous_fastq"].write(" ".buf[1]+"\n")

    with open(filename,"r") as f:
        for s in f:
            if not s[:7] == 'vulgar:': continue
            s = s.strip().split(' ')
            s_split = s[5].split('|')
            key = s_split[0]
            info = s[1].split('_')
            idLine = info[0]
            if idLine != prev_idLine:
                _process_line(line_buffer)
                line_buffer = []
            prev_idLine = idLine
            if key not in correction:
                if len(s_split) > 3:
                    correction[key] = len(s_split[1])-len(s_split[3])+n-1
                else:
                    correction[key] = len(s_split[1])+n-1
                filenames[key] = unique_filename_in()
                files[key] = open(filenames[key],"w")
            k = int(s[3])-int(s[7])+correction[key]
            l_linename = len(info[0])
            l_seq = len(info[1])
            full_qual = s[1][int(l_linename)+int(l_seq)+2:int(l_linename)+2*int(l_seq)+2]
            seq = info[1][k:l+k]
            qual = full_qual[k:l+k]
            if int(s[9]) < minScore:
                files["unaligned"].write(" ".join(s)+"\n")
            elif len(seq) >= l:
                line_buffer.append((s[9],"@"+info[0]+"\n"+seq+"\n+\n"+qual+"\n",s,key))
            else:
                files["discarded"].write("@"+info[0]+"\n"+seq+"\n+\n"+qual+"\n")

    _process_line(line_buffer)
    for f in files.itervalues():
        f.close()
    return (filenames,alignments)
Ejemplo n.º 11
0
def fastqToFasta(fqFile,n=1,x=22,output=None):
    if output is None: output = unique_filename_in()
    return {'arguments': ["fastqToFasta.py","-i",fqFile,"-o",output,"-n",str(n),"-x",str(x)],
            'return_value': output}
Ejemplo n.º 12
0
def parallel_exonerate(ex, subfiles, dbFile, grp_descr,
                       minScore=77, n=1, x=22, l=30, q=True, via="local"):
    futures = [fastqToFasta.nonblocking(ex,sf,n=n,x=x,via=via) for sf in subfiles]

    futures2 = []
    res = []
    resExonerate = []
    faSubFiles = []
    all_ambiguous = []
    all_ambiguous_fastq = []
    all_unaligned = []
    all_discarded = []
    gid, grp_name = grp_descr
    my_minscore = _get_minscore(dbFile)
    for sf in futures:
        subResFile = unique_filename_in()
        faSubFiles.append(sf.wait())
        futures2.append(exonerate.nonblocking(ex, faSubFiles[-1], dbFile, minScore=my_minscore,
                                              via=via, stdout=subResFile, memory=6))
        resExonerate.append(subResFile)
    for nf,f in enumerate(resExonerate):
        futures2[nf].wait()
        (resSplitExonerate,alignments) = split_exonerate(f,minScore,l=l,n=n)
        all_unaligned.append(alignments["unaligned"])
        all_ambiguous.append(alignments["ambiguous"])
        all_ambiguous_fastq.append(alignments["ambiguous_fastq"])
        all_discarded.append(alignments["discarded"])
        res.append(resSplitExonerate)

    gzipfile(ex,cat(all_unaligned[1:],out=all_unaligned[0]))
    ex.add(all_unaligned[0]+".gz",
           description=set_file_descr(grp_name+"_unaligned.txt.gz",
                                      groupId=gid,step="exonerate",type="txt",
                                      view="admin", comment="scores between %i and %i"%(my_minscore,minScore)) )

    # add ambiguous file only if it is not empty
    n = count_lines(ex,all_ambiguous[0])
    if n > 1:
        gzipfile(ex,cat(all_ambiguous[1:],out=all_ambiguous[0]))
        ex.add(all_ambiguous[0]+".gz",
               description=set_file_descr(grp_name+"_ambiguous.txt.gz",
                                      groupId=gid,step="exonerate",type="txt",
                                      view="admin", comment="multiple equally good classifications") )

    # add ambiguous fastq file only if it is not empty
    tot_ambiguous = count_lines(ex,all_ambiguous_fastq[0])/4
    if n > 1:
        gzipfile(ex,cat(all_ambiguous_fastq[1:],out=all_ambiguous_fastq[0]))
        ex.add(all_ambiguous_fastq[0]+".gz",
               description=set_file_descr(grp_name+"_ambiguous.fastq.gz",
                                      groupId=gid,step="exonerate",type="fastq", comment="multiple equally good classifications") )

    # add discarded file only if it is not empty
    tot_discarded = count_lines(ex,all_discarded[0])/4
    if n > 1:
        gzipfile(ex,cat(all_discarded[1:],out=all_discarded[0]))
        ex.add(all_discarded[0]+".gz",
               description=set_file_descr(grp_name+"_discarded.fastq.gz",
                                          groupId=gid, step="exonerate", type="fastq",
                                          view="admin", comment="< %i bps" %l ) )
    gzipfile(ex,faSubFiles[0])
    ex.add(faSubFiles[0]+".gz",
           description=set_file_descr(grp_name+"_input_part.fa.gz",
                                      groupId=gid, step="init", type="fa",
                                      view="admin", comment="part") )
    gzipfile(ex,resExonerate[0])
    ex.add(resExonerate[0]+".gz",
           description=set_file_descr(grp_name+"_exonerate_part.txt.gz",
                                      groupId=gid, step="exonerate", type="txt",
                                      view="admin", comment="part") )

    resFiles = dict((k,'') for d in res for k in d.keys())
    for k in resFiles.keys():
        v = [d[k] for d in res if k in d]
        resFiles[k] = cat(v[1:],out=v[0])

    return (resFiles, tot_ambiguous, tot_discarded)
Ejemplo n.º 13
0
def demultiplex_workflow(ex,
                         job,
                         gl,
                         file_path="../",
                         via='lsf',
                         logfile=sys.stdout,
                         debugfile=sys.stderr):
    script_path = gl['script_path']
    file_names = {}
    job_groups = job.groups
    resFiles = {}
    for gid, group in job_groups.iteritems():
        file_names[gid] = {}

        primersFilename = 'group_' + group['name'] + "_barcode_file.fa"
        primersFile = group.get("primersfile",
                                os.path.join(file_path, primersFilename))
        ex.add(primersFile,
               description=set_file_descr(primersFilename,
                                          groupId=gid,
                                          step="init",
                                          type="fa"))

        paramsFilename = 'group_' + group['name'] + "_param_file.txt"
        paramsFile = group.get("paramsfile",
                               os.path.join(file_path, paramsFilename))
        ex.add(paramsFile,
               description=set_file_descr(paramsFilename,
                                          groupId=gid,
                                          step="init",
                                          type="txt"))
        params = load_paramsFile(paramsFile)

        infiles = []
        tot_counts = 0
        allSubFiles = []
        for rid, run in group['runs'].iteritems():
            infiles.append(run)
            n = count_lines(ex, run)
            tot_counts += n / 4
            if n > 10000000:
                allSubFiles.extend(split_file(ex, run, n_lines=8000000))
            else:
                allSubFiles.append(run)
        (resExonerate, tot_ambiguous,
         tot_discarded) = parallel_exonerate(ex,
                                             allSubFiles,
                                             primersFile, (gid, group['name']),
                                             via=via,
                                             **params)

        gzipfile(ex, cat(infiles))
        ex.add(run + ".gz",
               description=set_file_descr(group['name'] + "_full_fastq.gz",
                                          groupId=gid,
                                          step='exonerate',
                                          view='debug',
                                          type="fastq"))
        logfile.write("Will get sequences to filter\n")
        logfile.flush()
        seqToFilter = getSeqToFilter(ex, primersFile)

        logfile.write("Will filter the sequences\n")
        filteredFastq = filterSeq(ex,
                                  resExonerate,
                                  seqToFilter,
                                  gid,
                                  group['name'],
                                  via=via)

        logfile.write("After filterSeq, filteredFastq=%s\n" % filteredFastq)
        logfile.flush()

        counts_primers = {}
        counts_primers_filtered = {}
        global bcDelimiter
        if len(filteredFastq):
            archive = unique_filename_in()
            tgz = tarfile.open(archive, "w:gz")
        for k, f in resExonerate.iteritems():
            counts_primers[k] = count_lines(ex, f) / 4
            if k in filteredFastq:
                k2 = k.replace(bcDelimiter, "_")
                file_names[gid][k2] = group['name'] + "_" + k2 + "_filtered"
                ex.add(filteredFastq[k],
                       description=set_file_descr(file_names[gid][k2] +
                                                  ".fastq",
                                                  groupId=gid,
                                                  step="final",
                                                  type="fastq"))
                counts_primers_filtered[k] = count_lines(ex,
                                                         filteredFastq[k]) / 4
                tgz.add(f,
                        arcname=group['name'] + "_" +
                        k.replace(bcDelimiter, "_") + ".fastq")
            else:
                k2 = k.replace(bcDelimiter, "_")
                file_names[gid][k2] = group['name'] + "_" + k2
                ex.add(f,
                       description=set_file_descr(file_names[gid][k2] +
                                                  ".fastq",
                                                  groupId=gid,
                                                  step="final",
                                                  type="fastq"))
                counts_primers_filtered[k] = 0
        if len(filteredFastq):
            tgz.close()
            ex.add(archive,
                   description=set_file_descr(group['name'] +
                                              "_unfiltered_fastq.tgz",
                                              groupId=gid,
                                              step="exonerate",
                                              type="tar"))

        # Prepare report per group of runs
        report_ok, reportFile = prepareReport(ex, group['name'], tot_counts,
                                              counts_primers,
                                              counts_primers_filtered,
                                              tot_ambiguous, tot_discarded)
        ex.add(reportFile,
               description=set_file_descr(group['name'] +
                                          "_report_demultiplexing.txt",
                                          groupId=gid,
                                          step="final",
                                          type="txt",
                                          view="admin"))
        if report_ok:
            reportFile_pdf = unique_filename_in()
            createReport(ex, reportFile, reportFile_pdf, script_path)
            ex.add(reportFile_pdf,
                   description=set_file_descr(group['name'] +
                                              "_report_demultiplexing.pdf",
                                              groupId=gid,
                                              step="final",
                                              type="pdf"))
        else:
            logfile.write(
                "*** Probable ambiguous classification: total_reads < sum(reads_by_primers) ***\n"
            )
            logfile.flush()
    add_pickle(
        ex, file_names,
        set_file_descr('file_names', step="final", type='py', view='admin'))
    return resFiles
Ejemplo n.º 14
0
def split_exonerate(filename, primersDict, minScore, l=30, n=1, trim=True):

    correction = {}
    files = {}
    filenames = {}
    alignments = {}
    prev_idLine = ''
    # Is ambiguous a read with multiple equally good exonerate alignments (i.e. with a same score >= minScore)
    # Will often be empty as it contains only good alignments (>minScore).
    # Other ambiguous but "bad" alignments will be in "unaligned"
    alignments["ambiguous"] = unique_filename_in()
    files["ambiguous"] = open(alignments["ambiguous"], "w")
    alignments["ambiguous_fastq"] = unique_filename_in()
    files["ambiguous_fastq"] = open(alignments["ambiguous_fastq"], "w")
    # Is unaligned a read with score < minScore (that is score between my_minScore and minScore)
    # my_minScore is the score used for exonerate and is ~ equal to half the length of the primer sequence
    alignments["unaligned"] = unique_filename_in()
    files["unaligned"] = open(alignments["unaligned"], "w")
    # discarded: if score ok but len(seq) < l (that is the remaining sequence is too short)
    alignments["discarded"] = unique_filename_in()
    files["discarded"] = open(alignments["discarded"], "w")
    line_buffer = []

    def _process_line(line_buffer):
        if len(line_buffer) == 1:
            files[line_buffer[0][3]].write(line_buffer[0][1])
        elif len(line_buffer) > 1:
            for buf in sorted(line_buffer):
                files["ambiguous"].write(" ".join(buf[2]) + "\n")
            # add the corresponding ambiguous read to the ambiguous_fastq file (only once)
            #files["ambiguous_fastq"].write(" ".join(buf[1])+"\n")
            files["ambiguous_fastq"].write(buf[1])

    with open(filename, "r") as f:
        for s in f:
            if not s[:7] == 'vulgar:': continue
            s = s.strip().split(' ')
            s_split = s[5].split('|')
            key = s_split[0]
            info = s[1].split('_')
            idLine = info[0]
            if idLine != prev_idLine:
                _process_line(line_buffer)
                line_buffer = []
            prev_idLine = idLine

            closestBarcode = key
            global bcDelimiter
            if key.split(bcDelimiter)[0] in primersDict:  ## has barcode
                pr = key.split(bcDelimiter)[0]
                closestBarcode = getClosestBarcode(
                    info[1],
                    primersDict[pr])  ## replace key with the closest barcode

            if key not in correction:
                if len(s_split) > 3:
                    correction[key] = len(s_split[1]) - len(s_split[3]) + n - 1
                else:
                    correction[key] = len(s_split[1]) + n - 1

                if len(key.split(bcDelimiter)) > 1:
                    correction[key] = correction[
                        key] - 1  ## remove the bcDelimiter from the correction

                filenames[key] = unique_filename_in()
                files[key] = open(filenames[key], "w")
            k = int(s[3]) - int(s[7]) + correction[key]
            if len(key.split(bcDelimiter)) > 1:
                k = k - len(primersDict[key.split(bcDelimiter)[0]][key])

            l_linename = len(info[0])
            l_seq = len(info[1])
            full_qual = s[1][int(l_linename) + int(l_seq) + 2:int(l_linename) +
                             2 * int(l_seq) + 2]
            if trim:
                seq = info[1][k:l + k] if l + k < l_seq else info[1][k:]
                ## if not take the remaining length
                qual = full_qual[k:l + k] if l + k < l_seq else full_qual[k:]
            else:
                seq = info[1]
                qual = full_qual
            if int(s[9]) < minScore:
                files["unaligned"].write(" ".join(s) + "\n")
            elif len(seq) >= l / 2:
                if key == closestBarcode or closestBarcode == "amb":  # if not => not the primer_barcode to which the read comes from + add if ambiguous barcode
                    #line_buffer.append((s[9],"@"+info[0]+"\n"+seq+"***"+info[1]+"\n+\n"+qual+"\n",s,key))
                    line_buffer.append((s[9], "@" + info[0] + "\n" + seq +
                                        "\n+\n" + qual + "\n", s, key))
            else:  ## will be discarded if remaining read length is too short. Arbitrarly set to < l/2. It used to be < l but will not happen as we now accept l close to the read length.
                files["discarded"].write("@" + info[0] + "\n" + seq + "\n+\n" +
                                         qual + "\n")

    _process_line(line_buffer)
    for f in files.itervalues():
        f.close()
    return (filenames, alignments)
Ejemplo n.º 15
0
def parallel_exonerate(ex,
                       subfiles,
                       dbFile,
                       grp_descr,
                       minScore=77,
                       n=1,
                       x=22,
                       l=30,
                       trim=True,
                       via="local"):
    futures = [
        fastqToFasta.nonblocking(ex, sf, n=n, x=x, via=via) for sf in subfiles
    ]

    futures2 = []
    res = []
    resExonerate = []
    faSubFiles = []
    all_ambiguous = []
    all_ambiguous_fastq = []
    all_unaligned = []
    all_discarded = []
    gid, grp_name = grp_descr
    my_minscore = _get_minscore(dbFile)
    primersDict = get_primersList(dbFile)

    for sf in futures:
        subResFile = unique_filename_in()
        faSubFiles.append(sf.wait())
        futures2.append(
            exonerate.nonblocking(ex,
                                  faSubFiles[-1],
                                  dbFile,
                                  minScore=my_minscore,
                                  via=via,
                                  stdout=subResFile,
                                  memory=6))
        resExonerate.append(subResFile)
    for nf, f in enumerate(resExonerate):
        futures2[nf].wait()
        (resSplitExonerate, alignments) = split_exonerate(f,
                                                          primersDict,
                                                          minScore,
                                                          l=l,
                                                          n=n,
                                                          trim=trim)
        all_unaligned.append(alignments["unaligned"])
        all_ambiguous.append(alignments["ambiguous"])
        all_ambiguous_fastq.append(alignments["ambiguous_fastq"])
        all_discarded.append(alignments["discarded"])
        res.append(resSplitExonerate)

    # add unaligned file only if it is not empty
    n = count_lines(ex, all_unaligned[0])
    if n > 1:
        catfile = cat(all_unaligned)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(grp_name + "_unaligned.txt.gz",
                                          groupId=gid,
                                          step="exonerate",
                                          type="txt",
                                          view="admin",
                                          comment="scores between %i and %i" %
                                          (my_minscore, minScore)))

    # add ambiguous file only if it is not empty
    n = count_lines(ex, all_ambiguous[0])
    if n > 1:
        catfile = cat(all_ambiguous)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(
                   grp_name + "_ambiguous.txt.gz",
                   groupId=gid,
                   step="exonerate",
                   type="txt",
                   view="admin",
                   comment="multiple equally good classifications"))

    # add ambiguous fastq file only if it is not empty
    tot_ambiguous = count_lines(ex, all_ambiguous_fastq[0]) / 4
    if tot_ambiguous > 1:
        catfile = cat(all_ambiguous_fastq)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(
                   grp_name + "_ambiguous.fastq.gz",
                   groupId=gid,
                   step="exonerate",
                   type="fastq",
                   comment="multiple equally good classifications"))

    # add discarded file only if it is not empty
    tot_discarded = count_lines(ex, all_discarded[0]) / 4
    if tot_discarded > 1:
        catfile = cat(all_discarded)
        gzipfile(ex, catfile)
        ex.add(catfile + ".gz",
               description=set_file_descr(grp_name + "_discarded.fastq.gz",
                                          groupId=gid,
                                          step="exonerate",
                                          type="fastq",
                                          view="admin",
                                          comment="remaining seq too short"))

    ## add part input fasta file only if it is not empty
    n = count_lines(ex, faSubFiles[0])
    if n > 1:
        gzipfile(ex, faSubFiles[0])
        ex.add(faSubFiles[0] + ".gz",
               description=set_file_descr(grp_name + "_input_part.fa.gz",
                                          groupId=gid,
                                          step="init",
                                          type="fa",
                                          view="admin",
                                          comment="part"))

    ## add part res exonerate only if it is not empty
    n = count_lines(ex, resExonerate[0])
    if n > 1:
        gzipfile(ex, resExonerate[0])
        ex.add(resExonerate[0] + ".gz",
               description=set_file_descr(grp_name + "_exonerate_part.txt.gz",
                                          groupId=gid,
                                          step="exonerate",
                                          type="txt",
                                          view="admin",
                                          comment="part"))

    resFiles = dict((k, '') for d in res for k in d.keys())
    for k in resFiles.keys():
        v = [d[k] for d in res if k in d]
        resFiles[k] = cat(v[1:], out=v[0])

    return (resFiles, tot_ambiguous, tot_discarded)
Ejemplo n.º 16
0
def split_exonerate(filename,primersDict,minScore,l=30,n=1,trim=True):

    correction = {}
    files = {}
    filenames = {}
    alignments = {}
    prev_idLine = ''
    # Is ambiguous a read with multiple equally good exonerate alignments (i.e. with a same score >= minScore)
    # Will often be empty as it contains only good alignments (>minScore).
    # Other ambiguous but "bad" alignments will be in "unaligned"
    alignments["ambiguous"] = unique_filename_in()
    files["ambiguous"] = open(alignments["ambiguous"],"w")
    alignments["ambiguous_fastq"] = unique_filename_in()
    files["ambiguous_fastq"] = open(alignments["ambiguous_fastq"],"w")
    # Is unaligned a read with score < minScore (that is score between my_minScore and minScore)
    # my_minScore is the score used for exonerate and is ~ equal to half the length of the primer sequence
    alignments["unaligned"] = unique_filename_in()
    files["unaligned"] = open(alignments["unaligned"],"w")
    # discarded: if score ok but len(seq) < l (that is the remaining sequence is too short)
    alignments["discarded"] = unique_filename_in()
    files["discarded"] = open(alignments["discarded"],"w")
    line_buffer = []

    def _process_line(line_buffer):
        if len(line_buffer) == 1:
            files[line_buffer[0][3]].write(line_buffer[0][1])
        elif len(line_buffer)>1:
            for buf in sorted(line_buffer):
                files["ambiguous"].write(" ".join(buf[2])+"\n")
            # add the corresponding ambiguous read to the ambiguous_fastq file (only once)
            #files["ambiguous_fastq"].write(" ".join(buf[1])+"\n")
            files["ambiguous_fastq"].write(buf[1])

    with open(filename,"r") as f:
        for s in f:
            if not s[:7] == 'vulgar:': continue
            s = s.strip().split(' ')
            s_split = s[5].split('|')
            key = s_split[0]
            info = s[1].split('_')
            idLine = info[0]
            if idLine != prev_idLine:
                _process_line(line_buffer)
                line_buffer = []
            prev_idLine = idLine

            closestBarcode = key
            global bcDelimiter
            if key.split(bcDelimiter)[0] in primersDict: ## has barcode
                pr = key.split(bcDelimiter)[0]
                closestBarcode = getClosestBarcode(info[1],primersDict[pr]) ## replace key with the closest barcode

            if key not in correction:
                if len(s_split) > 3:
                    correction[key] = len(s_split[1])-len(s_split[3])+n-1
                else:
                    correction[key] = len(s_split[1])+n-1

                if len(key.split(bcDelimiter)) > 1:
                    correction[key] = correction[key] - 1 ## remove the bcDelimiter from the correction

                filenames[key] = unique_filename_in()
                files[key] = open(filenames[key],"w")
            k = int(s[3])-int(s[7])+correction[key]
            if len(key.split(bcDelimiter)) > 1:
                k = k - len(primersDict[key.split(bcDelimiter)[0]][key])

            l_linename = len(info[0])
            l_seq = len(info[1])
            full_qual = s[1][int(l_linename)+int(l_seq)+2:int(l_linename)+2*int(l_seq)+2]
            if trim:
                seq = info[1][k:l+k] if l+k < l_seq else info[1][k:]; ## if not take the remaining length
                qual = full_qual[k:l+k] if l+k < l_seq else full_qual[k:]
            else:
                seq = info[1]
                qual = full_qual
            if int(s[9]) < minScore:
                files["unaligned"].write(" ".join(s)+"\n")
            elif len(seq) >= l/2:
                if key == closestBarcode or closestBarcode == "amb": # if not => not the primer_barcode to which the read comes from + add if ambiguous barcode
                    #line_buffer.append((s[9],"@"+info[0]+"\n"+seq+"***"+info[1]+"\n+\n"+qual+"\n",s,key))
                    line_buffer.append((s[9],"@"+info[0]+"\n"+seq+"\n+\n"+qual+"\n",s,key))
            else: ## will be discarded if remaining read length is too short. Arbitrarly set to < l/2. It used to be < l but will not happen as we now accept l close to the read length.
                files["discarded"].write("@"+info[0]+"\n"+seq+"\n+\n"+qual+"\n")

    _process_line(line_buffer)
    for f in files.itervalues():
        f.close()
    return (filenames,alignments)