Exemple #1
0
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"):
    """Splits a sff.txt file on barcode/mapping file."""

    try:
        (flowgrams, header) = cat_sff_files(sff_file_handles)
    except ValueError:
        # reading in the binary sff usually shows up as ValueError
        raise FileFormatError(
            'Wrong flogram file format. Make sure you pass the sff.txt format '
            + 'produced by sffinfo. The binary .sff will not work here.')

    (inverse_map,
     map_count) = build_inverse_barcode_map(parse_fasta(map_file_handle))

    filenames = []
    # we might have many barcodes and reach python open file limit
    # therefor we go the slow way and open and close files each time
    # First set up all files with the headers only
    for barcode_id in map_count.keys():
        fh = open(outdir + barcode_id, "w")
        write_sff_header(header, fh, map_count[barcode_id])
        fh.close()
        filenames.append(outdir + barcode_id)
    # Then direct each flowgram into its barcode file
    for f in flowgrams:
        if f.Name in inverse_map:
            barcode_id = inverse_map[f.Name]
            fh = open(outdir + barcode_id, "a")
            fh.write(f.createFlowHeader() + "\n")
    return filenames
Exemple #2
0
def filter_sff_file(flowgrams, header, filter_list, out_fh):
    """Filters all flowgrams in handle with filter.

    flowgrams: a list of flowgrams (or something similar)

    header: the header for the flowgrams

    filter_list: list of filters to be applied on sff.txt file

    out_fh: output file handle

    returns: number of flowgrams in filtered out file
    """

    write_sff_header(header, out_fh)

    l = 0
    for f in flowgrams:
        passed = True
        for filter in filter_list:
            passed = passed and filter(f)
            if not passed:
                # bail out
                break
        if (passed):
            out_fh.write(f.createFlowHeader() + "\n")
            l += 1
    return l
Exemple #3
0
def filter_sff_file(flowgrams, header, filter_list, out_fh):
    """Filters all flowgrams in handle with filter.

    flowgrams: a list of flowgrams (or something similar)
    
    header: the header for the flowgrams
    
    filter_list: list of filters to be applied on sff.txt file
    
    out_fh: output file handle

    returns: number of flowgrams in filtered out file
    """
   
    write_sff_header(header, out_fh)

    l = 0
    for f in flowgrams:
        passed = True 
        for filter in filter_list:
            passed = passed and filter(f)            
            if not passed:
                #bail out
                break
        if (passed):
            out_fh.write(f.createFlowHeader()+"\n")
            l += 1
    return l
Exemple #4
0
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"):
    """Splits a sff.txt file on barcode/mapping file."""
    
    try:
        (flowgrams, header) = cat_sff_files(sff_file_handles)
    except ValueError:
        #reading in the binary sff usually shows up as ValueError
        raise FileFormatError, 'Wrong flogram file format. Make sure you pass the sff.txt format '+\
            'produced by sffinfo. The binary .sff will not work here.'
  
    (inverse_map, map_count) = build_inverse_barcode_map(MinimalFastaParser(map_file_handle))
    
    filenames = []
    #we might have many barcodes and reach python open file limit
    # therefor we go the slow way and open and close files each time
    #First set up all files with the headers only
    for barcode_id in map_count.keys():
        fh = open(outdir+barcode_id, "w")
        write_sff_header(header, fh, map_count[barcode_id])
        fh.close()
        filenames.append(outdir+barcode_id)
    #Then direct each flowgram into its barcode file
    for f in flowgrams:
        if inverse_map.has_key(f.Name):
            barcode_id = inverse_map[f.Name]
            fh = open(outdir+barcode_id, "a")
            fh.write(f.createFlowHeader()+"\n")
    return filenames
Exemple #5
0
def truncate_flowgrams_in_SFF(flowgrams,
                              header,
                              outhandle=None,
                              outdir="/tmp/",
                              barcode_mapping=None,
                              primer=None,
                              allow_num_ambigous=4):
    """Truncate flowgrams at low quality 3' end and strip key+primers.

    flowgrams: a list of flowgrams (or something similar)

    header: the header for the flowgrams

    outhandle: output file handle, can be None

    outdir: directory where random file will be created if outhandle is None

    barcode_mapping: dictionary mapping of read ids to barcode seqs.
                     The barcode seq will be truncated of the 5' end of the read

    primer: primer sequence that will be truncated of the 5' end of the read

    allow_num_ambigous: number of 'N' allowed in flowgram
    """
    out_filename = ""
    if not outhandle:
        fd, out_filename = mkstemp(dir=outdir,
                                   prefix="trunc_sff",
                                   suffix=".sff.txt")
        close(fd)
        outhandle = open(out_filename, "w")

    write_sff_header(header, outhandle)

    l = 0
    for f in flowgrams:
        qual_trimmed_flowgram = f.getQualityTrimmedFlowgram()

        if barcode_mapping:
            if f.Name in barcode_mapping:
                trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram(
                    primerseq=DEFAULT_KEYSEQ + barcode_mapping[f.Name] +
                    primer)
            else:
                continue
        else:
            prim = DEFAULT_KEYSEQ
            if primer:
                prim += primer
            trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram(
                primerseq=prim)

        if (trunc_flowgram is not None):
            outhandle.write(trunc_flowgram.createFlowHeader() + "\n")
            l += 1
    return (out_filename, l)
Exemple #6
0
def truncate_flowgrams_in_SFF(
        flowgrams, header, outhandle=None, outdir="/tmp/",
        barcode_mapping=None, primer=None,
        allow_num_ambigous=4):
    """Truncate flowgrams at low quality 3' end and strip key+primers.

    flowgrams: a list of flowgrams (or something similar)

    header: the header for the flowgrams

    outhandle: output file handle, can be None

    outdir: directory where random file will be created if outhandle is None

    barcode_mapping: dictionary mapping of read ids to barcode seqs.
                     The barcode seq will be truncated of the 5' end of the read

    primer: primer sequence that will be truncated of the 5' end of the read

    allow_num_ambigous: number of 'N' allowed in flowgram
    """
    out_filename = ""
    if not outhandle:
        fd, out_filename = mkstemp(dir=outdir, prefix="trunc_sff",
                                  suffix=".sff.txt")
        close(fd)
        outhandle = open(out_filename, "w")

    write_sff_header(header, outhandle)

    l = 0
    for f in flowgrams:
        qual_trimmed_flowgram = f.getQualityTrimmedFlowgram()

        if barcode_mapping:
            if f.Name in barcode_mapping:
                trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram(
                    primerseq=DEFAULT_KEYSEQ + barcode_mapping[f.Name] + primer)
            else:
                continue
        else:
            prim = DEFAULT_KEYSEQ
            if primer:
                prim += primer
            trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram(
                primerseq=prim)

        if(trunc_flowgram is not None):
            outhandle.write(trunc_flowgram.createFlowHeader() + "\n")
            l += 1
    return (out_filename, l)
Exemple #7
0
def build_averaged_flowgrams(mapping, sff_fp,
                             min_coverage=50, out_fp=None):
    """Build averaged flowgrams for each cluster in mapping.

    mapping: a cluster mapping as dictionary of lists

    sff_fp: pointer to sff.txt file, must be consistent with  mapping

    min_coverage: number of flowgrams to average over for each cluster

    out_fp: ouput file name

    NOTE: This function has no test code, since it is mostly IO around tested functions
    """

    l = len(mapping)
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    # update some values in the sff header
    header["# of Reads"] = l
    header["Index Length"] = "NA"

    if (out_fp):
        out_filename = out_fp
    else:
        fd, out_filename = mkstemp(dir="/tmp/",
                                  prefix="prefix_dereplicated",
                                  suffix=".sff.txt")
        close(fd)
    outhandle = open(out_filename, "w")

    # write out reduced flogram set
    write_sff_header(header, outhandle)

    seqs = {}
    # get a random sample for each cluster
    sample_keys = sample_mapped_keys(mapping, min_coverage)
    for ave_f, id in _average_flowgrams(mapping, flowgrams, sample_keys):
        outhandle.write(ave_f.createFlowHeader() + "\n")
        ave_f.Bases = ave_f.toSeq()
        seqs[id] = ave_f.Bases

    outhandle.close()
    return(out_filename, seqs)
Exemple #8
0
def build_averaged_flowgrams(mapping, sff_fp,
                             min_coverage=50, out_fp=None):
    """Build averaged flowgrams for each cluster in mapping.

    mapping: a cluster mapping as dictionary of lists

    sff_fp: pointer to sff.txt file, must be consistent with  mapping

    min_coverage: number of flowgrams to average over for each cluster

    out_fp: ouput file name

    NOTE: This function has no test code, since it is mostly IO around tested functions
    """

    l = len(mapping)
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    # update some values in the sff header
    header["# of Reads"] = l
    header["Index Length"] = "NA"

    if (out_fp):
        out_filename = out_fp
    else:
        fd, out_filename = mkstemp(dir="/tmp/",
                                  prefix="prefix_dereplicated",
                                  suffix=".sff.txt")
        close(fd)
    outhandle = open(out_filename, "w")

    # write out reduced flogram set
    write_sff_header(header, outhandle)

    seqs = {}
    # get a random sample for each cluster
    sample_keys = sample_mapped_keys(mapping, min_coverage)
    for ave_f, id in _average_flowgrams(mapping, flowgrams, sample_keys):
        outhandle.write(ave_f.createFlowHeader() + "\n")
        ave_f.Bases = ave_f.toSeq()
        seqs[id] = ave_f.Bases

    outhandle.close()
    return(out_filename, seqs)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows,
                      log_fh, num_cpus=1, on_cluster=False,
                      bail_out=1, pair_id_thresh=0.97, verbose=False,
                      threshold=3.75, fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None, checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if(key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write("Max number of rounds reached. " +
                             "Aborting clustering phase II and continuing with phase III.\n")
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if(prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids,
                                                     l, bestscores, log_fh, outdir,
                                                     on_cluster=on_cluster,
                                                     num_cpus=num_cpus,
                                                     fast_method=fast_method,
                                                     mapping=cluster_mapping,
                                                     verbose=verbose,
                                                     threshold=threshold,
                                                     pair_id_thresh=pair_id_thresh,
                                                     client_sockets=client_sockets,
                                                     error_profile=error_profile, spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if(newl == 0):
            # all flowgrams clustered
            break
         # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging
#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff",
                                        suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return(non_clustered_filename, bestscores, cluster_mapping)
Exemple #10
0
def greedy_clustering(sff_fp,
                      seqs,
                      cluster_mapping,
                      outdir,
                      num_flows,
                      log_fh,
                      num_cpus=1,
                      on_cluster=False,
                      bail_out=1,
                      pair_id_thresh=0.97,
                      verbose=False,
                      threshold=3.75,
                      fast_method=True,
                      error_profile=DENOISER_DATA_DIR +
                      'FLX_error_profile.dat',
                      max_num_rounds=None,
                      checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """

    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    # this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        # skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if (key not in cluster_mapping):
            # this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write(
                    "Max number of rounds reached. " +
                    "Aborting clustering phase II and continuing with phase III.\n"
                )
            break

        prefix_clustersize = len(cluster_mapping[key])
        # abort greedy first phase
        if (prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the
        # code
        if ('N' in seqs[key]):
            continue

        # check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            # check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams,
         newl) = filter_with_flowgram(key,
                                      ideal_flow,
                                      flowgrams,
                                      header,
                                      ids,
                                      l,
                                      bestscores,
                                      log_fh,
                                      outdir,
                                      on_cluster=on_cluster,
                                      num_cpus=num_cpus,
                                      fast_method=fast_method,
                                      mapping=cluster_mapping,
                                      verbose=verbose,
                                      threshold=threshold,
                                      pair_id_thresh=pair_id_thresh,
                                      client_sockets=client_sockets,
                                      error_profile=error_profile,
                                      spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if (newl == 0):
            # all flowgrams clustered
            break
        # JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging


#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    # write all remaining flowgrams into file for next step
    # TODO: might use abstract FlowgramContainer here as well
    fd, non_clustered_filename = mkstemp(dir=outdir,
                                         prefix="ff",
                                         suffix=".sff.txt")
    close(fd)
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
        if (f.Name in ids):
            non_clustered_fh.write(f.createFlowHeader() + "\n")

    return (non_clustered_filename, bestscores, cluster_mapping)