Esempio n. 1
0
   def test_write_sff_header(self):
      """write_sff_header writes a correct sff header"""      
      expected = """Common Header:
  Magic Number:\t0x2E736666
  Version:\t0001
  Index Offset:\t7773224
  Index Length:\t93365
  # of Reads:\t114
  Header Length:\t440
  Key Length:\t4
  # of Flows:\t400
  Flowgram Code:\t1
  Flow Chars:\tTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
  Key Sequence:\tTCAG
""".split('\n')
      header = {'Version':"0001",
                'Magic Number': '0x2E736666',
                'Index Offset':  '7773224',
                'Index Length':  '93365',
                '# of Reads':    '114',
                'Header Length': '440',
                'Key Length':    '4',
                '# of Flows':    '400',
                'Flowgram Code': '1',
                'Flow Chars':    'TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG',
                'Key Sequence':  'TCAG'}

      tmp_name = get_tmp_filename(prefix="test_write_sff_header")
      fh = open(tmp_name,"w")
      write_sff_header(header, fh, num=400)
      fh.close()
      fh = open(tmp_name,"U")
      lines =list(fh)
      remove(tmp_name)
      self.assertEqualItems(lines, map(lambda a: a +"\n", expected))
Esempio n. 2
0
    def __init__(self, header, outdir="/tmp/"):
        # set up output file
        self.filename = get_tmp_filename(tmp_dir=outdir, prefix="fc", suffix=".sff.txt")
        self.fh = open(self.filename, "w")
        write_sff_header(header, self.fh)

        self.write_mode = True
Esempio n. 3
0
    def __init__(self, header, outdir="/tmp/"):
        #set up output file
        self.filename = get_tmp_filename(tmp_dir=outdir,
                                         prefix="fc",
                                         suffix=".sff.txt")
        self.fh = open(self.filename, "w")
        write_sff_header(header, self.fh)

        self.write_mode = True
Esempio n. 4
0
def build_averaged_flowgrams(mapping, sff_fp,
                             min_coverage=50, out_fp=None):
    """Build averaged flowgrams for each cluster in mapping.
    
    mapping: a cluster mapping as dictionary of lists
    
    sff_fp: pointer to sff.txt file, must be consistent with  mapping
    
    min_coverage: number of flowgrams to average over for each cluster

    out_fp: ouput file name

    NOTE: This function has no test code, since it is mostly IO around tested functions
    """

    l = len(mapping)
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    #update some values in the sff header
    header["# of Reads"] = l
    header["Index Length"] = "NA"

    if (out_fp):
        out_filename=out_fp
    else:
        out_filename = get_tmp_filename(tmp_dir="/tmp/",
                                        prefix="prefix_dereplicated",
                                        suffix = ".sff.txt")
    outhandle = open(out_filename, "w")
    
    #write out reduced flogram set
    write_sff_header(header, outhandle)

    seqs = {}
    # get a random sample for each cluster
    sample_keys = sample_mapped_keys(mapping, min_coverage)
    for ave_f,id in _average_flowgrams(mapping, flowgrams, sample_keys):
        outhandle.write(ave_f.createFlowHeader()+"\n")
        ave_f.Bases = ave_f.toSeq()        
        seqs[id] = ave_f.Bases
    
    outhandle.close()
    return(out_filename, seqs)
Esempio n. 5
0
    def test_write_sff_header(self):
        """write_sff_header writes a correct sff header"""
        expected = """Common Header:
  Magic Number:\t0x2E736666
  Version:\t0001
  Index Offset:\t7773224
  Index Length:\t93365
  # of Reads:\t114
  Header Length:\t440
  Key Length:\t4
  # of Flows:\t400
  Flowgram Code:\t1
  Flow Chars:\tTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG
  Key Sequence:\tTCAG
""".split(
            "\n"
        )
        header = {
            "Version": "0001",
            "Magic Number": "0x2E736666",
            "Index Offset": "7773224",
            "Index Length": "93365",
            "# of Reads": "114",
            "Header Length": "440",
            "Key Length": "4",
            "# of Flows": "400",
            "Flowgram Code": "1",
            "Flow Chars": "TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG",
            "Key Sequence": "TCAG",
        }

        tmp_name = get_tmp_filename(prefix="test_write_sff_header")
        fh = open(tmp_name, "w")
        write_sff_header(header, fh, num=400)
        fh.close()
        fh = open(tmp_name, "U")
        lines = list(fh)
        remove(tmp_name)
        self.assertEqualItems(lines, map(lambda a: a + "\n", expected))
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows,
                      log_fh, num_cpus=1, on_cluster=False,
                      bail_out=1, pair_id_thresh=0.97, verbose=False,
                      threshold=3.75, fast_method=True,
                      error_profile=DENOISER_DATA_DIR+'FLX_error_profile.dat',
                      max_num_rounds=None, checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """
  
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    #this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        #skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if(not cluster_mapping.has_key(key)):
            #this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write("Max number of rounds reached. "+
                             "Aborting clustering phase II and continuing with phase III.\n")
            break
        
        prefix_clustersize=len(cluster_mapping[key])      
        #abort greedy first phase 
        if(prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the code
        if('N' in seqs[key]):
            continue

        #check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            #check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids,
                                                   l, bestscores, log_fh, outdir,
                                                   on_cluster=on_cluster,
                                                   num_cpus=num_cpus,
                                                   fast_method=fast_method, 
                                                   mapping=cluster_mapping,
                                                   verbose=verbose,
                                                   threshold=threshold,
                                                   pair_id_thresh=pair_id_thresh,
                                                   client_sockets=client_sockets,
                                                   error_profile=error_profile, spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if(newl==0):
            #all flowgrams clustered
            break
         #JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging
#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    #write all remaining flowgrams into file for next step
    #TODO: might use abstract FlowgramContainer here as well
    non_clustered_filename = get_tmp_filename(tmp_dir=outdir, prefix="ff",
                                              suffix =".sff.txt")
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
         if (ids.has_key(f.Name)):
              non_clustered_fh.write(f.createFlowHeader() +"\n")
              
    return(non_clustered_filename, bestscores, cluster_mapping)
Esempio n. 7
0
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows,
                      log_fh, num_cpus=1, on_cluster=False,
                      bail_out=1, pair_id_thresh=0.97, verbose=False,
                      threshold=3.75, fast_method=True,
                      error_profile=DENOISER_DATA_DIR+'FLX_error_profile.dat',
                      max_num_rounds=None, checkpoint_fp=None):
    """second clustering phase of denoiser.

    sff_fp: flowgram file
    seqs: fasta seqs corresponding to sff_fp
    cluster_mapping: preliminary cluster mapping from phase I
    outdir: output directory
    num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp)
    log_fh: write verbose info to log_fh if set
    num_cpus:number of cpus to use of on_cluster ==True
    on_cluster: run in paralell if True
    bail_out: stop clustering with first cluster having bail_out members
    pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq
                     identity of pair_id_thresh or higher
    verbose: be verbose or not
    threshold: low clustering threshold for phase II
    fast_method: use more memory intensive but faster method
    error_profile: path to error profile *.dat file
    max_num_rounds: If set, will stop clustering after this many rounds
    """
  
    (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
    l = num_flows

    spread = [1.0 for x in range(num_cpus)]
    (client_sockets, workers) = (None, None)
    if on_cluster:
        (client_sockets, workers, server_socket) = \
            setup_cluster(num_cpus, outdir, verbose, error_profile)

    if checkpoint_fp:
        (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \
            read_checkpoint(checkpoint_fp)
        skipping = True
    else:
        # ids stores all the active sequences
        # we initialize it with the ids from  the seqs dict here,
        # as it starts with all active flows.
        ids = dict.fromkeys(seqs)

        sorted_keys = sort_mapping_by_size(cluster_mapping)

        bestscores = {}
        round_ctr = 1

    #this is the main clustering loop, where most of the compute time is spent
    for key in sorted_keys:
        #skip until we reach the checkpoint
        if checkpoint_fp:
            if (checkpoint_key == key):
                if log_fh:
                    log_fh.write("Resume denoising with %s\n" % key)
                skipping = False
            if (skipping):
                continue

        if(not cluster_mapping.has_key(key)):
            #this guy already has been clustered
            continue

        if (max_num_rounds and round_ctr > max_num_rounds):
            if log_fh:
                log_fh.write("Max number of rounds reached. "+
                             "Aborting clustering phase II and continuing with phase III.\n")
            break
        
        prefix_clustersize=len(cluster_mapping[key])      
        #abort greedy first phase 
        if(prefix_clustersize < bail_out):
            break

        # Do not take bad sequences as cluster seeds, as this will break the code
        if('N' in seqs[key]):
            continue

        #check and delete workers if no longer needed
        if on_cluster:
            num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh)
            #check for dead workers
            check_workers(workers, client_sockets, log_fh)
            if num_cpus != len(spread):
                spread = [1.0 for x in range(num_cpus)]

        # write checkpoint right before expensive computation starts
        # Currently, write checkpint every 50 rounds,
        # could easily be changed here or exposed to command line
        if (round_ctr % 50) == 0:
            write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores,
                             sorted_keys, outdir)

        if log_fh:
            log_fh.write("Round %d:\n" % round_ctr)
            log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh)

        ideal_flow = seq_to_flow(seqs[key])
        (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids,
                                                   l, bestscores, log_fh, outdir,
                                                   on_cluster=on_cluster,
                                                   num_cpus=num_cpus,
                                                   fast_method=fast_method, 
                                                   mapping=cluster_mapping,
                                                   verbose=verbose,
                                                   threshold=threshold,
                                                   pair_id_thresh=pair_id_thresh,
                                                   client_sockets=client_sockets,
                                                   error_profile=error_profile, spread=spread)
        l = newl
        flowgrams = new_flowgrams
        round_ctr += 1
        if(newl==0):
            #all flowgrams clustered
            break
         #JR: I think this is too much info for the regular user, I leave it in, so
        # we can simply turn it on for debugging
#        if log_fh:
#            log_fh.write("Throughput Spread %s\n" % str(spread))

    if on_cluster:
        stop_workers(client_sockets, log_fh)
        server_socket.close()

    #write all remaining flowgrams into file for next step
    #TODO: might use abstract FlowgramContainer here as well
    non_clustered_filename = get_tmp_filename(tmp_dir=outdir, prefix="ff",
                                              suffix =".sff.txt")
    non_clustered_fh = open(non_clustered_filename, "w")
    write_sff_header(header, non_clustered_fh)
    for f in flowgrams:
         if (ids.has_key(f.Name)):
              non_clustered_fh.write(f.createFlowHeader() +"\n")
              
    return(non_clustered_filename, bestscores, cluster_mapping)