def reorder_mat(A, thr_list, min_cc_len, VERB): if not isspmatrix_csr(A): A = A.tocsr() # Initialization. ccs_ord = [] #Create list of unordered connected components todo_ccs = [np.arange(A.shape[0])] todo_next = [] n_loop = 0 while len(todo_ccs) > 0: thr_sub = thr_list[n_loop] # starts at 0.4 for n_loop=0 # Reorder each of them for cc in todo_ccs: # if statement # in order not to make the preprocessing twice. We could also remove # the preprocessing from the pipeline and do it here. if n_loop > 0: A_sub = A[cc, :][:, cc] A_sub = remove_bridge_reads(A_sub.multiply(A_sub > thr_sub)) else: A_sub = A # Compute connected components (n_cc, labels) = connected_components(A_sub, directed=False, return_labels=True) # Reorder each cc with spectral and keep the ordering if it looks OK for i_cc in xrange(n_cc): cc_sub = np.argwhere(labels == i_cc)[:, 0] if len(cc_sub) <= min_cc_len: continue msg = " Running spectral algorithm in connected"\ "component of size %d..." % (len(cc_sub)) oprint(msg, cond=(VERB >= 2)) (_, fidvec) = get_fiedler(A_sub[cc_sub, :][:, cc_sub]) permu = np.argsort(fidvec) (ii, jj, _) = find(A_sub[cc_sub[permu], :][:, cc_sub[permu]]) bw = max(abs(ii - jj)) if bw >= 80: oprint("Bandwidth larger than 80 in reordered matrix.", cond=(VERB >= 2)) todo_next.append(cc[cc_sub]) else: ccs_ord.append(cc[cc_sub[permu]]) todo_ccs = todo_next todo_next = [] n_loop += 1 return ccs_ord
def add_next_window(temp_fn, w_idx, cc_idx, whole_cons, opts, trim_margin): """ Add the consensus from the current window to the current consensus. Parameters ---------- temp_fn : str (temporary file to write sequences to align with spoa) w_idx : int (index of current window) cc_idx : int (index of the connected component) whole_cons : str (consensus extracted so far by joining the consensus sequences from windows 0 to w_idx - 1) opts : dict (keywords arguments for global parameters) trim_margin : int (number of bp to trim on each end of the consensus, as the consensus sequence is more likely to be erroneous on the ends) Returns ------- str (consensus extracted by joining the consensus sequences from windows 0 to w_idx) """ DATATYPE = opts['READS_FMT'][-1] ROOT_DIR = opts['ROOT_DIR'] MERGE_MARGIN = opts['MERGE_MARGIN'] VERB = opts['VERB'] fn = "%s/cc_%d/poa_in_cc_%d_win_%d.fast%s.cnsns" % ( ROOT_DIR, cc_idx, cc_idx, w_idx, DATATYPE) if (not (os.path.exists(fn)) or os.path.getsize(fn) == 0): msg = "file %s does not exist or is empty" % (fn) oprint(msg, cond=(VERB >= 2)) return whole_cons next_win_seq = get_consensus(fn, trim_margin) next_win_len = len(next_win_seq) whole_cons_len = len(whole_cons) kept_len = max(0, whole_cons_len - next_win_len - MERGE_MARGIN) cons0 = whole_cons[:kept_len] cons1 = whole_cons[kept_len:] # Write end of current consensus long sequence and next consensus window sequence in poa_in file poa_in_fh = open(temp_fn, "wb") poa_in_fh.write(">end_of_current_cons\n%s\n" % (cons1)) poa_in_fh.write(">cons_in_window_%d\n%s\n" % (w_idx, next_win_seq)) poa_in_fh.close() # Run poa to include next out_fn = "%s/cc_%d/poa_out_cons_cc%d_win_%d" % (ROOT_DIR, cc_idx, cc_idx, w_idx) cons1b = run_spoa_and_consensus(temp_fn, out_fn, opts['SPOA_PATH']) return cons0 + cons1b
def reord_submat(in_tuple, A, opts): (thr_sub, cc) = in_tuple min_len = int(opts['MIN_CC_LEN']) verb = int(opts['VERB']) JULIA_PATH = opts['JULIA_PATH'] JULIA_SCRIPT = opts['JULIA_SCRIPT'] # rep_time_fh = open('%s/time_evs.txt' %(opts['ROOT_DIR']), 'a') sub_todo_next = [] sub_ccs_ord = [] A_sub = A[cc, :][:, cc] A_sub = remove_bridge_reads(A_sub.multiply(A_sub > thr_sub)) # Compute connected components (n_cc, labels) = connected_components(A_sub, directed=False, return_labels=True) # Reorder each cc with spectral and keep the ordering if it looks OK for i_cc in xrange(n_cc): cc_sub = np.argwhere(labels == i_cc)[:, 0] if len(cc_sub) <= min_len: continue msg = " Running spectral algorithm in connected "\ "component of size %d..." % (len(cc_sub)) oprint(msg, cond=(verb >= 2)) # t1 = time() if JULIA_PATH and (len(cc_sub) > 3000): permu = get_fiedler_julia(A_sub[cc_sub, :][:, cc_sub], JULIA_PATH, JULIA_SCRIPT) # rep_time_fh.write("%d\t%3.6f\t(julia)\n" %(len(cc_sub), time()-t1)) else: (_, fidvec) = get_fiedler(A_sub[cc_sub, :][:, cc_sub]) permu = np.argsort(fidvec) # rep_time_fh.write("%d\t%3.6f\n" %(len(cc_sub), time()-t1)) oprint("Done in %3.6fs" % (time() - t1), cond=(verb >= 2)) (ii, jj, _) = find(A_sub[cc_sub[permu], :][:, cc_sub[permu]]) bw = max(abs(ii - jj)) if bw >= 80: oprint("Bandwidth larger than 90 in reordered matrix.", cond=(verb >= 2)) sub_todo_next.append(cc[cc_sub]) else: sub_ccs_ord.append(cc[cc_sub[permu]]) # rep_time_fh.close() return sub_ccs_ord, sub_todo_next
def merge_windows_in_cc(cc_idx, opts): """ Merge the consensus sequences from all windows into one sequence (contig). Parameters ---------- cc_idx : int (index of the connected component) opts : dict (keywords arguments for global parameters) """ # Parse arguments TRIM_MARGIN = opts['TRIM_MARGIN'] DATATYPE = opts['READS_FMT'][-1] ROOT_DIR = opts['ROOT_DIR'] VERB = opts['VERB'] # Count number of windows try: cmd = "ls %s/cc_%d/poa_in_cc_%d_win_*.fast*.cnsns | wc -l" % ( ROOT_DIR, cc_idx, cc_idx) n_win = int(subprocess.check_output(cmd, shell=True)) except: n_win = 10000 # quick fix in case of problem with output of subprocess # Initialize fn = "%s/cc_%d/poa_in_cc_%d_win_%d.fast%s.cnsns" % (ROOT_DIR, cc_idx, cc_idx, 0, DATATYPE) whole_cons = get_consensus(fn, TRIM_MARGIN) oprint(len(whole_cons)) # Incrementally add consensus between window k and window k+1 # trim margin = 0 for first and last 3 windows for w_idx in xrange(0, 3): poa_in_fn = "%s/poa_in_cons_cc_%d_win_%d.fasta" % (ROOT_DIR, cc_idx, w_idx) whole_cons = add_next_window(poa_in_fn, w_idx, cc_idx, whole_cons, opts, 0) # trim margin = args.trim_margin for the rest of the windows for w_idx in xrange(3, n_win - 3): poa_in_fn = "%s/poa_in_cons_cc_%d_win_%d.fasta" % (ROOT_DIR, cc_idx, w_idx) whole_cons = add_next_window(poa_in_fn, w_idx, cc_idx, whole_cons, opts, TRIM_MARGIN) msg = "Consensus generation... %dbp extracted so far (window %d)" % ( len(whole_cons), w_idx) condition = (VERB >= 2) and (w_idx % 500 == 0) oprint(msg, cond=condition) for w_idx in xrange(n_win - 3, n_win): poa_in_fn = "%s/poa_in_cons_cc_%d_win_%d.fasta" % (ROOT_DIR, cc_idx, w_idx) whole_cons = add_next_window(poa_in_fn, w_idx, cc_idx, whole_cons, opts, 0) msg = "extracted and merged sequences in windows for contig %d. Consensus length %dbp" % \ (cc_idx, len(whole_cons)) oprint(msg, cond=(VERB >= 2)) # Print consensus to backup file consensus_fn = "%s/consensus_cc_%d.fasta" % (ROOT_DIR, cc_idx) consensus_fh = open(consensus_fn, "wb") consensus_fh.write(">consensus_from_windows_contig_%d\n%s\n" % (cc_idx, whole_cons)) consensus_fh.close() # print(">contig_%d\n%s" % (cc_idx, whole_cons), file=sys.stdout) return whole_cons
"less good)") parser.add_argument("--julia", default=None, help="path to Julia (optional,"\ "though eigenvector computations are clearly faster in Julia than in Python)") args = parser.parse_args() opts = fill_args_opts(args) ROOT_DIR = opts['ROOT_DIR'] VERB = opts['VERB'] # Load reads reads_fh = open(args.READS_FN, "rU") record_list = list(SeqIO.parse(reads_fh, opts['READS_FMT'])) reads_fh.close() oprint("Reads loaded. Compute overlaps from files...", dt=(time() - t0), cond=(VERB >= 2)) # Compute overlaps from the files (read_nb2id, ovl_list, I, J, K, num_match, ovl_len, n_reads) = compute_overlaps(args.minimapfn, record_list) # Threshold based on overlaps value (number of matches) and length THR = mquantiles(num_match, args.sim_qtile) oprint("THR = %1.1f " % THR) cond1 = (num_match > THR) cond2 = (ovl_len > opts['LEN_THR']) idxok = np.argwhere(cond1 * cond2)[:, 0] num_match_l = num_match I = I[idxok] J = J[idxok]
def reorder_submat(A, cc, num_match_l, qtile, ccs_ord, opts): """ Reorder matrix A with spectral ordering algorithm. Recursive function that reorders each connected component of the input matrix and raises threshold in the connected components where the order seems wrong, based on the bandwidth of the reordered matrix (this criterium is empirical and specific to genome assembly of genomes with limited number of repeats). Parameters ---------- A : scipy.sparse matrix (similarity matrix) cc : list (index of the reads in the cc_idx-th connected component) num_match_l : list (of number of matches (int) such that A[i,j] = number of matches between i and j) *before* preprocessing and not restricted to the reads in cc. It is used to compute the threshold with qtile) qtile : real (the values lower than the threhsold thr = quantile(num_match_l, qtile) are removed from A) opts : dict (keywords argument containing global parameters and options) ccs_ord : list (of lists or reads index sorted by position inside a given connected component) Returns ---------- None but ccs_ord is modified "passed by reference" """ VERB = opts['VERB'] min_cc_len = opts['MIN_CC_LEN'] JULIA_PATH = opts['JULIA_PATH'] JULIA_SCRIPT = opts['JULIA_SCRIPT'] # rep_time_fh = open('%s/time_evs.txt' %(opts['ROOT_DIR']), 'wb') # t0 = time() if not isspmatrix_csr(A): A = A.tocsr() (ncs, lbls) = connected_components(A, directed=False, return_labels=True) for nc in xrange(ncs): cc_sub = np.argwhere(lbls == nc)[:, 0] if len(cc_sub) <= min_cc_len: continue msg = " Running spectral algorithm in connected component of size %d..." % ( len(cc_sub)) oprint(msg, cond=(VERB >= 2)) # A_sub = A.copy().tocsr() # A_sub = A_sub[cc_sub, :] # A_sub = A_sub[:, cc_sub] A_sub = A[cc_sub, :][:, cc_sub] # t1 = time() # # Use Julia if possible to reorder relatively large matrices if JULIA_PATH and (len(cc_sub) > 4000): permu = get_fiedler_julia(A_sub, JULIA_PATH, JULIA_SCRIPT) # rep_time_fh.write("%d\t%3.6f\t(julia)\n" %(len(cc_sub), time()-t1)) else: (fidval, fidvec) = get_fiedler(A_sub) if fidval < 1e-12: oprint( "\n\nWARNING ! Non connected submatrix of size %d!\n\n" % (len(cc_sub))) # rep_time_fh.write("%d\t%3.6f\n" %(len(cc_sub), time()-t1)) permu = np.argsort(fidvec) cc_ord = [cc_sub[idx] for idx in permu] # A_ord = A_sub.copy() # A_ord = A_ord[permu, :] # A_ord = A_ord[:, permu] # (ii, jj, _) = find(A_ord) (ii, jj, _) = find(A_sub[permu, :][:, permu]) bw = max(abs(ii - jj)) if bw >= 80: oprint( "Bandwidth larger than 80 in reordered matrix. Threshold in submatrix increased before reordering.", cond=(VERB >= 2)) new_qtile = qtile new_qtile += min(0.1, 0.5 * (1. - qtile)) thr_sub = mquantiles(num_match_l, new_qtile) A_sub = remove_bridge_reads(A_sub.multiply(A_sub > thr_sub)) cc_abs = [cc[idx] for idx in cc_sub] reorder_submat(A_sub, cc_abs, num_match_l, new_qtile, ccs_ord, opts) else: ccs_ord.append([cc[idx] for idx in cc_ord]) # oprint("Done in %3.3f." %(time() - t1), dt=(time() - t0), cond=(VERB >= 2)) # # oprint("Computed rough layout in %3.3f." %(time() - t0), cond=(VERB >= 2)) # # rep_time_fh.close() return