def kmers_for_component(k1mer_dictionary, kmer_directory, reads, reads_files, directory_name, contig_file_extension, get_partition_k1mers, double_stranded=True, paired_end=False, repartition=False, partition_size=500, overload=1.5, K=24, gpmetis_path='gpmetis', penalty=5, only_reads=False, inMem=False, nJobs=1): """This fuction runs gpmetis on the components above a threshold size. It then creates a dictionary called k1mers2component {k1mer : component}. It then sends any reads that share a kmer with a component to that component. It then cycles through the k1mer file and outputs the k1mers along with their weights to a file for each component. It then creates a kmers2component dictionary. It then outputs a kmers file for each component. Inputs: k1mer_dictionary kmer_directory: used instead of k1mer_dictionary, currently unused reads_files: where reads are directory_name: where files should be stored contig_file_extension: the contig files are in this extension """ if os.path.exists(directory_name + "/before_sp_log.txt"): f_log = open(directory_name + "/before_sp_log.txt", 'a') else: f_log = open(directory_name + "/before_sp_log.txt", 'w') def write_log(s): f_log.write(s + "\n") print(s) # Counts number of components above size threshold Num_Components = 0 i = 1 while os.path.exists(directory_name + "/component" + str(i) + "contigs.txt"): i += 1 Num_Components += 1 # Counts number of components below size threshold Num_Remaining_Components = 0 i = 1 while os.path.exists(directory_name + "/remaining_contigs" + str(i) + ".txt"): i += 1 Num_Remaining_Components += 1 # def find_comps(comp_dict, tries): # #Only the exact # (comp,hits) = max(comp_dict.iteritems(), key=operator.itemgetter(1)) # if hits>=tries: # return set(comp) # else: # return set() def get_rmers(read, R): i = 0 k1mers = [] while i < len(read) - R: k1mers.append(read[i:i + R]) i += R k1mers.append(read[-R:]) return k1mers def get_comps(read, k1mers2component): k1mers = get_rmers(read, K + 1) comps = [k1mers2component.get(k1mer, [-1, -1]) for k1mer in k1mers] comp_list = [set(comp[0]) for comp in comps if comp[0] != -1] if comp_list: assigned_comp = set.union(*comp_list) else: assigned_comp = set() return assigned_comp def get_comps_paired(read1, read2, k1mers2component): return set.union(get_comps(read1, k1mers2component), get_comps(read2, k1mers2component)) if get_partition_k1mers: ufactor = int(1000.0 * overload - 1000.0) components_broken = {} temp_string = "" # Runs gpmetis for i in range(Num_Components): with open( directory_name + "/component" + str(i + 1) + contig_file_extension, 'r') as f: lines = f.readlines() num_contigs = len(lines) Partitions = min( int(math.ceil(float(num_contigs) / float(partition_size))), 100) components_broken[i] = Partitions temp_string += "Component " + str(i) + ": " + str( Partitions) + " partitions, " if len(lines) >= 2: run_cmd(gpmetis_path + " -ufactor=" + str(ufactor) + " " + directory_name + "/component" + str(i + 1) + ".txt" + " " + str(Partitions)) if repartition: #Create GPMETIS file for repartition partition_file = "/component" + str( i + 1) + ".txt" + ".part." + str( components_broken[i]) og_graph_file = "/component" + str(i + 1) + ".txt" new_graph_file = "/component" + str(i + 1) + "r2.txt" contig_file = "/component" + str( i + 1) + contig_file_extension new_contig_file = contig_file #Currently unused option randomize = False write_log( str(time.asctime()) + ": " + "Creating graph for repartition ") weight_updated_graph(directory_name, partition_file, og_graph_file, new_graph_file, contig_file, new_contig_file, penalty, randomize) write_log( str(time.asctime()) + ": " + "Created graph for repartition ") #Rerun GPMETIS file for repartition run_cmd(gpmetis_path + " -ufactor=" + str(ufactor) + " " + directory_name + "/component" + str(i + 1) + "r2.txt " + str(Partitions)) write_log( str(time.asctime()) + ": " + "gpmetis for partitioning is complete \n " + temp_string) new_components = {} k1mers2component = {} kmers2component = {} # Builds k1mer2component dictionary for i in components_broken: with open( directory_name + "/component" + str(i + 1) + contig_file_extension, 'r') as f_contigs: contig_lines = f_contigs.readlines() with open( directory_name + "/component" + str(i + 1) + ".txt.part." + str(components_broken[i]), 'r') as f_component: j = 0 for line in f_component: tokens = line.split() comp = 'c' + str(i + 1) + "_" + tokens[0] contig = contig_lines[j].split()[0] if comp not in new_components: new_components[comp] = [contig] else: new_components[comp].append(contig) for each in range(len(contig) - (K + 1) + 1): k1mer = contig[each:each + (K + 1)] if k1mer not in k1mers2component: k1mers2component[k1mer] = [ set([comp]), k1mer_dictionary.get(k1mer, 0) ] else: k1mers2component[k1mer][0].add(comp) j += 1 if repartition: with open( directory_name + "/component" + str(i + 1) + "r2.txt.part." + str(components_broken[i]), 'r') as f_component: j = 0 for line in f_component: tokens = line.split() comp = 'r2_c' + str(i + 1) + "_" + tokens[0] contig = contig_lines[j].split()[0] if comp not in new_components: new_components[comp] = [contig] else: new_components[comp].append(contig) for each in range(len(contig) - (K + 1) + 1): k1mer = contig[each:each + (K + 1)] if k1mer not in k1mers2component: k1mers2component[k1mer] = [ set([comp]), k1mer_dictionary.get(k1mer, 0) ] else: k1mers2component[k1mer][0].add(comp) j += 1 # Adds remaining components to k1mers2component if 1: for i in range(Num_Remaining_Components): with open( directory_name + "/remaining_contigs" + str(i + 1) + ".txt", 'r') as remaining_contigs_file: if 1: lines = remaining_contigs_file.readlines() comp = "cremaining" + str(i + 1) j = 0 for line in lines: tokens = line.split() contig = tokens[0] if comp not in new_components: new_components[comp] = [contig] else: new_components[comp].append(contig) for each in range(len(contig) - (K + 1) + 1): k1mer = contig[each:each + (K + 1)] if k1mer not in k1mers2component: k1mers2component[k1mer] = [ set([comp]), k1mer_dictionary.get(k1mer, 0) ] else: k1mers2component[k1mer][0].add(comp) j += 1 write_log( str(time.asctime()) + ": " + "k1mers2component dictionary created ") '''no_kmers_in_comp = {} for comp in new_components: temp = 0 for contig in new_components[comp]: temp += len(contig) no_kmers_in_comp[comp] = temp''' '''iter_tag = "_c" if second_iteration: iter_tag = "_r2_c"''' read_line = '' # Assigns reads to components in the non paired end case NR = 10000000 if paired_end == False: read_ctr = 0 offset = {} read_part_seq = {} for comp in new_components: read_part_seq[comp] = [] #open(directory_name+"/reads"+iter_tag+str(comp)+".fasta", 'w') offset[comp] = 0 f = open(reads_files[0], 'r') while 1: reads = [] while 1: readname = f.readline()[:-1] if not readname: last_read = readname break read = f.readline()[:-1] if read.strip('ACTG'): continue read_ctr += 1 reads.append(read) last_read = read if (read_ctr % NR) == (0) or (not read): break if double_stranded: reads = par_SE_rc(reads, nJobs) #pdb.set_trace() for read in reads: assigned_comp = get_comps(read, k1mers2component) for each_comp in assigned_comp: read_part_seq[each_comp].append(read) if not inMem: #pdb.set_trace() for comp in new_components: read_part_file = open( directory_name + "/reads" + str(comp) + ".fasta", 'a') read_part_file.write("".join([ '>' + str(e + offset.get(comp, 0)) + '\n' + read + '\n' for (e, read) in enumerate(read_part_seq[comp]) ])) read_part_file.close() offset[comp] = offset.get(comp, 0) + len( read_part_seq[comp]) read_part_seq[comp][:] = [] if not last_read: break # Assigns reads to components in the paired end case elif paired_end == True: read_ctr = 0 offset = {} read1_part_seq = {} read2_part_seq = {} for comp in new_components: read1_part_seq[comp] = [] # open(directory_name+"/reads"+iter_tag+str(comp)+"_1.fasta", 'w') read2_part_seq[comp] = [] #open(directory_name+"/reads"+iter_tag+str(comp)+"_2.fasta", 'w') offset[comp] = 0 #read_line1 = ''; read_line2 = '' f1 = open(reads_files[0], 'r') f2 = open(reads_files[1], 'r') while 1: reads_1 = [] reads_2 = [] while 1: readname_1 = f1.readline()[:-1] readname_2 = f2.readline()[:-1] if not readname_1: last_read = readname_1 break read_1 = f1.readline()[:-1] read_2 = f2.readline()[:-1] if read_1.strip('ACTG') or read_2.strip('ACTG'): continue read_ctr += 1 reads_1.append(read_1) reads_2.append(read_2) last_read = read_1 if (read_ctr % NR) == (0) or (not read_1): break if double_stranded: reads = par_PE_rc(reads_1, reads_2, double_stranded, nJobs) if not double_stranded: reads = [[reads_1[i], reads_2[i]] for i in range(len(reads_1))] del reads_1, reads_2 for read in reads: assigned_comp = get_comps_paired(read[0], read[1], k1mers2component) for each_comp in assigned_comp: read1_part_seq[each_comp].append(read[0]) read2_part_seq[each_comp].append(read[1]) if not inMem: for comp in new_components: read1_part_file = open( directory_name + "/reads" + str(comp) + "_1.fasta", 'a') read2_part_file = open( directory_name + "/reads" + str(comp) + "_2.fasta", 'a') read1_part_file.write("".join([ '>' + str(e + offset.get(comp, 0)) + '_1\n' + read + '\n' for (e, read) in enumerate(read1_part_seq[comp]) ])) read2_part_file.write("".join([ '>' + str(e + offset.get(comp, 0)) + '_2\n' + read + '\n' for (e, read) in enumerate(read2_part_seq[comp]) ])) read1_part_file.close() read2_part_file.close() offset[comp] = offset.get(comp, 0) + len( read1_part_seq[comp]) read1_part_seq[comp][:] = [] read2_part_seq[comp][:] = [] if not last_read: break if not inMem: for comp in new_components: read1_part_file = open( directory_name + "/reads" + str(comp) + "_1.fasta", 'a') read2_part_file = open( directory_name + "/reads" + str(comp) + "_2.fasta", 'a') read1_part_file.write("".join([ '>' + str(e + offset.get(comp, 0)) + '_1\n' + read + '\n' for (e, read) in enumerate(read1_part_seq[comp]) ])) read2_part_file.write("".join([ '>' + str(e + offset.get(comp, 0)) + '_2\n' + read + '\n' for (e, read) in enumerate(read2_part_seq[comp]) ])) read1_part_file.close() read2_part_file.close() offset[comp] = offset.get(comp, 0) + len( read1_part_seq[comp]) elif inMem: for comp in new_components: rps1 = [ read1_part_seq[comp] ] #read1_part_seq[comp][1::2] #Keep only the reads, not the names rps2 = [read2_part_seq[comp]] #read2_part_seq[comp][1::2] read1_part_seq[comp] = rps1 read2_part_seq[comp] = rps2 write_log(str(time.asctime()) + ": " + "reads partititoned ") '''c2 = Counter("k1mer Streaming", 10**6) # Cycles through k1mer file and adds k1mer weights to k1mers2component dictionary with open(kmer_directory + "/k1mer.dict" , 'r') as f: for line in f: if len(line) == 0: continue c2.increment() k1mer, weight = line.split() k1mer = k1mer.upper() weight = (float(weight)) if k1mer in k1mers2component: k1mers2component[k1mer][1] += weight if double_stranded: r_k1mer = reverse_complement(k1mer) if r_k1mer in k1mers2component: k1mers2component[r_k1mer][1] += weight write_log(str(time.asctime()) + ": " + "k1mers weights stored in dictionary")''' '''write_log(str(time.asctime()) + ": " + "Computing kmer weights.") #Update the kmer dictionary weights kmer_dictionary = collections.Counter() for kp1mer in k1mer_dictionary: k1, k2 = kp1mer[:K], kp1mer[-K:] kmer_dictionary[k1] += k1mer_dictionary[kp1mer] kmer_dictionary[k2] += k1mer_dictionary[kp1mer] write_log(str(time.asctime()) + ": " + "Computed kmer weights.")''' contig_weights = {} if not only_reads: #If only_reads, no need to write k1mers write_log(str(time.asctime()) + ": Writing k1mers to file") # Writes out k1mers with weights for each partition for comp in new_components: with open( directory_name + "/component" + comp + "k1mers_allowed.dict", 'w') as k1mer_file: k1mer_file_data = [] contig_weights[comp] = [] for contig in new_components[comp]: weight_list = [] for i in range(len(contig) - (K + 1) + 1): k1mer = contig[i:i + (K + 1)] #rc = reverse_complement(k1mer); rw = k1mers2component.get(rc,[0,0]) k1mer_wt = k1mers2component[k1mer][1] weight_list.append(k1mer_wt) if not inMem: k1mer_file_data.append(k1mer + "\t" + str(k1mer_wt) + "\n") if inMem: contig_weights[comp].append(weight_list) #k1mer_file.write(k1mer + "\t" + str(k1mers2component[k1mer][1] + rw[1]) + "\n") '''for i in range(len(contig)-(K) + 1): kmer = contig[i:i+(K)] kmer_file.write(kmer + "\t" + str(kmer_dictionary[kmer]) + "\n")''' k1mer_file.writelines(k1mer_file_data) write_log(str(time.asctime()) + ": " + "k1mers written to file ") #del(k1mers2component) '''kmers2component = {} # builds kmers2components dictionary that will store kmer weights for i in components_broken: with open(directory_name+"/component" + str(i+1) + contig_file_extension, 'r') as f_contigs: with open(directory_name+"/component" + str(i+1) + graph_file_extension + ".part." + str(components_broken[i]) , 'r') as f_component: contig_lines = f_contigs.readlines() j = 0 for line in f_component: tokens = line.split() comp = str(i+1) + "_" + tokens[0] contig = contig_lines[j].split()[0] for each in range(len(contig)-(K) + 1): kmers2component[contig[each:each+(K)]] = 0.0 j += 1 # adds kmers from remaining components if not second iterarion if not second_iteration: for i in range(Num_Remaining_Components): with open(directory_name+"/remaining_contigs"+str(i+1)+".txt", 'r') as remaining_contigs_file: if 1: lines = remaining_contigs_file.readlines() comp = "remaining"+str(i+1) j = 0 for line in lines: tokens = line.split() contig = tokens[0] for each in range(len(contig)-(K) + 1): kmers2component[contig[each:each+(K)]] = 0.0 j += 1 write_log(str(time.asctime()) + ": " + "kmers2component dictionary created ") c2 = Counter("kmer Streaming", 10**6) # gets kmer weights from kmer file and adds them to kmers2component with open(kmer_directory + "/kmer.dict" , 'r') as f: for line in f: if len(line) == 0: continue c2.increment() kmer, weight = line.split() kmer = kmer.upper() weight = (float(weight)) if kmer in kmers2component: kmers2component[kmer] += weight if double_stranded: r_kmer = reverse_complement(kmer) if r_kmer in kmers2component: kmers2component[r_kmer] += weight write_log(str(time.asctime()) + ": " + "kmers weights stored in dictionary")''' '''# Writes out kmers and weights for each partition for comp in new_components: with open(directory_name+"/component" + comp + "kmers_allowed.dict" , 'w') as new_file: for contig in new_components[comp]: for i in range(len(contig)-(K) + 1): kmer = contig[i:i+(K)] new_file.write(kmer + "\t" + str(kmer_dictionary[kmer]) + "\n")''' write_log(str(time.asctime()) + ": " + "kmers written to file " + "\n") #newcomps = [c for c in new_components] if inMem: new_comps = new_components else: #Reseat new_comps and contig_weights new_comps = [c for c in new_components] contig_weights = [] rps = {} if inMem: if paired_end: for comp in new_components: rps[comp] = [read1_part_seq[comp], read2_part_seq[comp]] else: for comp in new_components: rps[comp] = [read_part_seq[comp]] return [components_broken, new_comps, contig_weights, rps]
def kmers_for_component(k1mer_dictionary,kmer_directory, reads_files, directory_name, contig_file_extension, get_partition_k1mers, double_stranded = True, paired_end = False, repartition = False, partition_size = 500, overload = 1.5, K = 24, gpmetis_path = 'gpmetis', penalty = 5, only_reads = False): """This fuction runs gpmetis on the components above a threshold size. It then creates a dictionary called k1mers2component {k1mer : component}. It then sends any reads that share a kmer with a component to that component. It then cycles through the k1mer file and outputs the k1mers along with their weights to a file for each component. It then creates a kmers2component dictionary. It then outputs a kmers file for each component. Inputs: k1mer_dictionary kmer_directory: used instead of k1mer_dictionary, currently unused reads_files: where reads are directory_name: where files should be stored contig_file_extension: the contig files are in this extension """ if os.path.exists(directory_name+"/before_sp_log.txt"): f_log = open(directory_name+"/before_sp_log.txt", 'a') else: f_log = open(directory_name+"/before_sp_log.txt", 'w') def write_log(s): f_log.write(s + "\n"); print(s) # Counts number of components above size threshold Num_Components = 0 i = 1 while os.path.exists(directory_name+"/component" + str(i) + "contigs.txt"): i += 1 Num_Components += 1 # Counts number of components below size threshold Num_Remaining_Components = 0 i = 1 while os.path.exists(directory_name+"/remaining_contigs" + str(i) + ".txt"): i += 1 Num_Remaining_Components += 1 # def find_comps(comp_dict, tries): # #Only the exact # (comp,hits) = max(comp_dict.iteritems(), key=operator.itemgetter(1)) # if hits>=tries: # return set(comp) # else: # return set() def get_rmers(read,R): i = 0; k1mers = [] while i < len(read) - R: k1mers.append(read[i:i+R]) i+=R k1mers.append(read[-R:]) return k1mers def get_comps(read,k1mers2component): k1mers = get_rmers(read,K+1); comps = [k1mers2component.get(k1mer, [-1,-1]) for k1mer in k1mers] comp_list = [set(comp[0]) for comp in comps if comp[0] != -1] if comp_list: assigned_comp = set.union(*comp_list) else: assigned_comp = set() return assigned_comp def get_comps_paired(read1,read2,k1mers2component): return set.union(get_comps(read1,k1mers2component),get_comps(read2,k1mers2component)) if get_partition_k1mers: ufactor = int(1000.0*overload - 1000.0) components_broken = {} temp_string = "" # Runs gpmetis for i in range(Num_Components): with open(directory_name+"/component" + str(i+1) + contig_file_extension , 'r') as f: lines = f.readlines() num_contigs = len(lines) Partitions = min(int(math.ceil(float(num_contigs)/float(partition_size))),100) components_broken[i] = Partitions temp_string += "Component " + str(i) + ": " + str(Partitions) + " partitions, " if len(lines) >= 2: run_cmd(gpmetis_path + " -ufactor="+str(ufactor) + " "+directory_name+"/component" + str(i+1) + ".txt" + " " +str(Partitions)) if repartition: #Create GPMETIS file for repartition partition_file = "/component" + str(i+1) + ".txt" + ".part." + str(components_broken[i]) og_graph_file = "/component" + str(i+1) + ".txt" new_graph_file = "/component" + str(i+1) + "r2.txt" contig_file = "/component" + str(i+1) + contig_file_extension; new_contig_file = contig_file #Currently unused option randomize = False write_log(str(time.asctime()) + ": " + "Creating graph for repartition ") weight_updated_graph(directory_name, partition_file, og_graph_file, new_graph_file, contig_file, new_contig_file, penalty, randomize) write_log(str(time.asctime()) + ": " + "Created graph for repartition ") #Rerun GPMETIS file for repartition run_cmd(gpmetis_path + " -ufactor="+str(ufactor) + " "+directory_name+"/component" + str(i+1) + "r2.txt " +str(Partitions)) write_log(str(time.asctime()) + ": " + "gpmetis for partitioning is complete \n " + temp_string) new_components = {} k1mers2component = {} kmers2component = {} # Builds k1mer2component dictionary for i in components_broken: with open(directory_name+"/component" + str(i+1) + contig_file_extension, 'r') as f_contigs: contig_lines = f_contigs.readlines() with open(directory_name+"/component" + str(i+1) + ".txt.part." + str(components_broken[i]) , 'r') as f_component: j = 0 for line in f_component: tokens = line.split() comp = 'c' + str(i+1) + "_" + tokens[0] contig = contig_lines[j].split()[0] if comp not in new_components: new_components[comp] = [contig] else: new_components[comp].append(contig) for each in range(len(contig)-(K+1) + 1): k1mer = contig[each:each+(K+1)] if k1mer not in k1mers2component: k1mers2component[k1mer] = [set([comp]), k1mer_dictionary.get(k1mer,0)] else: k1mers2component[k1mer][0].add(comp) j += 1 if repartition: with open(directory_name+"/component" + str(i+1) + "r2.txt.part." + str(components_broken[i]) , 'r') as f_component: j = 0 for line in f_component: tokens = line.split() comp = 'r2_c' + str(i+1) + "_" + tokens[0] contig = contig_lines[j].split()[0] if comp not in new_components: new_components[comp] = [contig] else: new_components[comp].append(contig) for each in range(len(contig)-(K+1) + 1): k1mer = contig[each:each+(K+1)] if k1mer not in k1mers2component: k1mers2component[k1mer] = [set([comp]), k1mer_dictionary.get(k1mer,0)] else: k1mers2component[k1mer][0].add(comp) j += 1 # Adds remaining components to k1mers2component if 1: for i in range(Num_Remaining_Components): with open(directory_name+"/remaining_contigs"+str(i+1)+".txt", 'r') as remaining_contigs_file: if 1: lines = remaining_contigs_file.readlines() comp = "cremaining"+str(i+1) j = 0 for line in lines: tokens = line.split() contig = tokens[0] if comp not in new_components: new_components[comp] = [contig] else: new_components[comp].append(contig) for each in range(len(contig)-(K+1) + 1): k1mer = contig[each:each+(K+1)] if k1mer not in k1mers2component: k1mers2component[k1mer] = [set([comp]), k1mer_dictionary.get(k1mer,0)] else: k1mers2component[k1mer][0].add(comp) j += 1 write_log(str(time.asctime()) + ": " + "k1mers2component dictionary created ") '''no_kmers_in_comp = {} for comp in new_components: temp = 0 for contig in new_components[comp]: temp += len(contig) no_kmers_in_comp[comp] = temp''' '''iter_tag = "_c" if second_iteration: iter_tag = "_r2_c"''' read_line ='' # Assigns reads to components in the non paired end case if paired_end == False: read_part_seq = {} for comp in new_components: read_part_seq[comp] = []; #open(directory_name+"/reads"+iter_tag+str(comp)+".fasta", 'w') with open(reads_files[0]) as readfile: for line in readfile: if line.split()[0][0] == ">": read_line = line else: read = line.split()[0] if read.strip('ACTG'): continue #Contains characters other than ACTG assigned_comp = get_comps(read,k1mers2component) for each_comp in assigned_comp: read_part_seq[each_comp].append(read_line) read_part_seq[each_comp].append(line) if double_stranded: rc_read = reverse_complement(read); assigned_comp = get_comps(rc_read,k1mers2component) for each_comp in assigned_comp: reversed_read_name=read_line.split()[0]+'_reversed'+'\t' +'\t'.join(read_line.split()[1:]) read_part_seq[each_comp].append(reversed_read_name+'\n') read_part_seq[each_comp].append(rc_read+'\n') for comp in new_components: read_part_file = open(directory_name+"/reads"+str(comp)+".fasta", 'w') read_part_file.write("".join(read_part_seq[comp])) read_part_file.close() # Assigns reads to components in the paired end case elif paired_end == True: read1_part_seq = {} read2_part_seq = {} for comp in new_components: read1_part_seq[comp] = []; # open(directory_name+"/reads"+iter_tag+str(comp)+"_1.fasta", 'w') read2_part_seq[comp] = []; #open(directory_name+"/reads"+iter_tag+str(comp)+"_2.fasta", 'w') comp2reads = {} comp2reads_reversed = {} readfile1 = open(reads_files[0], 'r') readfile2 = open(reads_files[1], 'r') read_line1 = ''; read_line2 = '' with open(reads_files[0]) as readfile1, open(reads_files[1]) as readfile2: for line1,line2 in zip(readfile1,readfile2): if line1.split()[0][0] == ">": assert line2.split()[0][0] == ">" read_line1 = line1 read_line2 = line2 else: assert line2.split()[0][0] != ">" read1 = line1.split()[0] read2 = line2.split()[0] if read1.strip('ACTG') or read2.strip('ACTG'): continue #Dont write 'N' reads read1_reversed = reverse_complement(read1) read2_reversed = reverse_complement(read2) #First process (read1, read2_reversed) assigned_comp = get_comps_paired(read1,read2_reversed,k1mers2component) for each_comp in assigned_comp: read1_part_seq[each_comp].append(read_line1) read1_part_seq[each_comp].append(line1) read2_part_seq[each_comp].append(read_line2) read2_part_seq[each_comp].append(line2) if double_stranded: #Now process (read1_reversed, read2) assigned_comp = get_comps_paired(read1_reversed, read2, k1mers2component) for each_comp in assigned_comp: reversed_read1_name=read_line1.split()[0]+'_reversed'+'\t'+'\t'.join(read_line1.split()[1:]) reversed_read2_name=read_line2.split()[0]+'_reversed'+'\t'+'\t'.join(read_line2.split()[1:]) read1_part_seq[each_comp].append(reversed_read1_name+'\n') read1_part_seq[each_comp].append(read1_reversed+'\n') read2_part_seq[each_comp].append(reversed_read2_name+'\n') read2_part_seq[each_comp].append(read2_reversed+'\n') for comp in new_components: read1_part_file = open(directory_name+"/reads"+str(comp)+"_1.fasta", 'w') read2_part_file = open(directory_name+"/reads"+str(comp)+"_2.fasta", 'w') read1_part_file.write("".join(read1_part_seq[comp])) read2_part_file.write("".join(read2_part_seq[comp])) read1_part_file.close() read2_part_file.close() write_log(str(time.asctime()) + ": " + "reads partititoned ") '''c2 = Counter("k1mer Streaming", 10**6) # Cycles through k1mer file and adds k1mer weights to k1mers2component dictionary with open(kmer_directory + "/k1mer.dict" , 'r') as f: for line in f: if len(line) == 0: continue c2.increment() k1mer, weight = line.split() k1mer = k1mer.upper() weight = (float(weight)) if k1mer in k1mers2component: k1mers2component[k1mer][1] += weight if double_stranded: r_k1mer = reverse_complement(k1mer) if r_k1mer in k1mers2component: k1mers2component[r_k1mer][1] += weight write_log(str(time.asctime()) + ": " + "k1mers weights stored in dictionary")''' '''write_log(str(time.asctime()) + ": " + "Computing kmer weights.") #Update the kmer dictionary weights kmer_dictionary = collections.Counter() for kp1mer in k1mer_dictionary: k1, k2 = kp1mer[:K], kp1mer[-K:] kmer_dictionary[k1] += k1mer_dictionary[kp1mer] kmer_dictionary[k2] += k1mer_dictionary[kp1mer] write_log(str(time.asctime()) + ": " + "Computed kmer weights.")''' if not only_reads: #If only_reads, no need to write k1mers write_log(str(time.asctime()) + ": Writing k1mers to file") # Writes out k1mers with weights for each partition for comp in new_components: with open(directory_name+"/component" + comp + "k1mers_allowed.dict" , 'w') as k1mer_file: k1mer_file_data = [] for contig in new_components[comp]: for i in range(len(contig)-(K+1) + 1): k1mer = contig[i:i+(K+1)]; #rc = reverse_complement(k1mer); rw = k1mers2component.get(rc,[0,0]) k1mer_file_data.append(k1mer + "\t" + str(k1mers2component[k1mer][1]) + "\n") #k1mer_file.write(k1mer + "\t" + str(k1mers2component[k1mer][1] + rw[1]) + "\n") '''for i in range(len(contig)-(K) + 1): kmer = contig[i:i+(K)] kmer_file.write(kmer + "\t" + str(kmer_dictionary[kmer]) + "\n")''' k1mer_file.writelines(k1mer_file_data) write_log(str(time.asctime()) + ": " + "k1mers written to file ") #del(k1mers2component) '''kmers2component = {} # builds kmers2components dictionary that will store kmer weights for i in components_broken: with open(directory_name+"/component" + str(i+1) + contig_file_extension, 'r') as f_contigs: with open(directory_name+"/component" + str(i+1) + graph_file_extension + ".part." + str(components_broken[i]) , 'r') as f_component: contig_lines = f_contigs.readlines() j = 0 for line in f_component: tokens = line.split() comp = str(i+1) + "_" + tokens[0] contig = contig_lines[j].split()[0] for each in range(len(contig)-(K) + 1): kmers2component[contig[each:each+(K)]] = 0.0 j += 1 # adds kmers from remaining components if not second iterarion if not second_iteration: for i in range(Num_Remaining_Components): with open(directory_name+"/remaining_contigs"+str(i+1)+".txt", 'r') as remaining_contigs_file: if 1: lines = remaining_contigs_file.readlines() comp = "remaining"+str(i+1) j = 0 for line in lines: tokens = line.split() contig = tokens[0] for each in range(len(contig)-(K) + 1): kmers2component[contig[each:each+(K)]] = 0.0 j += 1 write_log(str(time.asctime()) + ": " + "kmers2component dictionary created ") c2 = Counter("kmer Streaming", 10**6) # gets kmer weights from kmer file and adds them to kmers2component with open(kmer_directory + "/kmer.dict" , 'r') as f: for line in f: if len(line) == 0: continue c2.increment() kmer, weight = line.split() kmer = kmer.upper() weight = (float(weight)) if kmer in kmers2component: kmers2component[kmer] += weight if double_stranded: r_kmer = reverse_complement(kmer) if r_kmer in kmers2component: kmers2component[r_kmer] += weight write_log(str(time.asctime()) + ": " + "kmers weights stored in dictionary")''' '''# Writes out kmers and weights for each partition for comp in new_components: with open(directory_name+"/component" + comp + "kmers_allowed.dict" , 'w') as new_file: for contig in new_components[comp]: for i in range(len(contig)-(K) + 1): kmer = contig[i:i+(K)] new_file.write(kmer + "\t" + str(kmer_dictionary[kmer]) + "\n")''' write_log(str(time.asctime()) + ": " + "kmers written to file " + "\n") newcomps = [c for c in new_components] return [components_broken, newcomps]
# If "use_second_partition", rerun gpmetis with a penalization for contig edges broken in old partitioning # This to give a new partitioning for each component of size above "partition_size" # Gets k1mers, kmers, and reads for each partition if use_second_iteration: r2_graph_file_extension = "r2.txt" r2_new_kmer_tag = "r2" r2_contig_file_extension = "r2.txt" for i in components_broken: partition_file = "/component" + str(i+1) + r1_graph_file_extension + ".part." + str(components_broken[i]) og_graph_file = "/component" + str(i+1) + r1_graph_file_extension new_graph_file = "/component" + str(i+1) + r2_graph_file_extension contig_file = "/component" + str(i+1) + r1_contig_file_extension new_contig_file = "/component" + str(i+1) + r2_contig_file_extension weight_updated_graph(base_directory_name, partition_file, og_graph_file, new_graph_file, contig_file, new_contig_file, penalty, randomize) get_og_comp_kmers = 0 get_partition_kmers = 1 [r2_components_broken, r2_new_components] = kmers_for_component(kmer_directory, reads_files, base_directory_name, r1_contig_file_extension, r2_new_kmer_tag, r2_graph_file_extension, get_og_comp_kmers, get_partition_kmers, double_stranded, paired_end, True, partition_size, overload, K, gpmetis_path) # This counts remaining and non-remaining partitions for log. for part in r2_new_components: num_non_remaining += 1 # This code updates the log if os.path.exists(comp_directory_name+"/before_sp_log.txt"): f_log = open(comp_directory_name+"/before_sp_log.txt", 'a') else: f_log = open(comp_directory_name+"/before_sp_log.txt", 'w') f_log.write(str(time.asctime()) + ": " +"Number of simple Partitions: " + str(num_remaining) + "\n")
# Gets k1mers, kmers, and reads for each partition if use_second_iteration: r2_graph_file_extension = "r2.txt" r2_new_kmer_tag = "r2" r2_contig_file_extension = "r2.txt" for i in components_broken: partition_file = "/component" + str( i + 1) + r1_graph_file_extension + ".part." + str( components_broken[i]) og_graph_file = "/component" + str(i + 1) + r1_graph_file_extension new_graph_file = "/component" + str(i + 1) + r2_graph_file_extension contig_file = "/component" + str(i + 1) + r1_contig_file_extension new_contig_file = "/component" + str(i + 1) + r2_contig_file_extension weight_updated_graph(base_directory_name, partition_file, og_graph_file, new_graph_file, contig_file, new_contig_file, penalty, randomize) get_og_comp_kmers = 0 get_partition_kmers = 1 [r2_components_broken, r2_new_components ] = kmers_for_component(kmer_directory, reads_files, base_directory_name, r1_contig_file_extension, r2_new_kmer_tag, r2_graph_file_extension, get_og_comp_kmers, get_partition_kmers, double_stranded, paired_end, True, partition_size, overload, K) # This counts remaining and non-remaining partitions for log. for part in r2_new_components: num_non_remaining += 1