def write_cluster(file_name, sequence_names, sequence_dict): names_list = list() sequence_list = list() file_name = file_name.replace(' ', '_') for current_seq in sequence_names: names_list.append(current_seq) sequence_list.append(sequence_dict[current_seq]) oligo.write_fastas(names_list, sequence_list, file_name + ".fasta")
def main(): in_file = sys.argv[1] names, sequences = oligo.read_fasta_lists(in_file) print("Number of oligos in original design: %d" % len(names)) print("Number of unique oligos: %d" % len(set(sequences))) names, sequences = oligo.get_unique_sequences(names, sequences) oligo.write_fastas(names, sequences, in_file + "_unique")
def write_large_cluster(names_list, sequence_list, file_name): file_name = file_name.replace(' ', '_') dir_for_clusters = "large_clusters" if not os.path.exists(dir_for_clusters): os.mkdir(dir_for_clusters) os.chdir(dir_for_clusters) oligo.write_fastas(names_list, sequence_list, str(file_name) + "_too_large.fasta") os.chdir("..")
def write_outputs(dict_to_write, filename): """ Writes all name:sequence pairings stored in dict_to_write to the filename specified """ names = list() sequences = list for item in dict_to_write: names.append(item) sequences.append(dict_to_write[item]) oligo.write_fastas(filename, names, sequences)
def main(): argp = argparse.ArgumentParser(description="Randomly generate a fasta " "containing AA sequences.") argp.add_argument('-f', '--filename') argp.add_argument('-l', '--length', type=int) argp.add_argument('-g', '--generate', type=int) argp.add_argument('-p', '--prefix', help="The prefix for each " "generated sequence. ", default="sequence") args = argp.parse_args() names, sequences = list(), list() for index in range(args.generate): name = f'{args.prefix}_{index}' sequence = generate_aa_sequence(args.length) names.append(name) sequences.append(sequence) oligo.write_fastas(names, sequences, output_name=args.filename)
def main(): arg_parser = argparse.ArgumentParser( description= "Parse representative output map to produce a sprot/trembl file") arg_parser.add_argument('-s', '--sprot', help="Input sprot fasta to parse") arg_parser.add_argument('-t', '--trembl', help="Input trembl file to parse") arg_parser.add_argument('-m', '--map_file', help="Input map file to parse.") args = arg_parser.parse_args() out_sprot_name = args.map_file + "_sprot" out_trembl_name = args.map_file + "_trembl" sprot_names, sprot_seqs = oligo.read_fasta_lists(args.sprot) trembl_names, trembl_seqs = oligo.read_fasta_lists(args.trembl) in_sprot_seqs = {} in_trembl_seqs = {} out_sprot_seqs = {} out_trembl_seqs = {} for index in range(len(sprot_names)): current_name = sprot_names[index] current_seq = sprot_seqs[index] in_sprot_seqs[current_name] = current_seq for index in range(len(trembl_names)): current_name = trembl_names[index] current_seq = trembl_seqs[index] in_trembl_seqs[current_name] = current_seq map_items = parse_map(args.map_file) for current in map_items: added = False for inner in current: if inner in in_sprot_seqs: added = True out_sprot_seqs[inner.strip()] = in_sprot_seqs[inner.strip()] break if not added: out_trembl_seqs[current[0].strip()] = in_trembl_seqs[ current[0].strip()] out_sprot_names = list() out_sprot_sequences = list() if len(out_sprot_seqs): for key, value in out_sprot_seqs.items(): out_sprot_names.append(key) out_sprot_sequences.append(value) out_trembl_names = list() out_trembl_sequences = list() if len(out_trembl_seqs): for key, value in out_trembl_seqs.items(): out_trembl_names.append(key) out_trembl_sequences.append(value) oligo.write_fastas(out_sprot_names, out_sprot_sequences, out_sprot_name) oligo.write_fastas(out_trembl_names, out_trembl_sequences, out_trembl_name)
def main(): usage = "usage: %prog [options]" option_parser = optparse.OptionParser(usage) add_program_options(option_parser) options, arguments = option_parser.parse_args() names, sequences = oligo.read_fasta_lists(options.query) min_ymers = 999999999999999999999999999999999 for i in range(options.iterations): xmer_seq_dict = {} # create list of Xmer sequences for index in range(len(sequences)): name, sequence = oligo.subset_lists_iter(names[index], sequences[index], options.XmerWindowSize, options.stepSize) for index in range(len(sequence)): if oligo.is_valid_sequence(sequence[index], options.minLength, options.percentValid): value = [options.redundancy, name[index]] xmer_seq_dict[sequence[index]] = value # create dict of Ymer sequences ymer_seq_dict = {} # Break each ymer up into subsets of xmer size for index in range(len(sequences)): name, sequence = oligo.subset_lists_iter(names[index], sequences[index], options.YmerWindowSize, options.stepSize) for index in range(len(sequence)): if oligo.is_valid_sequence(sequence[index], options.minLength, options.percentValid): ymer_seq_dict[sequence[index]] = name[index] total_ymers = len(ymer_seq_dict) array_design = {} array_xmers = {} to_add = [] ymer_xmers = [] iter_count = 0 while True: #reset max score at the beginning of each iteration max_score = 0 for current_ymer in ymer_seq_dict.keys(): # calculate the score of this ymer score, subset_ymer = calculate_score(current_ymer, xmer_seq_dict, options.XmerWindowSize, 1) if score > max_score: to_add = list() max_score = score to_add.append(current_ymer) ymer_xmers = [subset_ymer] elif score == max_score: to_add.append(current_ymer) ymer_xmers.append(subset_ymer) random_index = random.choice(range(len(to_add))) oligo_to_remove = to_add[random_index] chosen_xmers = ymer_xmers[random_index] # array_xmers.update(chosen_xmers) for each in chosen_xmers: array_xmers[each] = array_xmers.get(each, 0) + 1 # subtract from the score of each xmer within the chosen ymer for item in chosen_xmers: if item in xmer_seq_dict: # We dont' want negative scores if xmer_seq_dict[item][0] > 0: xmer_seq_dict[item][0] -= 1 else: print("%s - not found in xmer dict!!!" % (item)) iter_count += 1 if len(ymer_seq_dict) == 0 or max_score <= 0: print("Final design includes %d %d-mers (%.1f%% of total) " % (len(array_design), options.YmerWindowSize, (len(array_design) / float(total_ymers)) * 100)) # average_redundancy = sum( xmer_seq_dict[ item ][ 0 ] for item in xmer_seq_dict ) / len( xmer_seq_dict ) print("%d unique %d-mers in final %d-mers (%.2f%% of total)" % (len(array_xmers), options.XmerWindowSize, options.YmerWindowSize, (float(len(array_xmers)) / len(xmer_seq_dict)) * 100)) print("Average redundancy of %d-mers in %d-mers: %.2f" % (options.XmerWindowSize, options.YmerWindowSize, sum(array_xmers.values()) / float(len(array_xmers)))) if len(array_design) < min_ymers: min_ymers = len(array_design) best_xmer_seq_dict = xmer_seq_dict del (xmer_seq_dict) best_array_design = array_design del (array_design) break try: array_design[oligo_to_remove] = ymer_seq_dict[oligo_to_remove] del ymer_seq_dict[oligo_to_remove] except KeyError: continue if not iter_count % 250: print("Current Iteration: " + str(iter_count)) # print( "Number of output ymers: " + str( len( array_design ) ) ) print("Current xmer dictionary score: " + str(sum(item[0] for item in xmer_seq_dict.values()))) write_outputs(best_xmer_seq_dict, options.outPut) names = [] sequences = [] # Write resulting oligos to file for sequence, name in best_array_design.items(): names.append(name) sequences.append(sequence) oligo.write_fastas(names, sequences, output_name=options.outPut + "_R" + str(options.redundancy) + ".fasta")