def remove_similar_sequences(sequences, ratio, folder): unique_sequences = [] unique_sequences.append(sequences[0]) for sequence in sequences[1:]: similar = False for unique_sequence in unique_sequences: similarity = compare_string(sequence[1], unique_sequence[1]) if similarity>=float(ratio): similar = True break if similar is False: unique_sequences.append(sequence) #convert 0.999 to 999 for appending to filename file_append_ratio = ratio.split('.')[1] strains_ratio = [unique_sequence[0] for unique_sequence in unique_sequences] print('number of ', ratio, ' strains ', len(strains_ratio)) util.write_strains_to_csv(strains_ratio, folder + os.sep + 'unique_strains_' + file_append_ratio)
def run_dup_remover(): folder = sys.argv[1] file1 = folder + os.sep + 'ha.afasta' file2 = folder + os.sep + 'na.afasta' file3 = folder + os.sep + 'm1.afasta' file4 = folder + os.sep + 'm2.afasta' file5 = folder + os.sep + 'np.afasta' file6 = folder + os.sep + 'pb1.afasta' file7 = folder + os.sep + 'pb2.afasta' file8 = folder + os.sep + 'pa.afasta' file9 = folder + os.sep + 'ns1.afasta' file10 = folder + os.sep + 'ns2.afasta' complete_sequences = concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10) print('number of complete sequences ', len(complete_sequences)) unique_sequences_dict = sequence_cleaner(complete_sequences) print(unique_sequences_dict.values()) strains_100 = [strain for strain in unique_sequences_dict.values()] util.write_strains_to_csv(strains_100, folder + os.sep + 'unique_strains_100') print('number of sequences_100 ', len(strains_100))
def run_dup_remover(): folder = sys.argv[1] file1 = folder + os.sep + 'ha.afasta' file2 = folder + os.sep + 'na.afasta' file3 = folder + os.sep + 'm1.afasta' file4 = folder + os.sep + 'm2.afasta' file5 = folder + os.sep + 'np.afasta' file6 = folder + os.sep + 'pb1.afasta' file7 = folder + os.sep + 'pb2.afasta' file8 = folder + os.sep + 'pa.afasta' file9 = folder + os.sep + 'ns1.afasta' file10 = folder + os.sep + 'ns2.afasta' complete_sequences = concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10) print('number of complete sequences ', len(complete_sequences)) print('length of complete sequence', len(complete_sequences[0][1])) unique_sequences_dict = sequence_cleaner(complete_sequences) strains_100 = [strain for strain in unique_sequences_dict.values()] util.write_strains_to_csv(strains_100, folder + os.sep + 'unique_strains_100') print('number of sequences_100 ', len(strains_100))
def remove_similar_sequences(sequences, ratio, folder): unique_sequences = [] unique_sequences.append(sequences[0]) for sequence in sequences[1:]: similar = False for unique_sequence in unique_sequences: similarity = compare_string(sequence[1], unique_sequence[1]) if similarity >= float(ratio): similar = True break if similar is False: unique_sequences.append(sequence) #convert 0.999 to 999 for appending to filename file_append_ratio = ratio.split('.')[1] strains_ratio = [ unique_sequence[0] for unique_sequence in unique_sequences ] print('number of ', ratio, ' strains ', len(strains_ratio)) util.write_strains_to_csv( strains_ratio, folder + os.sep + 'unique_strains_' + file_append_ratio)