Example #1
0
def remove_similar_sequences(sequences, ratio, folder):
    unique_sequences = []
    unique_sequences.append(sequences[0])

    for sequence in sequences[1:]:
        similar = False
        for unique_sequence in unique_sequences:
            similarity = compare_string(sequence[1], unique_sequence[1])
            if similarity>=float(ratio):
                similar = True
                break
        if similar is False:
            unique_sequences.append(sequence)

    #convert 0.999 to 999 for appending to filename
    file_append_ratio = ratio.split('.')[1]
    strains_ratio = [unique_sequence[0] for unique_sequence in unique_sequences]
    print('number of ', ratio, ' strains ', len(strains_ratio))
    util.write_strains_to_csv(strains_ratio, folder + os.sep + 'unique_strains_' + file_append_ratio)
Example #2
0
def run_dup_remover():

    folder = sys.argv[1]
    file1 = folder + os.sep + 'ha.afasta'
    file2 = folder + os.sep + 'na.afasta'
    file3 = folder + os.sep + 'm1.afasta'
    file4 = folder + os.sep + 'm2.afasta'
    file5 = folder + os.sep + 'np.afasta'
    file6 = folder + os.sep + 'pb1.afasta'
    file7 = folder + os.sep + 'pb2.afasta'
    file8 = folder + os.sep + 'pa.afasta'
    file9 = folder + os.sep + 'ns1.afasta'
    file10 = folder + os.sep + 'ns2.afasta'
    complete_sequences = concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10)
    print('number of complete sequences ', len(complete_sequences))

    unique_sequences_dict = sequence_cleaner(complete_sequences)
    print(unique_sequences_dict.values())
    strains_100 = [strain for strain in unique_sequences_dict.values()]
    util.write_strains_to_csv(strains_100, folder + os.sep + 'unique_strains_100')
    print('number of sequences_100 ', len(strains_100))
Example #3
0
def run_dup_remover():

    folder = sys.argv[1]
    file1 = folder + os.sep + 'ha.afasta'
    file2 = folder + os.sep + 'na.afasta'
    file3 = folder + os.sep + 'm1.afasta'
    file4 = folder + os.sep + 'm2.afasta'
    file5 = folder + os.sep + 'np.afasta'
    file6 = folder + os.sep + 'pb1.afasta'
    file7 = folder + os.sep + 'pb2.afasta'
    file8 = folder + os.sep + 'pa.afasta'
    file9 = folder + os.sep + 'ns1.afasta'
    file10 = folder + os.sep + 'ns2.afasta'
    complete_sequences = concat_sequences(file1, file2, file3, file4, file5, file6, file7, file8, file9, file10)
    print('number of complete sequences ', len(complete_sequences))
    print('length of complete sequence', len(complete_sequences[0][1]))

    unique_sequences_dict = sequence_cleaner(complete_sequences)
    strains_100 = [strain for strain in unique_sequences_dict.values()]
    util.write_strains_to_csv(strains_100, folder + os.sep + 'unique_strains_100')
    print('number of sequences_100 ', len(strains_100))
Example #4
0
def remove_similar_sequences(sequences, ratio, folder):
    unique_sequences = []
    unique_sequences.append(sequences[0])

    for sequence in sequences[1:]:
        similar = False
        for unique_sequence in unique_sequences:
            similarity = compare_string(sequence[1], unique_sequence[1])
            if similarity >= float(ratio):
                similar = True
                break
        if similar is False:
            unique_sequences.append(sequence)

    #convert 0.999 to 999 for appending to filename
    file_append_ratio = ratio.split('.')[1]
    strains_ratio = [
        unique_sequence[0] for unique_sequence in unique_sequences
    ]
    print('number of ', ratio, ' strains ', len(strains_ratio))
    util.write_strains_to_csv(
        strains_ratio, folder + os.sep + 'unique_strains_' + file_append_ratio)