Beispiel #1
0
def test_filtered_profiles(args):
    original_profiles_array = IO.unpack_profiles_file(args.profiles_full_file,
                                                      do_print=True)
    with open(args.profiles_filtered_file, 'rb') as rf:
        bitstring = rf.read()
    filtered_profiles_array = IO.decompress_profiles_indices(bitstring)

    print(original_profiles_array.shape)
    print(original_profiles_array)
    print(original_profiles_array[6:16, ].sum())
    print(filtered_profiles_array.shape)
    print(filtered_profiles_array)
    print(filtered_profiles_array.sum())
def run_test_bins_fasta(args):
    with open(args.rna_bin_file, 'rb') as rb:
        bitstring = rb.read()
        seq_objects_dict, seq_objects_order = IO.decompress_named_sequences(
            bitstring)
        full_string = IO.write_named_seq_to_fasta(seq_objects_dict,
                                                  seq_objects_order)
    with open(args.rna_fastafile, 'r') as rf:
        full_fasta_string = rf.read()

    full_fasta_string_Us = full_fasta_string.replace('T', 'U').replace(
        'ENSU', 'ENST')
    assert (len(full_string) == len(full_fasta_string_Us))

    assert (full_string == full_fasta_string_Us)
Beispiel #3
0
def time_calculate_MI_profiles(calculate_with_numba):
    test_batch_folder = '/Users/student/Documents/hani/programs/pyteiser/data/test_1_batch_snrnpa1'
    seeds_filename = os.path.join(test_batch_folder,
                                  'seeds_4-7_4-9_4-6_14-20_30k_1.bin')
    profiles_filename = os.path.join(
        test_batch_folder, 'snrnpa_profiles_4-7_4-9_4-6_14-20_30k_1.bin')
    exp_mask_filename = "/Users/student/Documents/hani/programs/pyteiser/data/mask_files/SNRNPA1_PSI_mask.bin"
    nbins = 15
    min_occurences = 5

    decompressed_profiles_array, index_array, values_array = IO.unpack_profiles_and_mask(
        profiles_filename, exp_mask_filename, do_print=True)

    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array,
                                                  nbins)

    value, counts = np.unique(discr_exp_profile, return_counts=True, axis=0)
    print(counts)

    MI_values_array = calculate_MI_profiles.calculate_MI_for_seeds(
        decompressed_profiles_array,
        index_array,
        discr_exp_profile,
        min_occurences,
        calculate_with_numba,
        do_print=True)
def profiles_wrapper(args):
    profiles_filename = args.profiles_full_file
    decompressed_profiles_array = IO.unpack_profiles_file(profiles_filename,
                                                          args.indices_mode,
                                                          do_print=True)
    run_test_compressing_decompressing_indices(args,
                                               decompressed_profiles_array)
Beispiel #5
0
def main():
    args = handler()

    w_motifs_list = IO.read_motif_file(args.seeds_bin_file)

    test_representation(w_motifs_list[2], args.temp_folder)

    test_RNAstructure_plotting(w_motifs_list[0:10])
def main():
    args = handler()

    n_seqs_list = read_sequences(args.rna_bin_file)
    index_array, values_array = IO.unpack_mask_file(args.exp_mask_file)
    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins = args.nbins)
    seqs_of_interest = [n_seqs_list[x] for x in range(index_array.shape[0]) if index_array[x]]

    test_elongated_seed(seqs_of_interest, discr_exp_profile, args.nbins, args.number_example_matches_to_print)
def test_bins_fasta(args):
    with open(args.rna_bin_file, 'rb') as rb:
        bitstring = rb.read()
        seq_objects_dict, seq_objects_order = IO.decompress_named_sequences(
            bitstring)
        full_string = IO.write_named_seq_to_fasta(seq_objects_dict,
                                                  seq_objects_order)
    with open(args.rna_fastafile, 'r') as rf:
        full_fasta_string = rf.read()

    with open("/Users/student/Documents/hani/temp/temp_fasta/1.txt",
              'w') as wf:
        wf.write(full_string)

    full_fasta_string_Us = full_fasta_string.replace('T', 'U').replace(
        'ENSU', 'ENST')
    assert (len(full_string) == len(full_fasta_string_Us))
    #
    # print(full_string[0:200])
    # print(full_fasta_string_Us[0:200])
    assert (full_string == full_fasta_string_Us)
def run_test_profiles_compression_decompression(args, do_shorten_test = True):
    args.seedfile = args.seeds_bin_file
    n_motifs_list, n_seqs_list = calculate_seed_profiles.prepare_lists_for_calculations(args)
    if do_shorten_test:
        n_motifs_list = n_motifs_list[0:3]
    calculated_profiles_array = calculate_seed_profiles.calculate_write_profiles(n_motifs_list, n_seqs_list,
                                            args.profiles_bin_file, do_print=True,
                                            do_return=True)

    with open(args.profiles_bin_file, 'rb') as rf:
        bitstring = rf.read()
    decompressed_profiles_array = IO.decompress_profiles(bitstring)

    assert(np.array_equal(calculated_profiles_array, decompressed_profiles_array))
def prepare_known_seeds(args):
    seqs_shape, seqs_to_test, bin_file_to_test, desired_numbers = define_constants(args)

    w_motifs_list = [0] * len(seqs_to_test)
    for ind, seq in enumerate(seqs_to_test):
        curr_test_motif = structures.w_motif(seqs_shape[0], seqs_shape[1])
        curr_test_motif.from_string(seq)
        w_motifs_list[ind] = curr_test_motif

    seqs_dict, seqs_order = IO.read_rna_bin_file(bin_file_to_test)
    w_seqs_list = [seqs_dict[name] for name in seqs_order]

    n_motifs_list = type_conversions.w_to_n_motifs_list(w_motifs_list)
    n_seqs_list = type_conversions.w_to_n_sequences_list(w_seqs_list)
    return n_motifs_list, n_seqs_list
def test_compressing_decompressing_indices(args, decompressed_profiles_array):
    with open(args.compressed_profiles_file, 'wb') as wf:
        transcriptome_length = decompressed_profiles_array.shape[1]
        for i in range(decompressed_profiles_array.shape[0]):
            curr_profile = structures.w_profile(transcriptome_length)
            curr_profile.values = decompressed_profiles_array[i]
            curr_profile.compress_indices()
            wf.write(curr_profile.bytestring_indices)

    with open(args.compressed_profiles_file, 'rb') as rf:
        bitstring = rf.read()

    read_out_profiles_array = IO.decompress_profiles_indices(bitstring)
    assert (read_out_profiles_array == decompressed_profiles_array
            ).all(), "decompression has changed the data!"
Beispiel #11
0
def time_reading_fasta(fasta_file):
    tr_dict_loc = {}
    seqs_order = []
    with open(fasta_file, 'r') as f:
        split_string = f.read().split('>')
        for entry in split_string:
            if entry == '':
                continue
            seq_start = entry.find('\n')
            annotation = entry[:seq_start]
            sequence_string = entry[seq_start + 1:].replace('\n', '')
            current_sequence = structures.w_sequence(len(sequence_string))
            current_sequence.from_sequence(sequence_string)

            time_create_object = timeit.timeit(
                lambda: structures.w_sequence(len(sequence_string)),
                number=100)
            time_fill_object = timeit.timeit(
                lambda: current_sequence.from_sequence(sequence_string),
                number=100)
            time_compress_object = timeit.timeit(
                lambda: current_sequence.compress(), number=100)
            time_compress_named_object = timeit.timeit(
                lambda: IO.compress_named_sequences(
                    {annotation: current_sequence}, [annotation]),
                number=100)

            print("Create object: %.5f" % time_create_object)
            print("Fill object: %.5f" % time_fill_object)
            print("Compress object: %.5f" % time_compress_object)
            print("Compress named object: %.5f" % time_compress_named_object)
            print()

            # curr_timing = timeit.timeit('current_sequence.from_sequence(sequence_string)',
            #                             'from __main__ import current_sequence, sequence_string')
            # print(curr_timing)

            #
            # tr_dict_loc[annotation] = current_sequence
            # seqs_order.append(annotation)

    return tr_dict_loc, seqs_order
def read_sequences(rna_bin_filename):
    seqs_dict, seqs_order = IO.read_rna_bin_file(rna_bin_filename)
    w_seqs_list = [seqs_dict[name] for name in seqs_order]
    n_seqs_list = type_conversions.w_to_n_sequences_list(w_seqs_list)

    return n_seqs_list
Beispiel #13
0
def time_compressing_sequences(fasta_file):
    sequences_dict, seqs_order = IO.read_fasta(fasta_file)

    for i in range(len(seqs_order)):
        print(seqs_order[i])