Esempio n. 1
0
def write_array_of_profiles(profiles_passed_array, combined_profiles_filename):
    with open(combined_profiles_filename, 'wb') as wf:
        for i in range(profiles_passed_array.shape[0]):
            current_profile = structures.w_profile(profiles_passed_array[i].shape[0])
            current_profile.values = profiles_passed_array[i]
            current_profile.compress()
            wf.write(current_profile.bytestring)
Esempio n. 2
0
def calculate_profile_one_motif(motif, n_seqs_list, is_degenerate=False):
    start_time = time.time()

    current_profile = structures.w_profile(len(n_seqs_list))
    for i, seq in enumerate(n_seqs_list):
        match = is_there_motif_instance(motif, seq, is_degenerate)
        if match:
            current_profile.values[i] = True
    end_time = time.time()
    time_spent = end_time - start_time

    return current_profile, time_spent
def filter_profiles_by_folding(w_motifs_list,
                               w_seqs_list,
                               n_motifs_list,
                               n_seqs_list,
                               profiles_array,
                               output_filename,
                               window_size,
                               MFE_ratio_thresh,
                               is_degenerate,
                               do_print=False,
                               do_print_subs_matches=False,
                               do_print_progress=True,
                               how_often_print=100):
    N_seq = len(n_seqs_list)
    with open(output_filename, 'wb') as wf:
        # iterate over seeds
        for i, w_motif in enumerate(w_motifs_list):

            if i <= 5:
                continue
            if i >= 16:
                break

            n_motif = n_motifs_list[i]
            current_profile = profiles_array[i, :]
            filtered_profile = process_one_profile_one_seed(
                w_motif,
                n_motif,
                w_seqs_list,
                n_seqs_list,
                current_profile,
                window_size,
                is_degenerate,
                MFE_ratio_thresh,
                do_print=do_print,
                do_print_subs_matches=do_print_subs_matches,
                do_print_progress=do_print_progress,
                how_often_print=how_often_print)

            filtered_profile_w = structures.w_profile(N_seq)
            filtered_profile_w.values = filtered_profile
            filtered_profile_w.compress_indices()
            wf.write(filtered_profile_w.bytestring_indices)

            if do_print:
                difference_u_f = np.logical_and(current_profile,
                                                np.invert(filtered_profile))
                print("%d out of %d transcripts were filtered out: " %
                      (difference_u_f.sum(), current_profile.sum()))
Esempio n. 4
0
def decompress_profiles(bitstring,
                        do_print=False, how_often_print=10000):
    profiles_list = []
    total_length = len(bitstring)
    current_spot = 0
    counter = 0

    while current_spot < total_length:
        # get the length of the profile
        length_bitstring = bitstring[current_spot : current_spot + 4]
        profile_length_np = np.frombuffer(length_bitstring, dtype=np.uint32)
        profile_length = profile_length_np[0]

        # figure out how long is the profile packed into bits
        # if profile length // 8 > 0, it will take one additional byte
        if profile_length % 8 != 0:
            length_packed = (profile_length // 8) + 1
        else:
            length_packed = profile_length // 8

        values_bitstring = bitstring[current_spot + 4 : current_spot + 4 + length_packed]
        md5_bitstring = bitstring[current_spot + 4 + length_packed :
                                    current_spot + 4 + length_packed + 16]

        current_spot += 4 + length_packed + 16

        values_packed_bits = np.frombuffer(values_bitstring, dtype=np.uint8)
        values = np.unpackbits(values_packed_bits)
        values = values[0 : profile_length]

        current_profile = structures.w_profile(profile_length)
        current_profile.values = values
        current_profile.compress()

        assert (md5_bitstring == current_profile.md5)

        profiles_list.append(current_profile.values)

        counter += 1
        if counter % how_often_print == 0:
            if do_print:
                print("Decompressed profile number ", counter)

    profiles_array = np.array(profiles_list, dtype=np.bool)

    return profiles_array
def write_profiles_passed(last_positive_seed, MI_values_array, profiles_array,
                          passed_profiles_filename):
    if last_positive_seed < 0:
        total_bitstring = np.uint32(0).tobytes()
    else:
        seed_indices_sorted = np.argsort(MI_values_array)[::-1]
        indices_passed = seed_indices_sorted[0:last_positive_seed + 1]

        profiles_passed_list = profiles_array[indices_passed]
        profiles_bitstrings = []

        for i in range(profiles_passed_list.shape[0]):
            current_profile = structures.w_profile(
                profiles_passed_list[i].shape[0])
            current_profile.values = profiles_passed_list[i]
            current_profile.compress()
            profiles_bitstrings.append(current_profile.bytestring)

        total_bitstring = np.uint32(
            len(profiles_bitstrings)).tobytes() + b''.join(profiles_bitstrings)

    with open(passed_profiles_filename, 'wb') as wf:
        wf.write(total_bitstring)
Esempio n. 6
0
def decompress_profiles_indices(bitstring,
                        do_print=False, how_often_print=10000):
    profiles_list = []
    total_length = len(bitstring)
    current_spot = 0
    counter = 0

    while current_spot < total_length:
        # get the length of the profile
        length_bitstring = bitstring[current_spot : current_spot + 4]
        profile_length_np = np.frombuffer(length_bitstring, dtype=np.uint32)
        length = profile_length_np[0]

        # get the number of indices (of True) of the profile
        N_indices_bitstring = bitstring[current_spot + 4 : current_spot + 8]
        N_indices_np = np.frombuffer(N_indices_bitstring, dtype=np.uint32)
        N_indices = N_indices_np[0]

        # get the number of bits used per index (compression width)
        width_bitstring = bitstring[current_spot + 8 : current_spot + 12]
        width_np = np.frombuffer(width_bitstring, dtype=np.uint32)
        width = width_np[0]

        # figure out how many bytes do we need to read out
        length_packed = N_indices * width

        if length_packed % 8 != 0:
            length_packed = (length_packed // 8) + 1
        else:
            length_packed = length_packed // 8

        # read out bitstring of the proper size
        values_bitstring = bitstring[current_spot + 12 : current_spot + 12 + length_packed]
        md5_bitstring = bitstring[current_spot + 12 + length_packed :
                                    current_spot + 12 + length_packed + 16]
        current_spot += 12 + length_packed + 16

        # convert bitsting to 32-bit arrays representing indices
        indices_packed_uint8 = np.frombuffer(values_bitstring, dtype=np.uint8)
        binary_bytes_array = np.unpackbits(indices_packed_uint8)
        binary_bytes_array = binary_bytes_array[0 : N_indices * width]
        reshaped_binary_array = binary_bytes_array.reshape(N_indices, width)
        full_binary_array = np.zeros((N_indices, 32), dtype=np.bool)
        full_binary_array[:, 0:width] = reshaped_binary_array

        # convert 32-bit arrays into a uint32 indices
        reshaped_full_binary_array = full_binary_array.flatten()
        reshaped_full_binary_string = np.packbits(reshaped_full_binary_array)
        true_indices = np.frombuffer(reshaped_full_binary_string, dtype=np.uint32)

        # create a new profile
        curr_profile = structures.w_profile(length)
        curr_profile.values[true_indices] = True
        curr_profile.compress_indices()
        assert (md5_bitstring == curr_profile.md5_indices)
        profiles_list.append(curr_profile.values)

        counter += 1
        if counter % how_often_print == 0:
            if do_print:
                print("Decompressed profile number ", counter)

    profiles_array = np.array(profiles_list, dtype=np.bool)

    return profiles_array