Beispiel #1
0
def compare_mea_speeds():
    """Compare the speeds of the three mea implementations"""
    plt.figure(figsize=(14, 4))
    panel1 = plt.axes([0.07, 0.11, .4, .8])
    panel2 = plt.axes([0.55, 0.11, .4, .8])

    # longest = max(data[0]) + data[1])

    fast_times = []
    slow_times = []
    slower_times = []
    num_points = []
    for x in range(10, 200, 10):
        size = x
        prob_matrix, shortest_future_col_per_row = create_random_prob_matrix(col=size, row=size)
        points = np.count_nonzero(prob_matrix)
        num_points.append(points)
        _, time = time_it(mea_slow, prob_matrix, shortest_future_col_per_row)
        slow_times.append(time)
        print(time, time/points)
        _, time = time_it(mea_slower, prob_matrix, shortest_future_col_per_row)
        slower_times.append(time)
        print(time, time/points)
        _, time = time_it(maximum_expected_accuracy_alignment, prob_matrix, shortest_future_col_per_row)
        fast_times.append(time)
        print(time, time/points)

    # plot number of points vs time
    handle1, = panel1.plot(num_points, fast_times, color='black')
    handle2, = panel1.plot(num_points, slow_times, color='blue')
    handle3, = panel1.plot(num_points, slower_times, color='red')

    slowest_time = max(slower_times + slow_times)
    most_points = max(num_points)

    panel1.set_xlim(0, most_points)
    panel1.set_ylim(0, slowest_time)

    panel1.set_xlabel('Number of Points')
    panel1.set_ylabel('Time (s)')
    panel1.legend([handle1, handle2, handle3], ["Fast", "Slow", "Slower"], loc='upper left')
    panel1.set_title('Time (s) vs Number of points ')

    fast_ratio = [x/y for x, y in zip(fast_times, num_points)]
    slow_ratio = [x/y for x, y in zip(slow_times, num_points)]
    slower_ratio = [x/y for x, y in zip(slower_times, num_points)]

    handle1, = panel2.plot(num_points, fast_ratio, color='black')
    handle2, = panel2.plot(num_points, slow_ratio, color='blue')
    handle3, = panel2.plot(num_points, slower_ratio, color='red')

    panel2.set_xlim(0, max(num_points))
    panel2.set_ylim(0, max(slower_ratio))

    panel2.set_xlabel('Number of Points')
    panel2.set_ylabel('Time/ number of points (s/point)')
    panel2.legend([handle1, handle2, handle3], ["Fast", "Slow", "Slower"], loc='upper left')
    panel2.set_title('Time(s)/Data Points vs Number of points ')

    plt.show()
Beispiel #2
0
    def test_time_it(self):
        """Test time_it function"""
        with captured_output() as (_, _):

            def add(x, y):
                return x + y

            _, _ = time_it(add, 1, 2)
            with self.assertRaises(AssertionError):
                time_it(1, 1, 2)
Beispiel #3
0
    def test_generate_buildAlignments4(self):
        kmers = get_kmers(6, alphabet="ATGC")
        data_files = [self.alignments_path]

        data, time = time_it(multiprocess_make_kmer_assignment_tables,
                             data_files, kmers, {"t", "c"}, 0.0, False, True,
                             10, 8)

        with tempfile.TemporaryDirectory() as temdir:
            output_file = os.path.join(temdir, "built_alignment.tsv")
            data2, time2 = time_it(generate_top_n_kmers_from_sa_output,
                                   data_files, temdir, output_file, 10, "ACGT",
                                   6, 0.0, 8, False, True, False, True)

            # get kmers associated with each sample
            num_lines = len(list(open(output_file)))
        print(time2, time)
        self.assertEqual(len(data.index), num_lines)
        self.assertLess(time2, time)
Beispiel #4
0
 def test_binary_search_exact_match(self):
     with captured_output() as (_, _):
         for x in range(100, 1000, 10):
             test_list = list(range(x))
             time_list = []
             for _ in range(100):
                 find_number = np.random.randint(0, x)
                 index, time = time_it(binary_search, test_list,
                                       find_number)
                 time_list.append(time)
                 self.assertEqual(test_list[index], find_number)
Beispiel #5
0
 def test_binary_search_no_match(self):
     with captured_output() as (_, _):
         for x in range(10, 100, 10):
             test_list = list(range(x))
             time_list = []
             for _ in range(100):
                 find_number = np.random.randint(0, x) + 0.5
                 index, time = time_it(binary_search, test_list,
                                       find_number, False)
                 time_list.append(time)
                 if index == x - 1:
                     self.assertTrue(test_list[index] < find_number)
                 elif index == 0:
                     self.assertTrue(find_number < test_list[index + 1])
                 else:
                     self.assertTrue(
                         test_list[index] < find_number < test_list[index +
                                                                    1])
        args.positions_file)
    step_size = 10000
    step_number = 0
    with open(args.positions_file, "r") as fh:
        while True:
            out_name = args.output_file + str(step_number) + "tmp"
            execute = "samtools view -b -o {} {}".format(out_name, args.bam)
            positions = " "
            counter = 0
            for line in fh:
                split_line = line.split()
                chromosome = split_line[0]
                position = split_line[1]
                positions += chromosome + ":" + position + "-" + position + " "
                counter += 1
                if counter > 10000:
                    break
            if positions == " " or step_number > 10:
                break
            execute += positions
            check_call(execute.split())
            check_call(
                f"samtools rmdup -s {out_name} {args.output_file+str(step_number)}"
                .split())
            os.remove(out_name)
            step_number += 1


if __name__ == '__main__':
    print(time_it(main)[1])
Beispiel #7
0
                print("Per-genomic-site confusion matrix", file=log_file)
                print(cmh.confusion_matrix(), file=log_file)

                all_data.append(chr_strand_data)

        print("All Chromosomes both strands:", file=log_file)
        print("Per-call confusion matrix", file=log_file)
        print(print_confusion_matrix(tps, fps, fns, tns),
              file=log_file)
        plot_confusion_matrix(tps, fps, fns, tns,
                              normalize=True,
                              output_path=os.path.join(output_dir,
                                                       "all_calls_confusion_matrix.png"),
                              title="All calls CpG "
                                    "Normalized Confusion Matrix")
        all_data = pd.concat(all_data)
        label_data = all_data.loc[:, ['C_label', "E_label"]]
        prediction_data = all_data.loc[:, ['C', "E"]]
        label_data.rename(columns={'C_label': 'C', "E_label": "E"}, inplace=True)
        cmh = ClassificationMetrics(label_data, prediction_data)
        cmh.plot_roc("E", os.path.join(output_dir, "per_genomic_site_all_chromosomes_roc.png"))
        cmh.plot_precision_recall("E", os.path.join(output_dir,
                                                    "per_genomic_site_all_chromosomes"
                                                    "precision_recall.png"))
        print("Per-genomic-site confusion matrix", file=log_file)
        print(cmh.confusion_matrix(), file=log_file)


if __name__ == '__main__':
    print(time_it(main))
def main():
    args = parse_args()
    filter_bed = FilterBed()
    filters = []
    if args.filter_by_percentage is not None:
        filter_bed.set_filter_by_percentage(
            *[float(x) for x in args.filter_by_percentage])
        filters.append(filter_bed.filter_by_percentage_min_min_max_max)

    if args.filter_by_coverage is not None:
        filter_bed.set_filter_by_coverage(
            *[float(x) for x in args.filter_by_coverage])
        filters.append(filter_bed.filter_by_coverage_min_max)

    filter_bed.chain_logic(*filters)

    kmers = get_kmer_counts_from_reference_given_bed(
        args.reference,
        args.methyl_bed,
        k=args.kmer_length,
        param_filter=filter_bed.function,
        check_base=args.check_base)
    print(kmers)
    with open(os.path.join(args.output, "kmer_counts.pkl"), 'wb') as fh:
        pickle.dump(kmers, fh)


if __name__ == '__main__':
    _, time = time_it(main)
    print(time, "seconds")