Example #1
0
    def test_named_indices(self):

        parent_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [160, 220]},
                                 index=['foo', 'bar']))

        # Edges
        edge_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 215], 'end': [110, 220]},
                                            index=['foo', 'bar']))
        correct_ans = {'foo': np.array([0, 1]), 'bar': np.array([3])}
        self.assert_numpy_dicts_equal(correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5))


        # Middle
        mid_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [140, 208], 'end': [145, 212]},
                                           index=['foo', 'bar']))
        correct_ans = {'foo': np.array([8]), 'bar': np.array([1, 2])}
        self.assert_numpy_dicts_equal(correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5))


        # Middle, without one index
        mid_regions2 = Regions(pd.DataFrame({'chromosome': ['chr10'], 'start': [208], 'end': [212]}, index=['bar']))
        correct_ans = {'bar': np.array([1, 2])}
        self.assert_numpy_dicts_equal(correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5))


        length_one_regions = Regions(pd.DataFrame({'chromosome': ['chr1'], 'start': [140], 'end': [141]}, index=['foo']))
        correct_ans = {'foo': np.array([8])}
        self.assert_numpy_dicts_equal(correct_ans, length_one_regions.as_bins_of(parent_regions, resolution=5))

        regions_not_in_parent = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr2'], 'start': [140, 200], 'end': [141, 300]},
                                                     index=['foo', 'baz']))
        correct_ans = {'foo': np.array([8])}
        self.assert_numpy_dicts_equal(correct_ans, regions_not_in_parent.as_bins_of(parent_regions, resolution=5))
Example #2
0
    def test_raises_value_error_on_mismatch(self):
        parent_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1'],
                'start': [100],
                'end': [200]
            }))

        # Wrong chromosome
        poi_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr2'],
                'start': [120],
                'end': [180]
            }))
        self.assertRaises(ValueError, poi_regions.as_bins_of, parent_regions)

        # No overlap 1
        poi_regions2 = Regions(
            pd.DataFrame({
                'chromosome': ['chr1'],
                'start': [20],
                'end': [80]
            }))
        self.assertRaises(ValueError, poi_regions2.as_bins_of, parent_regions)

        # No overlap 2
        poi_regions3 = Regions(
            pd.DataFrame({
                'chromosome': ['chr1'],
                'start': [220],
                'end': [230]
            }))
        self.assertRaises(ValueError, poi_regions3.as_bins_of, parent_regions)
Example #3
0
    def test_clipping_always_greater_or_equal_than_0(self):
        regions = Regions(pd.DataFrame({'chromosome' : ['chr1'], 'start': [5], 'end': [7]}))

        clipped_df = regions.clip_to_resolution(20)

        self.assertEquals('chr1', clipped_df.ix[0]['chromosome'])
        self.assertEquals(0, clipped_df.ix[0]['start'])
        self.assertEquals(20, clipped_df.ix[0]['end'])
Example #4
0
 def test_slicing_works(self):
     regions = Regions(
         pd.DataFrame({
             'chromosome': ['chr1', 'chr10'],
             'start': [100, 200],
             'end': [117, 220]
         }))
     self.assertTrue(isinstance(regions.head(1), Regions))
     self.assertTrue(isinstance(regions[:1], Regions))
Example #5
0
    def test_clipping_keeps_the_same_class(self):

        regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1', 'chr10'],
                'start': [100, 200],
                'end': [117, 220]
            }))
        clipped_regions = regions.clip_to_resolution(5)
        self.assertTrue(isinstance(clipped_regions, Regions))
Example #6
0
    def test_clipping_when_only_one_bin_present(self):
        regions = Regions(pd.DataFrame( {'chromosome' : ['chr1', 'chr10'], 'start' : [100, 200], 'end' : [117, 220]} ))

        clipped_df = regions.clip_to_resolution(20)

        self.assertEquals('chr1', clipped_df.ix[0]['chromosome'])
        self.assertEquals(99, clipped_df.ix[0]['start'])
        self.assertEquals(119, clipped_df.ix[0]['end'])

        self.assertEquals('chr10', clipped_df.ix[1]['chromosome'])
        self.assertEquals(200, clipped_df.ix[1]['start'])
        self.assertEquals(220, clipped_df.ix[1]['end'])
Example #7
0
    def test_clipping_res_1(self):

        regions = Regions(pd.DataFrame( {'chromosome' : ['chr1', 'chr10'], 'start' : [100, 200], 'end' : [117, 220]} ))

        clipped_df = regions.clip_to_resolution(1)

        self.assertEquals('chr1', clipped_df.ix[0]['chromosome'])
        self.assertEquals(100, clipped_df.ix[0]['start'])
        self.assertEquals(117, clipped_df.ix[0]['end'])

        self.assertEquals('chr10', clipped_df.ix[1]['chromosome'])
        self.assertEquals(200, clipped_df.ix[1]['start'])
        self.assertEquals(220, clipped_df.ix[1]['end'])
Example #8
0
    def test_clipping_always_greater_or_equal_than_0(self):
        regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1'],
                'start': [5],
                'end': [7]
            }))

        clipped_df = regions.clip_to_resolution(20)

        self.assertEquals('chr1', clipped_df.ix[0]['chromosome'])
        self.assertEquals(0, clipped_df.ix[0]['start'])
        self.assertEquals(20, clipped_df.ix[0]['end'])
Example #9
0
    def test_clipping_when_only_one_bin_present(self):
        regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1', 'chr10'],
                'start': [100, 200],
                'end': [117, 220]
            }))

        clipped_df = regions.clip_to_resolution(20)

        self.assertEquals('chr1', clipped_df.ix[0]['chromosome'])
        self.assertEquals(99, clipped_df.ix[0]['start'])
        self.assertEquals(119, clipped_df.ix[0]['end'])

        self.assertEquals('chr10', clipped_df.ix[1]['chromosome'])
        self.assertEquals(200, clipped_df.ix[1]['start'])
        self.assertEquals(220, clipped_df.ix[1]['end'])
Example #10
0
def main():
    parser = argument_parser()
    args = parser.parse_args()

    input_filename = args.input_filename
    poi_filename = args.poi_filename

    input_regions = Regions.from_bed(input_filename)
    poi_regions = Regions.from_bed(poi_filename)

    output_file = args.output
    for ix, region in input_regions.iterrows():
        pois_in_region = poi_regions.contained_within(region)
        if len(pois_in_region) == 0:
            continue
        output_file.write('{0}:{1}\n'.format(ix, pois_in_region.as_printable_list_of_pois()))

    output_file.close()
Example #11
0
    def test_clipping_res_1(self):

        regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1', 'chr10'],
                'start': [100, 200],
                'end': [117, 220]
            }))

        clipped_df = regions.clip_to_resolution(1)

        self.assertEquals('chr1', clipped_df.ix[0]['chromosome'])
        self.assertEquals(100, clipped_df.ix[0]['start'])
        self.assertEquals(117, clipped_df.ix[0]['end'])

        self.assertEquals('chr10', clipped_df.ix[1]['chromosome'])
        self.assertEquals(200, clipped_df.ix[1]['start'])
        self.assertEquals(220, clipped_df.ix[1]['end'])
Example #12
0
def read_regions(regions_filename, random_sample, resolution):
    regions = Regions.from_bed(regions_filename)
    total_len = len(regions)
    print '> {0} regions of interest read'.format(total_len)

    regions = regions.clip_to_resolution(resolution)

    used_len = total_len
    if random_sample:
        print '> Using only a random sample of {0} regions from {1!r}'.format(random_sample, regions_filename)
        used_len = random_sample
        regions = regions.ix[random.sample(regions.index, random_sample)]

    return regions, total_len, used_len
Example #13
0
def read_regions(regions_filename, random_sample, resolution):
    regions = Regions.from_bed(regions_filename)
    total_len = len(regions)
    print '> {0} regions of interest read'.format(total_len)

    regions = regions.clip_to_resolution(resolution)

    used_len = total_len
    if random_sample:
        print '> Using only a random sample of {0} regions from {1!r}'.format(
            random_sample, regions_filename)
        used_len = random_sample
        regions = regions.ix[random.sample(regions.index, random_sample)]

    return regions, total_len, used_len
Example #14
0
    def test_bins_calculated_correctly(self):
        parent_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [160, 220]}))

        # Edges
        edge_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 215], 'end': [110, 220]}))
        correct_ans = {0: np.array([0, 1]), 1: np.array([3])}
        self.assert_numpy_dicts_equal(correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5))

        # Middle
        mid_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [140, 208], 'end': [145, 212]}))
        correct_ans = {0: np.array([8]), 1: np.array([1, 2])}
        self.assert_numpy_dicts_equal(correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5))

        # Middle, without one index
        mid_regions2 = Regions(pd.DataFrame({'chromosome': ['chr10'], 'start': [208], 'end': [212]}, index=[1]))
        correct_ans = {1: np.array([1, 2])}
        self.assert_numpy_dicts_equal(correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5))


        length_one_regions = Regions(pd.DataFrame({'chromosome' : ['chr1'], 'start': [140], 'end': [141]}))
        correct_ans = {0: np.array([8])}
        self.assert_numpy_dicts_equal(correct_ans, length_one_regions.as_bins_of(parent_regions, resolution=5))
Example #15
0
def main():
    # --- Argument parsing -----------------------
    parser = argument_parser()

    args = parser.parse_args()
    if args.datasets and args.processed_dataset:
        parser.error('Must specify either --dataset or --processed_dataset only.')
    elif not args.processed_dataset:
        if not args.regions or not args.datasets:
            parser.error('Must specify both --regions and --dataset')

    if args.metric is None:
        if args.processed_dataset:
            parser.error('Must provide a metric if using processed dataset')
        elif len(args.datasets) >= 2:
            print "> Defaulting to cosine distance as more than 2 dataset given"
            args.metric = 'cosine'
        else:
            print "> Defaulting to sqeuclidean distance as only one dataset given"
            args.metric = 'sqeuclidean'
    elif args.metric == 'cosine':
        if args.datasets and len(args.datasets) < 2:
            parser.error('Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.')

    if args.no_dtw:
        # That's what no-dtw actually does
        args.slanted_band = 0
        args.scale = True
        if args.prototyping_method is None:
            args.prototyping_method = 'mean'
    else:
        if args.prototyping_method is None:
            args.prototyping_method = 'standard'

    if args.verbose:
        logging.root.setLevel(logging.DEBUG)

    # Disable trying to reverse regions if strand information given
    if args.use_strand_information:
        args.no_reverse = True

    configuration = Configuration(args)

    # --- pre-processing ------------------------
    if args.regions:
        print '> Reading regions from {0!r} ....'.format(args.regions)
        regions, total_regions, used_regions = read_regions(args.regions, args.random_sample, args.resolution)
        if args.use_strand_information and not regions.has_strand_data():
            logging.debug('Parsed columns: {0}'.format(regions.columns))
            parser.error('--use-strand-information is set but the input BED file has no strand information.')

        too_short_regions = (regions.lengths / args.resolution) < args.min_bins  # Set the threshold to 4 bins
        too_short_regions = regions.ix[too_short_regions[too_short_regions].index]
        if len(too_short_regions) > 0:
            print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\
                .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename)
            too_short_regions.to_bed(configuration.too_short_regions_filename)

            regions = regions.ix[regions.index - too_short_regions.index]

        if args.max_bins:
            too_long_regions = (regions.lengths / args.resolution) >= args.max_bins
            too_long_regions = regions.ix[too_long_regions[too_long_regions].index]

            if len(too_long_regions) > 0:
                print '> {0} regions have their length longer than {1} bins. ' \
                      'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\
                      .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename)
                too_long_regions.to_bed(configuration.too_long_regions_filename)

                regions = regions.ix[regions.index - too_long_regions.index]

        print '> {0} regions remain'.format(len(regions))
    else:
        regions = None

    if args.points_of_interest:
        print '> Reading points of interest'

        poi_file = args.points_of_interest
        try:
            poi = from_simple(poi_file, regions, resolution=configuration.resolution)
        except ValueError:
            poi = Regions.from_bed(poi_file)
            poi = poi.as_bins_of(regions, resolution=configuration.resolution,
                                 ignore_non_overlaps=args.ignore_poi_non_overlaps,
                                 account_for_strand_information=configuration.use_strand_information)

        if not poi:
            raise Exception(
                'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'.format(
                    poi_file))
    else:
        poi = None

    print '> Reading dataset ...'
    dataset, missing_regions, filtered_regions = read_datasets(args, regions)

    if args.datasets:

        if poi:
            dataset.add_points_of_interest(poi, name=args.points_of_interest)

            if args.ignore_no_poi_regions:
                poi_dataset = dataset.drop_no_pois()

                if len(poi_dataset) != len(dataset):
                    dropped_regions = regions.ix[dataset.items - poi_dataset.items]
                    print '> {0} regions were removed as they have no POI data with them ' \
                          'and --ignore-no-poi-regions was set'.format(len(dropped_regions))
                    print '> Saving them to {0!r}'.format(configuration.no_poi_regions_filename)
                    dropped_regions.to_bed(configuration.no_poi_regions_filename)
                    dataset = poi_dataset
                    del dropped_regions
                del poi_dataset

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions),
                                                                                configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions',
                                   track_description='Regions that are in input, but missing from the dataset')

        if len(filtered_regions) > 0:
            print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format(len(filtered_regions),
                                                                                           configuration.filtered_regions_filename)
            regions.ix[filtered_regions].to_bed(configuration.filtered_regions_filename, track_title='DGWFilteredRegions',
                                        track_description='Regions that were filtered out from the dataset')

        # Get remaining regions
        regions = regions.ix[dataset.items]
        if len(missing_regions) > 0 or len(filtered_regions) > 0:
            print '> {0} regions remaining and will be processed'.format(len(regions))


        if args.output_raw_dataset:
            print '> Serialising raw dataset to {0}'.format(configuration.raw_dataset_filename)
            serialise(dataset, configuration.raw_dataset_filename)

        dataset = dataset.to_log_scale()

        if args.normalise_pileups:
            print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set'
            dataset = dataset.normalise_bin_heights()


        missing_regions = regions.index - dataset.items

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions),
                                                                                               configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions',
                                               track_description='Regions that are in input, but missing from the dataset')
    else:
        print "> Not converting dataset to log scale as processed dataset already provided"

    # --- Serialise the regions as they will be needed in explorer ----------
    if regions is not None:
        print '> Serialising regions to {0}'.format(configuration.parsed_regions_filename)
        serialise(regions, configuration.parsed_regions_filename)

    # --- Saving of dataset -------------------
    print '> Saving dataset to {0}'.format(configuration.dataset_filename)
    serialise(dataset, configuration.dataset_filename)

    if not args.blank:
        # --- actual work ---------------------------
        print '> Calculating pairwise distances (this might take a while) ...'
        if args.n_processes is not None:
            print '> Using {0} processes'.format(args.n_processes)
        else:
            args.n_processes = cpu_count()
            print '> Using all available cpu cores ({0})'.format(args.n_processes)

        if args.no_dtw:
            print '> Not using DTW as --no-dtw option is set'

        logging.debug('Running DTW with the following kwargs: {0!r}'.format(configuration.dtw_kwargs))
        start = datetime.now()
        dm = parallel_pdist(dataset, args.n_processes, **configuration.dtw_kwargs)
        end = datetime.now()

        delta = end - start
        print '> Pairwise distances calculation took {0} s'.format(delta.total_seconds())

        if args.random_sample:
            multiplier = binomial_coefficent(total_regions, 2) / float(binomial_coefficent(args.random_sample, 2))
            print '> Expected calculation duration if random-sample was not used: {0} s'\
                   .format(delta.total_seconds() * multiplier)


        # --- Saving of the work --------------
        if configuration.pairwise_distances_filename:
            print '> Saving the pairwise distance matrix to {0!r}'.format(configuration.pairwise_distances_filename)
            np.save(configuration.pairwise_distances_filename, dm)

        # Linkage matrix
        print '> Computing linkage matrix'
        linkage = fastcluster.complete(dm)

        print '> Saving linkage matrix to {0!r}'.format(configuration.linkage_filename)
        np.save(configuration.linkage_filename, linkage)

        print '> Computing prototypes'
        # Hierarchical clustering object to compute the prototypes
        hc = HierarchicalClustering(dataset, regions, linkage, dtw_function=configuration.dtw_function,
                                    prototyping_method=configuration.prototyping_method)
        prototypes = hc.extract_prototypes()
        print '> Saving prototypes to {0!r}'.format(configuration.prototypes_filename)
        serialise(prototypes, configuration.prototypes_filename)

        print '> Computing warping paths'
        nodes = hc.tree_nodes_list
        paths = compute_paths(dataset, nodes, hc.num_obs, n_processes=args.n_processes,
                              **configuration.dtw_kwargs)
        print '> Saving warping paths to {0!r}'.format(configuration.warping_paths_filename)
        serialise(paths, configuration.warping_paths_filename)
    else:
        print '> Skipping pairwise distances step because of --blank option set'

    print '> Saving configuration to {0!r}'.format(configuration.configuration_filename)
    f = open(configuration.configuration_filename, 'w')
    try:
        configuration.to_json(f)
    finally:
        f.close()

    print '> Done'
Example #16
0
 def test_slicing_works(self):
     regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220]}))
     self.assertTrue(isinstance(regions.head(1), Regions))
     self.assertTrue(isinstance(regions[:1], Regions))
Example #17
0
    def test_clipping_keeps_the_same_class(self):

        regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220]}))
        clipped_regions = regions.clip_to_resolution(5)
        self.assertTrue(isinstance(clipped_regions, Regions))
Example #18
0
def main():
    parser = argument_parser()
    args = parser.parse_args()

    if args.verbose:
        import logging
        logging.root.setLevel(logging.DEBUG)

    configuration = load_configuration_from_file(args.configuration_file)
    args.configuration_file.close()

    if configuration.blank:
        parser.error('Cannot explore a --blank run of DGW')

    regions = configuration.load_regions()
    dataset = configuration.load_dataset()

    # Borrowed from colorbrewer's Dark2 color palette
    standard_highlight_colours = ["#d95f02", "#e7298a"]
    highlight_colours = {}
    if args.points_of_interest:
        dataset.reset_poi()
        for i, poi_file in enumerate(args.points_of_interest):
            print '> Reading points of interest from {0!r}'.format(poi_file)

            try:
                poi = from_simple(poi_file, regions, resolution=configuration.resolution,
                                  account_for_strand_information=configuration.use_strand_information)
            except ValueError:

                poi = Regions.from_bed(poi_file)
                poi = poi.as_bins_of(regions, resolution=configuration.resolution,
                                     ignore_non_overlaps=args.ignore_poi_non_overlaps,
                                     account_for_strand_information=configuration.use_strand_information)

            if not poi:
                raise Exception('POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'.format(poi_file))

            poi_filename = os.path.basename(poi_file)
            dataset.add_points_of_interest(poi, name=poi_filename)
            try:
                highlight_colours[poi_filename] = standard_highlight_colours.pop()
            except IndexError:
                raise Exception("Sorry, only up to {0} POI regions are supported".format(len(standard_highlight_colours)))
    else:
        if args.no_poi:
            dataset.reset_poi()
            
        if dataset.points_of_interest:
            highlight_colours[dataset.points_of_interest.values()[0].keys()[0]] = standard_highlight_colours.pop()


    hc = configuration.create_hierarchical_clustering_object(regions=regions, dataset=dataset)
    configuration_basename = os.path.basename(args.configuration_file.name)

    cut_xdata = 0
    have_cut = False
    if args.cut:
        cut_xdata = args.cut
        have_cut = True
    elif args.n_clusters:
        cut_xdata = hc.distance_threshold_for_n_clusters(args.n_clusters)
        have_cut = True

    hcv = dgw.cluster.visualisation.HierarchicalClusteringViewer(hc, output_directory=args.output,
                                                                 configuration_file=configuration_basename,
                                                                 highlight_colours=highlight_colours,
                                                                 cut_xdata=cut_xdata)
    if not args.save_only:
        print "> Displaying explorer"
        hcv.show()
    else:
        if not have_cut:
            raise Exception('Please use specify either the cut distance, or number of clusters when using --save-only')
        output_directory = args.output
        if not output_directory:
            raise Exception('Please specify output directory where the files should be saved')

        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)

        print('> Saving to {}'.format(output_directory))
        hcv.savefig(os.path.join(output_directory, 'clustering.pdf'))

        cluster_previewer = hcv.cluster_previewer()
        cluster_previewer.save_clusters(output_directory)

        print('> Saving summaries')
        cluster_previewer.save_previewer_windows(os.path.join(output_directory, 'summaries'))
Example #19
0
    def test_named_indices(self):

        parent_regions = Regions(
            pd.DataFrame(
                {
                    'chromosome': ['chr1', 'chr10'],
                    'start': [100, 200],
                    'end': [160, 220]
                },
                index=['foo', 'bar']))

        # Edges
        edge_regions = Regions(
            pd.DataFrame(
                {
                    'chromosome': ['chr1', 'chr10'],
                    'start': [100, 215],
                    'end': [110, 220]
                },
                index=['foo', 'bar']))
        correct_ans = {'foo': np.array([0, 1]), 'bar': np.array([3])}
        self.assert_numpy_dicts_equal(
            correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5))

        # Middle
        mid_regions = Regions(
            pd.DataFrame(
                {
                    'chromosome': ['chr1', 'chr10'],
                    'start': [140, 208],
                    'end': [145, 212]
                },
                index=['foo', 'bar']))
        correct_ans = {'foo': np.array([8]), 'bar': np.array([1, 2])}
        self.assert_numpy_dicts_equal(
            correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5))

        # Middle, without one index
        mid_regions2 = Regions(
            pd.DataFrame(
                {
                    'chromosome': ['chr10'],
                    'start': [208],
                    'end': [212]
                },
                index=['bar']))
        correct_ans = {'bar': np.array([1, 2])}
        self.assert_numpy_dicts_equal(
            correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5))

        length_one_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1'],
                'start': [140],
                'end': [141]
            },
                         index=['foo']))
        correct_ans = {'foo': np.array([8])}
        self.assert_numpy_dicts_equal(
            correct_ans,
            length_one_regions.as_bins_of(parent_regions, resolution=5))

        regions_not_in_parent = Regions(
            pd.DataFrame(
                {
                    'chromosome': ['chr1', 'chr2'],
                    'start': [140, 200],
                    'end': [141, 300]
                },
                index=['foo', 'baz']))
        correct_ans = {'foo': np.array([8])}
        self.assert_numpy_dicts_equal(
            correct_ans,
            regions_not_in_parent.as_bins_of(parent_regions, resolution=5))
Example #20
0
    def test_bins_calculated_correctly(self):
        parent_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1', 'chr10'],
                'start': [100, 200],
                'end': [160, 220]
            }))

        # Edges
        edge_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1', 'chr10'],
                'start': [100, 215],
                'end': [110, 220]
            }))
        correct_ans = {0: np.array([0, 1]), 1: np.array([3])}
        self.assert_numpy_dicts_equal(
            correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5))

        # Middle
        mid_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1', 'chr10'],
                'start': [140, 208],
                'end': [145, 212]
            }))
        correct_ans = {0: np.array([8]), 1: np.array([1, 2])}
        self.assert_numpy_dicts_equal(
            correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5))

        # Middle, without one index
        mid_regions2 = Regions(
            pd.DataFrame(
                {
                    'chromosome': ['chr10'],
                    'start': [208],
                    'end': [212]
                },
                index=[1]))
        correct_ans = {1: np.array([1, 2])}
        self.assert_numpy_dicts_equal(
            correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5))

        length_one_regions = Regions(
            pd.DataFrame({
                'chromosome': ['chr1'],
                'start': [140],
                'end': [141]
            }))
        correct_ans = {0: np.array([8])}
        self.assert_numpy_dicts_equal(
            correct_ans,
            length_one_regions.as_bins_of(parent_regions, resolution=5))
Example #21
0
def main():
    # --- Argument parsing -----------------------
    parser = argument_parser()

    args = parser.parse_args()
    if args.datasets and args.processed_dataset:
        parser.error(
            'Must specify either --dataset or --processed_dataset only.')
    elif not args.processed_dataset:
        if not args.regions or not args.datasets:
            parser.error('Must specify both --regions and --dataset')

    if args.metric is None:
        if args.processed_dataset:
            parser.error('Must provide a metric if using processed dataset')
        elif len(args.datasets) >= 2:
            print "> Defaulting to cosine distance as more than 2 dataset given"
            args.metric = 'cosine'
        else:
            print "> Defaulting to sqeuclidean distance as only one dataset given"
            args.metric = 'sqeuclidean'
    elif args.metric == 'cosine':
        if args.datasets and len(args.datasets) < 2:
            parser.error(
                'Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.'
            )

    if args.no_dtw:
        # That's what no-dtw actually does
        args.slanted_band = 0
        args.scale = True
        if args.prototyping_method is None:
            args.prototyping_method = 'mean'
    else:
        if args.prototyping_method is None:
            args.prototyping_method = 'standard'

    if args.verbose:
        logging.root.setLevel(logging.DEBUG)

    # Disable trying to reverse regions if strand information given
    if args.use_strand_information:
        args.no_reverse = True

    configuration = Configuration(args)

    # --- pre-processing ------------------------
    if args.regions:
        print '> Reading regions from {0!r} ....'.format(args.regions)
        regions, total_regions, used_regions = read_regions(
            args.regions, args.random_sample, args.resolution)
        if args.use_strand_information and not regions.has_strand_data():
            logging.debug('Parsed columns: {0}'.format(regions.columns))
            parser.error(
                '--use-strand-information is set but the input BED file has no strand information.'
            )

        too_short_regions = (regions.lengths / args.resolution
                             ) < args.min_bins  # Set the threshold to 4 bins
        too_short_regions = regions.ix[
            too_short_regions[too_short_regions].index]
        if len(too_short_regions) > 0:
            print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\
                .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename)
            too_short_regions.to_bed(configuration.too_short_regions_filename)

            regions = regions.ix[regions.index - too_short_regions.index]

        if args.max_bins:
            too_long_regions = (regions.lengths /
                                args.resolution) >= args.max_bins
            too_long_regions = regions.ix[
                too_long_regions[too_long_regions].index]

            if len(too_long_regions) > 0:
                print '> {0} regions have their length longer than {1} bins. ' \
                      'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\
                      .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename)
                too_long_regions.to_bed(
                    configuration.too_long_regions_filename)

                regions = regions.ix[regions.index - too_long_regions.index]

        print '> {0} regions remain'.format(len(regions))
    else:
        regions = None

    if args.points_of_interest:
        print '> Reading points of interest'

        poi_file = args.points_of_interest
        try:
            poi = from_simple(poi_file,
                              regions,
                              resolution=configuration.resolution)
        except ValueError:
            poi = Regions.from_bed(poi_file)
            poi = poi.as_bins_of(
                regions,
                resolution=configuration.resolution,
                ignore_non_overlaps=args.ignore_poi_non_overlaps,
                account_for_strand_information=configuration.
                use_strand_information)

        if not poi:
            raise Exception(
                'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'
                .format(poi_file))
    else:
        poi = None

    print '> Reading dataset ...'
    dataset, missing_regions, filtered_regions = read_datasets(args, regions)

    if args.datasets:

        if poi:
            dataset.add_points_of_interest(poi, name=args.points_of_interest)

            if args.ignore_no_poi_regions:
                poi_dataset = dataset.drop_no_pois()

                if len(poi_dataset) != len(dataset):
                    dropped_regions = regions.ix[dataset.items -
                                                 poi_dataset.items]
                    print '> {0} regions were removed as they have no POI data with them ' \
                          'and --ignore-no-poi-regions was set'.format(len(dropped_regions))
                    print '> Saving them to {0!r}'.format(
                        configuration.no_poi_regions_filename)
                    dropped_regions.to_bed(
                        configuration.no_poi_regions_filename)
                    dataset = poi_dataset
                    del dropped_regions
                del poi_dataset

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(
                len(missing_regions), configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(
                configuration.missing_regions_filename,
                track_title='DGWMissingRegions',
                track_description=
                'Regions that are in input, but missing from the dataset')

        if len(filtered_regions) > 0:
            print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format(
                len(filtered_regions), configuration.filtered_regions_filename)
            regions.ix[filtered_regions].to_bed(
                configuration.filtered_regions_filename,
                track_title='DGWFilteredRegions',
                track_description=
                'Regions that were filtered out from the dataset')

        # Get remaining regions
        regions = regions.ix[dataset.items]
        if len(missing_regions) > 0 or len(filtered_regions) > 0:
            print '> {0} regions remaining and will be processed'.format(
                len(regions))

        if args.output_raw_dataset:
            print '> Serialising raw dataset to {0}'.format(
                configuration.raw_dataset_filename)
            serialise(dataset, configuration.raw_dataset_filename)

        dataset = dataset.to_log_scale()

        if args.normalise_pileups:
            print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set'
            dataset = dataset.normalise_bin_heights()

        missing_regions = regions.index - dataset.items

        if len(missing_regions) > 0:
            print "> {0} regions were not found in the dataset, they were saved to {1}".format(
                len(missing_regions), configuration.missing_regions_filename)
            regions.ix[missing_regions].to_bed(
                configuration.missing_regions_filename,
                track_title='DGWMissingRegions',
                track_description=
                'Regions that are in input, but missing from the dataset')
    else:
        print "> Not converting dataset to log scale as processed dataset already provided"

    # --- Serialise the regions as they will be needed in explorer ----------
    if regions is not None:
        print '> Serialising regions to {0}'.format(
            configuration.parsed_regions_filename)
        serialise(regions, configuration.parsed_regions_filename)

    # --- Saving of dataset -------------------
    print '> Saving dataset to {0}'.format(configuration.dataset_filename)
    serialise(dataset, configuration.dataset_filename)

    if not args.blank:
        # --- actual work ---------------------------
        print '> Calculating pairwise distances (this might take a while) ...'
        if args.n_processes is not None:
            print '> Using {0} processes'.format(args.n_processes)
        else:
            args.n_processes = cpu_count()
            print '> Using all available cpu cores ({0})'.format(
                args.n_processes)

        if args.no_dtw:
            print '> Not using DTW as --no-dtw option is set'

        logging.debug('Running DTW with the following kwargs: {0!r}'.format(
            configuration.dtw_kwargs))
        start = datetime.now()
        dm = parallel_pdist(dataset, args.n_processes,
                            **configuration.dtw_kwargs)
        end = datetime.now()

        delta = end - start
        print '> Pairwise distances calculation took {0} s'.format(
            delta.total_seconds())

        if args.random_sample:
            multiplier = binomial_coefficent(total_regions, 2) / float(
                binomial_coefficent(args.random_sample, 2))
            print '> Expected calculation duration if random-sample was not used: {0} s'\
                   .format(delta.total_seconds() * multiplier)

        # --- Saving of the work --------------
        if configuration.pairwise_distances_filename:
            print '> Saving the pairwise distance matrix to {0!r}'.format(
                configuration.pairwise_distances_filename)
            np.save(configuration.pairwise_distances_filename, dm)

        # Linkage matrix
        print '> Computing linkage matrix'
        linkage = fastcluster.complete(dm)

        print '> Saving linkage matrix to {0!r}'.format(
            configuration.linkage_filename)
        np.save(configuration.linkage_filename, linkage)

        print '> Computing prototypes'
        # Hierarchical clustering object to compute the prototypes
        hc = HierarchicalClustering(
            dataset,
            regions,
            linkage,
            dtw_function=configuration.dtw_function,
            prototyping_method=configuration.prototyping_method)
        prototypes = hc.extract_prototypes()
        print '> Saving prototypes to {0!r}'.format(
            configuration.prototypes_filename)
        serialise(prototypes, configuration.prototypes_filename)

        print '> Computing warping paths'
        nodes = hc.tree_nodes_list
        paths = compute_paths(dataset,
                              nodes,
                              hc.num_obs,
                              n_processes=args.n_processes,
                              **configuration.dtw_kwargs)
        print '> Saving warping paths to {0!r}'.format(
            configuration.warping_paths_filename)
        serialise(paths, configuration.warping_paths_filename)
    else:
        print '> Skipping pairwise distances step because of --blank option set'

    print '> Saving configuration to {0!r}'.format(
        configuration.configuration_filename)
    f = open(configuration.configuration_filename, 'w')
    try:
        configuration.to_json(f)
    finally:
        f.close()

    print '> Done'
Example #22
0
def main():
    parser = argument_parser()
    args = parser.parse_args()

    if args.verbose:
        import logging
        logging.root.setLevel(logging.DEBUG)

    configuration = load_configuration_from_file(args.configuration_file)
    args.configuration_file.close()

    if configuration.blank:
        parser.error('Cannot explore a --blank run of DGW')

    regions = configuration.load_regions()
    dataset = configuration.load_dataset()

    # Borrowed from colorbrewer's Dark2 color palette
    standard_highlight_colours = ["#d95f02", "#e7298a"]
    highlight_colours = {}
    if args.points_of_interest:
        dataset.reset_poi()
        for i, poi_file in enumerate(args.points_of_interest):
            print '> Reading points of interest from {0!r}'.format(poi_file)

            try:
                poi = from_simple(poi_file,
                                  regions,
                                  resolution=configuration.resolution,
                                  account_for_strand_information=configuration.
                                  use_strand_information)
            except ValueError:

                poi = Regions.from_bed(poi_file)
                poi = poi.as_bins_of(
                    regions,
                    resolution=configuration.resolution,
                    ignore_non_overlaps=args.ignore_poi_non_overlaps,
                    account_for_strand_information=configuration.
                    use_strand_information)

            if not poi:
                raise Exception(
                    'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'
                    .format(poi_file))

            poi_filename = os.path.basename(poi_file)
            dataset.add_points_of_interest(poi, name=poi_filename)
            try:
                highlight_colours[
                    poi_filename] = standard_highlight_colours.pop()
            except IndexError:
                raise Exception(
                    "Sorry, only up to {0} POI regions are supported".format(
                        len(standard_highlight_colours)))
    else:
        if args.no_poi:
            dataset.reset_poi()

        if dataset.points_of_interest:
            highlight_colours[dataset.points_of_interest.values()[0].keys()
                              [0]] = standard_highlight_colours.pop()

    hc = configuration.create_hierarchical_clustering_object(regions=regions,
                                                             dataset=dataset)
    configuration_basename = os.path.basename(args.configuration_file.name)

    cut_xdata = 0
    have_cut = False
    if args.cut:
        cut_xdata = args.cut
        have_cut = True
    elif args.n_clusters:
        cut_xdata = hc.distance_threshold_for_n_clusters(args.n_clusters)
        have_cut = True

    hcv = dgw.cluster.visualisation.HierarchicalClusteringViewer(
        hc,
        output_directory=args.output,
        configuration_file=configuration_basename,
        highlight_colours=highlight_colours,
        cut_xdata=cut_xdata)
    if not args.save_only:
        print "> Displaying explorer"
        hcv.show()
    else:
        if not have_cut:
            raise Exception(
                'Please use specify either the cut distance, or number of clusters when using --save-only'
            )
        output_directory = args.output
        if not output_directory:
            raise Exception(
                'Please specify output directory where the files should be saved'
            )

        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)

        print('> Saving to {}'.format(output_directory))
        hcv.savefig(os.path.join(output_directory, 'clustering.pdf'))

        cluster_previewer = hcv.cluster_previewer()
        cluster_previewer.save_clusters(output_directory)

        print('> Saving summaries')
        cluster_previewer.save_previewer_windows(
            os.path.join(output_directory, 'summaries'))