def test_named_indices(self): parent_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [160, 220]}, index=['foo', 'bar'])) # Edges edge_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 215], 'end': [110, 220]}, index=['foo', 'bar'])) correct_ans = {'foo': np.array([0, 1]), 'bar': np.array([3])} self.assert_numpy_dicts_equal(correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5)) # Middle mid_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [140, 208], 'end': [145, 212]}, index=['foo', 'bar'])) correct_ans = {'foo': np.array([8]), 'bar': np.array([1, 2])} self.assert_numpy_dicts_equal(correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5)) # Middle, without one index mid_regions2 = Regions(pd.DataFrame({'chromosome': ['chr10'], 'start': [208], 'end': [212]}, index=['bar'])) correct_ans = {'bar': np.array([1, 2])} self.assert_numpy_dicts_equal(correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5)) length_one_regions = Regions(pd.DataFrame({'chromosome': ['chr1'], 'start': [140], 'end': [141]}, index=['foo'])) correct_ans = {'foo': np.array([8])} self.assert_numpy_dicts_equal(correct_ans, length_one_regions.as_bins_of(parent_regions, resolution=5)) regions_not_in_parent = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr2'], 'start': [140, 200], 'end': [141, 300]}, index=['foo', 'baz'])) correct_ans = {'foo': np.array([8])} self.assert_numpy_dicts_equal(correct_ans, regions_not_in_parent.as_bins_of(parent_regions, resolution=5))
def test_raises_value_error_on_mismatch(self): parent_regions = Regions( pd.DataFrame({ 'chromosome': ['chr1'], 'start': [100], 'end': [200] })) # Wrong chromosome poi_regions = Regions( pd.DataFrame({ 'chromosome': ['chr2'], 'start': [120], 'end': [180] })) self.assertRaises(ValueError, poi_regions.as_bins_of, parent_regions) # No overlap 1 poi_regions2 = Regions( pd.DataFrame({ 'chromosome': ['chr1'], 'start': [20], 'end': [80] })) self.assertRaises(ValueError, poi_regions2.as_bins_of, parent_regions) # No overlap 2 poi_regions3 = Regions( pd.DataFrame({ 'chromosome': ['chr1'], 'start': [220], 'end': [230] })) self.assertRaises(ValueError, poi_regions3.as_bins_of, parent_regions)
def test_clipping_always_greater_or_equal_than_0(self): regions = Regions(pd.DataFrame({'chromosome' : ['chr1'], 'start': [5], 'end': [7]})) clipped_df = regions.clip_to_resolution(20) self.assertEquals('chr1', clipped_df.ix[0]['chromosome']) self.assertEquals(0, clipped_df.ix[0]['start']) self.assertEquals(20, clipped_df.ix[0]['end'])
def test_slicing_works(self): regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220] })) self.assertTrue(isinstance(regions.head(1), Regions)) self.assertTrue(isinstance(regions[:1], Regions))
def test_clipping_keeps_the_same_class(self): regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220] })) clipped_regions = regions.clip_to_resolution(5) self.assertTrue(isinstance(clipped_regions, Regions))
def test_clipping_when_only_one_bin_present(self): regions = Regions(pd.DataFrame( {'chromosome' : ['chr1', 'chr10'], 'start' : [100, 200], 'end' : [117, 220]} )) clipped_df = regions.clip_to_resolution(20) self.assertEquals('chr1', clipped_df.ix[0]['chromosome']) self.assertEquals(99, clipped_df.ix[0]['start']) self.assertEquals(119, clipped_df.ix[0]['end']) self.assertEquals('chr10', clipped_df.ix[1]['chromosome']) self.assertEquals(200, clipped_df.ix[1]['start']) self.assertEquals(220, clipped_df.ix[1]['end'])
def test_clipping_res_1(self): regions = Regions(pd.DataFrame( {'chromosome' : ['chr1', 'chr10'], 'start' : [100, 200], 'end' : [117, 220]} )) clipped_df = regions.clip_to_resolution(1) self.assertEquals('chr1', clipped_df.ix[0]['chromosome']) self.assertEquals(100, clipped_df.ix[0]['start']) self.assertEquals(117, clipped_df.ix[0]['end']) self.assertEquals('chr10', clipped_df.ix[1]['chromosome']) self.assertEquals(200, clipped_df.ix[1]['start']) self.assertEquals(220, clipped_df.ix[1]['end'])
def test_clipping_always_greater_or_equal_than_0(self): regions = Regions( pd.DataFrame({ 'chromosome': ['chr1'], 'start': [5], 'end': [7] })) clipped_df = regions.clip_to_resolution(20) self.assertEquals('chr1', clipped_df.ix[0]['chromosome']) self.assertEquals(0, clipped_df.ix[0]['start']) self.assertEquals(20, clipped_df.ix[0]['end'])
def test_clipping_when_only_one_bin_present(self): regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220] })) clipped_df = regions.clip_to_resolution(20) self.assertEquals('chr1', clipped_df.ix[0]['chromosome']) self.assertEquals(99, clipped_df.ix[0]['start']) self.assertEquals(119, clipped_df.ix[0]['end']) self.assertEquals('chr10', clipped_df.ix[1]['chromosome']) self.assertEquals(200, clipped_df.ix[1]['start']) self.assertEquals(220, clipped_df.ix[1]['end'])
def main(): parser = argument_parser() args = parser.parse_args() input_filename = args.input_filename poi_filename = args.poi_filename input_regions = Regions.from_bed(input_filename) poi_regions = Regions.from_bed(poi_filename) output_file = args.output for ix, region in input_regions.iterrows(): pois_in_region = poi_regions.contained_within(region) if len(pois_in_region) == 0: continue output_file.write('{0}:{1}\n'.format(ix, pois_in_region.as_printable_list_of_pois())) output_file.close()
def test_clipping_res_1(self): regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220] })) clipped_df = regions.clip_to_resolution(1) self.assertEquals('chr1', clipped_df.ix[0]['chromosome']) self.assertEquals(100, clipped_df.ix[0]['start']) self.assertEquals(117, clipped_df.ix[0]['end']) self.assertEquals('chr10', clipped_df.ix[1]['chromosome']) self.assertEquals(200, clipped_df.ix[1]['start']) self.assertEquals(220, clipped_df.ix[1]['end'])
def read_regions(regions_filename, random_sample, resolution): regions = Regions.from_bed(regions_filename) total_len = len(regions) print '> {0} regions of interest read'.format(total_len) regions = regions.clip_to_resolution(resolution) used_len = total_len if random_sample: print '> Using only a random sample of {0} regions from {1!r}'.format(random_sample, regions_filename) used_len = random_sample regions = regions.ix[random.sample(regions.index, random_sample)] return regions, total_len, used_len
def read_regions(regions_filename, random_sample, resolution): regions = Regions.from_bed(regions_filename) total_len = len(regions) print '> {0} regions of interest read'.format(total_len) regions = regions.clip_to_resolution(resolution) used_len = total_len if random_sample: print '> Using only a random sample of {0} regions from {1!r}'.format( random_sample, regions_filename) used_len = random_sample regions = regions.ix[random.sample(regions.index, random_sample)] return regions, total_len, used_len
def test_bins_calculated_correctly(self): parent_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [160, 220]})) # Edges edge_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 215], 'end': [110, 220]})) correct_ans = {0: np.array([0, 1]), 1: np.array([3])} self.assert_numpy_dicts_equal(correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5)) # Middle mid_regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [140, 208], 'end': [145, 212]})) correct_ans = {0: np.array([8]), 1: np.array([1, 2])} self.assert_numpy_dicts_equal(correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5)) # Middle, without one index mid_regions2 = Regions(pd.DataFrame({'chromosome': ['chr10'], 'start': [208], 'end': [212]}, index=[1])) correct_ans = {1: np.array([1, 2])} self.assert_numpy_dicts_equal(correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5)) length_one_regions = Regions(pd.DataFrame({'chromosome' : ['chr1'], 'start': [140], 'end': [141]})) correct_ans = {0: np.array([8])} self.assert_numpy_dicts_equal(correct_ans, length_one_regions.as_bins_of(parent_regions, resolution=5))
def main(): # --- Argument parsing ----------------------- parser = argument_parser() args = parser.parse_args() if args.datasets and args.processed_dataset: parser.error('Must specify either --dataset or --processed_dataset only.') elif not args.processed_dataset: if not args.regions or not args.datasets: parser.error('Must specify both --regions and --dataset') if args.metric is None: if args.processed_dataset: parser.error('Must provide a metric if using processed dataset') elif len(args.datasets) >= 2: print "> Defaulting to cosine distance as more than 2 dataset given" args.metric = 'cosine' else: print "> Defaulting to sqeuclidean distance as only one dataset given" args.metric = 'sqeuclidean' elif args.metric == 'cosine': if args.datasets and len(args.datasets) < 2: parser.error('Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.') if args.no_dtw: # That's what no-dtw actually does args.slanted_band = 0 args.scale = True if args.prototyping_method is None: args.prototyping_method = 'mean' else: if args.prototyping_method is None: args.prototyping_method = 'standard' if args.verbose: logging.root.setLevel(logging.DEBUG) # Disable trying to reverse regions if strand information given if args.use_strand_information: args.no_reverse = True configuration = Configuration(args) # --- pre-processing ------------------------ if args.regions: print '> Reading regions from {0!r} ....'.format(args.regions) regions, total_regions, used_regions = read_regions(args.regions, args.random_sample, args.resolution) if args.use_strand_information and not regions.has_strand_data(): logging.debug('Parsed columns: {0}'.format(regions.columns)) parser.error('--use-strand-information is set but the input BED file has no strand information.') too_short_regions = (regions.lengths / args.resolution) < args.min_bins # Set the threshold to 4 bins too_short_regions = regions.ix[too_short_regions[too_short_regions].index] if len(too_short_regions) > 0: print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\ .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename) too_short_regions.to_bed(configuration.too_short_regions_filename) regions = regions.ix[regions.index - too_short_regions.index] if args.max_bins: too_long_regions = (regions.lengths / args.resolution) >= args.max_bins too_long_regions = regions.ix[too_long_regions[too_long_regions].index] if len(too_long_regions) > 0: print '> {0} regions have their length longer than {1} bins. ' \ 'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\ .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename) too_long_regions.to_bed(configuration.too_long_regions_filename) regions = regions.ix[regions.index - too_long_regions.index] print '> {0} regions remain'.format(len(regions)) else: regions = None if args.points_of_interest: print '> Reading points of interest' poi_file = args.points_of_interest try: poi = from_simple(poi_file, regions, resolution=configuration.resolution) except ValueError: poi = Regions.from_bed(poi_file) poi = poi.as_bins_of(regions, resolution=configuration.resolution, ignore_non_overlaps=args.ignore_poi_non_overlaps, account_for_strand_information=configuration.use_strand_information) if not poi: raise Exception( 'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'.format( poi_file)) else: poi = None print '> Reading dataset ...' dataset, missing_regions, filtered_regions = read_datasets(args, regions) if args.datasets: if poi: dataset.add_points_of_interest(poi, name=args.points_of_interest) if args.ignore_no_poi_regions: poi_dataset = dataset.drop_no_pois() if len(poi_dataset) != len(dataset): dropped_regions = regions.ix[dataset.items - poi_dataset.items] print '> {0} regions were removed as they have no POI data with them ' \ 'and --ignore-no-poi-regions was set'.format(len(dropped_regions)) print '> Saving them to {0!r}'.format(configuration.no_poi_regions_filename) dropped_regions.to_bed(configuration.no_poi_regions_filename) dataset = poi_dataset del dropped_regions del poi_dataset if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description='Regions that are in input, but missing from the dataset') if len(filtered_regions) > 0: print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format(len(filtered_regions), configuration.filtered_regions_filename) regions.ix[filtered_regions].to_bed(configuration.filtered_regions_filename, track_title='DGWFilteredRegions', track_description='Regions that were filtered out from the dataset') # Get remaining regions regions = regions.ix[dataset.items] if len(missing_regions) > 0 or len(filtered_regions) > 0: print '> {0} regions remaining and will be processed'.format(len(regions)) if args.output_raw_dataset: print '> Serialising raw dataset to {0}'.format(configuration.raw_dataset_filename) serialise(dataset, configuration.raw_dataset_filename) dataset = dataset.to_log_scale() if args.normalise_pileups: print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set' dataset = dataset.normalise_bin_heights() missing_regions = regions.index - dataset.items if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format(len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed(configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description='Regions that are in input, but missing from the dataset') else: print "> Not converting dataset to log scale as processed dataset already provided" # --- Serialise the regions as they will be needed in explorer ---------- if regions is not None: print '> Serialising regions to {0}'.format(configuration.parsed_regions_filename) serialise(regions, configuration.parsed_regions_filename) # --- Saving of dataset ------------------- print '> Saving dataset to {0}'.format(configuration.dataset_filename) serialise(dataset, configuration.dataset_filename) if not args.blank: # --- actual work --------------------------- print '> Calculating pairwise distances (this might take a while) ...' if args.n_processes is not None: print '> Using {0} processes'.format(args.n_processes) else: args.n_processes = cpu_count() print '> Using all available cpu cores ({0})'.format(args.n_processes) if args.no_dtw: print '> Not using DTW as --no-dtw option is set' logging.debug('Running DTW with the following kwargs: {0!r}'.format(configuration.dtw_kwargs)) start = datetime.now() dm = parallel_pdist(dataset, args.n_processes, **configuration.dtw_kwargs) end = datetime.now() delta = end - start print '> Pairwise distances calculation took {0} s'.format(delta.total_seconds()) if args.random_sample: multiplier = binomial_coefficent(total_regions, 2) / float(binomial_coefficent(args.random_sample, 2)) print '> Expected calculation duration if random-sample was not used: {0} s'\ .format(delta.total_seconds() * multiplier) # --- Saving of the work -------------- if configuration.pairwise_distances_filename: print '> Saving the pairwise distance matrix to {0!r}'.format(configuration.pairwise_distances_filename) np.save(configuration.pairwise_distances_filename, dm) # Linkage matrix print '> Computing linkage matrix' linkage = fastcluster.complete(dm) print '> Saving linkage matrix to {0!r}'.format(configuration.linkage_filename) np.save(configuration.linkage_filename, linkage) print '> Computing prototypes' # Hierarchical clustering object to compute the prototypes hc = HierarchicalClustering(dataset, regions, linkage, dtw_function=configuration.dtw_function, prototyping_method=configuration.prototyping_method) prototypes = hc.extract_prototypes() print '> Saving prototypes to {0!r}'.format(configuration.prototypes_filename) serialise(prototypes, configuration.prototypes_filename) print '> Computing warping paths' nodes = hc.tree_nodes_list paths = compute_paths(dataset, nodes, hc.num_obs, n_processes=args.n_processes, **configuration.dtw_kwargs) print '> Saving warping paths to {0!r}'.format(configuration.warping_paths_filename) serialise(paths, configuration.warping_paths_filename) else: print '> Skipping pairwise distances step because of --blank option set' print '> Saving configuration to {0!r}'.format(configuration.configuration_filename) f = open(configuration.configuration_filename, 'w') try: configuration.to_json(f) finally: f.close() print '> Done'
def test_slicing_works(self): regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220]})) self.assertTrue(isinstance(regions.head(1), Regions)) self.assertTrue(isinstance(regions[:1], Regions))
def test_clipping_keeps_the_same_class(self): regions = Regions(pd.DataFrame({'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [117, 220]})) clipped_regions = regions.clip_to_resolution(5) self.assertTrue(isinstance(clipped_regions, Regions))
def main(): parser = argument_parser() args = parser.parse_args() if args.verbose: import logging logging.root.setLevel(logging.DEBUG) configuration = load_configuration_from_file(args.configuration_file) args.configuration_file.close() if configuration.blank: parser.error('Cannot explore a --blank run of DGW') regions = configuration.load_regions() dataset = configuration.load_dataset() # Borrowed from colorbrewer's Dark2 color palette standard_highlight_colours = ["#d95f02", "#e7298a"] highlight_colours = {} if args.points_of_interest: dataset.reset_poi() for i, poi_file in enumerate(args.points_of_interest): print '> Reading points of interest from {0!r}'.format(poi_file) try: poi = from_simple(poi_file, regions, resolution=configuration.resolution, account_for_strand_information=configuration.use_strand_information) except ValueError: poi = Regions.from_bed(poi_file) poi = poi.as_bins_of(regions, resolution=configuration.resolution, ignore_non_overlaps=args.ignore_poi_non_overlaps, account_for_strand_information=configuration.use_strand_information) if not poi: raise Exception('POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi'.format(poi_file)) poi_filename = os.path.basename(poi_file) dataset.add_points_of_interest(poi, name=poi_filename) try: highlight_colours[poi_filename] = standard_highlight_colours.pop() except IndexError: raise Exception("Sorry, only up to {0} POI regions are supported".format(len(standard_highlight_colours))) else: if args.no_poi: dataset.reset_poi() if dataset.points_of_interest: highlight_colours[dataset.points_of_interest.values()[0].keys()[0]] = standard_highlight_colours.pop() hc = configuration.create_hierarchical_clustering_object(regions=regions, dataset=dataset) configuration_basename = os.path.basename(args.configuration_file.name) cut_xdata = 0 have_cut = False if args.cut: cut_xdata = args.cut have_cut = True elif args.n_clusters: cut_xdata = hc.distance_threshold_for_n_clusters(args.n_clusters) have_cut = True hcv = dgw.cluster.visualisation.HierarchicalClusteringViewer(hc, output_directory=args.output, configuration_file=configuration_basename, highlight_colours=highlight_colours, cut_xdata=cut_xdata) if not args.save_only: print "> Displaying explorer" hcv.show() else: if not have_cut: raise Exception('Please use specify either the cut distance, or number of clusters when using --save-only') output_directory = args.output if not output_directory: raise Exception('Please specify output directory where the files should be saved') if not os.path.isdir(output_directory): os.makedirs(output_directory) print('> Saving to {}'.format(output_directory)) hcv.savefig(os.path.join(output_directory, 'clustering.pdf')) cluster_previewer = hcv.cluster_previewer() cluster_previewer.save_clusters(output_directory) print('> Saving summaries') cluster_previewer.save_previewer_windows(os.path.join(output_directory, 'summaries'))
def test_named_indices(self): parent_regions = Regions( pd.DataFrame( { 'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [160, 220] }, index=['foo', 'bar'])) # Edges edge_regions = Regions( pd.DataFrame( { 'chromosome': ['chr1', 'chr10'], 'start': [100, 215], 'end': [110, 220] }, index=['foo', 'bar'])) correct_ans = {'foo': np.array([0, 1]), 'bar': np.array([3])} self.assert_numpy_dicts_equal( correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5)) # Middle mid_regions = Regions( pd.DataFrame( { 'chromosome': ['chr1', 'chr10'], 'start': [140, 208], 'end': [145, 212] }, index=['foo', 'bar'])) correct_ans = {'foo': np.array([8]), 'bar': np.array([1, 2])} self.assert_numpy_dicts_equal( correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5)) # Middle, without one index mid_regions2 = Regions( pd.DataFrame( { 'chromosome': ['chr10'], 'start': [208], 'end': [212] }, index=['bar'])) correct_ans = {'bar': np.array([1, 2])} self.assert_numpy_dicts_equal( correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5)) length_one_regions = Regions( pd.DataFrame({ 'chromosome': ['chr1'], 'start': [140], 'end': [141] }, index=['foo'])) correct_ans = {'foo': np.array([8])} self.assert_numpy_dicts_equal( correct_ans, length_one_regions.as_bins_of(parent_regions, resolution=5)) regions_not_in_parent = Regions( pd.DataFrame( { 'chromosome': ['chr1', 'chr2'], 'start': [140, 200], 'end': [141, 300] }, index=['foo', 'baz'])) correct_ans = {'foo': np.array([8])} self.assert_numpy_dicts_equal( correct_ans, regions_not_in_parent.as_bins_of(parent_regions, resolution=5))
def test_bins_calculated_correctly(self): parent_regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [100, 200], 'end': [160, 220] })) # Edges edge_regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [100, 215], 'end': [110, 220] })) correct_ans = {0: np.array([0, 1]), 1: np.array([3])} self.assert_numpy_dicts_equal( correct_ans, edge_regions.as_bins_of(parent_regions, resolution=5)) # Middle mid_regions = Regions( pd.DataFrame({ 'chromosome': ['chr1', 'chr10'], 'start': [140, 208], 'end': [145, 212] })) correct_ans = {0: np.array([8]), 1: np.array([1, 2])} self.assert_numpy_dicts_equal( correct_ans, mid_regions.as_bins_of(parent_regions, resolution=5)) # Middle, without one index mid_regions2 = Regions( pd.DataFrame( { 'chromosome': ['chr10'], 'start': [208], 'end': [212] }, index=[1])) correct_ans = {1: np.array([1, 2])} self.assert_numpy_dicts_equal( correct_ans, mid_regions2.as_bins_of(parent_regions, resolution=5)) length_one_regions = Regions( pd.DataFrame({ 'chromosome': ['chr1'], 'start': [140], 'end': [141] })) correct_ans = {0: np.array([8])} self.assert_numpy_dicts_equal( correct_ans, length_one_regions.as_bins_of(parent_regions, resolution=5))
def main(): # --- Argument parsing ----------------------- parser = argument_parser() args = parser.parse_args() if args.datasets and args.processed_dataset: parser.error( 'Must specify either --dataset or --processed_dataset only.') elif not args.processed_dataset: if not args.regions or not args.datasets: parser.error('Must specify both --regions and --dataset') if args.metric is None: if args.processed_dataset: parser.error('Must provide a metric if using processed dataset') elif len(args.datasets) >= 2: print "> Defaulting to cosine distance as more than 2 dataset given" args.metric = 'cosine' else: print "> Defaulting to sqeuclidean distance as only one dataset given" args.metric = 'sqeuclidean' elif args.metric == 'cosine': if args.datasets and len(args.datasets) < 2: parser.error( 'Cannot use cosine distance with just one dataset. Choose sqeuclidean or euclidean instead.' ) if args.no_dtw: # That's what no-dtw actually does args.slanted_band = 0 args.scale = True if args.prototyping_method is None: args.prototyping_method = 'mean' else: if args.prototyping_method is None: args.prototyping_method = 'standard' if args.verbose: logging.root.setLevel(logging.DEBUG) # Disable trying to reverse regions if strand information given if args.use_strand_information: args.no_reverse = True configuration = Configuration(args) # --- pre-processing ------------------------ if args.regions: print '> Reading regions from {0!r} ....'.format(args.regions) regions, total_regions, used_regions = read_regions( args.regions, args.random_sample, args.resolution) if args.use_strand_information and not regions.has_strand_data(): logging.debug('Parsed columns: {0}'.format(regions.columns)) parser.error( '--use-strand-information is set but the input BED file has no strand information.' ) too_short_regions = (regions.lengths / args.resolution ) < args.min_bins # Set the threshold to 4 bins too_short_regions = regions.ix[ too_short_regions[too_short_regions].index] if len(too_short_regions) > 0: print '> {0} regions have their length shorter than {1} bins. Saving them to {2!r} as they won\'t be processed'\ .format(len(too_short_regions), args.min_bins, configuration.too_short_regions_filename) too_short_regions.to_bed(configuration.too_short_regions_filename) regions = regions.ix[regions.index - too_short_regions.index] if args.max_bins: too_long_regions = (regions.lengths / args.resolution) >= args.max_bins too_long_regions = regions.ix[ too_long_regions[too_long_regions].index] if len(too_long_regions) > 0: print '> {0} regions have their length longer than {1} bins. ' \ 'Saving them to {2!r} as they won\'t be processed due to --max-bins constraint'\ .format(len(too_long_regions), args.max_bins, configuration.too_long_regions_filename) too_long_regions.to_bed( configuration.too_long_regions_filename) regions = regions.ix[regions.index - too_long_regions.index] print '> {0} regions remain'.format(len(regions)) else: regions = None if args.points_of_interest: print '> Reading points of interest' poi_file = args.points_of_interest try: poi = from_simple(poi_file, regions, resolution=configuration.resolution) except ValueError: poi = Regions.from_bed(poi_file) poi = poi.as_bins_of( regions, resolution=configuration.resolution, ignore_non_overlaps=args.ignore_poi_non_overlaps, account_for_strand_information=configuration. use_strand_information) if not poi: raise Exception( 'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi' .format(poi_file)) else: poi = None print '> Reading dataset ...' dataset, missing_regions, filtered_regions = read_datasets(args, regions) if args.datasets: if poi: dataset.add_points_of_interest(poi, name=args.points_of_interest) if args.ignore_no_poi_regions: poi_dataset = dataset.drop_no_pois() if len(poi_dataset) != len(dataset): dropped_regions = regions.ix[dataset.items - poi_dataset.items] print '> {0} regions were removed as they have no POI data with them ' \ 'and --ignore-no-poi-regions was set'.format(len(dropped_regions)) print '> Saving them to {0!r}'.format( configuration.no_poi_regions_filename) dropped_regions.to_bed( configuration.no_poi_regions_filename) dataset = poi_dataset del dropped_regions del poi_dataset if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format( len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed( configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description= 'Regions that are in input, but missing from the dataset') if len(filtered_regions) > 0: print "> {0} regions were filtered out from dataset due to --min-pileup constraint, they were saved to {1}".format( len(filtered_regions), configuration.filtered_regions_filename) regions.ix[filtered_regions].to_bed( configuration.filtered_regions_filename, track_title='DGWFilteredRegions', track_description= 'Regions that were filtered out from the dataset') # Get remaining regions regions = regions.ix[dataset.items] if len(missing_regions) > 0 or len(filtered_regions) > 0: print '> {0} regions remaining and will be processed'.format( len(regions)) if args.output_raw_dataset: print '> Serialising raw dataset to {0}'.format( configuration.raw_dataset_filename) serialise(dataset, configuration.raw_dataset_filename) dataset = dataset.to_log_scale() if args.normalise_pileups: print '> Dividing the number of reads in each bin by the maximum number of reads per region as --normalise-pileups is set' dataset = dataset.normalise_bin_heights() missing_regions = regions.index - dataset.items if len(missing_regions) > 0: print "> {0} regions were not found in the dataset, they were saved to {1}".format( len(missing_regions), configuration.missing_regions_filename) regions.ix[missing_regions].to_bed( configuration.missing_regions_filename, track_title='DGWMissingRegions', track_description= 'Regions that are in input, but missing from the dataset') else: print "> Not converting dataset to log scale as processed dataset already provided" # --- Serialise the regions as they will be needed in explorer ---------- if regions is not None: print '> Serialising regions to {0}'.format( configuration.parsed_regions_filename) serialise(regions, configuration.parsed_regions_filename) # --- Saving of dataset ------------------- print '> Saving dataset to {0}'.format(configuration.dataset_filename) serialise(dataset, configuration.dataset_filename) if not args.blank: # --- actual work --------------------------- print '> Calculating pairwise distances (this might take a while) ...' if args.n_processes is not None: print '> Using {0} processes'.format(args.n_processes) else: args.n_processes = cpu_count() print '> Using all available cpu cores ({0})'.format( args.n_processes) if args.no_dtw: print '> Not using DTW as --no-dtw option is set' logging.debug('Running DTW with the following kwargs: {0!r}'.format( configuration.dtw_kwargs)) start = datetime.now() dm = parallel_pdist(dataset, args.n_processes, **configuration.dtw_kwargs) end = datetime.now() delta = end - start print '> Pairwise distances calculation took {0} s'.format( delta.total_seconds()) if args.random_sample: multiplier = binomial_coefficent(total_regions, 2) / float( binomial_coefficent(args.random_sample, 2)) print '> Expected calculation duration if random-sample was not used: {0} s'\ .format(delta.total_seconds() * multiplier) # --- Saving of the work -------------- if configuration.pairwise_distances_filename: print '> Saving the pairwise distance matrix to {0!r}'.format( configuration.pairwise_distances_filename) np.save(configuration.pairwise_distances_filename, dm) # Linkage matrix print '> Computing linkage matrix' linkage = fastcluster.complete(dm) print '> Saving linkage matrix to {0!r}'.format( configuration.linkage_filename) np.save(configuration.linkage_filename, linkage) print '> Computing prototypes' # Hierarchical clustering object to compute the prototypes hc = HierarchicalClustering( dataset, regions, linkage, dtw_function=configuration.dtw_function, prototyping_method=configuration.prototyping_method) prototypes = hc.extract_prototypes() print '> Saving prototypes to {0!r}'.format( configuration.prototypes_filename) serialise(prototypes, configuration.prototypes_filename) print '> Computing warping paths' nodes = hc.tree_nodes_list paths = compute_paths(dataset, nodes, hc.num_obs, n_processes=args.n_processes, **configuration.dtw_kwargs) print '> Saving warping paths to {0!r}'.format( configuration.warping_paths_filename) serialise(paths, configuration.warping_paths_filename) else: print '> Skipping pairwise distances step because of --blank option set' print '> Saving configuration to {0!r}'.format( configuration.configuration_filename) f = open(configuration.configuration_filename, 'w') try: configuration.to_json(f) finally: f.close() print '> Done'
def main(): parser = argument_parser() args = parser.parse_args() if args.verbose: import logging logging.root.setLevel(logging.DEBUG) configuration = load_configuration_from_file(args.configuration_file) args.configuration_file.close() if configuration.blank: parser.error('Cannot explore a --blank run of DGW') regions = configuration.load_regions() dataset = configuration.load_dataset() # Borrowed from colorbrewer's Dark2 color palette standard_highlight_colours = ["#d95f02", "#e7298a"] highlight_colours = {} if args.points_of_interest: dataset.reset_poi() for i, poi_file in enumerate(args.points_of_interest): print '> Reading points of interest from {0!r}'.format(poi_file) try: poi = from_simple(poi_file, regions, resolution=configuration.resolution, account_for_strand_information=configuration. use_strand_information) except ValueError: poi = Regions.from_bed(poi_file) poi = poi.as_bins_of( regions, resolution=configuration.resolution, ignore_non_overlaps=args.ignore_poi_non_overlaps, account_for_strand_information=configuration. use_strand_information) if not poi: raise Exception( 'POI file provided, but no POIs were parsed from {}. Try using dgw-overlaps2poi' .format(poi_file)) poi_filename = os.path.basename(poi_file) dataset.add_points_of_interest(poi, name=poi_filename) try: highlight_colours[ poi_filename] = standard_highlight_colours.pop() except IndexError: raise Exception( "Sorry, only up to {0} POI regions are supported".format( len(standard_highlight_colours))) else: if args.no_poi: dataset.reset_poi() if dataset.points_of_interest: highlight_colours[dataset.points_of_interest.values()[0].keys() [0]] = standard_highlight_colours.pop() hc = configuration.create_hierarchical_clustering_object(regions=regions, dataset=dataset) configuration_basename = os.path.basename(args.configuration_file.name) cut_xdata = 0 have_cut = False if args.cut: cut_xdata = args.cut have_cut = True elif args.n_clusters: cut_xdata = hc.distance_threshold_for_n_clusters(args.n_clusters) have_cut = True hcv = dgw.cluster.visualisation.HierarchicalClusteringViewer( hc, output_directory=args.output, configuration_file=configuration_basename, highlight_colours=highlight_colours, cut_xdata=cut_xdata) if not args.save_only: print "> Displaying explorer" hcv.show() else: if not have_cut: raise Exception( 'Please use specify either the cut distance, or number of clusters when using --save-only' ) output_directory = args.output if not output_directory: raise Exception( 'Please specify output directory where the files should be saved' ) if not os.path.isdir(output_directory): os.makedirs(output_directory) print('> Saving to {}'.format(output_directory)) hcv.savefig(os.path.join(output_directory, 'clustering.pdf')) cluster_previewer = hcv.cluster_previewer() cluster_previewer.save_clusters(output_directory) print('> Saving summaries') cluster_previewer.save_previewer_windows( os.path.join(output_directory, 'summaries'))