def run_scenario(size: int, scale: float, genome_size: int, alpha: float, ultrametric: bool) -> Result: with time_func("Seeding numpy random"): random_seed = int(time.time()) numpy.random.seed(random_seed) genome_maker = GenomeMaker(random_seed, alpha) with time_func("Constructing the Yule tree"): res = YuleTreeGenerator(size=size, scale=scale, seed=random_seed).construct(ultrametric) with time_func("Get branch statistics"): branch_stats = res.root.branch_len_stats() logging.info( "Branch count: %s avg: %s median: %s expected: %s", branch_stats.count, branch_stats.average, branch_stats.median, scale) total_jumped = [] with time_func(f"Filling genome, size: {genome_size}"): fill_genome(res.root, genome_size=genome_size, maker=genome_maker, total_jumped=total_jumped) assert len(res.leaves) == size newick = res.root.to_newick() internal_branches_orig = len([c for c in newick if c == ')']) - 1 model_tree = TreeDesc(newick, internal_branches_orig, branch_stats) concat_genomes = [leaf.genome.genes for leaf in res.leaves] suffix_tree = STree(concat_genomes) with time_func("Counting occurrences"): occurrences = suffix_tree.occurrences() return Result( model_tree, genome_size, scale, size, sum(total_jumped), statistics.mean(total_jumped) if total_jumped else 0, alpha, random_seed, occurrences )
def _read_real_data( data_dir: Path, name_key: str = "Cog", field_names: Tuple[str] = ("Taxid", "Gene name", "Contig", "Srnd", "Start", "Stop", "Length", "Cog") ) -> Occurrences: names = {} genomes = [] sizes = [] for file_ in data_dir.iterdir(): genome = [] with file_.open("r") as csvfile: reader = csv.DictReader(csvfile, fieldnames=field_names) next(reader) # Skip header for line in reader: name = line[name_key] if name not in names: names[name] = len(names) gene_id = names[name] genome += [gene_id] if genome is not None: logging.info("Done parsing genome: %s genome size is: %d", file_, len(genome)) sizes.append(len(genome)) genomes.append(genome) with time_func( f"Constructing the suffix tree for {len(genomes)} genomes!"): suffix_tree = STree(genomes) with time_func(f"Counting occurrences for {len(genomes)} genomes!"): logging.info( "Smallest geome is: %d longest geome is: %d average genome is: %d median genome is: %d", min(sizes), max(sizes), statistics.mean(sizes), statistics.median(sizes)) return suffix_tree.occurrences()
def main(data_path: str, output_path: str, edge_lengths: int): data_path = Path(data_path).expanduser() output_path = Path(output_path).expanduser() sns.set() assert data_path.exists() and data_path.is_dir() output_path.mkdir(exist_ok=True) with time_func(f"Reading distributions from {data_path}"): dists, jumps = read_distributions(data_path) data = PlotData(distributions=dists, out_dir=output_path, lambdas=edge_lengths) with time_func("Plotting histogram"): plot_distribution(data, [size for size in range(1, 1024)])
def run_single_job( pattern: str, leaf_count: int, scale: float, base_path: Path, alpha: float, genome_size: int, idx: int, ultrametric: bool): assert pattern with time_func(f"Running tree: {idx} of scenario with {leaf_count} leaves, alpha: {alpha} and scale: {scale}"): result = run_scenario(leaf_count, scale, genome_size=genome_size, alpha=alpha, ultrametric=ultrametric) outdir = base_path / str(scale) outdir.mkdir(exist_ok=True) output = outdir / f"{uuid.uuid4()}_{pattern}" with gzip.open(str(output.with_suffix(".json.gz")), "w") as f_gz: f_gz.write(result.to_json().encode())
def merge_files(directory: Path, file_pattern: str, output: Path): # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#gif assert directory.is_dir() relevant = list(directory.glob(file_pattern)) by_key = {} with time_func(f"Going over {len(relevant)} files"): for file in relevant: integers = NUMBER_MATCHER.findall(file.name) assert len(integers) == 1 key = int(integers[0]) by_key[key] = file step = len(by_key) / 10 with time_func("Creating the GIFF"): with imageio.get_writer(output, mode='I') as writer: for index, (_, filename) in enumerate(sorted(by_key.items())): if index % step == 0: logging.info("Progress %s percent done", (index // step) * 10) image = imageio.imread(filename) writer.append_data(image) with time_func(f"Optimizing giff: {output}"): optimize_giff(str(output.absolute()))
def plot_island_distribution(data: PlotData, island_size_: int, tmp_dir: Path): csv_out = Path(tmp_dir, f"out_{island_size_}.csv") with time_func(f"Populating the CSV at {csv_out}"): populate_csv(csv_out, data.distributions, [island_size_]) with time_func("Reading the CSV"): data_set = pd.read_csv(csv_out) for normalize in (True, False): xs = "avg_occurr" if not normalize else "ln_avg_occurr" with time_func("Displaying the dataset:"): sns.displot( data_set, x=xs, hue="edge_length", kind="kde", # kde=True, palette=sns.color_palette("Paired", data.lambdas)) title = f"island_size_{island_size_}" if normalize: title = "normalized_" + title out_fie = Path(data.out_dir, f"{title}.png") plt.title(title) plt.savefig(str(out_fie))
def main(config: str): config_path = Path(config).expanduser() configuration = parse_configuration(config_path) configuration.validate() configuration.output_folder.mkdir(exist_ok=True) data_files = list( configuration.data_folder.glob(configuration.file_pattern)) logging.info("Going over %s data files!", len(data_files)) tabulated = {} for data_file in data_files: process_file(data_file, tabulated) with time_func("Writing CSVs"): write_csvs(configuration, tabulated) logging.info("DONE :)")
def run_scenario(size: int, scale: float, neighborhood_size: int, genome_size: int, genome_maker: GenomeMaker) -> Result: with time_func("Constructing the Yule tree"): res = YuleTreeGenerator(size=size, scale=scale, seed=genome_maker.seed).construct() with time_func("Get branch statistics"): branch_stats = res.root.branch_len_stats() logging.info("Branch count: %s avg: %s median: %s expected: %s", branch_stats.count, branch_stats.average, branch_stats.median, scale) total_jumped = [] with time_func(f"Filling genome, size: {genome_size}"): fill_genome(res.root, genome_size=genome_size, maker=genome_maker, total_jumped=total_jumped) assert len(res.leaves) == size leaves_matrix = {} def fill_leaves_matrix(): for row, l1 in enumerate(res.leaves): for l2 in res.leaves: leaves_matrix.setdefault(row, []).append((l1, l2)) with time_func("Filling leaves matrix"): fill_leaves_matrix() distance_matrix = {} def fill_distance_matrix(): calculated = {} durations = [] for leave_vector in leaves_matrix.values(): for l1, l2 in leave_vector: name1 = l1.name name2 = l2.name key = tuple(sorted([name1, name2])) if key in calculated: distance = calculated[key] else: if l1.name == l2.name: # Small optimization distance = 0 else: time_before_call = time.monotonic() distance = calculate_synteny_distance( l1.genome, l2.genome, neighborhood_size) duration = time.monotonic() - time_before_call durations.append(duration) calculated[key] = distance distance_matrix.setdefault(l1.name, []).append(distance) total = sum(durations) size = len(durations) max_duration = max(durations) logging.info( "Number of calculations: %s avg duration: %s max duration: %s total duration: %s", size, total / size, max_duration, total) with time_func("Filling distance matrix"): fill_distance_matrix() constructor = PhylipNeighborConstructor() with time_func("Runing Phylip Neighbor constructor"): orig, constructed = constructor.construct(res.root, distance_matrix) distance_calc = PhylipTreeDistCalculator() with time_func("Runing Phylip TreeDist"): distance_res = distance_calc.calc(orig, constructed) logging.debug("Original tree: ") logging.debug(orig) logging.debug("Constructed tree:") logging.debug(constructed) logging.debug("TreeDist result:") logging.debug(distance_res) internal_branches_orig = len([c for c in orig if c == ')']) - 1 internal_branches_constructed = len([c for c in constructed if c == ')']) - 1 distance_without_len = distance_calc.calc(orig, constructed, False) assert distance_without_len // 1 == distance_without_len distance_without_len = int(distance_without_len) logging.debug("Distance without len: %s", distance_without_len) common_edges = ((internal_branches_orig + internal_branches_constructed) - distance_without_len) / 2 if internal_branches_orig == 0: fp = 1 else: fp = (internal_branches_orig - common_edges) / internal_branches_orig if internal_branches_constructed == 0: fn = 1 else: fn = (internal_branches_constructed - common_edges) / internal_branches_constructed logging.debug("False positive estimator: %s", fp) logging.debug("False negative estimator: %s", fn) model_tree = TreeDesc(orig, internal_branches_orig, branch_stats) constructed_res = NewickParser(constructed).parse() constructed_tree = TreeDesc(constructed, internal_branches_constructed, constructed_res.root.branch_len_stats()) return Result(model_tree, constructed_tree, genome_size, neighborhood_size, scale, distance_without_len, fp, fn, distance_res)