Ejemplo n.º 1
0
def run_scenario(size: int, scale: float, genome_size: int, alpha: float, ultrametric: bool) -> Result:
    with time_func("Seeding numpy random"):
        random_seed = int(time.time())
        numpy.random.seed(random_seed)
        genome_maker = GenomeMaker(random_seed, alpha)

    with time_func("Constructing the Yule tree"):
        res = YuleTreeGenerator(size=size, scale=scale, seed=random_seed).construct(ultrametric)
    with time_func("Get branch statistics"):
        branch_stats = res.root.branch_len_stats()
    logging.info(
        "Branch count: %s avg: %s median: %s expected: %s", branch_stats.count,
        branch_stats.average, branch_stats.median, scale)
    total_jumped = []
    with time_func(f"Filling genome, size: {genome_size}"):
        fill_genome(res.root, genome_size=genome_size, maker=genome_maker, total_jumped=total_jumped)

    assert len(res.leaves) == size

    newick = res.root.to_newick()
    internal_branches_orig = len([c for c in newick if c == ')']) - 1
    model_tree = TreeDesc(newick, internal_branches_orig, branch_stats)
    concat_genomes = [leaf.genome.genes for leaf in res.leaves]
    suffix_tree = STree(concat_genomes)
    with time_func("Counting occurrences"):
        occurrences = suffix_tree.occurrences()
    return Result(
        model_tree, genome_size, scale, size, sum(total_jumped), statistics.mean(total_jumped) if total_jumped else 0,
        alpha, random_seed, occurrences
    )
Ejemplo n.º 2
0
def _read_real_data(
    data_dir: Path,
    name_key: str = "Cog",
    field_names: Tuple[str] = ("Taxid", "Gene name", "Contig", "Srnd", "Start",
                               "Stop", "Length", "Cog")
) -> Occurrences:
    names = {}
    genomes = []
    sizes = []
    for file_ in data_dir.iterdir():
        genome = []
        with file_.open("r") as csvfile:
            reader = csv.DictReader(csvfile, fieldnames=field_names)
            next(reader)  # Skip header
            for line in reader:
                name = line[name_key]
                if name not in names:
                    names[name] = len(names)
                gene_id = names[name]
                genome += [gene_id]
        if genome is not None:
            logging.info("Done parsing genome: %s genome size is: %d", file_,
                         len(genome))
            sizes.append(len(genome))
            genomes.append(genome)
    with time_func(
            f"Constructing the suffix tree for {len(genomes)} genomes!"):
        suffix_tree = STree(genomes)
    with time_func(f"Counting occurrences for {len(genomes)} genomes!"):
        logging.info(
            "Smallest geome is: %d longest geome is: %d average genome is: %d median genome is: %d",
            min(sizes), max(sizes), statistics.mean(sizes),
            statistics.median(sizes))
        return suffix_tree.occurrences()
Ejemplo n.º 3
0
def main(data_path: str, output_path: str, edge_lengths: int):
    data_path = Path(data_path).expanduser()
    output_path = Path(output_path).expanduser()
    sns.set()
    assert data_path.exists() and data_path.is_dir()
    output_path.mkdir(exist_ok=True)
    with time_func(f"Reading distributions from {data_path}"):
        dists, jumps = read_distributions(data_path)
    data = PlotData(distributions=dists,
                    out_dir=output_path,
                    lambdas=edge_lengths)
    with time_func("Plotting histogram"):
        plot_distribution(data, [size for size in range(1, 1024)])
Ejemplo n.º 4
0
def run_single_job(
        pattern: str, leaf_count: int, scale: float, base_path: Path, alpha: float, genome_size: int, idx: int,
        ultrametric: bool):
    assert pattern
    with time_func(f"Running tree: {idx} of scenario with {leaf_count} leaves, alpha: {alpha} and scale: {scale}"):
        result = run_scenario(leaf_count, scale, genome_size=genome_size, alpha=alpha, ultrametric=ultrametric)
    outdir = base_path / str(scale)
    outdir.mkdir(exist_ok=True)
    output = outdir / f"{uuid.uuid4()}_{pattern}"
    with gzip.open(str(output.with_suffix(".json.gz")), "w") as f_gz:
        f_gz.write(result.to_json().encode())
Ejemplo n.º 5
0
def merge_files(directory: Path, file_pattern: str, output: Path):
	# https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#gif
	assert directory.is_dir()
	relevant = list(directory.glob(file_pattern))
	by_key = {}
	with time_func(f"Going over {len(relevant)} files"):
		for file in relevant:
			integers = NUMBER_MATCHER.findall(file.name)
			assert len(integers) == 1
			key = int(integers[0])
			by_key[key] = file
	step = len(by_key) / 10
	with time_func("Creating the GIFF"):
		with imageio.get_writer(output, mode='I') as writer:
			for index, (_, filename) in enumerate(sorted(by_key.items())):
				if index % step == 0:
					logging.info("Progress %s percent done", (index // step) * 10)
				image = imageio.imread(filename)
				writer.append_data(image)
	with time_func(f"Optimizing giff: {output}"):
		optimize_giff(str(output.absolute()))
Ejemplo n.º 6
0
def plot_island_distribution(data: PlotData, island_size_: int, tmp_dir: Path):
    csv_out = Path(tmp_dir, f"out_{island_size_}.csv")
    with time_func(f"Populating the CSV at {csv_out}"):
        populate_csv(csv_out, data.distributions, [island_size_])
    with time_func("Reading the CSV"):
        data_set = pd.read_csv(csv_out)
    for normalize in (True, False):
        xs = "avg_occurr" if not normalize else "ln_avg_occurr"
        with time_func("Displaying the dataset:"):
            sns.displot(
                data_set,
                x=xs,
                hue="edge_length",
                kind="kde",  # kde=True,
                palette=sns.color_palette("Paired", data.lambdas))
        title = f"island_size_{island_size_}"
        if normalize:
            title = "normalized_" + title
        out_fie = Path(data.out_dir, f"{title}.png")
        plt.title(title)
        plt.savefig(str(out_fie))
Ejemplo n.º 7
0
def main(config: str):
    config_path = Path(config).expanduser()
    configuration = parse_configuration(config_path)
    configuration.validate()
    configuration.output_folder.mkdir(exist_ok=True)
    data_files = list(
        configuration.data_folder.glob(configuration.file_pattern))
    logging.info("Going over %s data files!", len(data_files))
    tabulated = {}
    for data_file in data_files:
        process_file(data_file, tabulated)
    with time_func("Writing CSVs"):
        write_csvs(configuration, tabulated)
    logging.info("DONE :)")
Ejemplo n.º 8
0
def run_scenario(size: int, scale: float, neighborhood_size: int,
                 genome_size: int, genome_maker: GenomeMaker) -> Result:
    with time_func("Constructing the Yule tree"):
        res = YuleTreeGenerator(size=size, scale=scale,
                                seed=genome_maker.seed).construct()
    with time_func("Get branch statistics"):
        branch_stats = res.root.branch_len_stats()
    logging.info("Branch count: %s avg: %s median: %s expected: %s",
                 branch_stats.count, branch_stats.average, branch_stats.median,
                 scale)
    total_jumped = []
    with time_func(f"Filling genome, size: {genome_size}"):
        fill_genome(res.root,
                    genome_size=genome_size,
                    maker=genome_maker,
                    total_jumped=total_jumped)
    assert len(res.leaves) == size

    leaves_matrix = {}

    def fill_leaves_matrix():
        for row, l1 in enumerate(res.leaves):
            for l2 in res.leaves:
                leaves_matrix.setdefault(row, []).append((l1, l2))

    with time_func("Filling leaves matrix"):
        fill_leaves_matrix()
    distance_matrix = {}

    def fill_distance_matrix():
        calculated = {}
        durations = []
        for leave_vector in leaves_matrix.values():
            for l1, l2 in leave_vector:
                name1 = l1.name
                name2 = l2.name
                key = tuple(sorted([name1, name2]))
                if key in calculated:
                    distance = calculated[key]
                else:
                    if l1.name == l2.name:  # Small optimization
                        distance = 0
                    else:
                        time_before_call = time.monotonic()
                        distance = calculate_synteny_distance(
                            l1.genome, l2.genome, neighborhood_size)
                        duration = time.monotonic() - time_before_call
                        durations.append(duration)

                    calculated[key] = distance
                distance_matrix.setdefault(l1.name, []).append(distance)
        total = sum(durations)
        size = len(durations)
        max_duration = max(durations)
        logging.info(
            "Number of calculations: %s avg duration: %s max duration: %s total duration: %s",
            size, total / size, max_duration, total)

    with time_func("Filling distance matrix"):
        fill_distance_matrix()

    constructor = PhylipNeighborConstructor()
    with time_func("Runing Phylip Neighbor constructor"):
        orig, constructed = constructor.construct(res.root, distance_matrix)
    distance_calc = PhylipTreeDistCalculator()
    with time_func("Runing Phylip TreeDist"):
        distance_res = distance_calc.calc(orig, constructed)
    logging.debug("Original tree: ")
    logging.debug(orig)
    logging.debug("Constructed tree:")
    logging.debug(constructed)
    logging.debug("TreeDist result:")
    logging.debug(distance_res)
    internal_branches_orig = len([c for c in orig if c == ')']) - 1
    internal_branches_constructed = len([c
                                         for c in constructed if c == ')']) - 1
    distance_without_len = distance_calc.calc(orig, constructed, False)
    assert distance_without_len // 1 == distance_without_len
    distance_without_len = int(distance_without_len)
    logging.debug("Distance without len: %s", distance_without_len)
    common_edges = ((internal_branches_orig + internal_branches_constructed) -
                    distance_without_len) / 2
    if internal_branches_orig == 0:
        fp = 1
    else:
        fp = (internal_branches_orig - common_edges) / internal_branches_orig
    if internal_branches_constructed == 0:
        fn = 1
    else:
        fn = (internal_branches_constructed -
              common_edges) / internal_branches_constructed
    logging.debug("False positive estimator: %s", fp)
    logging.debug("False negative estimator: %s", fn)
    model_tree = TreeDesc(orig, internal_branches_orig, branch_stats)
    constructed_res = NewickParser(constructed).parse()
    constructed_tree = TreeDesc(constructed, internal_branches_constructed,
                                constructed_res.root.branch_len_stats())
    return Result(model_tree, constructed_tree, genome_size, neighborhood_size,
                  scale, distance_without_len, fp, fn, distance_res)