Ejemplo n.º 1
0
    def from_variants(cls,
                      variants,
                      n_individuals,
                      n_variants,
                      n_threads=10,
                      chunk_size=10000):
        matrix = np.zeros((n_individuals, n_variants), dtype=np.uint8)
        matrix = cls(matrix)
        logging.info("Putting genotype matrix in shared memory")
        to_shared_memory(matrix, "genotype_matrix")

        logging.info("Getting variant chunks")
        variant_chunks = variants.get_chunks(chunk_size=chunk_size)

        pool = Pool(n_threads)

        i = 0
        for result in pool.imap(
                GenotypeMatrix.fill_shared_memory_matrix_with_variants,
                variant_chunks):
            i += 1
            logging.info("Done with %d variant chunks" % i)

        logging.info("Done with all variant chunks")
        matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix")
        return cls(matrix.matrix)
Ejemplo n.º 2
0
    def analyse(self, n_threads=10):
        n_variants = self.matrix.matrix.shape[1]
        n_individuals = self.matrix.matrix.shape[0]

        most_similar_lookup = np.zeros(n_variants, dtype=np.uint32)
        prob_same_genotype = np.zeros(n_variants, dtype=np.float)

        lookup = MostSimilarVariantLookup(most_similar_lookup,
                                          prob_same_genotype)
        to_shared_memory(self.matrix, "genotype_matrix")
        to_shared_memory(lookup, "most_similar_variant_lookup")

        intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)]
        variant_intervals = [
            (from_id, to_id)
            for from_id, to_id in zip(intervals[0:-1], intervals[1:])
        ]
        logging.info("Will analyse intervals: %s" % variant_intervals)

        pool = Pool(n_threads)

        for result in pool.imap(
                GenotypeMatrixAnalyser.analyse_variants_on_shared_memody,
                variant_intervals):
            logging.info("Done with one job")

        lookup = from_shared_memory(MostSimilarVariantLookup,
                                    "most_similar_variant_lookup")

        return lookup
Ejemplo n.º 3
0
    def set_numeric_node_sequences(args):
        graph = Graph.from_file(args.graph)
        to_shared_memory(graph, "graph_shared")
        pool = Pool(args.n_threads)

        numeric_node_sequences = SingleSharedArray(
            np.zeros(len(graph.node_sequences), dtype=np.uint8))
        to_shared_memory(numeric_node_sequences, "numeric_node_sequences")

        intervals = list([
            int(i)
            for i in np.linspace(0, len(graph.node_sequences), args.n_threads +
                                 1)
        ])
        intervals = [
            (from_pos, to_pos)
            for from_pos, to_pos in zip(intervals[0:-1], intervals[1:])
        ]
        logging.info("Intervals: %s" % intervals)

        for from_pos, to_pos in pool.imap(
                get_numeric_node_sequence_single_thread, intervals):
            logging.info(
                "Done processing interval %d-%d. Inserting into full array" %
                (from_pos, to_pos))

        logging.info("Done with all intervals. Saving new graph")
        numeric_node_sequences = from_shared_memory(SingleSharedArray,
                                                    "numeric_node_sequences")
        graph.numeric_node_sequences = numeric_node_sequences.array
        graph.to_file(args.graph)
        logging.info("Saved to the same file %s" % args.graph)
Ejemplo n.º 4
0
    def from_genotype_matrix(cls, genotype_matrix, n_threads=10):
        to_shared_memory(genotype_matrix,
                         "genotype_matrix_shared_for_frequencies")

        n_variants = genotype_matrix.matrix.shape[1]
        n_individuals = len(
            np.where(genotype_matrix.matrix[:, 0])[0] != 0
        )  # can be zeros for non-individuals, so all non-zero is an individual
        logging.info("Assumes there are %d individuals and %d variants" %
                     (n_individuals, n_variants))
        data = {
            1: np.zeros(n_variants, dtype=float),
            2: np.zeros(n_variants, dtype=float),
            3: np.zeros(n_variants, dtype=float)
        }
        genotype_frequences = cls(data[1], data[2], data[3])
        to_shared_memory(genotype_frequences, "genotype_frequencies_shared")

        intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)]
        variant_intervals = [
            (from_id, to_id)
            for from_id, to_id in zip(intervals[0:-1], intervals[1:])
        ]
        logging.info("Will analyse intervals: %s" % variant_intervals)

        pool = Pool(n_threads)

        for result in pool.imap(GenotypeFrequencies.create_using_shared_memory,
                                variant_intervals):
            logging.info("Done with one job")
        """
        for numeric_genotype, array in data.items():
            logging.info("Finding for genotype %d" % numeric_genotype)
            # the second index from np where gives the columns that have a hit, every column 1 time for each hit
            column_hits = np.where(genotype_matrix.matrix == numeric_genotype)[1]
            logging.info("Making frequencies")
            unique_columns, n_hits_per_column = np.unique(column_hits, return_counts=True)
            data[numeric_genotype][unique_columns] = n_hits_per_column / n_individuals
        """
        """
        # Less memory hungry, but slower
        for numeric_genotype, array in data.items():
            logging.info("Finding for genotype %d" % numeric_genotype)
            for variant_id in range(n_variants):
                if variant_id % 10000 == 0:
                    logging.info("%d variants processed" % variant_id)

                array[variant_id] = len(np.where(genotype_matrix.matrix[:,variant_id] == numeric_genotype)[0]) / n_individuals
        """
        return from_shared_memory(GenotypeFrequencies,
                                  "genotype_frequencies_shared")
Ejemplo n.º 5
0
    def from_graph_and_variants(cls, graph, variants, limit_to_n_haplotypes=10, n_threads=10):
        # Flat structures used to make the index later
        flat_nodes = []
        flat_haplotypes = []

        pool = Pool(n_threads)
        shared_memory_graph_name = "graph_shared"
        to_shared_memory(graph, shared_memory_graph_name)

        for haplotypes, nodes in pool.starmap(HaplotypeToNodes._multiprocess_wrapper, zip(repeat(shared_memory_graph_name), variants.get_chunks(chunk_size=1000), repeat(limit_to_n_haplotypes))):
            logging.info("Done with 1 iteration")
            flat_haplotypes.extend(haplotypes)
            flat_nodes.extend(nodes)
            logging.info("Added nodes and haplotypes")

        logging.info("Done processing all variants")

        return cls.from_flat_haplotypes_and_nodes(flat_haplotypes, flat_nodes)
def create_index(args):
    if args.graph_file_name is not None:
        graph = Graph.from_file(args.graph_file_name)
        to_shared_memory(graph, "graph_shared")

    if args.threads == 1:
        kmers = create_index_single_thread(args)
        kmers.to_file(args.out_file_name)
    else:
        logging.info("Making pool with %d workers" % args.threads)
        pool = Pool(args.threads)
        genome_size = args.genome_size
        n_total_start_positions = genome_size // args.spacing
        n_positions_each_process = n_total_start_positions // args.threads
        logging.info(
            "Using genome size %d. Will process %d genome positions in each process."
            % (genome_size, n_positions_each_process))
        intervals = []
        for i in range(args.threads):
            start_position = n_positions_each_process * i * args.spacing
            end_position = n_positions_each_process * (i + 1) * args.spacing
            intervals.append((start_position, end_position))
            logging.info("Creating interval for genome segment %d-%d" %
                         (start_position, end_position))

        all_hashes = []
        all_nodes = []
        all_ref_offsets = []
        all_allele_frequencies = []
        for flat_kmers in pool.starmap(create_index_single_thread,
                                       zip(repeat(args), intervals)):
            all_hashes.append(flat_kmers._hashes)
            all_nodes.append(flat_kmers._nodes)
            all_ref_offsets.append(flat_kmers._ref_offsets)
            all_allele_frequencies.append(flat_kmers._allele_frequencies)

        logging.info("Making full index from all indexes")
        full_index = FlatKmers(np.concatenate(all_hashes),
                               np.concatenate(all_nodes),
                               np.concatenate(all_ref_offsets),
                               np.concatenate(all_allele_frequencies))

        logging.info("Saving full index")
        full_index.to_file(args.out_file_name)
    def make_unique_variant_kmers(args):
        logging.info("Reading kmer index")
        kmer_index = CollisionFreeKmerIndex.from_file(args.kmer_index)
        to_shared_memory(kmer_index, "kmer_index_shared")
        logging.info("Reading variant to nodes")
        variant_to_nodes = VariantToNodes.from_file(args.variant_to_nodes)
        to_shared_memory(variant_to_nodes, "variant_to_nodes_shared")
        logging.info("REading graph")
        graph = Graph.from_file(args.graph)
        to_shared_memory(graph, "graph_shared")
        logging.info("Reading all variants")
        variants = VcfVariants.from_vcf(args.vcf,
                                        skip_index=True,
                                        make_generator=True)
        variants = variants.get_chunks(chunk_size=args.chunk_size)
        pool = Pool(args.n_threads)

        all_flat_kmers = []
        for flat_kmers in pool.starmap(make_unique_variant_kmers_single_thread,
                                       zip(variants, repeat(args))):
            all_flat_kmers.append(flat_kmers)

        logging.info("Merge all flat kmers")
        merged_flat = FlatKmers.from_multiple_flat_kmers(all_flat_kmers)
        merged_flat.to_file(args.out_file_name)
        logging.info("Wrote to file %s" % args.out_file_name)
Ejemplo n.º 8
0
from graph_kmer_index.shared_mem import to_shared_memory, from_shared_memory
from graph_kmer_index import KmerIndex

index = KmerIndex.from_file("testdata2_index.npz")
print(index.get(852840309094508953))

to_shared_memory(index, "testindex")

new_index = from_shared_memory(KmerIndex, "testindex")
print(new_index.get(852840309094508953))