コード例 #1
0
ファイル: genotype_matrix.py プロジェクト: ivargr/obgraph
    def create_using_shared_memory(variant_interval):
        from_variant, to_variant = variant_interval
        logging.info("Creating on interval %d-%d" % (from_variant, to_variant))

        genotype_matrix = from_shared_memory(
            GenotypeMatrix, "genotype_matrix_shared_for_frequencies")
        genotype_frequencies = from_shared_memory(
            GenotypeFrequencies, "genotype_frequencies_shared")

        n_variants = genotype_matrix.matrix.shape[1]
        n_individuals = len(
            np.where(genotype_matrix.matrix[:, 0])[0] != 0
        )  # can be zeros for non-individuals, so all non-zero is an individual
        # Less memory hungry, but slower

        for numeric_genotype, array in zip([1, 2, 3], [
                genotype_frequencies.homo_ref, genotype_frequencies.homo_alt,
                genotype_frequencies.hetero
        ]):
            logging.info("Finding for genotype %d" % numeric_genotype)
            prev_time = time.time()
            for variant_id in range(from_variant, to_variant):
                if variant_id % 100000 == 0:
                    logging.info(
                        "%d/%d variants processed (genotype now is %d). Prev 100k processed in %.3f s"
                        %
                        (variant_id - from_variant, to_variant - from_variant,
                         numeric_genotype, time.time() - prev_time))
                    prev_time = time.time()

                array[variant_id] = len(
                    np.where(genotype_matrix.matrix[:, variant_id] ==
                             numeric_genotype)[0]) / n_individuals
コード例 #2
0
def get_numeric_node_sequence_single_thread(interval):
    from_pos, to_pos = interval
    start_time = time.time()
    graph = from_shared_memory(Graph, "graph_shared")
    numeric_node_sequences = from_shared_memory(SingleSharedArray,
                                                "numeric_node_sequences")
    result = np_letter_sequence_to_numeric(
        graph.node_sequences[from_pos:to_pos])
    numeric_node_sequences.array[from_pos:to_pos] = result
    logging.info("Spent %.3f s on interval" % (time.time() - start_time))
    return from_pos, to_pos
コード例 #3
0
ファイル: genotype_matrix.py プロジェクト: ivargr/obgraph
    def from_variants(cls,
                      variants,
                      n_individuals,
                      n_variants,
                      n_threads=10,
                      chunk_size=10000):
        matrix = np.zeros((n_individuals, n_variants), dtype=np.uint8)
        matrix = cls(matrix)
        logging.info("Putting genotype matrix in shared memory")
        to_shared_memory(matrix, "genotype_matrix")

        logging.info("Getting variant chunks")
        variant_chunks = variants.get_chunks(chunk_size=chunk_size)

        pool = Pool(n_threads)

        i = 0
        for result in pool.imap(
                GenotypeMatrix.fill_shared_memory_matrix_with_variants,
                variant_chunks):
            i += 1
            logging.info("Done with %d variant chunks" % i)

        logging.info("Done with all variant chunks")
        matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix")
        return cls(matrix.matrix)
コード例 #4
0
ファイル: genotype_matrix.py プロジェクト: ivargr/obgraph
    def analyse(self, n_threads=10):
        n_variants = self.matrix.matrix.shape[1]
        n_individuals = self.matrix.matrix.shape[0]

        most_similar_lookup = np.zeros(n_variants, dtype=np.uint32)
        prob_same_genotype = np.zeros(n_variants, dtype=np.float)

        lookup = MostSimilarVariantLookup(most_similar_lookup,
                                          prob_same_genotype)
        to_shared_memory(self.matrix, "genotype_matrix")
        to_shared_memory(lookup, "most_similar_variant_lookup")

        intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)]
        variant_intervals = [
            (from_id, to_id)
            for from_id, to_id in zip(intervals[0:-1], intervals[1:])
        ]
        logging.info("Will analyse intervals: %s" % variant_intervals)

        pool = Pool(n_threads)

        for result in pool.imap(
                GenotypeMatrixAnalyser.analyse_variants_on_shared_memody,
                variant_intervals):
            logging.info("Done with one job")

        lookup = from_shared_memory(MostSimilarVariantLookup,
                                    "most_similar_variant_lookup")

        return lookup
コード例 #5
0
    def set_numeric_node_sequences(args):
        graph = Graph.from_file(args.graph)
        to_shared_memory(graph, "graph_shared")
        pool = Pool(args.n_threads)

        numeric_node_sequences = SingleSharedArray(
            np.zeros(len(graph.node_sequences), dtype=np.uint8))
        to_shared_memory(numeric_node_sequences, "numeric_node_sequences")

        intervals = list([
            int(i)
            for i in np.linspace(0, len(graph.node_sequences), args.n_threads +
                                 1)
        ])
        intervals = [
            (from_pos, to_pos)
            for from_pos, to_pos in zip(intervals[0:-1], intervals[1:])
        ]
        logging.info("Intervals: %s" % intervals)

        for from_pos, to_pos in pool.imap(
                get_numeric_node_sequence_single_thread, intervals):
            logging.info(
                "Done processing interval %d-%d. Inserting into full array" %
                (from_pos, to_pos))

        logging.info("Done with all intervals. Saving new graph")
        numeric_node_sequences = from_shared_memory(SingleSharedArray,
                                                    "numeric_node_sequences")
        graph.numeric_node_sequences = numeric_node_sequences.array
        graph.to_file(args.graph)
        logging.info("Saved to the same file %s" % args.graph)
コード例 #6
0
 def make_unique_variant_kmers_single_thread(variants, args):
     variant_to_nodes = from_shared_memory(VariantToNodes,
                                           "variant_to_nodes_shared")
     kmer_index = from_shared_memory(CollisionFreeKmerIndex,
                                     "kmer_index_shared")
     graph = from_shared_memory(Graph, "graph_shared")
     #graph = Graph.from_file(args.graph)
     logging.info("Reading all variants")
     finder = UniqueVariantKmersFinder(
         graph,
         variant_to_nodes,
         variants,
         args.kmer_size,
         args.max_variant_nodes,
         kmer_index_with_frequencies=kmer_index)
     flat_kmers = finder.find_unique_kmers()
     return flat_kmers
コード例 #7
0
ファイル: genotype_matrix.py プロジェクト: ivargr/obgraph
    def fill_shared_memory_matrix_with_variants(variants):
        matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix")

        for variant in variants:
            variant_number = variant.vcf_line_number
            if variant_number % 10000 == 0:
                logging.info("%d variants processeed" % variant_number)

            for individual_id, genotype in variant.get_individuals_and_numeric_genotypes(
            ):
                matrix.matrix[individual_id, variant_number] = genotype
コード例 #8
0
ファイル: genotype_matrix.py プロジェクト: ivargr/obgraph
    def from_genotype_matrix(cls, genotype_matrix, n_threads=10):
        to_shared_memory(genotype_matrix,
                         "genotype_matrix_shared_for_frequencies")

        n_variants = genotype_matrix.matrix.shape[1]
        n_individuals = len(
            np.where(genotype_matrix.matrix[:, 0])[0] != 0
        )  # can be zeros for non-individuals, so all non-zero is an individual
        logging.info("Assumes there are %d individuals and %d variants" %
                     (n_individuals, n_variants))
        data = {
            1: np.zeros(n_variants, dtype=float),
            2: np.zeros(n_variants, dtype=float),
            3: np.zeros(n_variants, dtype=float)
        }
        genotype_frequences = cls(data[1], data[2], data[3])
        to_shared_memory(genotype_frequences, "genotype_frequencies_shared")

        intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)]
        variant_intervals = [
            (from_id, to_id)
            for from_id, to_id in zip(intervals[0:-1], intervals[1:])
        ]
        logging.info("Will analyse intervals: %s" % variant_intervals)

        pool = Pool(n_threads)

        for result in pool.imap(GenotypeFrequencies.create_using_shared_memory,
                                variant_intervals):
            logging.info("Done with one job")
        """
        for numeric_genotype, array in data.items():
            logging.info("Finding for genotype %d" % numeric_genotype)
            # the second index from np where gives the columns that have a hit, every column 1 time for each hit
            column_hits = np.where(genotype_matrix.matrix == numeric_genotype)[1]
            logging.info("Making frequencies")
            unique_columns, n_hits_per_column = np.unique(column_hits, return_counts=True)
            data[numeric_genotype][unique_columns] = n_hits_per_column / n_individuals
        """
        """
        # Less memory hungry, but slower
        for numeric_genotype, array in data.items():
            logging.info("Finding for genotype %d" % numeric_genotype)
            for variant_id in range(n_variants):
                if variant_id % 10000 == 0:
                    logging.info("%d variants processed" % variant_id)

                array[variant_id] = len(np.where(genotype_matrix.matrix[:,variant_id] == numeric_genotype)[0]) / n_individuals
        """
        return from_shared_memory(GenotypeFrequencies,
                                  "genotype_frequencies_shared")
コード例 #9
0
ファイル: genotype_matrix.py プロジェクト: ivargr/obgraph
    def analyse_variants_on_shared_memody(variant_interval):
        from_id, to_id = variant_interval
        if from_id == 0:
            from_id = 1
        logging.info("Analysing variant %d to %d in one job" %
                     (from_id, to_id))
        matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix")
        lookup = from_shared_memory(MostSimilarVariantLookup,
                                    "most_similar_variant_lookup")
        n_individuals = matrix.matrix.shape[0]
        prev_time = time.time()
        for i, variant_id in enumerate(range(from_id, to_id)):
            if i % 5000 == 0 and i > 0:
                logging.info(
                    "%d/%d variants analysed (last 5k analyse in %.3f s)" %
                    (i, to_id - from_id, time.time() - prev_time))
                prev_time = time.time()

            most_similar, score = matrix.get_most_similar_previous_variant(
                variant_id)
            #logging.info("Most similar to %d is %d with score %d. Genotype distribution: %s" % (variant_id, most_similar, score, np.unique(self.matrix[:,variant_id], return_counts=True)))
            lookup.lookup_array[variant_id] = most_similar
            lookup.prob_same_genotype[variant_id] = score / n_individuals
コード例 #10
0
def create_index_single_thread(args, interval=None):
    start_position = None
    end_position = None
    if interval is not None:
        start_position = interval[0]
        end_position = interval[1]

    logging.info("Loading data")
    #graph = Graph.from_file(args.graph_file_name)
    if args.graph_file_name is not None:
        graph = from_shared_memory(Graph, "graph_shared")
        reference = None
    else:
        graph = None
        assert args.reference_fasta is not None
        assert args.reference_name is not None, "Reference name must be specified"
        reference = Fasta(args.reference_fasta)[args.reference_name]

    logging.info("Running kmerfinder")
    whitelist = None
    if args.whitelist is not None:
        w = FlatKmers.from_file(args.whitelist)
        whitelist = set(w._hashes)

    skip_kmers_with_nodes = None
    if args.skip_kmers_with_nodes is not None:
        f = FlatKmers.from_file(args.skip_kmers_with_nodes)
        skip_kmers_with_nodes = set(f._nodes)

    finder = SnpKmerFinder(
        graph,
        k=args.kmer_size,
        spacing=args.spacing,
        include_reverse_complements=args.include_reverse_complement,
        pruning=args.pruning,
        max_kmers_same_position=args.max_kmers_same_position,
        max_frequency=args.max_frequency,
        max_variant_nodes=args.max_variant_nodes,
        only_add_variant_kmers=args.only_add_variant_kmers,
        whitelist=whitelist,
        only_save_variant_nodes=args.only_save_variant_nodes,
        start_position=start_position,
        end_position=end_position,
        skip_kmers_with_nodes=skip_kmers_with_nodes,
        only_save_one_node_per_kmer=args.only_save_one_node_per_kmer,
        reference=reference)

    kmers = finder.find_kmers()
    return kmers
コード例 #11
0
ファイル: haplotype_nodes.py プロジェクト: ivargr/obgraph
 def _multiprocess_wrapper(shared_memory_graph_name, variants, limit_to_n_haplotypes=10):
     graph = from_shared_memory(Graph, shared_memory_graph_name)
     return HaplotypeToNodes.get_flat_haplotypes_and_nodes_from_graph_and_variants(graph, variants, limit_to_n_haplotypes)
コード例 #12
0
from graph_kmer_index.shared_mem import to_shared_memory, from_shared_memory
from graph_kmer_index import KmerIndex

index = KmerIndex.from_file("testdata2_index.npz")
print(index.get(852840309094508953))

to_shared_memory(index, "testindex")

new_index = from_shared_memory(KmerIndex, "testindex")
print(new_index.get(852840309094508953))