def from_variants(cls, variants, n_individuals, n_variants, n_threads=10, chunk_size=10000): matrix = np.zeros((n_individuals, n_variants), dtype=np.uint8) matrix = cls(matrix) logging.info("Putting genotype matrix in shared memory") to_shared_memory(matrix, "genotype_matrix") logging.info("Getting variant chunks") variant_chunks = variants.get_chunks(chunk_size=chunk_size) pool = Pool(n_threads) i = 0 for result in pool.imap( GenotypeMatrix.fill_shared_memory_matrix_with_variants, variant_chunks): i += 1 logging.info("Done with %d variant chunks" % i) logging.info("Done with all variant chunks") matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix") return cls(matrix.matrix)
def analyse(self, n_threads=10): n_variants = self.matrix.matrix.shape[1] n_individuals = self.matrix.matrix.shape[0] most_similar_lookup = np.zeros(n_variants, dtype=np.uint32) prob_same_genotype = np.zeros(n_variants, dtype=np.float) lookup = MostSimilarVariantLookup(most_similar_lookup, prob_same_genotype) to_shared_memory(self.matrix, "genotype_matrix") to_shared_memory(lookup, "most_similar_variant_lookup") intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)] variant_intervals = [ (from_id, to_id) for from_id, to_id in zip(intervals[0:-1], intervals[1:]) ] logging.info("Will analyse intervals: %s" % variant_intervals) pool = Pool(n_threads) for result in pool.imap( GenotypeMatrixAnalyser.analyse_variants_on_shared_memody, variant_intervals): logging.info("Done with one job") lookup = from_shared_memory(MostSimilarVariantLookup, "most_similar_variant_lookup") return lookup
def set_numeric_node_sequences(args): graph = Graph.from_file(args.graph) to_shared_memory(graph, "graph_shared") pool = Pool(args.n_threads) numeric_node_sequences = SingleSharedArray( np.zeros(len(graph.node_sequences), dtype=np.uint8)) to_shared_memory(numeric_node_sequences, "numeric_node_sequences") intervals = list([ int(i) for i in np.linspace(0, len(graph.node_sequences), args.n_threads + 1) ]) intervals = [ (from_pos, to_pos) for from_pos, to_pos in zip(intervals[0:-1], intervals[1:]) ] logging.info("Intervals: %s" % intervals) for from_pos, to_pos in pool.imap( get_numeric_node_sequence_single_thread, intervals): logging.info( "Done processing interval %d-%d. Inserting into full array" % (from_pos, to_pos)) logging.info("Done with all intervals. Saving new graph") numeric_node_sequences = from_shared_memory(SingleSharedArray, "numeric_node_sequences") graph.numeric_node_sequences = numeric_node_sequences.array graph.to_file(args.graph) logging.info("Saved to the same file %s" % args.graph)
def from_genotype_matrix(cls, genotype_matrix, n_threads=10): to_shared_memory(genotype_matrix, "genotype_matrix_shared_for_frequencies") n_variants = genotype_matrix.matrix.shape[1] n_individuals = len( np.where(genotype_matrix.matrix[:, 0])[0] != 0 ) # can be zeros for non-individuals, so all non-zero is an individual logging.info("Assumes there are %d individuals and %d variants" % (n_individuals, n_variants)) data = { 1: np.zeros(n_variants, dtype=float), 2: np.zeros(n_variants, dtype=float), 3: np.zeros(n_variants, dtype=float) } genotype_frequences = cls(data[1], data[2], data[3]) to_shared_memory(genotype_frequences, "genotype_frequencies_shared") intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)] variant_intervals = [ (from_id, to_id) for from_id, to_id in zip(intervals[0:-1], intervals[1:]) ] logging.info("Will analyse intervals: %s" % variant_intervals) pool = Pool(n_threads) for result in pool.imap(GenotypeFrequencies.create_using_shared_memory, variant_intervals): logging.info("Done with one job") """ for numeric_genotype, array in data.items(): logging.info("Finding for genotype %d" % numeric_genotype) # the second index from np where gives the columns that have a hit, every column 1 time for each hit column_hits = np.where(genotype_matrix.matrix == numeric_genotype)[1] logging.info("Making frequencies") unique_columns, n_hits_per_column = np.unique(column_hits, return_counts=True) data[numeric_genotype][unique_columns] = n_hits_per_column / n_individuals """ """ # Less memory hungry, but slower for numeric_genotype, array in data.items(): logging.info("Finding for genotype %d" % numeric_genotype) for variant_id in range(n_variants): if variant_id % 10000 == 0: logging.info("%d variants processed" % variant_id) array[variant_id] = len(np.where(genotype_matrix.matrix[:,variant_id] == numeric_genotype)[0]) / n_individuals """ return from_shared_memory(GenotypeFrequencies, "genotype_frequencies_shared")
def from_graph_and_variants(cls, graph, variants, limit_to_n_haplotypes=10, n_threads=10): # Flat structures used to make the index later flat_nodes = [] flat_haplotypes = [] pool = Pool(n_threads) shared_memory_graph_name = "graph_shared" to_shared_memory(graph, shared_memory_graph_name) for haplotypes, nodes in pool.starmap(HaplotypeToNodes._multiprocess_wrapper, zip(repeat(shared_memory_graph_name), variants.get_chunks(chunk_size=1000), repeat(limit_to_n_haplotypes))): logging.info("Done with 1 iteration") flat_haplotypes.extend(haplotypes) flat_nodes.extend(nodes) logging.info("Added nodes and haplotypes") logging.info("Done processing all variants") return cls.from_flat_haplotypes_and_nodes(flat_haplotypes, flat_nodes)
def create_index(args): if args.graph_file_name is not None: graph = Graph.from_file(args.graph_file_name) to_shared_memory(graph, "graph_shared") if args.threads == 1: kmers = create_index_single_thread(args) kmers.to_file(args.out_file_name) else: logging.info("Making pool with %d workers" % args.threads) pool = Pool(args.threads) genome_size = args.genome_size n_total_start_positions = genome_size // args.spacing n_positions_each_process = n_total_start_positions // args.threads logging.info( "Using genome size %d. Will process %d genome positions in each process." % (genome_size, n_positions_each_process)) intervals = [] for i in range(args.threads): start_position = n_positions_each_process * i * args.spacing end_position = n_positions_each_process * (i + 1) * args.spacing intervals.append((start_position, end_position)) logging.info("Creating interval for genome segment %d-%d" % (start_position, end_position)) all_hashes = [] all_nodes = [] all_ref_offsets = [] all_allele_frequencies = [] for flat_kmers in pool.starmap(create_index_single_thread, zip(repeat(args), intervals)): all_hashes.append(flat_kmers._hashes) all_nodes.append(flat_kmers._nodes) all_ref_offsets.append(flat_kmers._ref_offsets) all_allele_frequencies.append(flat_kmers._allele_frequencies) logging.info("Making full index from all indexes") full_index = FlatKmers(np.concatenate(all_hashes), np.concatenate(all_nodes), np.concatenate(all_ref_offsets), np.concatenate(all_allele_frequencies)) logging.info("Saving full index") full_index.to_file(args.out_file_name)
def make_unique_variant_kmers(args): logging.info("Reading kmer index") kmer_index = CollisionFreeKmerIndex.from_file(args.kmer_index) to_shared_memory(kmer_index, "kmer_index_shared") logging.info("Reading variant to nodes") variant_to_nodes = VariantToNodes.from_file(args.variant_to_nodes) to_shared_memory(variant_to_nodes, "variant_to_nodes_shared") logging.info("REading graph") graph = Graph.from_file(args.graph) to_shared_memory(graph, "graph_shared") logging.info("Reading all variants") variants = VcfVariants.from_vcf(args.vcf, skip_index=True, make_generator=True) variants = variants.get_chunks(chunk_size=args.chunk_size) pool = Pool(args.n_threads) all_flat_kmers = [] for flat_kmers in pool.starmap(make_unique_variant_kmers_single_thread, zip(variants, repeat(args))): all_flat_kmers.append(flat_kmers) logging.info("Merge all flat kmers") merged_flat = FlatKmers.from_multiple_flat_kmers(all_flat_kmers) merged_flat.to_file(args.out_file_name) logging.info("Wrote to file %s" % args.out_file_name)
from graph_kmer_index.shared_mem import to_shared_memory, from_shared_memory from graph_kmer_index import KmerIndex index = KmerIndex.from_file("testdata2_index.npz") print(index.get(852840309094508953)) to_shared_memory(index, "testindex") new_index = from_shared_memory(KmerIndex, "testindex") print(new_index.get(852840309094508953))