def merge(self, config: hug.types.text, merge_config: hug.types.text): config = get_config_from_file(config) merge_config = get_config_from_file(merge_config) index1 = BIGSI(config) index2 = BIGSI(merge_config) merge(index1, index2) return {"result": "merged %s into %s." % (merge_config, config)}
def insert(self, db: hug.types.text, bloomfilter, sample): """Inserts a bloom filter into the graph e.g. bigsi insert ERR1010211.bloom ERR1010211 """ graph = BIGSI(db) return insert(graph=BIGSI(db), bloomfilter=bloomfilter, sample=sample)
def bulk_search( self, fasta: hug.types.text, threshold: hug.types.float_number = 1.0, config: hug.types.text = None, score: hug.types.smart_boolean = False, format: hug.types.one_of(["json", "csv"]) = "json", stream: hug.types.smart_boolean = False, ): config = get_config_from_file(config) fasta = Fasta(fasta) if not stream: _config = copy.copy(config) _config["nproc"] = 1 csv_combined = "" nproc = config.get("nproc", 1) with multiprocessing.Pool(processes=nproc) as pool: args = [(_config, str(seq), threshold, score) for seq in fasta.values()] dd = pool.map_async(search_bigsi_parallel, chunks(args, math.ceil(len(args) / nproc))).get() dd = [item for sublist in dd for item in sublist] if format == "csv": return "\n".join([d_to_csv(d, False, False) for d in dd]) else: return json.dumps(dd, indent=4) else: bigsi = BIGSI(config) csv_combined = "" for i, seq in enumerate(fasta.values()): seq = str(seq) d = { "query": seq, "threshold": threshold, "results": bigsi.search(seq, threshold, score), "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1", } if format == "csv": if i == 0: with_header = True carriage_return = False elif i == len(fasta) - 1: carriage_return = True else: with_header = False carriage_return = False csv_result = d_to_csv(d, with_header, carriage_return) csv_combined += csv_result if stream: print(csv_result) else: if stream: print(json.dumps(d))
def bulk_search( self, fasta: hug.types.text, threshold: hug.types.float_number = 1.0, config: hug.types.text = None, score: hug.types.smart_boolean = False, format: hug.types.one_of(["json", "csv"]) = "json", stream: hug.types.smart_boolean = False, ): config = get_config_from_file(config) bigsi = BIGSI(config) fasta = Fasta(fasta) if not stream: csv_combined = "" nproc = config.get("nproc", 1) with ThreadPool(processes=nproc) as pool: args = [(bigsi, str(seq), threshold, score) for seq in fasta.values()] dd = pool.starmap(search_bigsi, args) if format == "csv": return "\n".join([d_to_csv(d, False, False) for d in dd]) else: return json.dumps(dd, indent=4) else: dd = [] csv_combined = "" for i, seq in enumerate(fasta.values()): seq = str(seq) d = { "query": seq, "threshold": threshold, "results": bigsi.search(seq, threshold, score), "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1", } dd.append(d) if format == "csv": if i == 0: with_header = True carriage_return = False elif i == len(fasta) - 1: carriage_return = True else: with_header = False carriage_return = False csv_result = d_to_csv(d, with_header, carriage_return) csv_combined += csv_result if stream: print(csv_result) else: if stream: print(json.dumps(d))
def delete(self, db: hug.types.text = None): try: bigsi = BIGSI(db) except ValueError: pass else: return delete(bigsi)
def variant_search( self, reference: hug.types.text, ref: hug.types.text, pos: hug.types.number, alt: hug.types.text, gene: hug.types.text = None, genbank: hug.types.text = None, config: hug.types.text = None, format: hug.types.one_of(["json", "csv"]) = "json", ): config = get_config_from_file(config) bigsi = BIGSI(config) if genbank and gene: d = BIGSIAminoAcidMutationSearch(bigsi, reference, genbank).search( gene, ref, pos, alt) elif genbank or gene: raise ValueError("genbank and gene must be supplied together") else: d = BIGSIVariantSearch(bigsi, reference).search(ref, pos, alt) d["citation"] = "http://dx.doi.org/10.1038/s41587-018-0010-1" if format == "csv": return d_to_csv(d) else: return json.dumps(d, indent=4)
def insert(self, config: hug.types.text, bloomfilter, sample): """Inserts a bloom filter into the graph e.g. bigsi insert ERR1010211.bloom ERR1010211 """ config = get_config_from_file(config) index = BIGSI(config) return insert(index=index, bloomfilter=bloomfilter, sample=sample)
def bloom(config, outfile, kmers): outfile = os.path.realpath(outfile) bloomfilter = BIGSI.bloom(config, kmers) off = bloom_file_name(outfile) directory = os.path.dirname(off) if not os.path.exists(directory): os.makedirs(directory) with open(off, "wb") as of: bloomfilter.tofile(of)
def search(self, seq: hug.types.text, threshold: hug.types.float_number = 1.0, config: hug.types.text = None, score: hug.types.smart_boolean = False, format: hug.types.one_of(["json", "csv"]) = "json"): config = get_config_from_file(config) bigsi = BIGSI(config) d = { "query": seq, "threshold": threshold, "results": bigsi.search(seq, threshold, score), "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1" } if format == "csv": return d_to_csv(d) else: return json.dumps(d, indent=4)
def build(self, db: hug.types.text, bloomfilters: hug.types.multiple, samples: hug.types.multiple = []): if samples: assert len(samples) == len(bloomfilters) else: samples = bloomfilters return build(graph=BIGSI(db), bloomfilter_filepaths=bloomfilters, samples=samples)
def search( self, seq: hug.types.text, threshold: hug.types.float_number = 1.0, config: hug.types.text = None, score: hug.types.smart_boolean = False, format: hug.types.one_of(["json", "csv"]) = "json", ): config = get_config_from_file(config) bigsi = BIGSI(config) d = search_bigsi(bigsi, seq, threshold, score) if format == "csv": return d_to_csv(d) else: return json.dumps(d, indent=4)
def search(self, db: hug.types.text = None, seq: hug.types.text = None, seqfile: hug.types.text = None, threshold: hug.types.float_number = 1.0, output_format: hug.types.one_of( ("json", "tsv", "fasta")) = 'json', pipe_out: hug.types.smart_boolean = False, pipe_in: hug.types.smart_boolean = False, cachesize: hug.types.number = 4, score: hug.types.smart_boolean = False, nproc: hug.types.number = 4): if db is None: db = BDB_DB_FILENAME bigsi = BIGSI(db, cachesize=cachesize, nproc=nproc, mode="r") """Returns samples that contain the searched sequence. Use -f to search for sequence from fasta""" if output_format in ["tsv", "fasta"]: pipe_out = True if not pipe_in and (not seq and not seqfile): return "-s or -f must be provided" if seq == "-" or pipe_in: _, fp = tempfile.mkstemp(text=True) with open(fp, 'w') as openfile: for line in sys.stdin: openfile.write(line) result = search(seq=None, fasta_file=fp, threshold=threshold, graph=bigsi, output_format=output_format, pipe=pipe_out, score=score) else: result = search(seq=seq, fasta_file=seqfile, threshold=threshold, graph=bigsi, output_format=output_format, pipe=pipe_out, score=score) if not pipe_out: return result
def run_insert(kmers, bloom_filter, sample_name=None): graph = Graph(storage=storage, bloom_filter_size=bloom_filter_size, num_hashes=num_hashes) if sample_name is None: sample_name = os.path.basename(bloom_filter).split('.')[0] logger.debug("Starting insert. ") graph.insert(bloom_filter, sample_name) graph.sync() return {"message": "success", "colour": graph.get_colour_from_sample(sample_name), }
def build(self, db: hug.types.text, bloomfilters: hug.types.multiple, samples: hug.types.multiple = [], max_memory: hug.types.text = '', lowmem: hug.types.smart_boolean = False): if samples: assert len(samples) == len(bloomfilters) else: samples = bloomfilters if max_memory: max_memory_bytes = humanfriendly.parse_size(max_memory) else: max_memory_bytes = None return build(index=BIGSI(db), bloomfilter_filepaths=bloomfilters, samples=samples, max_memory=max_memory_bytes, lowmem=lowmem)
def bloom(self, outfile, db=DEFUALT_DB_DIRECTORY, kmers=None, seqfile=None, ctx=None): bigsi = BIGSI(db, mode="r") """Creates a bloom filter from a sequence file or cortex graph. (fastq,fasta,bam,ctx) e.g. bigsi insert ERR1010211.ctx """ if ctx: kmers = extract_kmers_from_ctx(ctx, bigsi.kmer_size) if not kmers and not seqfile: return "--kmers or --seqfile must be provided" bf = bloom(outfile=outfile, kmers=kmers, kmer_file=seqfile, graph=bigsi)
def search_bigsi_parallel(l): bigsi = BIGSI(l[0][0]) results = [] for _, seq, threshold, score in l: results.append(search_bigsi(bigsi, seq, threshold, score)) return results
def merge(self, db1: hug.types.text, db2: hug.types.text): BIGSI(db1).merge(BIGSI(db2)) return {"result": "merged %s into %s." % (db2, db1)}
def build_tmp(bloomfilter_filepaths, samples, indext, i,lowmem=False): index_dir = indext.db+"%i.tmp" % i index = BIGSI.create(db=index_dir, k=indext.kmer_size, m=indext.bloom_filter_size, h=indext.num_hashes, force=True) build_main(bloomfilter_filepaths, samples, index,lowmem=lowmem) return BIGSI(index_dir)
def init(self, db, k=31, m=25 * 10**6, h=3, force=False): bigsi = BIGSI.create(db=db, k=k, m=m, h=h, force=force) return {'k': k, 'm': m, 'h': h, 'db': db}
def build_main(config, bloomfilter_filepaths, samples): bloomfilters = [] for f in bloomfilter_filepaths: bloomfilters.append(load_bloomfilter(f)) return BIGSI.build(config, bloomfilters, samples)