Exemple #1
0
 def merge(self, config: hug.types.text, merge_config: hug.types.text):
     config = get_config_from_file(config)
     merge_config = get_config_from_file(merge_config)
     index1 = BIGSI(config)
     index2 = BIGSI(merge_config)
     merge(index1, index2)
     return {"result": "merged %s into %s." % (merge_config, config)}
Exemple #2
0
    def insert(self, db: hug.types.text, bloomfilter, sample):
        """Inserts a bloom filter into the graph

        e.g. bigsi insert ERR1010211.bloom ERR1010211

        """
        graph = BIGSI(db)
        return insert(graph=BIGSI(db), bloomfilter=bloomfilter, sample=sample)
Exemple #3
0
    def bulk_search(
        self,
        fasta: hug.types.text,
        threshold: hug.types.float_number = 1.0,
        config: hug.types.text = None,
        score: hug.types.smart_boolean = False,
        format: hug.types.one_of(["json", "csv"]) = "json",
        stream: hug.types.smart_boolean = False,
    ):
        config = get_config_from_file(config)

        fasta = Fasta(fasta)
        if not stream:
            _config = copy.copy(config)
            _config["nproc"] = 1
            csv_combined = ""
            nproc = config.get("nproc", 1)
            with multiprocessing.Pool(processes=nproc) as pool:
                args = [(_config, str(seq), threshold, score)
                        for seq in fasta.values()]
                dd = pool.map_async(search_bigsi_parallel,
                                    chunks(args, math.ceil(len(args) /
                                                           nproc))).get()
                dd = [item for sublist in dd for item in sublist]
            if format == "csv":
                return "\n".join([d_to_csv(d, False, False) for d in dd])
            else:
                return json.dumps(dd, indent=4)
        else:
            bigsi = BIGSI(config)
            csv_combined = ""
            for i, seq in enumerate(fasta.values()):
                seq = str(seq)
                d = {
                    "query": seq,
                    "threshold": threshold,
                    "results": bigsi.search(seq, threshold, score),
                    "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1",
                }
                if format == "csv":
                    if i == 0:
                        with_header = True
                        carriage_return = False
                    elif i == len(fasta) - 1:
                        carriage_return = True
                    else:
                        with_header = False
                        carriage_return = False
                    csv_result = d_to_csv(d, with_header, carriage_return)
                    csv_combined += csv_result
                    if stream:
                        print(csv_result)
                else:
                    if stream:
                        print(json.dumps(d))
Exemple #4
0
 def bulk_search(
     self,
     fasta: hug.types.text,
     threshold: hug.types.float_number = 1.0,
     config: hug.types.text = None,
     score: hug.types.smart_boolean = False,
     format: hug.types.one_of(["json", "csv"]) = "json",
     stream: hug.types.smart_boolean = False,
 ):
     config = get_config_from_file(config)
     bigsi = BIGSI(config)
     fasta = Fasta(fasta)
     if not stream:
         csv_combined = ""
         nproc = config.get("nproc", 1)
         with ThreadPool(processes=nproc) as pool:
             args = [(bigsi, str(seq), threshold, score)
                     for seq in fasta.values()]
             dd = pool.starmap(search_bigsi, args)
         if format == "csv":
             return "\n".join([d_to_csv(d, False, False) for d in dd])
         else:
             return json.dumps(dd, indent=4)
     else:
         dd = []
         csv_combined = ""
         for i, seq in enumerate(fasta.values()):
             seq = str(seq)
             d = {
                 "query": seq,
                 "threshold": threshold,
                 "results": bigsi.search(seq, threshold, score),
                 "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1",
             }
             dd.append(d)
             if format == "csv":
                 if i == 0:
                     with_header = True
                     carriage_return = False
                 elif i == len(fasta) - 1:
                     carriage_return = True
                 else:
                     with_header = False
                     carriage_return = False
                 csv_result = d_to_csv(d, with_header, carriage_return)
                 csv_combined += csv_result
                 if stream:
                     print(csv_result)
             else:
                 if stream:
                     print(json.dumps(d))
Exemple #5
0
 def delete(self, db: hug.types.text = None):
     try:
         bigsi = BIGSI(db)
     except ValueError:
         pass
     else:
         return delete(bigsi)
Exemple #6
0
 def variant_search(
     self,
     reference: hug.types.text,
     ref: hug.types.text,
     pos: hug.types.number,
     alt: hug.types.text,
     gene: hug.types.text = None,
     genbank: hug.types.text = None,
     config: hug.types.text = None,
     format: hug.types.one_of(["json", "csv"]) = "json",
 ):
     config = get_config_from_file(config)
     bigsi = BIGSI(config)
     if genbank and gene:
         d = BIGSIAminoAcidMutationSearch(bigsi, reference, genbank).search(
             gene, ref, pos, alt)
     elif genbank or gene:
         raise ValueError("genbank and gene must be supplied together")
     else:
         d = BIGSIVariantSearch(bigsi, reference).search(ref, pos, alt)
     d["citation"] = "http://dx.doi.org/10.1038/s41587-018-0010-1"
     if format == "csv":
         return d_to_csv(d)
     else:
         return json.dumps(d, indent=4)
Exemple #7
0
    def insert(self, config: hug.types.text, bloomfilter, sample):
        """Inserts a bloom filter into the graph

        e.g. bigsi insert ERR1010211.bloom ERR1010211

        """
        config = get_config_from_file(config)
        index = BIGSI(config)
        return insert(index=index, bloomfilter=bloomfilter, sample=sample)
Exemple #8
0
def bloom(config, outfile, kmers):
    outfile = os.path.realpath(outfile)
    bloomfilter = BIGSI.bloom(config, kmers)
    off = bloom_file_name(outfile)
    directory = os.path.dirname(off)
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(off, "wb") as of:
        bloomfilter.tofile(of)
Exemple #9
0
 def search(self,
            seq: hug.types.text,
            threshold: hug.types.float_number = 1.0,
            config: hug.types.text = None,
            score: hug.types.smart_boolean = False,
            format: hug.types.one_of(["json", "csv"]) = "json"):
     config = get_config_from_file(config)
     bigsi = BIGSI(config)
     d = {
         "query": seq,
         "threshold": threshold,
         "results": bigsi.search(seq, threshold, score),
         "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1"
     }
     if format == "csv":
         return d_to_csv(d)
     else:
         return json.dumps(d, indent=4)
Exemple #10
0
 def build(self,
           db: hug.types.text,
           bloomfilters: hug.types.multiple,
           samples: hug.types.multiple = []):
     if samples:
         assert len(samples) == len(bloomfilters)
     else:
         samples = bloomfilters
     return build(graph=BIGSI(db),
                  bloomfilter_filepaths=bloomfilters,
                  samples=samples)
Exemple #11
0
 def search(
     self,
     seq: hug.types.text,
     threshold: hug.types.float_number = 1.0,
     config: hug.types.text = None,
     score: hug.types.smart_boolean = False,
     format: hug.types.one_of(["json", "csv"]) = "json",
 ):
     config = get_config_from_file(config)
     bigsi = BIGSI(config)
     d = search_bigsi(bigsi, seq, threshold, score)
     if format == "csv":
         return d_to_csv(d)
     else:
         return json.dumps(d, indent=4)
Exemple #12
0
    def search(self,
               db: hug.types.text = None,
               seq: hug.types.text = None,
               seqfile: hug.types.text = None,
               threshold: hug.types.float_number = 1.0,
               output_format: hug.types.one_of(
                   ("json", "tsv", "fasta")) = 'json',
               pipe_out: hug.types.smart_boolean = False,
               pipe_in: hug.types.smart_boolean = False,
               cachesize: hug.types.number = 4,
               score: hug.types.smart_boolean = False,
               nproc: hug.types.number = 4):
        if db is None:
            db = BDB_DB_FILENAME
        bigsi = BIGSI(db, cachesize=cachesize, nproc=nproc, mode="r")
        """Returns samples that contain the searched sequence.
        Use -f to search for sequence from fasta"""
        if output_format in ["tsv", "fasta"]:
            pipe_out = True

        if not pipe_in and (not seq and not seqfile):
            return "-s or -f must be provided"
        if seq == "-" or pipe_in:
            _, fp = tempfile.mkstemp(text=True)
            with open(fp, 'w') as openfile:
                for line in sys.stdin:
                    openfile.write(line)
            result = search(seq=None,
                            fasta_file=fp,
                            threshold=threshold,
                            graph=bigsi,
                            output_format=output_format,
                            pipe=pipe_out,
                            score=score)

        else:
            result = search(seq=seq,
                            fasta_file=seqfile,
                            threshold=threshold,
                            graph=bigsi,
                            output_format=output_format,
                            pipe=pipe_out,
                            score=score)

        if not pipe_out:
            return result
Exemple #13
0
def run_insert(kmers, bloom_filter, sample_name=None):
    graph = Graph(storage=storage,
                  bloom_filter_size=bloom_filter_size,
                  num_hashes=num_hashes)
    if sample_name is None:
        sample_name = os.path.basename(bloom_filter).split('.')[0]
    logger.debug("Starting insert. ")
    graph.insert(bloom_filter, sample_name)
    graph.sync()
    return {"message": "success",
            "colour": graph.get_colour_from_sample(sample_name),
            }
Exemple #14
0
 def build(self,
           db: hug.types.text,
           bloomfilters: hug.types.multiple,
           samples: hug.types.multiple = [],
           max_memory: hug.types.text = '',
           lowmem: hug.types.smart_boolean = False):
     if samples:
         assert len(samples) == len(bloomfilters)
     else:
         samples = bloomfilters
     if max_memory:
         max_memory_bytes = humanfriendly.parse_size(max_memory)
     else:
         max_memory_bytes = None
     return build(index=BIGSI(db),
                  bloomfilter_filepaths=bloomfilters,
                  samples=samples,
                  max_memory=max_memory_bytes,
                  lowmem=lowmem)
Exemple #15
0
    def bloom(self,
              outfile,
              db=DEFUALT_DB_DIRECTORY,
              kmers=None,
              seqfile=None,
              ctx=None):
        bigsi = BIGSI(db, mode="r")
        """Creates a bloom filter from a sequence file or cortex graph. (fastq,fasta,bam,ctx)

        e.g. bigsi insert ERR1010211.ctx

        """
        if ctx:
            kmers = extract_kmers_from_ctx(ctx, bigsi.kmer_size)
        if not kmers and not seqfile:
            return "--kmers or --seqfile must be provided"
        bf = bloom(outfile=outfile,
                   kmers=kmers,
                   kmer_file=seqfile,
                   graph=bigsi)
Exemple #16
0
def search_bigsi_parallel(l):
    bigsi = BIGSI(l[0][0])
    results = []
    for _, seq, threshold, score in l:
        results.append(search_bigsi(bigsi, seq, threshold, score))
    return results
Exemple #17
0
 def merge(self, db1: hug.types.text, db2: hug.types.text):
     BIGSI(db1).merge(BIGSI(db2))
     return {"result": "merged %s into %s." % (db2, db1)}
Exemple #18
0
def build_tmp(bloomfilter_filepaths, samples, indext, i,lowmem=False):
    index_dir = indext.db+"%i.tmp" % i
    index = BIGSI.create(db=index_dir, k=indext.kmer_size,
                         m=indext.bloom_filter_size, h=indext.num_hashes, force=True)
    build_main(bloomfilter_filepaths, samples, index,lowmem=lowmem)
    return BIGSI(index_dir)
Exemple #19
0
 def init(self, db, k=31, m=25 * 10**6, h=3, force=False):
     bigsi = BIGSI.create(db=db, k=k, m=m, h=h, force=force)
     return {'k': k, 'm': m, 'h': h, 'db': db}
Exemple #20
0
def build_main(config, bloomfilter_filepaths, samples):
    bloomfilters = []
    for f in bloomfilter_filepaths:
        bloomfilters.append(load_bloomfilter(f))
    return BIGSI.build(config, bloomfilters, samples)