Esempio n. 1
0
def run_mms_for(fastx: str, k: int, epsilon: float, args):
    """Build and check one single MM sketch

    See run_sms_for documetantion.
    """
    if (epsilon < 0 or epsilon > 1):
        raise ValueError("epsilon must be a number between 0 and 1")
    filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c)
    pre_file = kmcdb + ".kmc_pre"
    suf_file = kmcdb + ".kmc_suf"
    if (not os.path.exists(pre_file) or not os.path.exists(suf_file)):
        kmc.count(k, fastx, args.c, args.w, args.m, True)

    sketch_name = "{}k{}e{}".format(filename, k, str(epsilon).split('.')[1])
    bin_name = sketch_name + ".mms"
    arch_name = sketch_name + ".gz"
    sketch_path = os.path.join(args.f, sketch_name)
    arch_path = os.path.join(args.f, arch_name)

    L1, dim, max_val, construction_time = run_fress_mms(
        kmcdb, sketch_path, epsilon, args.g)
    L1 = int(L1)
    dim = int(dim)
    max_val = int(max_val)
    construction_time = int(construction_time)
    ncolls, ntrue_colls, sod, avgd, maxd, avg_qtime = run_fress_mmschk(
        kmcdb, sketch_path, args.g)
    ncolls = int(ncolls)
    ntrue_colls = int(ntrue_colls)
    avgd = float(avgd)
    maxd = int(maxd)
    avg_qtime = int(avg_qtime)

    sys.stderr.write("number of cells = {}, max freq = {}\n".format(
        dim, max_val))
    theoretical_udim = round(dim * math.ceil(math.log(max_val, 2)) / 8)
    compress(args.f, [bin_name], arch_path)
    cdim = os.stat(arch_path).st_size
    os.remove(arch_path)
    return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        filename, epsilon, k, ntrue_colls, round(L1 * epsilon), sod, avgd,
        maxd, theoretical_udim, cdim, construction_time, avg_qtime)
Esempio n. 2
0
def run_bbhash_for(fastx: str, k: int, _, args):
    """Build and check BBHash MPHF with = 1

    See run_sms_for documentation about input parameters.
    Output:
    - A big table in tsv format with the following columns:
    dataset name | k-value | mphf uncompressed size | total uncompressed size | total compressed size
    """
    filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c)
    pre_file = kmcdb + ".kmc_pre"
    suf_file = kmcdb + ".kmc_suf"
    if (not os.path.exists(pre_file) or not os.path.exists(suf_file)):
        kmc.count(k, fastx, args.c, args.w, args.m, True)

    sketch_name = "{}k{}".format(filename, k)
    mphf_name = sketch_name + ".bbh"
    payload_name = sketch_name + ".pld"
    arch_name = sketch_name + "_BBH.gz"
    sketch_path = os.path.join(args.f, sketch_name)
    mphf_path = os.path.join(args.f, mphf_name)
    arch_path = os.path.join(args.f, arch_name)

    max_val, L0, construction_time, avg_qtime = run_fress_bbhash(
        kmcdb, sketch_path)
    max_val = int(max_val)
    L0 = int(L0)
    construction_time = int(construction_time)
    avg_qtime = int(avg_qtime)

    mphf_size = os.stat(mphf_path).st_size
    theoretical_udim = round(
        L0 * math.ceil(math.log(max_val, 2)) / 8) + mphf_size
    compress(args.f, [mphf_name, payload_name], arch_path)
    cdim = os.stat(arch_path).st_size
    os.remove(arch_path)
    return "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(filename, k, mphf_size,
                                               theoretical_udim, cdim,
                                               construction_time, avg_qtime)
Esempio n. 3
0
def run_merged_sms_for(fastx: str, k: int, epsilon: float, args):
    """Build and check one single SM sketch with column merging

    Similar to run_sms_for this function merges the first args.g columns of the histogram and assigns
    the weighted average of the removed elements to all k-mers involved in the operation.

    Output:
    - A big table in tsv format with the following columns:
    dataset name | epsilon | k-value | R | B | number of collisions | threshold | L1 sum of deltas | average delta | max delta | uncompressed size | compressed size
    """
    import time
    if (epsilon < 0 or epsilon > 1):
        raise ValueError("epsilon must be a number between 0 and 1")
    filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c)
    pre_file = kmcdb + ".kmc_pre"
    suf_file = kmcdb + ".kmc_suf"
    if (not os.path.exists(pre_file) or not os.path.exists(suf_file)):
        kmc.count(k, fastx, args.c, args.w, args.m, True)

    sketch_name = "{}k{}e{}".format(filename, k, str(epsilon).split('.')[1])
    histo_name = sketch_name + ".shist.txt"
    cmb_name = sketch_name + ".cmb.txt"
    bin_name = sketch_name + ".bin"
    arch_name = sketch_name + ".gz"
    sketch_path = os.path.join(args.f, sketch_name)
    histo_path = os.path.join(args.f, histo_name)
    cmb_path = os.path.join(args.f, cmb_name)
    #bin_path = os.path.join(args.f, bin_name)
    arch_path = os.path.join(args.f, arch_name)

    tmp_name = str(time.time()) + ".hist.txt"
    run_fress_histogram(kmcdb, tmp_name)
    r, b, _, _, freq, unerr = opt_dim_main(tmp_name, epsilon, None, None,
                                           args.g)
    os.remove(tmp_name)

    L1, dim = run_fress_sense(kmcdb, sketch_path, epsilon, r, b)
    L1 = int(L1)
    dim = int(dim)
    ncolls, ntrue_colls, sod, avgd, maxd = run_fress_check(
        kmcdb, sketch_path, args.g, freq)
    ncolls = int(ncolls)
    ntrue_colls = int(ntrue_colls)
    sod = float(sod)
    avgd = float(avgd)
    maxd = float(maxd)
    tavg = sod / ntrue_colls

    #histo = pandas.read_csv(histo_path, sep='\t', header=None)
    #skewness = skew(histo.to_numpy()[:,1])
    ncombinations = 0
    with open(cmb_path, "r") as hc:
        for _ in hc:
            ncombinations += 1
    theoretical_udim = round(
        dim * math.ceil(math.log(ncombinations, 2)) /
        8) + os.stat(histo_path).st_size + os.stat(cmb_path).st_size
    compress(args.f, [histo_name, cmb_name, bin_name], arch_path)
    cdim = os.stat(arch_path).st_size
    os.remove(arch_path)
    return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        filename, epsilon, k, r, b, ntrue_colls, round(L1 * epsilon), sod,
        avgd, tavg, maxd, theoretical_udim, cdim, args.g, freq, unerr)
Esempio n. 4
0
def run_sms_for(fastx: str, k: int, epsilon: float, args):
    """Build and check one single SM sketch

    Input: 
    - one fasta/fastq to be sketched
    - the k-mer length
    - the approximation factor epsilon
    - a working directory
    - the output directory for kmc databases
    - the output directory for the sketch
    - a temporary directory
    - maximum allowed memory

    Computations:
    - for each dataset and k-mer value apply kmc
    - get the L1 norm of the kmc databases
    - get the skewness of the k-mer spectrum
    - sketch the resulting kmc databases with fress sense
    - run fress check to have (sum of errors, average error, max error)
    - get the (theoretical) uncompressed size for each fress sketch
    - get the compressed size of each fress sketch

    Output:
    - A big table in tsv format with the following columns:
    dataset name | epsilon | k | number of collisions | threshold | L1 sum of deltas | average delta | max delta | uncompressed size | compressed size
    
    ATTENTION: average delta is not (L1 sum of deltas / number of collisions) but an average computed over using collisions computed 
    as intersections of size different than one (instead of the wrong frequency)
    """
    if (epsilon < 0 or epsilon > 1):
        raise ValueError("epsilon must be a number between 0 and 1")
    filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c)
    pre_file = kmcdb + ".kmc_pre"
    suf_file = kmcdb + ".kmc_suf"
    if (not os.path.exists(pre_file) or not os.path.exists(suf_file)):
        kmc.count(k, fastx, args.c, args.w, args.m, True)

    sketch_name = "{}k{}e{}".format(filename, k, str(epsilon).split('.')[1])
    histo_name = sketch_name + ".shist.txt"
    cmb_name = sketch_name + ".cmb.txt"
    bin_name = sketch_name + ".bin"
    arch_name = sketch_name + ".gz"
    sketch_path = os.path.join(args.f, sketch_name)
    histo_path = os.path.join(args.f, histo_name)
    cmb_path = os.path.join(args.f, cmb_name)
    #bin_path = os.path.join(args.f, bin_name)
    arch_path = os.path.join(args.f, arch_name)

    L1, dim, construction_time = run_fress_sense(kmcdb, sketch_path, epsilon)
    L1 = int(L1)
    dim = int(dim)
    construction_time = int(construction_time)
    ncolls, ntrue_colls, sod, avgd, maxd, avg_qtime = run_fress_check(
        kmcdb, sketch_path)
    ncolls = int(ncolls)
    ntrue_colls = int(ntrue_colls)
    avgd = float(avgd)
    maxd = int(maxd)
    avg_qtime = int(avg_qtime)

    #histo = pandas.read_csv(histo_path, sep='\t', header=None)
    #skewness = skew(histo.to_numpy()[:,1])
    ncombinations = 0
    with open(cmb_path, "r") as hc:
        for _ in hc:
            ncombinations += 1
    theoretical_udim = round(
        dim * math.ceil(math.log(ncombinations, 2)) /
        8) + os.stat(histo_path).st_size + os.stat(cmb_path).st_size
    compress(args.f, [histo_name, cmb_name, bin_name], arch_path)
    cdim = os.stat(arch_path).st_size
    os.remove(arch_path)
    return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        filename, epsilon, k, ntrue_colls, round(L1 * epsilon), sod, avgd,
        maxd, theoretical_udim, cdim, construction_time, avg_qtime)