Beispiel #1
0
def get_count_project(project, list_kmers):
    cpt_sample = 0
    it_sample = project.it_samples
    sample = it_sample.next()
    print(time.strftime('%X') + " " + sample.name)
    cpt_sample = 0
    # Ticky: loading all the file in cache
    os.system("wc -l " + sample.jf_file)
    jf = jellyfish.QueryMerFile(sample.jf_file)
    cpt_kmer = 0
    for kmer in list_kmers:
        cpt_kmer += 1
        kmer.init_count(project.num_samples)
        kmer.add_count(kmer.get_count_jf(jf), cpt_sample)
    del jf

    for sample in it_sample:
        print(time.strftime('%X') + " " + sample.name)
        cpt_sample += 1
        # Ticky: loading all the file in cache
        os.system("wc -l " + sample.jf_file)
        jf = jellyfish.QueryMerFile(sample.jf_file)
        for kmer in list_kmers:
            kmer.add_count(kmer.get_count_jf(jf), cpt_sample)
        del jf

    gc.collect()
    print(time.strftime('%X') + ": Get count done!")
def export_sparse_features(sigmers, sample, indir, outfile):

    outfh = open(outfile, 'w')
    i = 0
    for (s, l) in sample:
        i = i + 1

        if (i % 50 == 0):
            echo("\t\t ... Completed %f" % (float(i) / float(len(sample))))

        filename = indir + s + "_count.jf"
        qf = jellyfish.QueryMerFile(filename)

        outfh.write("%s " % (l))
        j = 0
        for mer in sigmers:
            j = j + 1
            jmer = jellyfish.MerDNA(mer)
            jmer.canonicalize()

            if (qf[jmer] > 0):
                outfh.write("%d:%d " % (j, qf[jmer]))

        #	outfh.write("%d\t%d\t%d\n" %(i, sigmers[mer], qf[jmer]))
        outfh.write("\n")
    outfh.close()
Beispiel #3
0
 def test_query(self):
     good = True
     qf = jellyfish.QueryMerFile(os.path.join(data, "sequence.jf"))
     for mer, count in self.mf:
         good = good and count == qf[mer]
         if not good: break
     self.assertTrue(good)
Beispiel #4
0
 def __init__(self, filename, cutoff=0.30, n_cutoff=500, canonical=True):
     self.jf = jellyfish.QueryMerFile(filename)
     self.k = jellyfish.MerDNA.k()
     self.filename = filename
     self.cutoff = cutoff
     self.n_cutoff = n_cutoff
     self.canonical = canonical
Beispiel #5
0
    def set_jf_file(self, path):
        """
        set the path to the query and read mer files

        :param path:
        :return: None
        """
        self.qf_filtered = jellyfish.QueryMerFile(path)
        self.rf = jellyfish.ReadMerFile(path)
        return None
Beispiel #6
0
def write_kmer(project, dict_seqs, prefix, args, path_dir):
    boo_header = False

    if not os.path.exists(path_dir):
        os.makedirs(path_dir)

    file_out_tab = os.path.join(path_dir, str(prefix) + "_count.tab")
    with open(file_out_tab, 'w') as w_tab:
        for name, seq in list(dict_seqs.items()):
            list_kmer = [
                seq[i:i + args.LKMER] for i in range(len(seq) - args.LKMER)
            ]

            list_line = [""] * (len(list_kmer) + 1)
            list_line[0] = "ID_ASSEMBLY\tKMER\tENTROPY\tSWITCH"
            cpt_line = 1
            for kmer in list_kmer:
                list_line[cpt_line] = str(name) + "\t" + kmer.seq + "\t" +\
                    str(kmer.entropy) + "\t" + str(kmer.switch)

                cpt_line += 1

            file_out_r = os.path.join(path_dir,
                                      str(name) + "_count_ggplot.csv")
            with open(file_out_r, 'w') as w_ggplot:
                w_ggplot.write("POS,KMER,SAMPLE,GROUP,LOG_COUNT,COUNT\n")

                for sample in project.samples:
                    list_line[0] = list_line[0] + "\t" + sample.name
                    jf = jellyfish.QueryMerFile(sample.jf_file)

                    cpt_line = 1
                    for kmer in list_kmer:
                        count = get_count(kmer, jf)
                        log_count = np.log10(count * args.LOG_F + args.LOG_C) /\
                            np.log10(sample.num_kmer + args.LOG_C)

                        w_ggplot.write(
                            str(cpt_line) + "," + kmer.seq + "," +
                            sample.name + "," + sample.group + "," +
                            str(log_count) + "," + str(count) + "\n")

                        list_line[cpt_line] = list_line[cpt_line] + "\t" + str(
                            count)
                        cpt_line += 1

            if boo_header:
                del list_line[0]
            else:
                boo_header = True

            for line in list_line:
                w_tab.write(line + "\n")
            del list_line
def prepare_jellyfish(indir, label_file, read_info, k):
    positive = []
    negative = []

    positive_factor = []
    negative_factor = []

    norm_factors = load_read_info(read_info, k)

    labels = parse_labels(label_file)

    for (p, l) in labels:
        filename = os.path.join(indir, p + "_count.jf")
        if (l == "-1"):
            negative.append(jellyfish.QueryMerFile(filename))
            negative_factor.append(norm_factors[p])
        else:
            positive.append(jellyfish.QueryMerFile(filename))
            positive_factor.append(norm_factors[p])

    return (positive, negative, positive_factor, negative_factor)
Beispiel #8
0
    def __init__(self, name, path, json_dump=None):
        """
        initialize a StrainObject

        :param name:
        :param path:
        :return:
        """
        if json_dump is None:
            self.jellyfish_path = "jellyfish"
            self.name = name
            self.path = path
            self.rapid_mode = False
            self.do_not_filter = False
            self.histo = self.get_histo()
            self.coverage = self.get_estimate_coverage()
            self.__check_resources()
            self.kmer_cutoff = None
            self.has_suitable_coverage = False
            self.kmer_set = set([])
            self.kmer_archive = set([])
            self.filtered_jf_file = "/tmp/tmp_filtered_{0}_{1}.jf".\
                format(self.name, ''.join(random.choice(string.ascii_uppercase) for i in range(8)))
            self.ard = {}
            self.unique_kmers = None
            self.distinct_kmers = None
            self.total_kmers = None
            self.max_count = None
            self.qf = jellyfish.QueryMerFile(self.path)
            self.qf_filtered = None
            self.rf = None
            self.warnings = []
        else:
            self.name = json_dump["strain_name"]
            self.path = json_dump["path_count_file"]
            self.do_not_filter = json_dump["filtered_kmer_set"]
            self.histo = {
                int(k): int(v)
                for k, v in json_dump["kmer_count_histogram"].items()
            }
            self.coverage = json_dump["coverage"]
            self.kmer_cutoff = json_dump["kmer_cutoff"]
            self.has_suitable_coverage = json_dump["has_suitable_coverage"]
            self.kmer_set = set(json_dump["kmer_archive"])
            self.kmer_archive = set(json_dump["kmer_archive"])
            self.unique_kmers = json_dump["unique_kmers"]
            self.distinct_kmers = json_dump["distinct_kmers"]
            self.max_count = json_dump["max_count"]
            self.warnings = json_dump["warnings"]
            self.ard_result = json_dump["ard_results"]
Beispiel #9
0
    def __filter_jf_file(self):
        """
        filters the raw kmer count set based on kmer cutoff and set the queryfile and readfile paths
        :return: None
        """
        if self.do_not_filter:
            self.filtered_jf_file = self.path
            self.qf_filtered = jellyfish.QueryMerFile(self.path)
            self.rf = jellyfish.ReadMerFile(self.path)
            self.__create_set()
        else:
            dummy_jf_file = pkg_resources.resource_filename(
                'straintypemer', 'data/dummy_A.jf')

            subprocess.check_call([
                "jellyfish", "merge", "-L",
                str(int(self.kmer_cutoff) + 1), "-o", self.filtered_jf_file,
                self.path, dummy_jf_file
            ], )

            self.qf_filtered = jellyfish.QueryMerFile(self.filtered_jf_file)
            self.rf = jellyfish.ReadMerFile(self.filtered_jf_file)
            self.__create_set()
        return
Beispiel #10
0
def kmercount(k, fname):
    try:
        qf = jellyfish.QueryMerFile(fname)
    except RuntimeError:
        raise
    else:
        # initialize with pseudo count
        # add 0.5 for smoothing
        # store data in doble quantity to use int vector
        c = np.ones(1 << (2 * k), dtype=np.uint16)
        i = 0
        for l in allkmers(k):
            c[i] += 2 * qf[jellyfish.MerDNA(''.join(l))]
            i += 1
        # print len(c);
        return c
Beispiel #11
0
def get_kmer_freq_v(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf',
                    k=5):
    try:
        qf = jellyfish.QueryMerFile(jfdb)
    except RuntimeError:
        raise
    else:
        alph = ('A', 'C', 'G', 'T')
        freq_l = []
        kmer = None
        for km in itertools.product(alph, repeat=k):
            kmer = ''.join(km)
            freq = qf[jellyfish.MerDNA(kmer)]
            freq_l.append(freq)
        # how to close qf??
        a = np.array([freq_l], dtype=np.float64)
        a /= np.sum(a)
        return a
Beispiel #12
0
def test07(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf', k=5):
    try:
        qf = jellyfish.QueryMerFile(jfdb)
    except RuntimeError:
        print 'jellyfish runtime error'
        raise
    else:
        alph = ('A', 'C', 'G', 'T')
        freq_l = []
        for km in itertools.product(alph, repeat=k):
            kmer = ''.join(km)
            freq = qf[jellyfish.MerDNA(kmer)]
            freq_l.append(freq)
            #print '{kmer}\t{freq}'.format(kmer =kmer, freq = freq);
        a = np.array([freq_l], dtype=np.float64)
        a /= np.sum(a)
        print a
    return
Beispiel #13
0
def kmercount(k, pos, chr = 21,
              fname_head = '/data/yt/GRCh38.p2.ch21/GRCh38.p2'):
    try:
        fname = '{head}.ch{chr}.{pos}.fasta.{k}.jf'.format(head = fname_head,
                                                           chr = chr,
                                                           pos = pos, 
                                                           k = k);
        qf = jellyfish.QueryMerFile(fname);
    except RuntimeError:
        raise;
    else:        
        # initialize with pseudo count
        # add 0.5 for smoothing
        # store data in doble quantity to use int vector
        c = np.ones((1 << (2 * k), 1), dtype = np.uint16);
        i = 0;
        for l in allkmers(k):
            c[i][0] += 2 * qf[jellyfish.MerDNA(''.join(l))];
            i += 1;
        # print c.T
        # print len(c);
        return c;
Beispiel #14
0
#! /usr/bin/env python

import jellyfish
import sys

qf = jellyfish.QueryMerFile(sys.argv[1])
for str in sys.argv[2:]:
    print("%s %d" % (str, qf[jellyfish.MerDNA(str)]))

Beispiel #15
0
    sys.exit()
k = int(sys.argv[2])
cosineFile = open("%scosinek%d.log" % (sys.argv[1], k), 'w')
jaccardFile = open("%sjaccardk%d.log" % (sys.argv[1], k), 'w')

#build our list of files / genomes to compare
files = [
    sys.argv[1] + f for f in listdir(sys.argv[1])
    if isfile(join(sys.argv[1], f)) and f.endswith('k%d.jf' % (k))
]

#print files
for idx, jfi_1 in enumerate(files[:-1]):
    for jfi_2 in files[idx + 1:]:
        jfi1_RFile = jellyfish.ReadMerFile(jfi_1)
        jfi1_QFile = jellyfish.QueryMerFile(jfi_1)
        jfi2_RFile = jellyfish.ReadMerFile(jfi_2)
        jfi2_QFile = jellyfish.QueryMerFile(jfi_2)
        t1 = []
        t2 = []
        notUnion = 0
        for mer, count1 in jfi1_RFile:
            #print count1
            count2 = jfi2_QFile[mer]
            if count2 == 0:
                notUnion += 1
            t1.append(int(count1))
            t2.append(int(count2))
        for mer, count2 in jfi2_RFile:
            if jfi1_QFile[mer] == 0:
                t1.append(0)
Beispiel #16
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique 101-mers...')
    addsitedir(jellyfish_python_dirpath)
    try:
        compile_jellyfish(logger)
        import jellyfish
        try:
            import imp
            imp.reload(jellyfish)
        except:
            reload(jellyfish)
        jellyfish.MerDNA.k(KMERS_LEN)
    except:
        logger.warning('Failed unique 101-mers analysis.')
        return

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths,
                                     ref_fpath):
            jf_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(jf_stats_fpath).read().split('\n')
            if len(stats_content) < 4:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(
                reporting.Fields.KMER_COMPLETENESS,
                '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                '%.2f' % float(stats_content[1].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                '%.2f' % float(stats_content[2].strip().split(': ')[-1]))
            report.add_field(
                reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                '%.2f' % float(stats_content[3].strip().split(': ')[-1]))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [
        fpath for fpath in contigs_fpaths if fpath not in checked_assemblies
    ]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    logger.info('Running Jellyfish on reference...')
    jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf')
    qutils.call_subprocess([
        jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
        str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t',
        str(qconfig.max_threads), ref_fpath
    ])
    ref_kmers = jellyfish.ReadMerFile(jf_out_fpath)
    os.remove(jf_out_fpath)

    logger.info('Running Jellyfish on assemblies...')
    contigs_kmers = []
    for contigs_fpath in contigs_fpaths:
        jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf')
        qutils.call_subprocess([
            jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s',
            str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t',
            str(qconfig.max_threads), contigs_fpath
        ])
        contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath))
        os.remove(jf_out_fpath)

    logger.info('Analyzing completeness and accuracy of assemblies...')
    unique_kmers = 0
    matched_kmers = defaultdict(int)
    shared_kmers = set()
    kmer_i = 0
    for kmer, count in ref_kmers:
        unique_kmers += 1
        matches = 0
        for idx in range(len(contigs_fpaths)):
            if contigs_kmers[idx][kmer]:
                matched_kmers[idx] += 1
                matches += 1
        if matches == len(contigs_fpaths):
            if kmer_i % 100 == 0:
                shared_kmers.add(str(kmer))
            kmer_i += 1

    for idx, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        completeness = matched_kmers[idx] * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS,
                         '%.2f' % completeness)

    shared_kmers_by_chrom = dict()
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    for name, seq in ref_contigs.items():
        seq_kmers = jellyfish.string_mers(seq)
        for kmer in seq_kmers:
            if str(kmer) in shared_kmers:
                shared_kmers_by_chrom[str(kmer)] = name

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = 0
        len_map_to_multi_chrom = 0
        total_len = 0

        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            seq_kmers = jellyfish.string_mers(seq)
            chrom_markers = []
            for kmer in seq_kmers:
                kmer_str = str(kmer)
                if kmer_str in shared_kmers_by_chrom:
                    chrom = shared_kmers_by_chrom[kmer_str]
                    chrom_markers.append(chrom)
            if len(chrom_markers) < MIN_MARKERS:
                continue
            if len(set(chrom_markers)) == 1:
                len_map_to_one_chrom += len(seq)
            else:
                len_map_to_multi_chrom += len(seq)

        len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM,
                         '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM,
                         '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
        report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM,
                         '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_jf_stats_file(
            output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
            report.get_field(reporting.Fields.KMER_COMPLETENESS),
            len_map_to_one_chrom, len_map_to_multi_chrom,
            len_map_to_none_chrom)

    logger.info('Done.')
Beispiel #17
0
def genquery(genomeFile, jellyFile, totedits, medindel, insprob, delprob,
             queryfreq, querycount, outputFile):
    #genome - path to genome
    #totedits - total number of edits to make
    #medindel - median (mean) size of indel edits. actual edit length determined from gaussian with mean medindel and std medindel/2
    #insprob - probability of insertion
    #delprob - probability of deletion
    #outputs all edits into a text file called "sampleedits.txt"

    if delprob + insprob > 1.0:
        raise "Error, delprob = {} and insprob = {}. "\
              "The sum is {} > 1.0".format(
                delprob, insprob, delprob + insprob)

    genome = genomeFile.readline()
    genomeFile.close()
    #mf = jellyfish.ReadMerFile(jellyFile)
    qf = jellyfish.QueryMerFile(jellyFile)
    numbases = len(genome) - 1
    genome = genome[0:numbases]
    letters = ['A', 'C', 'G', 'T']
    randr = []
    allinds = []
    snpProb = 1.0 - (insprob + delprob)
    SNPrange = int(snpProb * totedits)
    insrange = int(insprob * totedits)
    delrange = int(delprob * totedits)

    editTypes = (['S'] * SNPrange) +\
                (['D'] * delrange) +\
                (['I'] * insrange)

    random.shuffle(editTypes)
    qcount = 0
    effectedkmers = set()
    for val in editTypes:
        qcount += 1
        if val == 'I':
            p, s, seq = random_insertion(numbases, medindel)
            numbases += s
            outputFile.write('I %d %s\n' % (p, seq))
            add_kmers_in_seq(effectedkmers, seq)
            add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K])

        elif val == 'D':
            p, s = random_deletion(numbases, medindel)
            numbases -= s
            outputFile.write('D %d %d\n' % (p, p + s - 1))
            #add_kmers_in_seq(effectedkmers, genome[p-K+1:p+s-1+K])

        else:
            p, seq = random_snp(numbases)
            outputFile.write('S %d %s\n' % (p, seq))
            add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K - 1])

        # if it's time to output some queries
        if qcount == queryfreq:
            qcount = 0
            for qlist in xrange(querycount):
                dart = random.random()
                if dart <= EDIT_QUERY_PROB:
                    kmer = random.sample(effectedkmers, 1)[0]
                    editflag = 'I'
                else:
                    p = random.randrange(K * 2, numbases - K * 2)
                    kmer = genome[p:p + K].upper()
                    editflag = 'N'

                kcount = int(qf[jellyfish.MerDNA(kmer)])
                outputFile.write('Q %s %s %d\n' % (kmer, editflag, kcount))

    outputFile.close()
 def import_counts(self):
     print('importing jellyfish table', self.path)
     self.qf = jellyfish.QueryMerFile(self.path)
     print('table loaded')