Example #1
0
def test_kmc_call_paired():
    out, err, cmd = kmc.kmc(forward_in='tests/dummy_fastq/test_R1.fastq',
                            database_name='tests/kmc_db',
                            returncmd=True)
    assert cmd == 'kmc -k31 -ci1  @tmp/filelist.txt tests/kmc_db tmp'
    os.remove('tests/kmc_db.kmc_pre')
    os.remove('tests/kmc_db.kmc_suf')
Example #2
0
def test_kmc_call_single():
    out, err, cmd = kmc.kmc(forward_in='tests/dummy_fastq/single.fastq',
                            database_name='tests/kmc_db',
                            returncmd=True)
    assert cmd == 'kmc -k31 -ci1  tests/dummy_fastq/single.fastq tests/kmc_db tmp'
    os.remove('tests/kmc_db.kmc_pre')
    os.remove('tests/kmc_db.kmc_suf')
def kmerize_individual_fastas(potential_plasmid_list,
                              fasta_dir,
                              output_dir,
                              threads=1,
                              logfile=None):
    """
    Creates a KMC database for a list of potential plasmids that have FASTA-formatted sequences in fasta_dir.
    KMC databases are placed in output_dir.
    :param potential_plasmid_list: List of potential plasmids.
    :param fasta_dir: Directory where FASTA files for each potential plasmid are located.
    :param output_dir: Directory to store KMC Databases in. Created if it doesn't exist.
    :param logfile: File to write output to.
    :param threads: Number of threads to run KMC with.
    """
    if not os.path.isdir(output_dir):  # Make output dir if necessary.
        os.makedirs(output_dir)

    for plasmid in potential_plasmid_list:  # Call KMC in FASTA mode on each individual FASTA.
        out, err = kmc.kmc(forward_in=os.path.join(fasta_dir, plasmid),
                           database_name=os.path.join(output_dir, plasmid),
                           tmpdir=os.path.join(output_dir, 'tmp'),
                           fm='',
                           t=threads)
        if logfile:
            accessoryFunctions.write_to_logfile(out, err, logfile)
def find_plasmid_kmer_scores(reads_kmerized,
                             kmc_database_dir,
                             output_dir,
                             threads=1,
                             cutoff=0.95):
    """
    Computes kmer overlaps (how many kmers are in both plasmid and reads) for a set of reads and however many
    kmer databases are in the kmc_database dir folder.
    :param reads_kmerized: Name of kmerized read database.
    :param kmc_database_dir: Folder containing kmc database dirs.
    :param output_dir: Where to store intermediate files.
    :param threads: Number of threads to use.
    :param cutoff: Cutoff for finding
    :return:
    """
    present_plasmids = dict()
    kmerized_plasmids = glob.glob(os.path.join(kmc_database_dir, '*.kmc_pre'))
    if os.path.isfile(os.path.join(output_dir, 'plasmid_reads_R2.fastq.gz')):
        kmc.kmc(forward_in=os.path.join(output_dir,
                                        'plasmid_reads_R1.fastq.gz'),
                reverse_in=os.path.join(output_dir,
                                        'plasmid_reads_R2.fastq.gz'),
                tmpdir=os.path.join(output_dir, 'tmp'),
                database_name=os.path.join(output_dir, 'read_kmers'))
    else:
        kmc.kmc(forward_in=os.path.join(output_dir,
                                        'plasmid_reads_R1.fastq.gz'),
                tmpdir=os.path.join(output_dir, 'tmp'),
                database_name=os.path.join(output_dir, 'read_kmers'))
    read_list = [reads_kmerized] * len(kmerized_plasmids)
    pool = multiprocessing.Pool(processes=threads)
    results = pool.starmap(find_score, zip(read_list, kmerized_plasmids))
    pool.close()
    pool.join()
    for result in results:
        if result[1] > cutoff:
            present_plasmids[result[0]] = result[1]
    return present_plasmids
Example #5
0
def make_inclusion_kmerdb(inclusion_folder, output_db, forward_id='_R1', reverse_id='_R2', tmpdir='tmpinclusion',
                          maxmem='12', threads='2', logfile=None, k=31):
    """
    Given an folder containing some genomes, finds kmers that are common to all genomes, and writes them to output_db.
    Genomes can be in fasta (uncompressed only? check this) or fastq (gzip compressed or uncompressed) formats.
    Kmers found are 31-mers.
    :param inclusion_folder: Path to folder containing your genomes.
    :param output_db: Base name for the kmc database that will be created.
    :param forward_id: Forward read identifier.
    :param reverse_id: Reverse read identifier.
    :param tmpdir: Directory where temporary databases and whatnot will be stored. Deleted upon method completion.
    :param maxmem: Maximum amount of memory to use when kmerizing, in GB.
    :param threads: Number of threads to use. Counterintuitively, should be a string.
    :param logfile: Text file you want commands used, as well as stdout and stderr from called programs, to be logged to
    :param k: kmer size to use for kmc kmer generation
    """
    # Make the tmpdir, if it doesn't exist already.
    if not os.path.isdir(tmpdir):
        os.makedirs(tmpdir)
    # Get lists of everything - fasta, paired fastq, unpaired fastq.
    fastas = glob.glob(os.path.join(inclusion_folder, '*.f*a'))
    paired_fastqs = find_paired_reads(inclusion_folder, forward_id=forward_id, reverse_id=reverse_id)
    unpaired_fastqs = find_unpaired_reads(inclusion_folder, forward_id=forward_id, reverse_id=reverse_id)
    # Make a database for each item in each list, and place it into the tmpdir.
    i = 1
    for fasta in fastas:
        out, err, cmd = kmc.kmc(fasta, os.path.join(tmpdir, 'database{}'.format(str(i))), fm='', m=maxmem, t=threads,
                                tmpdir=os.path.join(tmpdir, str(time.time()).split('.')[0]), returncmd=True,
                                k=k)
        if logfile:
            write_to_logfile(logfile, out, err, cmd)
        i += 1
    for pair in paired_fastqs:
        out, err, cmd = kmc.kmc(forward_in=pair[0], reverse_in=pair[1], database_name=os.path.join(tmpdir, 'database{}'.format(str(i))),
                                min_occurrences=2,  # For fastqs, make min_occurrence two to hopefully filter out sequencing errors.
                                m=maxmem, t=threads, tmpdir=os.path.join(tmpdir, str(time.time()).split('.')[0]),
                                returncmd=True,
                                k=k)
        if logfile:
            write_to_logfile(logfile, out, err, cmd)
        i += 1
    for fastq in unpaired_fastqs:
        out, err, cmd = kmc.kmc(forward_in=fastq, database_name=os.path.join(tmpdir, 'database{}'.format(str(i))),
                                min_occurrences=2,  # For fastqs, make min_occurrence two to hopefully filter out sequencing errors.
                                m=maxmem, t=threads, tmpdir=os.path.join(tmpdir, str(time.time()).split('.')[0]),
                                returncmd=True,
                                k=k)
        if logfile:
            write_to_logfile(logfile, out, err, cmd)
        i += 1
    # Create a command file to allow kmc to get an intersection of all the inclusion databases created and write to our
    # final inclusion database.
    with open(os.path.join(tmpdir, 'command_file'), 'w') as f:
        f.write('INPUT:\n')
        for j in range(i - 1):
            f.write('set{} = {}\n'.format(str(j + 1), os.path.join(tmpdir, 'database{}'.format(str(j + 1)))))
        f.write('OUTPUT:\n{} = '.format(output_db))
        for j in range(i - 1):
            if j < (i - 2):
                f.write('set{}*sum'.format(str(j + 1)))
            else:
                f.write('set{}\n'.format(str(j + 1)))
    cmd = 'kmc_tools complex {}'.format(os.path.join(tmpdir, 'command_file'))
    if logfile:
        with open(logfile, 'a+') as f:
            f.write('Command: {}'.format(cmd))
            subprocess.call(cmd, shell=True, stderr=f, stdout=f)
    else:
        with open(os.path.join(tmpdir, 'asdf.txt'), 'w') as f:
            subprocess.call(cmd, shell=True, stderr=f, stdout=f)
    shutil.rmtree(tmpdir)