def test_kmc_call_paired(): out, err, cmd = kmc.kmc(forward_in='tests/dummy_fastq/test_R1.fastq', database_name='tests/kmc_db', returncmd=True) assert cmd == 'kmc -k31 -ci1 @tmp/filelist.txt tests/kmc_db tmp' os.remove('tests/kmc_db.kmc_pre') os.remove('tests/kmc_db.kmc_suf')
def test_kmc_call_single(): out, err, cmd = kmc.kmc(forward_in='tests/dummy_fastq/single.fastq', database_name='tests/kmc_db', returncmd=True) assert cmd == 'kmc -k31 -ci1 tests/dummy_fastq/single.fastq tests/kmc_db tmp' os.remove('tests/kmc_db.kmc_pre') os.remove('tests/kmc_db.kmc_suf')
def kmerize_individual_fastas(potential_plasmid_list, fasta_dir, output_dir, threads=1, logfile=None): """ Creates a KMC database for a list of potential plasmids that have FASTA-formatted sequences in fasta_dir. KMC databases are placed in output_dir. :param potential_plasmid_list: List of potential plasmids. :param fasta_dir: Directory where FASTA files for each potential plasmid are located. :param output_dir: Directory to store KMC Databases in. Created if it doesn't exist. :param logfile: File to write output to. :param threads: Number of threads to run KMC with. """ if not os.path.isdir(output_dir): # Make output dir if necessary. os.makedirs(output_dir) for plasmid in potential_plasmid_list: # Call KMC in FASTA mode on each individual FASTA. out, err = kmc.kmc(forward_in=os.path.join(fasta_dir, plasmid), database_name=os.path.join(output_dir, plasmid), tmpdir=os.path.join(output_dir, 'tmp'), fm='', t=threads) if logfile: accessoryFunctions.write_to_logfile(out, err, logfile)
def find_plasmid_kmer_scores(reads_kmerized, kmc_database_dir, output_dir, threads=1, cutoff=0.95): """ Computes kmer overlaps (how many kmers are in both plasmid and reads) for a set of reads and however many kmer databases are in the kmc_database dir folder. :param reads_kmerized: Name of kmerized read database. :param kmc_database_dir: Folder containing kmc database dirs. :param output_dir: Where to store intermediate files. :param threads: Number of threads to use. :param cutoff: Cutoff for finding :return: """ present_plasmids = dict() kmerized_plasmids = glob.glob(os.path.join(kmc_database_dir, '*.kmc_pre')) if os.path.isfile(os.path.join(output_dir, 'plasmid_reads_R2.fastq.gz')): kmc.kmc(forward_in=os.path.join(output_dir, 'plasmid_reads_R1.fastq.gz'), reverse_in=os.path.join(output_dir, 'plasmid_reads_R2.fastq.gz'), tmpdir=os.path.join(output_dir, 'tmp'), database_name=os.path.join(output_dir, 'read_kmers')) else: kmc.kmc(forward_in=os.path.join(output_dir, 'plasmid_reads_R1.fastq.gz'), tmpdir=os.path.join(output_dir, 'tmp'), database_name=os.path.join(output_dir, 'read_kmers')) read_list = [reads_kmerized] * len(kmerized_plasmids) pool = multiprocessing.Pool(processes=threads) results = pool.starmap(find_score, zip(read_list, kmerized_plasmids)) pool.close() pool.join() for result in results: if result[1] > cutoff: present_plasmids[result[0]] = result[1] return present_plasmids
def make_inclusion_kmerdb(inclusion_folder, output_db, forward_id='_R1', reverse_id='_R2', tmpdir='tmpinclusion', maxmem='12', threads='2', logfile=None, k=31): """ Given an folder containing some genomes, finds kmers that are common to all genomes, and writes them to output_db. Genomes can be in fasta (uncompressed only? check this) or fastq (gzip compressed or uncompressed) formats. Kmers found are 31-mers. :param inclusion_folder: Path to folder containing your genomes. :param output_db: Base name for the kmc database that will be created. :param forward_id: Forward read identifier. :param reverse_id: Reverse read identifier. :param tmpdir: Directory where temporary databases and whatnot will be stored. Deleted upon method completion. :param maxmem: Maximum amount of memory to use when kmerizing, in GB. :param threads: Number of threads to use. Counterintuitively, should be a string. :param logfile: Text file you want commands used, as well as stdout and stderr from called programs, to be logged to :param k: kmer size to use for kmc kmer generation """ # Make the tmpdir, if it doesn't exist already. if not os.path.isdir(tmpdir): os.makedirs(tmpdir) # Get lists of everything - fasta, paired fastq, unpaired fastq. fastas = glob.glob(os.path.join(inclusion_folder, '*.f*a')) paired_fastqs = find_paired_reads(inclusion_folder, forward_id=forward_id, reverse_id=reverse_id) unpaired_fastqs = find_unpaired_reads(inclusion_folder, forward_id=forward_id, reverse_id=reverse_id) # Make a database for each item in each list, and place it into the tmpdir. i = 1 for fasta in fastas: out, err, cmd = kmc.kmc(fasta, os.path.join(tmpdir, 'database{}'.format(str(i))), fm='', m=maxmem, t=threads, tmpdir=os.path.join(tmpdir, str(time.time()).split('.')[0]), returncmd=True, k=k) if logfile: write_to_logfile(logfile, out, err, cmd) i += 1 for pair in paired_fastqs: out, err, cmd = kmc.kmc(forward_in=pair[0], reverse_in=pair[1], database_name=os.path.join(tmpdir, 'database{}'.format(str(i))), min_occurrences=2, # For fastqs, make min_occurrence two to hopefully filter out sequencing errors. m=maxmem, t=threads, tmpdir=os.path.join(tmpdir, str(time.time()).split('.')[0]), returncmd=True, k=k) if logfile: write_to_logfile(logfile, out, err, cmd) i += 1 for fastq in unpaired_fastqs: out, err, cmd = kmc.kmc(forward_in=fastq, database_name=os.path.join(tmpdir, 'database{}'.format(str(i))), min_occurrences=2, # For fastqs, make min_occurrence two to hopefully filter out sequencing errors. m=maxmem, t=threads, tmpdir=os.path.join(tmpdir, str(time.time()).split('.')[0]), returncmd=True, k=k) if logfile: write_to_logfile(logfile, out, err, cmd) i += 1 # Create a command file to allow kmc to get an intersection of all the inclusion databases created and write to our # final inclusion database. with open(os.path.join(tmpdir, 'command_file'), 'w') as f: f.write('INPUT:\n') for j in range(i - 1): f.write('set{} = {}\n'.format(str(j + 1), os.path.join(tmpdir, 'database{}'.format(str(j + 1))))) f.write('OUTPUT:\n{} = '.format(output_db)) for j in range(i - 1): if j < (i - 2): f.write('set{}*sum'.format(str(j + 1))) else: f.write('set{}\n'.format(str(j + 1))) cmd = 'kmc_tools complex {}'.format(os.path.join(tmpdir, 'command_file')) if logfile: with open(logfile, 'a+') as f: f.write('Command: {}'.format(cmd)) subprocess.call(cmd, shell=True, stderr=f, stdout=f) else: with open(os.path.join(tmpdir, 'asdf.txt'), 'w') as f: subprocess.call(cmd, shell=True, stderr=f, stdout=f) shutil.rmtree(tmpdir)