def test_invert_filter(self): '''Test that inverting filtering works''' infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa') outfile = 'tmp.ids_file_filter.fa' tasks.filter(infile, outfile, ids_file=infile + '.ids', invert=True) self.assertTrue(filecmp.cmp(infile + '.filtered.invert', outfile)) os.unlink(outfile)
def test_ids_from_file_filter(self): '''Test that can extract reads from a file of read names''' infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa') outfile = 'tmp.ids_file_filter.fa' tasks.filter(infile, outfile, ids_file=infile + '.ids') self.assertTrue(filecmp.cmp(infile + '.filtered', outfile)) os.unlink(outfile)
def test_ids_with_comments_from_file_filter(self): '''Test that can extract reads from a file of read names where the read names have extra data after space''' infile = os.path.join(data_dir, 'readnames_with_comments.fastq') outfile = 'tmp.ids_file_filter.fastq' tasks.filter(infile, outfile, ids_file=infile + '.ids') self.assertTrue(filecmp.cmp(infile + '.filtered', outfile)) os.unlink(outfile)
def run(description): parser = argparse.ArgumentParser( description = 'Filters a sequence file by sequence length and/or by name matching a regular expression', usage = 'fastaq filter [options] <infile> <outfile>') parser.add_argument('--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT') parser.add_argument('--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT') parser.add_argument('--regex', help='If given, only reads with a name matching the regular expression will be kept') parser.add_argument('--ids_file', help='If given, only reads whose ID is in th given file will be used. One ID per line of file.', metavar='FILENAME') parser.add_argument('-v', '--invert', action='store_true', help='Only keep sequences that do not match the filters') mate_group = parser.add_argument_group('Mate file for read pairs options') mate_group.add_argument('--mate_in', help='Name of mates input file. If used, must also provide --mate_out', metavar='FILENAME') mate_group.add_argument('--mate_out', help='Name of mates output file', metavar='FILENAME') mate_group.add_argument('--both_mates_pass', action='store_true', help='By default, if either mate passes filter, then both reads output. Use this flag to require that both reads of a pair pass the filter') parser.add_argument('infile', help='Name of input file to be filtered') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() tasks.filter(options.infile, options.outfile, minlength=options.min_length, maxlength=options.max_length, regex=options.regex, ids_file=options.ids_file, invert=options.invert, mate_in=options.mate_in, mate_out=options.mate_out, both_mates_pass=options.both_mates_pass, )
def test_paired_one_pass(self): '''Test filter with paired file one pass''' infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_1.fa') infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_2.fa') outfile1 = 'tmp.filter_one_pass_1.fa' outfile2 = 'tmp.filter_one_pass_2.fa' expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_1.fa') expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_2.fa') tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, both_mates_pass=False, minlength=3) self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False)) self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False)) os.unlink(outfile1) os.unlink(outfile2)
def test_regex_filter(self): '''Check that filtering by name regex works as expected''' infile = os.path.join(data_dir, 'sequences_test_filter_by_regex.fa') correct_files = [os.path.join(data_dir, 'sequences_test_filter_by_regex.numeric.fa'), os.path.join(data_dir, 'sequences_test_filter_by_regex.first-of-pair.fa'), os.path.join(data_dir, 'sequences_test_filter_by_regex.first-char-a.fa')] regexes = ['^[0-9]+$', '/1$', '^a'] for i in range(len(regexes)): outfile = 'tmp.regex_filter.fa' tasks.filter(infile, outfile, regex=regexes[i]) self.assertTrue(filecmp.cmp(correct_files[i], outfile)) os.unlink(outfile)
def test_length_filter(self): '''Check that filtering by length works as expected''' infile = os.path.join(data_dir, 'sequences_test_length_filter.fa') correct_files = [os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-1.fa'), os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-inf.fa'), os.path.join(data_dir, 'sequences_test_length_filter.min-4.max-4.fa')] cutoffs = [(0, 1), (0, float('inf')), (4, 4)] for i in range(len(cutoffs)): outfile = 'tmp.length_filter.fa' tasks.filter(infile, outfile, minlength=cutoffs[i][0], maxlength=cutoffs[i][1]) self.assertTrue(filecmp.cmp(correct_files[i], outfile)) os.unlink(outfile)
def test_regex_check_comments_filter(self): '''When check_comments is true, and the regex is in the comment''' infile = tempfile.NamedTemporaryFile(suffix=".fa", mode="w+") infile.write( ">read1 foo=bar\nAGCT\n>read2 bar=foo\nGGG\n>read3\nGGGG\n>read4 foo=ba\n" "GCA\n>read5foo=bar\nGCAT" ) infile.seek(0) regex = '\sfoo=bar' outfile = tempfile.NamedTemporaryFile(suffix=".fa", mode="w+") tasks.filter(infile.name, outfile.name, regex=regex, check_comments=True) with open(outfile.name) as handle: actual = handle.read() expected = ">read1 foo=bar\nAGCT\n" self.assertEqual(actual, expected)
def run(self): '''Produce a filtered fasta file.''' original_dir = os.getcwd() os.chdir(self.working_directory) small_contigs = set() contained_contigs = set() if len(self.contigs) > len(self.ids_to_skip): alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_nucmer_filename(), min_percent_id=self.percent_match, run_promer=False) for id in self.contigs.keys(): if not id in self.ids_to_skip: if len(self.contigs[id]) < self.cutoff_contig_length: small_contigs.add(id) else: for algn in alignments: if (not algn.is_self_hit()) \ and algn.qry_name == id \ and algn.ref_name != algn.qry_name \ and not algn.ref_name in contained_contigs \ and (algn.hit_length_qry/algn.qry_length) * 100 >= self.percent_match: contained_contigs.add(id) discard = small_contigs.union(contained_contigs) ids_file = utils.write_ids_to_file(discard, "contig.ids.discard") tasks.filter(self.fasta_file, self.output_file, ids_file=ids_file, invert=True) if not self.debug: utils.delete(ids_file) utils.delete(self._build_nucmer_filename()) else: output_fw = fastaqutils.open_file_write(self.output_file) for contig_id in self.contigs: print(sequences.Fasta(contig_id, self.contigs[contig_id]), file=output_fw) fastaqutils.close(output_fw) self._write_summary(small_contigs, contained_contigs) os.chdir(original_dir)
def run(description): parser = argparse.ArgumentParser( description= 'Filters a sequence file by sequence length and/or by name matching a regular expression', usage='fastaq filter [options] <infile> <outfile>') parser.add_argument( '--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT') parser.add_argument( '--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT') parser.add_argument( '--regex', help= 'If given, only reads with a name matching the regular expression will be kept' ) parser.add_argument( '--ids_file', help= 'If given, only reads whose ID is in th given file will be used. One ID per line of file.', metavar='FILENAME') parser.add_argument( '-v', '--invert', action='store_true', help='Only keep sequences that do not match the filters') parser.add_argument( '--check_comments', action='store_true', help= 'Search the header comments also for the given regex. Can only be specified with --regex' ) mate_group = parser.add_argument_group('Mate file for read pairs options') mate_group.add_argument( '--mate_in', help='Name of mates input file. If used, must also provide --mate_out', metavar='FILENAME') mate_group.add_argument('--mate_out', help='Name of mates output file', metavar='FILENAME') mate_group.add_argument( '--both_mates_pass', action='store_true', help= 'By default, if either mate passes filter, then both reads output. Use this flag to require that both reads of a pair pass the filter' ) parser.add_argument('infile', help='Name of input file to be filtered') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() tasks.filter( options.infile, options.outfile, minlength=options.min_length, maxlength=options.max_length, regex=options.regex, ids_file=options.ids_file, invert=options.invert, mate_in=options.mate_in, mate_out=options.mate_out, both_mates_pass=options.both_mates_pass, check_comments=options.check_comments, )