Beispiel #1
0
 def test_invert_filter(self):
     '''Test that inverting filtering works'''
     infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa')
     outfile = 'tmp.ids_file_filter.fa'
     tasks.filter(infile, outfile, ids_file=infile + '.ids', invert=True)
     self.assertTrue(filecmp.cmp(infile + '.filtered.invert', outfile))
     os.unlink(outfile)
Beispiel #2
0
 def test_ids_from_file_filter(self):
     '''Test that can extract reads from a file of read names'''
     infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa')
     outfile = 'tmp.ids_file_filter.fa'
     tasks.filter(infile, outfile, ids_file=infile + '.ids')
     self.assertTrue(filecmp.cmp(infile + '.filtered', outfile))
     os.unlink(outfile)
Beispiel #3
0
 def test_ids_with_comments_from_file_filter(self):
     '''Test that can extract reads from a file of read names where the read names have extra data after space'''
     infile = os.path.join(data_dir, 'readnames_with_comments.fastq')
     outfile = 'tmp.ids_file_filter.fastq'
     tasks.filter(infile, outfile, ids_file=infile + '.ids')
     self.assertTrue(filecmp.cmp(infile + '.filtered', outfile))
     os.unlink(outfile)
Beispiel #4
0
 def test_ids_with_comments_from_file_filter(self):
     '''Test that can extract reads from a file of read names where the read names have extra data after space'''
     infile = os.path.join(data_dir, 'readnames_with_comments.fastq')
     outfile = 'tmp.ids_file_filter.fastq'
     tasks.filter(infile, outfile, ids_file=infile + '.ids')
     self.assertTrue(filecmp.cmp(infile + '.filtered', outfile))
     os.unlink(outfile)
Beispiel #5
0
 def test_invert_filter(self):
     '''Test that inverting filtering works'''
     infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa')
     outfile = 'tmp.ids_file_filter.fa'
     tasks.filter(infile, outfile, ids_file=infile + '.ids', invert=True)
     self.assertTrue(filecmp.cmp(infile + '.filtered.invert', outfile))
     os.unlink(outfile)
Beispiel #6
0
 def test_ids_from_file_filter(self):
     '''Test that can extract reads from a file of read names'''
     infile = os.path.join(data_dir, 'sequences_test_filter_by_ids_file.fa')
     outfile = 'tmp.ids_file_filter.fa'
     tasks.filter(infile, outfile, ids_file=infile + '.ids')
     self.assertTrue(filecmp.cmp(infile + '.filtered', outfile))
     os.unlink(outfile)
Beispiel #7
0
def run(description):
    parser = argparse.ArgumentParser(
        description = 'Filters a sequence file by sequence length and/or by name matching a regular expression',
        usage = 'fastaq filter [options] <infile> <outfile>')
    parser.add_argument('--min_length', type=int, help='Minimum length of sequence to keep [%(default)s]', default=0, metavar='INT')
    parser.add_argument('--max_length', type=float, help='Maximum length of sequence to keep [%(default)s]', default=float('inf'), metavar='INT')
    parser.add_argument('--regex', help='If given, only reads with a name matching the regular expression will be kept')
    parser.add_argument('--ids_file', help='If given, only reads whose ID is in th given file will be used. One ID per line of file.', metavar='FILENAME')
    parser.add_argument('-v', '--invert', action='store_true', help='Only keep sequences that do not match the filters')

    mate_group = parser.add_argument_group('Mate file for read pairs options')
    mate_group.add_argument('--mate_in', help='Name of mates input file. If used, must also provide --mate_out', metavar='FILENAME')
    mate_group.add_argument('--mate_out', help='Name of mates output file', metavar='FILENAME')
    mate_group.add_argument('--both_mates_pass', action='store_true', help='By default, if either mate passes filter, then both reads output. Use this flag to require that both reads of a pair pass the filter')

    parser.add_argument('infile', help='Name of input file to be filtered')
    parser.add_argument('outfile', help='Name of output file')
    options = parser.parse_args()
    tasks.filter(options.infile,
                 options.outfile,
                 minlength=options.min_length,
                 maxlength=options.max_length,
                 regex=options.regex,
                 ids_file=options.ids_file,
                 invert=options.invert,
                 mate_in=options.mate_in,
                 mate_out=options.mate_out,
                 both_mates_pass=options.both_mates_pass,
    )
Beispiel #8
0
 def test_paired_one_pass(self):
     '''Test filter with paired file one pass'''
     infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_1.fa')
     infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_2.fa')
     outfile1 = 'tmp.filter_one_pass_1.fa'
     outfile2 = 'tmp.filter_one_pass_2.fa'
     expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_1.fa')
     expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_2.fa')
     tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, both_mates_pass=False, minlength=3)
     self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False))
     self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False))
     os.unlink(outfile1)
     os.unlink(outfile2)
Beispiel #9
0
    def test_regex_filter(self):
        '''Check that filtering by name regex works as expected'''
        infile = os.path.join(data_dir, 'sequences_test_filter_by_regex.fa')
        correct_files = [os.path.join(data_dir, 'sequences_test_filter_by_regex.numeric.fa'),
                         os.path.join(data_dir, 'sequences_test_filter_by_regex.first-of-pair.fa'),
                         os.path.join(data_dir, 'sequences_test_filter_by_regex.first-char-a.fa')]
        regexes = ['^[0-9]+$', '/1$', '^a']

        for i in range(len(regexes)):
            outfile = 'tmp.regex_filter.fa'
            tasks.filter(infile, outfile, regex=regexes[i])
            self.assertTrue(filecmp.cmp(correct_files[i], outfile))
            os.unlink(outfile)
Beispiel #10
0
    def test_length_filter(self):
        '''Check that filtering by length works as expected'''
        infile = os.path.join(data_dir, 'sequences_test_length_filter.fa')
        correct_files = [os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-1.fa'),
                         os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-inf.fa'),
                         os.path.join(data_dir, 'sequences_test_length_filter.min-4.max-4.fa')]
        cutoffs = [(0, 1), (0, float('inf')), (4, 4)]

        for i in range(len(cutoffs)):
            outfile = 'tmp.length_filter.fa'
            tasks.filter(infile, outfile, minlength=cutoffs[i][0], maxlength=cutoffs[i][1])
            self.assertTrue(filecmp.cmp(correct_files[i], outfile))
            os.unlink(outfile)
Beispiel #11
0
 def test_paired_one_pass(self):
     '''Test filter with paired file one pass'''
     infile1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_1.fa')
     infile2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.in_2.fa')
     outfile1 = 'tmp.filter_one_pass_1.fa'
     outfile2 = 'tmp.filter_one_pass_2.fa'
     expected1 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_1.fa')
     expected2 = os.path.join(data_dir, 'tasks_test_filter_paired_one_pass.out_2.fa')
     tasks.filter(infile1, outfile1, mate_in=infile2, mate_out=outfile2, both_mates_pass=False, minlength=3)
     self.assertTrue(filecmp.cmp(outfile1, expected1, shallow=False))
     self.assertTrue(filecmp.cmp(outfile2, expected2, shallow=False))
     os.unlink(outfile1)
     os.unlink(outfile2)
Beispiel #12
0
    def test_regex_filter(self):
        '''Check that filtering by name regex works as expected'''
        infile = os.path.join(data_dir, 'sequences_test_filter_by_regex.fa')
        correct_files = [os.path.join(data_dir, 'sequences_test_filter_by_regex.numeric.fa'),
                         os.path.join(data_dir, 'sequences_test_filter_by_regex.first-of-pair.fa'),
                         os.path.join(data_dir, 'sequences_test_filter_by_regex.first-char-a.fa')]
        regexes = ['^[0-9]+$', '/1$', '^a']

        for i in range(len(regexes)):
            outfile = 'tmp.regex_filter.fa'
            tasks.filter(infile, outfile, regex=regexes[i])
            self.assertTrue(filecmp.cmp(correct_files[i], outfile))
            os.unlink(outfile)
Beispiel #13
0
    def test_length_filter(self):
        '''Check that filtering by length works as expected'''
        infile = os.path.join(data_dir, 'sequences_test_length_filter.fa')
        correct_files = [os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-1.fa'),
                         os.path.join(data_dir, 'sequences_test_length_filter.min-0.max-inf.fa'),
                         os.path.join(data_dir, 'sequences_test_length_filter.min-4.max-4.fa')]
        cutoffs = [(0, 1), (0, float('inf')), (4, 4)]

        for i in range(len(cutoffs)):
            outfile = 'tmp.length_filter.fa'
            tasks.filter(infile, outfile, minlength=cutoffs[i][0], maxlength=cutoffs[i][1])
            self.assertTrue(filecmp.cmp(correct_files[i], outfile))
            os.unlink(outfile)
Beispiel #14
0
    def test_regex_check_comments_filter(self):
        '''When check_comments is true, and the regex is in the comment'''
        infile = tempfile.NamedTemporaryFile(suffix=".fa", mode="w+")
        infile.write(
            ">read1 foo=bar\nAGCT\n>read2 bar=foo\nGGG\n>read3\nGGGG\n>read4 foo=ba\n"
            "GCA\n>read5foo=bar\nGCAT"
        )
        infile.seek(0)
        regex = '\sfoo=bar'
        outfile = tempfile.NamedTemporaryFile(suffix=".fa", mode="w+")

        tasks.filter(infile.name, outfile.name, regex=regex, check_comments=True)
        with open(outfile.name) as handle:
            actual = handle.read()

        expected = ">read1 foo=bar\nAGCT\n"

        self.assertEqual(actual, expected)
	def run(self):
		'''Produce a filtered fasta file.'''	
		original_dir = os.getcwd()
		os.chdir(self.working_directory)
		small_contigs = set()
		contained_contigs = set()
		if len(self.contigs) > len(self.ids_to_skip):
			alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_nucmer_filename(), min_percent_id=self.percent_match, run_promer=False)
			for id in self.contigs.keys():
				if not id in self.ids_to_skip:
					if len(self.contigs[id]) < self.cutoff_contig_length:
						small_contigs.add(id)
					else:
						for algn in alignments:
							if (not algn.is_self_hit()) \
							   and algn.qry_name == id \
							   and algn.ref_name != algn.qry_name \
							   and not algn.ref_name in contained_contigs \
							   and (algn.hit_length_qry/algn.qry_length) * 100 >= self.percent_match:
								contained_contigs.add(id)
					
			discard = small_contigs.union(contained_contigs)
			ids_file = utils.write_ids_to_file(discard, "contig.ids.discard")  
			tasks.filter(self.fasta_file, self.output_file, ids_file=ids_file, invert=True)	
								
			if not self.debug:
				utils.delete(ids_file)
				utils.delete(self._build_nucmer_filename())
		else:
			output_fw = fastaqutils.open_file_write(self.output_file)
			for contig_id in self.contigs:
				print(sequences.Fasta(contig_id, self.contigs[contig_id]), file=output_fw)
			fastaqutils.close(output_fw)
		
		self._write_summary(small_contigs, contained_contigs)	
		os.chdir(original_dir)
Beispiel #16
0
def run(description):
    parser = argparse.ArgumentParser(
        description=
        'Filters a sequence file by sequence length and/or by name matching a regular expression',
        usage='fastaq filter [options] <infile> <outfile>')
    parser.add_argument(
        '--min_length',
        type=int,
        help='Minimum length of sequence to keep [%(default)s]',
        default=0,
        metavar='INT')
    parser.add_argument(
        '--max_length',
        type=float,
        help='Maximum length of sequence to keep [%(default)s]',
        default=float('inf'),
        metavar='INT')
    parser.add_argument(
        '--regex',
        help=
        'If given, only reads with a name matching the regular expression will be kept'
    )
    parser.add_argument(
        '--ids_file',
        help=
        'If given, only reads whose ID is in th given file will be used. One ID per line of file.',
        metavar='FILENAME')
    parser.add_argument(
        '-v',
        '--invert',
        action='store_true',
        help='Only keep sequences that do not match the filters')
    parser.add_argument(
        '--check_comments',
        action='store_true',
        help=
        'Search the header comments also for the given regex. Can only be specified with --regex'
    )

    mate_group = parser.add_argument_group('Mate file for read pairs options')
    mate_group.add_argument(
        '--mate_in',
        help='Name of mates input file. If used, must also provide --mate_out',
        metavar='FILENAME')
    mate_group.add_argument('--mate_out',
                            help='Name of mates output file',
                            metavar='FILENAME')
    mate_group.add_argument(
        '--both_mates_pass',
        action='store_true',
        help=
        'By default, if either mate passes filter, then both reads output. Use this flag to require that both reads of a pair pass the filter'
    )

    parser.add_argument('infile', help='Name of input file to be filtered')
    parser.add_argument('outfile', help='Name of output file')
    options = parser.parse_args()
    tasks.filter(
        options.infile,
        options.outfile,
        minlength=options.min_length,
        maxlength=options.max_length,
        regex=options.regex,
        ids_file=options.ids_file,
        invert=options.invert,
        mate_in=options.mate_in,
        mate_out=options.mate_out,
        both_mates_pass=options.both_mates_pass,
        check_comments=options.check_comments,
    )