def test_remove_outliers(self):
        aln = [
            '>ACT009', 'ACAT-',
            '>ACT019', 'GACT-',
            '>ACT_02', 'GACT-',
            '>ACT_03', 'AACT-',
            '>ACT_04', 'AACT-',
            '>ACT_05', 'AACT-',
            '>ACT011', 'CTGGC',
            '>hello', 'AACTG',
        ]
        # mean errors is 10/9 .
        seqnames = []
        for elem in aln:
            if elem.startswith('>'):
                seqnames.append(elem[1:])
        seqs = []
        for elem in aln:
            if not elem.startswith('>'):
                seqs.append(elem)

        # just remove ACT011
        res = remove_outliers(aln, 2)
        self.assertEqual(res.sequence_count(), 7)
        for seqname_left in res.ids():
            self.assertTrue(seqname_left in seqnames)
        self.assertTrue('ACT011' not in res.ids())

        # now remove all that deviate have > 10/9 (2 or more) substitutions:
        res = remove_outliers(aln, 0)
        self.assertEqual(res.sequence_count(), 6)
        for seqname_left in res.ids():
            self.assertTrue(seqname_left in seqnames)
        self.assertTrue('ACT011' not in res.ids())
        self.assertTrue('ACT009' not in res.ids())
Example #2
0
    def test_remove_outliers(self):
        aln = [
            '>ACT009', 'ACAT-',
            '>ACT019', 'GACT-',
            '>ACT_02', 'GACT-',
            '>ACT_03', 'AACT-',
            '>ACT_04', 'AACT-',
            '>ACT_05', 'AACT-',
            '>ACT011', 'CTGGC',
            '>hello', 'AACTG',
        ]
        # mean errors is 10/9 .
        seqnames = []
        for elem in aln:
            if elem.startswith('>'):
                seqnames.append(elem[1:])
        seqs = []
        for elem in aln:
            if not elem.startswith('>'):
                seqs.append(elem)

        # just remove ACT011
        res = remove_outliers(aln, 2)
        self.assertEqual(res.sequence_count(), 7)
        for seqname_left in res.ids():
            self.assertTrue(seqname_left in seqnames)
        self.assertTrue('ACT011' not in res.ids())

        # now remove all that deviate have > 10/9 (2 or more) substitutions:
        res = remove_outliers(aln, 0)
        self.assertEqual(res.sequence_count(), 6)
        for seqname_left in res.ids():
            self.assertTrue(seqname_left in seqnames)
        self.assertTrue('ACT011' not in res.ids())
        self.assertTrue('ACT009' not in res.ids())
Example #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # build the output filepath and open it any problems can be caught
    # before starting the work
    try:
        mkdir(opts.output_dir)
    except OSError:
        pass
    input_dir, input_filename = split(opts.input_fasta_file)
    input_basename, ext = splitext(input_filename)

    if getsize(opts.input_fasta_file) == 0:
        raise ValueError("An empty fasta file was provided. "
                         "Did the alignment complete sucessfully? "
                         "Did PyNAST discard all sequences due to too-stringent minimum length "
                         "or minimum percent ID settings?")

    output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename)

    try:
        outfile = open(output_fp, 'w')
    except IOError:
        raise IOError("Can't open output_filepath for writing: %s"
                      % output_filepath)

    if not opts.suppress_lane_mask_filter and not opts.entropy_threshold:
        if opts.lane_mask_fp is not None:
            lane_mask = open(opts.lane_mask_fp, 'U').read().strip()
        else:
            lane_mask = get_template_alignment_column_mask()
    else:
        lane_mask = None

    # open the input and output files
    infile = open(opts.input_fasta_file, 'U')

    if opts.remove_outliers:
        # apply the lanemask/gap removal, then remove outliers

        seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                 opts.allowed_gap_frac,
                                                 entropy_threshold=opts.entropy_threshold)

        filtered_aln = remove_outliers(seq_gen, opts.threshold)
        for seq in filtered_aln:
            outfile.write(seq.to_fasta())
            outfile.write('\n')

    else:
        # just apply the lanemask/gap removal
        for result in apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                     opts.allowed_gap_frac,
                                                     entropy_threshold=opts.entropy_threshold):
            outfile.write(result)
    infile.close()
    outfile.close()
Example #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # build the output filepath and open it any problems can be caught
    # before starting the work
    try:
        mkdir(opts.output_dir)
    except OSError:
        pass
    input_dir, input_filename = split(opts.input_fasta_file)
    input_basename, ext = splitext(input_filename)

    if getsize(opts.input_fasta_file) == 0:
        raise ValueError("An empty fasta file was provided. "
                         "Did the alignment complete sucessfully? "
                         "Did PyNAST discard all sequences due to too-stringent minimum length "
                         "or minimum percent ID settings?")

    output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename)

    try:
        outfile = open(output_fp, 'w')
    except IOError:
        raise IOError("Can't open output_filepath for writing: %s"
                      % output_filepath)

    if opts.lane_mask_fp and not opts.suppress_lane_mask_filter and not\
            opts.entropy_threshold:
        # read the lane_mask, if one was provided
        if opts.verbose:
            print "Reading lane mask..."
        lane_mask = open(opts.lane_mask_fp).read().strip()
    else:
        lane_mask = None
    # open the input and output files
    infile = open(opts.input_fasta_file, 'U')

    if opts.remove_outliers:
        # apply the lanemask/gap removal, then remove outliers

        seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                 opts.allowed_gap_frac, verbose=opts.verbose,
                                                 entropy_threshold=opts.entropy_threshold)

        filtered_aln = remove_outliers(seq_gen, opts.threshold)
        for seq in filtered_aln.Seqs:
            outfile.write(seq.toFasta())
            outfile.write('\n')

    else:
        # just apply the lanemask/gap removal
        for result in apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                     opts.allowed_gap_frac, verbose=opts.verbose,
                                                     entropy_threshold=opts.entropy_threshold):
            outfile.write(result)
    infile.close()
    outfile.close()
Example #5
0
    def test_remove_outliers(self):
        """ remove outliers returns only seqs similar to consensus"""
        aln = [
            '>ACT009',
            'ACAT-',
            '>ACT019',
            'GACT-',
            '>ACT_02',
            'GACT-',
            '>ACT_03',
            'AACT-',
            '>ACT_04',
            'AACT-',
            '>ACT_05',
            'AACT-',
            '>ACT011',
            'CTGGC',
            '>hello',
            'AACTG',
        ]
        # mean errors is 10/9 .
        seqnames = []
        for elem in aln:
            if elem.startswith('>'):
                seqnames.append(elem[1:])
        seqs = []
        for elem in aln:
            if not elem.startswith('>'):
                seqs.append(elem)

        # just remove ACT011
        res = remove_outliers(aln, 2)
        self.assertEqual(len(res.getSeqNames()), 7)
        for seqname_left in res.getSeqNames():
            self.assertTrue(seqname_left in seqnames)
        self.assertTrue('ACT011' not in res.getSeqNames())

        # now remove all that deviate have > 10/9 (2 or more) substitutions:
        res = remove_outliers(aln, 0)
        self.assertEqual(len(res.getSeqNames()), 6)
        for seqname_left in res.getSeqNames():
            self.assertTrue(seqname_left in seqnames)
        self.assertTrue('ACT011' not in res.getSeqNames())
        self.assertTrue('ACT009' not in res.getSeqNames())
Example #6
0
    def test_remove_outliers(self):
        """ remove outliers returns only seqs similar to consensus"""
        aln = [\
         '>ACT009','ACAT-',
         '>ACT019','GACT-',
         '>ACT_02','GACT-',
         '>ACT_03','AACT-',
         '>ACT_04','AACT-',
         '>ACT_05','AACT-',
         '>ACT011','CTGGC',
         '>hello', 'AACTG',
         ]
        # mean errors is 10/9 .  
        seqnames = []
        for elem in aln:
            if elem.startswith('>'):
                seqnames.append (elem[1:])
        seqs = []
        for elem in aln:
            if not elem.startswith('>'):
                seqs.append(elem)

        # just remove ACT011
        res = remove_outliers(aln, 2)
        self.assertEqual(len(res.getSeqNames()), 7)
        for seqname_left in res.getSeqNames():
            self.assertContains(seqnames, seqname_left)
        self.assertNotContains(res.getSeqNames(), 'ACT011')
        
        # now remove all that deviate have > 10/9 (2 or more) substitutions:
        res = remove_outliers(aln, 0)
        self.assertEqual(len(res.getSeqNames()), 6)
        for seqname_left in res.getSeqNames():
            self.assertContains(seqnames, seqname_left)
        self.assertNotContains(res.getSeqNames(), 'ACT011')
        self.assertNotContains(res.getSeqNames(), 'ACT009')