def test_apply_lane_mask_only(self):
        lm1 = '111111'
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm1, 1):
            self.assertEqual(result, expected.next() + '\n')

        # filtering all positions results in a ValueError
        lm2 = '000000'
        with self.assertRaises(ValueError):
            list(apply_lane_mask_and_gap_filter(self.aln1, lm2, 1))

        lm3 = '101010'
        expected = [
            '>s1', 'AC-',
            '>s2', 'A--',
            '>s3', 'TT-',
            '>s4', 'AG-',
            '>s5', '---'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm3, 1):
            self.assertEqual(result, expected.next() + '\n')

        lm4 = '000111'
        expected = [
            '>s1', '--T',
            '>s2', '--T',
            '>s3', '--T',
            '>s4', '--T',
            '>s5', 'A--'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm4, 1):
            self.assertEqual(result, expected.next() + '\n')
    def test_apply_lane_mask_only(self):
        lm1 = '111111'
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm1, 1):
            self.assertEqual(result, expected.next() + '\n')

        # filtering all positions results in a ValueError
        lm2 = '000000'
        with self.assertRaises(ValueError):
            list(apply_lane_mask_and_gap_filter(self.aln1, lm2, 1))

        lm3 = '101010'
        expected = [
            '>s1', 'AC-', '>s2', 'A--', '>s3', 'TT-', '>s4', 'AG-', '>s5',
            '---'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm3, 1):
            self.assertEqual(result, expected.next() + '\n')

        lm4 = '000111'
        expected = [
            '>s1', '--T', '>s2', '--T', '>s3', '--T', '>s4', '--T', '>s5',
            'A--'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm4, 1):
            self.assertEqual(result, expected.next() + '\n')
Example #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # build the output filepath and open it any problems can be caught
    # before starting the work
    try:
        mkdir(opts.output_dir)
    except OSError:
        pass
    input_dir, input_filename = split(opts.input_fasta_file)
    input_basename, ext = splitext(input_filename)

    if getsize(opts.input_fasta_file) == 0:
        raise ValueError("An empty fasta file was provided. "
                         "Did the alignment complete sucessfully? "
                         "Did PyNAST discard all sequences due to too-stringent minimum length "
                         "or minimum percent ID settings?")

    output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename)

    try:
        outfile = open(output_fp, 'w')
    except IOError:
        raise IOError("Can't open output_filepath for writing: %s"
                      % output_filepath)

    if opts.lane_mask_fp and not opts.suppress_lane_mask_filter and not\
            opts.entropy_threshold:
        # read the lane_mask, if one was provided
        if opts.verbose:
            print "Reading lane mask..."
        lane_mask = open(opts.lane_mask_fp).read().strip()
    else:
        lane_mask = None
    # open the input and output files
    infile = open(opts.input_fasta_file, 'U')

    if opts.remove_outliers:
        # apply the lanemask/gap removal, then remove outliers

        seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                 opts.allowed_gap_frac, verbose=opts.verbose,
                                                 entropy_threshold=opts.entropy_threshold)

        filtered_aln = remove_outliers(seq_gen, opts.threshold)
        for seq in filtered_aln.Seqs:
            outfile.write(seq.toFasta())
            outfile.write('\n')

    else:
        # just apply the lanemask/gap removal
        for result in apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                     opts.allowed_gap_frac, verbose=opts.verbose,
                                                     entropy_threshold=opts.entropy_threshold):
            outfile.write(result)
    infile.close()
    outfile.close()
Example #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # build the output filepath and open it any problems can be caught
    # before starting the work
    try:
        mkdir(opts.output_dir)
    except OSError:
        pass
    input_dir, input_filename = split(opts.input_fasta_file)
    input_basename, ext = splitext(input_filename)

    if getsize(opts.input_fasta_file) == 0:
        raise ValueError("An empty fasta file was provided. "
                         "Did the alignment complete sucessfully? "
                         "Did PyNAST discard all sequences due to too-stringent minimum length "
                         "or minimum percent ID settings?")

    output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename)

    try:
        outfile = open(output_fp, 'w')
    except IOError:
        raise IOError("Can't open output_filepath for writing: %s"
                      % output_filepath)

    if not opts.suppress_lane_mask_filter and not opts.entropy_threshold:
        if opts.lane_mask_fp is not None:
            lane_mask = open(opts.lane_mask_fp, 'U').read().strip()
        else:
            lane_mask = get_template_alignment_column_mask()
    else:
        lane_mask = None

    # open the input and output files
    infile = open(opts.input_fasta_file, 'U')

    if opts.remove_outliers:
        # apply the lanemask/gap removal, then remove outliers

        seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                 opts.allowed_gap_frac,
                                                 entropy_threshold=opts.entropy_threshold)

        filtered_aln = remove_outliers(seq_gen, opts.threshold)
        for seq in filtered_aln:
            outfile.write(seq.to_fasta())
            outfile.write('\n')

    else:
        # just apply the lanemask/gap removal
        for result in apply_lane_mask_and_gap_filter(infile, lane_mask,
                                                     opts.allowed_gap_frac,
                                                     entropy_threshold=opts.entropy_threshold):
            outfile.write(result)
    infile.close()
    outfile.close()
    def test_apply_lane_mask_and_gap_filter_w_entropy_threshold(self):
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0,
                                                     entropy_threshold=0.0):
            self.assertEqual(result, expected.next() + '\n')

        # filtering all positions results in a ValueError
        with self.assertRaises(ValueError):
            list(apply_lane_mask_and_gap_filter(self.aln1, None, 1.0,
                                                entropy_threshold=1.0))
Example #6
0
    def test_apply_lane_mask_and_gap_filter_w_entropy_threshold(self):
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0,
                                                     entropy_threshold=0.0):
            self.assertEqual(result, expected.next() + '\n')

        # filtering all positions results in a ValueError
        with self.assertRaises(ValueError):
            list(apply_lane_mask_and_gap_filter(self.aln1, None, 1.0,
                                                entropy_threshold=1.0))
    def test_apply_lane_mask_and_gap_filter_alternate_alignment(self):
        aln = ['>ACT009', 'AACT-', '>ACT019', 'AACT-', '>ACT011', '-TCT-']
        expected = aln.__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        lm = '00111'
        expected = ['>ACT009', 'CT', '>ACT019', 'CT', '>ACT011',
                    'CT'].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, lm):
            self.assertEqual(result, expected.next() + '\n')
    def test_apply_lane_mask_and_gap_filter(self):
        """apply_lane_mask_and_gap_filter: functions as expected
        """
        lm = '111111'
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        lm = None
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        # gap filter only
        lm = '111111'
        expected = [\
            '>s1','ACC-T',\
            '>s2','AC--T',\
            '>s3','TCT-T',\
            '>s4','ACG-T',\
            '>s5','---A-'\
            ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm):
            self.assertEqual(result, expected.next() + '\n')

        # lm filter only
        lm = '011111'
        expected = [\
         '>s1','CC--T',\
         '>s2','C---T',\
         '>s3','CT--T',\
         '>s4','CG--T',\
         '>s5','--A--'\
         ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        # gap and lm filter
        lm = '011111'
        expected = [\
         '>s1','CC-T',\
         '>s2','C--T',\
         '>s3','CT-T',\
         '>s4','CG-T',\
         '>s5','--A-'\
         ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm):
            self.assertEqual(result, expected.next() + '\n')
Example #9
0
    def test_apply_lane_mask_and_gap_filter(self):
        """apply_lane_mask_and_gap_filter: functions as expected
        """
        lm = '111111'
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1,lm,1.0):
            self.assertEqual(result,expected.next()+'\n')

        lm = None
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1,lm,1.0):
            self.assertEqual(result,expected.next()+'\n')
         
        # gap filter only
        lm = '111111'
        expected = [\
            '>s1','ACC-T',\
            '>s2','AC--T',\
            '>s3','TCT-T',\
            '>s4','ACG-T',\
            '>s5','---A-'\
            ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1,lm):
            self.assertEqual(result,expected.next()+'\n')
         
        # lm filter only
        lm = '011111'
        expected = [\
         '>s1','CC--T',\
         '>s2','C---T',\
         '>s3','CT--T',\
         '>s4','CG--T',\
         '>s5','--A--'\
         ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1,lm,1.0):
            self.assertEqual(result,expected.next()+'\n')
         
        # gap and lm filter
        lm = '011111'
        expected = [\
         '>s1','CC-T',\
         '>s2','C--T',\
         '>s3','CT-T',\
         '>s4','CG-T',\
         '>s5','--A-'\
         ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1,lm):
            self.assertEqual(result,expected.next()+'\n')
Example #10
0
    def test_apply_lane_mask_and_gap_filter_w_precomputed_mask(self):
        lm = '111111'
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        lm = None
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        # gap filter only
        lm = '111111'
        expected = [
            '>s1', 'ACC-T',
            '>s2', 'AC--T',
            '>s3', 'TCT-T',
            '>s4', 'ACG-T',
            '>s5', '---A-'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm):
            self.assertEqual(result, expected.next() + '\n')

        # lm filter only
        lm = '011111'
        expected = [
            '>s1', 'CC--T',
            '>s2', 'C---T',
            '>s3', 'CT--T',
            '>s4', 'CG--T',
            '>s5', '--A--'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        # gap and lm filter
        lm = '011111'
        expected = [
            '>s1', 'CC-T',
            '>s2', 'C--T',
            '>s3', 'CT-T',
            '>s4', 'CG-T',
            '>s5', '--A-'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm):
            self.assertEqual(result, expected.next() + '\n')
    def test_apply_lane_mask_and_gap_filter_w_precomputed_mask(self):
        lm = '111111'
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        lm = None
        expected = self.aln1.__iter__()
        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        # gap filter only
        lm = '111111'
        expected = [
            '>s1', 'ACC-T',
            '>s2', 'AC--T',
            '>s3', 'TCT-T',
            '>s4', 'ACG-T',
            '>s5', '---A-'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm):
            self.assertEqual(result, expected.next() + '\n')

        # lm filter only
        lm = '011111'
        expected = [
            '>s1', 'CC--T',
            '>s2', 'C---T',
            '>s3', 'CT--T',
            '>s4', 'CG--T',
            '>s5', '--A--'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        # gap and lm filter
        lm = '011111'
        expected = [
            '>s1', 'CC-T',
            '>s2', 'C--T',
            '>s3', 'CT-T',
            '>s4', 'CG-T',
            '>s5', '--A-'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, lm):
            self.assertEqual(result, expected.next() + '\n')
 def test_apply_lane_mask_and_gap_filter_invalid(self):
     # passing both a mask and an entropy threshold results in a ValueError
     with self.assertRaises(ValueError):
         list(
             apply_lane_mask_and_gap_filter(self.aln1,
                                            '111111',
                                            entropy_threshold=0.0))
    def test_apply_lane_mask_and_gap_filter_alternate_alignment(self):
        aln = [
            '>ACT009', 'AACT-',
            '>ACT019', 'AACT-',
            '>ACT011', '-TCT-'
        ]
        expected = aln.__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        lm = '00111'
        expected = [
            '>ACT009', 'CT',
            '>ACT019', 'CT',
            '>ACT011', 'CT'
        ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, lm):
            self.assertEqual(result, expected.next() + '\n')
Example #14
0
 def test_apply_lane_mask_and_gap_filter_alternate_alignment(self):
     """apply_lane_mask_and_gap_filter: functions as expected with alt aln
     """
     aln = [\
      '>ACT009','AACT-',\
      '>ACT019','AACT-',\
      '>ACT011','-TCT-'\
      ]
     expected = aln.__iter__()
     for result in apply_lane_mask_and_gap_filter(aln,None,1.0):
         self.assertEqual(result,expected.next()+'\n')
     
     lm = '00111'
     expected = [\
      '>ACT009','CT',\
      '>ACT019','CT',\
      '>ACT011','CT'\
      ].__iter__()
     for result in apply_lane_mask_and_gap_filter(aln,lm):
         self.assertEqual(result,expected.next()+'\n')
Example #15
0
    def test_apply_lane_mask_and_gap_filter_alternate_alignment(self):
        """apply_lane_mask_and_gap_filter: functions as expected with alt aln
        """
        aln = [\
         '>ACT009','AACT-',\
         '>ACT019','AACT-',\
         '>ACT011','-TCT-'\
         ]
        expected = aln.__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        lm = '00111'
        expected = [\
         '>ACT009','CT',\
         '>ACT019','CT',\
         '>ACT011','CT'\
         ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, lm):
            self.assertEqual(result, expected.next() + '\n')
    def test_apply_gap_filter_only(self):
        expected = self.aln1.__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACC-T', '>s2', 'AC--T', '>s3', 'TCT-T', '>s4', 'ACG-T',
            '>s5', '---A-'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACCT', '>s2', 'AC-T', '>s3', 'TCTT', '>s4', 'ACGT', '>s5',
            '----'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.75):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACCT', '>s2', 'AC-T', '>s3', 'TCTT', '>s4', 'ACGT', '>s5',
            '----'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.40):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACT', '>s2', 'ACT', '>s3', 'TCT', '>s4', 'ACT', '>s5',
            '---'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.30):
            self.assertEqual(result, expected.next() + '\n')

        # filtering all positions results in a ValueError
        with self.assertRaises(ValueError):
            list(apply_lane_mask_and_gap_filter(self.aln1, None, 0.10))

        # the following tests were adapted from test_alignment.py in PyCogent

        aln = ['>a', '--A-BC-', '>b', '-CB-A--', '>c', '--D-EF-']

        # default should strip out cols that are 100% gaps
        expected = ['>a', '-ABC', '>b', 'CBA-', '>c', '-DEF'].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None):
            self.assertEqual(result, expected.next() + '\n')

        # if allowed_gap_frac is 1, shouldn't delete anything
        expected = ['>a', '--A-BC-', '>b', '-CB-A--', '>c',
                    '--D-EF-'].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 1):
            self.assertEqual(result, expected.next() + '\n')

        # if allowed_gap_frac is 0, should strip out any cols containing gaps
        expected = ['>a', 'AB', '>b', 'BA', '>c', 'DE'].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 0):
            self.assertEqual(result, expected.next() + '\n')

        # intermediate numbers should work as expected
        expected = ['>a', 'ABC', '>b', 'BA-', '>c', 'DEF'].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 0.4):
            self.assertEqual(result, expected.next() + '\n')
        expected = ['>a', '-ABC', '>b', 'CBA-', '>c', '-DEF'].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 0.7):
            self.assertEqual(result, expected.next() + '\n')
Example #17
0
 def test_apply_lane_mask_and_gap_filter_real(self):
     """apply_lane_mask_and_gap_filter: no error on full length seqs
     """
     # No error when applying to full-length sequence
     actual = apply_lane_mask_and_gap_filter(\
      self.aln2,self.aln2_lm)
    def test_apply_gap_filter_only(self):
        expected = self.aln1.__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 1.0):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACC-T',
            '>s2', 'AC--T',
            '>s3', 'TCT-T',
            '>s4', 'ACG-T',
            '>s5', '---A-'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACCT',
            '>s2', 'AC-T',
            '>s3', 'TCTT',
            '>s4', 'ACGT',
            '>s5', '----'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.75):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACCT',
            '>s2', 'AC-T',
            '>s3', 'TCTT',
            '>s4', 'ACGT',
            '>s5', '----'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.40):
            self.assertEqual(result, expected.next() + '\n')

        expected = [
            '>s1', 'ACT',
            '>s2', 'ACT',
            '>s3', 'TCT',
            '>s4', 'ACT',
            '>s5', '---'
        ].__iter__()

        for result in apply_lane_mask_and_gap_filter(self.aln1, None, 0.30):
            self.assertEqual(result, expected.next() + '\n')

        # filtering all positions results in a ValueError
        with self.assertRaises(ValueError):
            list(apply_lane_mask_and_gap_filter(self.aln1, None, 0.10))

        # the following tests were adapted from test_alignment.py in PyCogent

        aln = [
            '>a', '--A-BC-',
            '>b', '-CB-A--',
            '>c', '--D-EF-'
        ]

        # default should strip out cols that are 100% gaps
        expected = [
            '>a', '-ABC',
            '>b', 'CBA-',
            '>c', '-DEF'
        ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None):
            self.assertEqual(result, expected.next() + '\n')

        # if allowed_gap_frac is 1, shouldn't delete anything
        expected = [
            '>a', '--A-BC-',
            '>b', '-CB-A--',
            '>c', '--D-EF-'
        ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 1):
            self.assertEqual(result, expected.next() + '\n')

        # if allowed_gap_frac is 0, should strip out any cols containing gaps
        expected = [
            '>a', 'AB',
            '>b', 'BA',
            '>c', 'DE'
        ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 0):
            self.assertEqual(result, expected.next() + '\n')

        # intermediate numbers should work as expected
        expected = [
            '>a', 'ABC',
            '>b', 'BA-',
            '>c', 'DEF'
        ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 0.4):
            self.assertEqual(result, expected.next() + '\n')
        expected = [
            '>a', '-ABC',
            '>b', 'CBA-',
            '>c', '-DEF'
        ].__iter__()
        for result in apply_lane_mask_and_gap_filter(aln, None, 0.7):
            self.assertEqual(result, expected.next() + '\n')
 def test_apply_lane_mask_and_gap_filter_real(self):
     # No error when applying to full-length sequence
     actual = apply_lane_mask_and_gap_filter(self.aln2, self.aln2_lm)
 def test_apply_lane_mask_and_gap_filter_invalid(self):
     # passing both a mask and an entropy threshold results in a ValueError
     with self.assertRaises(ValueError):
         list(apply_lane_mask_and_gap_filter(self.aln1, '111111',
                                             entropy_threshold=0.0))