def test_remove_outliers(self): aln = [ '>ACT009', 'ACAT-', '>ACT019', 'GACT-', '>ACT_02', 'GACT-', '>ACT_03', 'AACT-', '>ACT_04', 'AACT-', '>ACT_05', 'AACT-', '>ACT011', 'CTGGC', '>hello', 'AACTG', ] # mean errors is 10/9 . seqnames = [] for elem in aln: if elem.startswith('>'): seqnames.append(elem[1:]) seqs = [] for elem in aln: if not elem.startswith('>'): seqs.append(elem) # just remove ACT011 res = remove_outliers(aln, 2) self.assertEqual(res.sequence_count(), 7) for seqname_left in res.ids(): self.assertTrue(seqname_left in seqnames) self.assertTrue('ACT011' not in res.ids()) # now remove all that deviate have > 10/9 (2 or more) substitutions: res = remove_outliers(aln, 0) self.assertEqual(res.sequence_count(), 6) for seqname_left in res.ids(): self.assertTrue(seqname_left in seqnames) self.assertTrue('ACT011' not in res.ids()) self.assertTrue('ACT009' not in res.ids())
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # build the output filepath and open it any problems can be caught # before starting the work try: mkdir(opts.output_dir) except OSError: pass input_dir, input_filename = split(opts.input_fasta_file) input_basename, ext = splitext(input_filename) if getsize(opts.input_fasta_file) == 0: raise ValueError("An empty fasta file was provided. " "Did the alignment complete sucessfully? " "Did PyNAST discard all sequences due to too-stringent minimum length " "or minimum percent ID settings?") output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename) try: outfile = open(output_fp, 'w') except IOError: raise IOError("Can't open output_filepath for writing: %s" % output_filepath) if not opts.suppress_lane_mask_filter and not opts.entropy_threshold: if opts.lane_mask_fp is not None: lane_mask = open(opts.lane_mask_fp, 'U').read().strip() else: lane_mask = get_template_alignment_column_mask() else: lane_mask = None # open the input and output files infile = open(opts.input_fasta_file, 'U') if opts.remove_outliers: # apply the lanemask/gap removal, then remove outliers seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, entropy_threshold=opts.entropy_threshold) filtered_aln = remove_outliers(seq_gen, opts.threshold) for seq in filtered_aln: outfile.write(seq.to_fasta()) outfile.write('\n') else: # just apply the lanemask/gap removal for result in apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, entropy_threshold=opts.entropy_threshold): outfile.write(result) infile.close() outfile.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # build the output filepath and open it any problems can be caught # before starting the work try: mkdir(opts.output_dir) except OSError: pass input_dir, input_filename = split(opts.input_fasta_file) input_basename, ext = splitext(input_filename) if getsize(opts.input_fasta_file) == 0: raise ValueError("An empty fasta file was provided. " "Did the alignment complete sucessfully? " "Did PyNAST discard all sequences due to too-stringent minimum length " "or minimum percent ID settings?") output_fp = '%s/%s_pfiltered.fasta' % (opts.output_dir, input_basename) try: outfile = open(output_fp, 'w') except IOError: raise IOError("Can't open output_filepath for writing: %s" % output_filepath) if opts.lane_mask_fp and not opts.suppress_lane_mask_filter and not\ opts.entropy_threshold: # read the lane_mask, if one was provided if opts.verbose: print "Reading lane mask..." lane_mask = open(opts.lane_mask_fp).read().strip() else: lane_mask = None # open the input and output files infile = open(opts.input_fasta_file, 'U') if opts.remove_outliers: # apply the lanemask/gap removal, then remove outliers seq_gen = apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, verbose=opts.verbose, entropy_threshold=opts.entropy_threshold) filtered_aln = remove_outliers(seq_gen, opts.threshold) for seq in filtered_aln.Seqs: outfile.write(seq.toFasta()) outfile.write('\n') else: # just apply the lanemask/gap removal for result in apply_lane_mask_and_gap_filter(infile, lane_mask, opts.allowed_gap_frac, verbose=opts.verbose, entropy_threshold=opts.entropy_threshold): outfile.write(result) infile.close() outfile.close()
def test_remove_outliers(self): """ remove outliers returns only seqs similar to consensus""" aln = [ '>ACT009', 'ACAT-', '>ACT019', 'GACT-', '>ACT_02', 'GACT-', '>ACT_03', 'AACT-', '>ACT_04', 'AACT-', '>ACT_05', 'AACT-', '>ACT011', 'CTGGC', '>hello', 'AACTG', ] # mean errors is 10/9 . seqnames = [] for elem in aln: if elem.startswith('>'): seqnames.append(elem[1:]) seqs = [] for elem in aln: if not elem.startswith('>'): seqs.append(elem) # just remove ACT011 res = remove_outliers(aln, 2) self.assertEqual(len(res.getSeqNames()), 7) for seqname_left in res.getSeqNames(): self.assertTrue(seqname_left in seqnames) self.assertTrue('ACT011' not in res.getSeqNames()) # now remove all that deviate have > 10/9 (2 or more) substitutions: res = remove_outliers(aln, 0) self.assertEqual(len(res.getSeqNames()), 6) for seqname_left in res.getSeqNames(): self.assertTrue(seqname_left in seqnames) self.assertTrue('ACT011' not in res.getSeqNames()) self.assertTrue('ACT009' not in res.getSeqNames())
def test_remove_outliers(self): """ remove outliers returns only seqs similar to consensus""" aln = [\ '>ACT009','ACAT-', '>ACT019','GACT-', '>ACT_02','GACT-', '>ACT_03','AACT-', '>ACT_04','AACT-', '>ACT_05','AACT-', '>ACT011','CTGGC', '>hello', 'AACTG', ] # mean errors is 10/9 . seqnames = [] for elem in aln: if elem.startswith('>'): seqnames.append (elem[1:]) seqs = [] for elem in aln: if not elem.startswith('>'): seqs.append(elem) # just remove ACT011 res = remove_outliers(aln, 2) self.assertEqual(len(res.getSeqNames()), 7) for seqname_left in res.getSeqNames(): self.assertContains(seqnames, seqname_left) self.assertNotContains(res.getSeqNames(), 'ACT011') # now remove all that deviate have > 10/9 (2 or more) substitutions: res = remove_outliers(aln, 0) self.assertEqual(len(res.getSeqNames()), 6) for seqname_left in res.getSeqNames(): self.assertContains(seqnames, seqname_left) self.assertNotContains(res.getSeqNames(), 'ACT011') self.assertNotContains(res.getSeqNames(), 'ACT009')