def test_build_seqdict_multi_sequence(self, mock_non_blank): mock_non_blank.return_value = [">blah", "some value", "", ">foo", "bar"] test_file = tempfile.NamedTemporaryFile(delete=False) input_dict = {} files.build_seqdict(test_file.name, input_dict) self.assertEqual(input_dict, {"blah": "some value", "foo": "bar"})
def test_build_seqdict_multi_sequence(self, mock_non_blank): mock_non_blank.return_value = [ ">blah", "some value", "", ">foo", "bar" ] test_file = tempfile.NamedTemporaryFile(delete=False) input_dict = {} files.build_seqdict(test_file.name, input_dict) self.assertEqual(input_dict, {"blah": "some value", "foo": "bar"})
def test_build_seqdict_1_sequence(self): test_file = tempfile.NamedTemporaryFile(delete=False) with test_file: test_file.write(">blah\n") test_file.write("some value") input_dict = {} files.build_seqdict(test_file.name, input_dict) self.assertEqual(input_dict, {"blah": "some value"})
# Appends if specified file already exists if os.path.isfile(m_out): m_o = open(m_out,'a') else: # The first time the file is opened, write header lines m_o = open(m_out,'w') m_o.write("gene,num AA changes,num identical before,num identical after," "num similar before,num similar after,avgerage edit score diff") m_o.write("\n" * 2) # Each independent file is also used to create its own file b_out = name + "_aminoacid_changes.csv" # Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(args.infile,seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string,k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() else: ref_seq = seqdict.get(k).upper() # Need to find beginning and end of aligned region i = 0 j = 0
bases = 'AGTC' # Create a "master" outfile to collate data from multiple files m_out = args.outfile # Appends if specified file already exists if os.path.isfile(m_out): m_o = open(m_out,'a') else: # The first time the file is opened, write header lines m_o = open(m_out,'w') m_o.write("name,length,number edits,frequency of significant edits") m_o.write("\n" * 2) # Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(args.infile,seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string,k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() # Find the beginning and end of aligned region i = 0 j = 0 try: # Compare genomic and RNA sequences to find local regions of good
parser.add_argument('infiles', nargs='+', help='list of infiles') args = parser.parse_args() # Unlike other programs in this package, this one is written to be used # without a wrapper script, but could be easily adapted to do so for infile in args.infiles: # Gets the basename for the file basename = infile.rsplit('.',1)[0] # We actually provide aligned and sequence-only # versions of the output out_align = basename + "_trimmed.afa" out_seq = basename + "_trimmed.fa" # Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(infile,seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper case for k in seqdict.keys(): if re.search(rna_string,k): # Since we are writing these data back out again # we want to keep track of sequence headers rna_header = k rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_header = k gen_seq = seqdict.get(k).upper() else: ref_header = k
'average sim edit score', 'frequency of significant editing' ] m_o = files.get_variable_file_handle(m_out, 'w', ',', mlist) if os.path.isfile(s_out): s_o = files.get_variable_file_handle(s_out, 'a') else: slist = [ 'gene', 'num 1st pos', 'num 2nd pos', 'num 3rd pos', 'A to T', 'A to G', 'A to C', 'T to A', 'T to G', 'T to C', 'G to A', 'G to T', 'G to C', 'C to A', 'C to T', 'C to G' ] s_o = files.get_variable_file_handle(s_out, 'w', ',', slist) # Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(args.infile, seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string, k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string, k): gen_seq = seqdict.get(k).upper() else: ref_seq = seqdict.get(k).upper() # We directly compare aligned sequences, but class implementation uses # unaligned sequences (i.e. no gap characters '-') san_rna_seq = strings.sanitize(rna_seq)
else: # The first time the file is opened, write header lines mlist = ['gene','number nucleotide edits','number AA edits','average number sim AA edits', 'average edit score','average sim edit score','frequency of significant editing'] m_o = files.get_variable_file_handle(m_out,'w',',',mlist) if os.path.isfile(s_out): s_o = files.get_variable_file_handle(s_out,'a') else: slist = ['gene','num 1st pos','num 2nd pos','num 3rd pos','A to T','A to G', 'A to C','T to A','T to G','T to C','G to A','G to T','G to C', 'C to A','C to T','C to G'] s_o = files.get_variable_file_handle(s_out,'w',',',slist) # Load sequence data into a data structure for internal use seqdict = {} files.build_seqdict(args.infile,seqdict) rna_string = str(args.RNA) gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string,k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() else: ref_seq = seqdict.get(k).upper() # We directly compare aligned sequences, but class implementation uses # unaligned sequences (i.e. no gap characters '-') san_rna_seq = strings.sanitize(rna_seq)