Exemple #1
0
    def test_build_seqdict_multi_sequence(self, mock_non_blank):
        mock_non_blank.return_value = [">blah", "some value", "", ">foo", "bar"]

        test_file = tempfile.NamedTemporaryFile(delete=False)
        input_dict = {}
        files.build_seqdict(test_file.name, input_dict)
        self.assertEqual(input_dict, {"blah": "some value", "foo": "bar"})
Exemple #2
0
    def test_build_seqdict_multi_sequence(self, mock_non_blank):
        mock_non_blank.return_value = [
            ">blah", "some value", "", ">foo", "bar"
        ]

        test_file = tempfile.NamedTemporaryFile(delete=False)
        input_dict = {}
        files.build_seqdict(test_file.name, input_dict)
        self.assertEqual(input_dict, {"blah": "some value", "foo": "bar"})
Exemple #3
0
    def test_build_seqdict_1_sequence(self):
        test_file = tempfile.NamedTemporaryFile(delete=False)
        with test_file:
            test_file.write(">blah\n")
            test_file.write("some value")

        input_dict = {}
        files.build_seqdict(test_file.name, input_dict)
        self.assertEqual(input_dict, {"blah": "some value"})
Exemple #4
0
    def test_build_seqdict_1_sequence(self):
        test_file = tempfile.NamedTemporaryFile(delete=False)
        with test_file:
            test_file.write(">blah\n")
            test_file.write("some value")

        input_dict = {}
        files.build_seqdict(test_file.name, input_dict)
        self.assertEqual(input_dict, {"blah": "some value"})
# Appends if specified file already exists
if os.path.isfile(m_out):
    m_o = open(m_out,'a')
else:
    # The first time the file is opened, write header lines
    m_o = open(m_out,'w')
    m_o.write("gene,num AA changes,num identical before,num identical after,"
        "num similar before,num similar after,avgerage edit score diff")
    m_o.write("\n" * 2)

# Each independent file is also used to create its own file
b_out = name + "_aminoacid_changes.csv"

# Load sequence data into a data structure for internal use
seqdict = {}
files.build_seqdict(args.infile,seqdict)

rna_string = str(args.RNA)
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string,k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()
    else:
        ref_seq = seqdict.get(k).upper()

# Need to find beginning and end of aligned region
i = 0
j = 0
Exemple #6
0
bases = 'AGTC'

# Create a "master" outfile to collate data from multiple files
m_out = args.outfile
# Appends if specified file already exists
if os.path.isfile(m_out):
    m_o = open(m_out,'a')
else:
    # The first time the file is opened, write header lines
    m_o = open(m_out,'w')
    m_o.write("name,length,number edits,frequency of significant edits")
    m_o.write("\n" * 2)

# Load sequence data into a data structure for internal use
seqdict = {}
files.build_seqdict(args.infile,seqdict)

rna_string = str(args.RNA)
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string,k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()

# Find the beginning and end of aligned region
i = 0
j = 0
try:
    # Compare genomic and RNA sequences to find local regions of good
parser.add_argument('infiles', nargs='+', help='list of infiles')
args = parser.parse_args()

# Unlike other programs in this package, this one is written to be used
# without a wrapper script, but could be easily adapted to do so
for infile in args.infiles:
    # Gets the basename for the file
    basename = infile.rsplit('.',1)[0]
    # We actually provide aligned and sequence-only
    # versions of the output
    out_align = basename + "_trimmed.afa"
    out_seq = basename + "_trimmed.fa"

    # Load sequence data into a data structure for internal use
    seqdict = {}
    files.build_seqdict(infile,seqdict)

    rna_string = str(args.RNA)
    gen_string = str(args.genomic)
    # Sequences must be in upper case
    for k in seqdict.keys():
        if re.search(rna_string,k):
            # Since we are writing these data back out again
            # we want to keep track of sequence headers
            rna_header = k
            rna_seq = seqdict.get(k).upper()
        elif re.search(gen_string,k):
            gen_header = k
            gen_seq = seqdict.get(k).upper()
        else:
            ref_header = k
        'average sim edit score', 'frequency of significant editing'
    ]
    m_o = files.get_variable_file_handle(m_out, 'w', ',', mlist)
if os.path.isfile(s_out):
    s_o = files.get_variable_file_handle(s_out, 'a')
else:
    slist = [
        'gene', 'num 1st pos', 'num 2nd pos', 'num 3rd pos', 'A to T',
        'A to G', 'A to C', 'T to A', 'T to G', 'T to C', 'G to A', 'G to T',
        'G to C', 'C to A', 'C to T', 'C to G'
    ]
    s_o = files.get_variable_file_handle(s_out, 'w', ',', slist)

# Load sequence data into a data structure for internal use
seqdict = {}
files.build_seqdict(args.infile, seqdict)

rna_string = str(args.RNA)
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string, k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string, k):
        gen_seq = seqdict.get(k).upper()
    else:
        ref_seq = seqdict.get(k).upper()

# We directly compare aligned sequences, but class implementation uses
# unaligned sequences (i.e. no gap characters '-')
san_rna_seq = strings.sanitize(rna_seq)
else:
    # The first time the file is opened, write header lines
    mlist = ['gene','number nucleotide edits','number AA edits','average number sim AA edits',
            'average edit score','average sim edit score','frequency of significant editing']
    m_o = files.get_variable_file_handle(m_out,'w',',',mlist)
if os.path.isfile(s_out):
    s_o = files.get_variable_file_handle(s_out,'a')
else:
    slist = ['gene','num 1st pos','num 2nd pos','num 3rd pos','A to T','A to G',
            'A to C','T to A','T to G','T to C','G to A','G to T','G to C',
            'C to A','C to T','C to G']
    s_o = files.get_variable_file_handle(s_out,'w',',',slist)

# Load sequence data into a data structure for internal use
seqdict = {}
files.build_seqdict(args.infile,seqdict)

rna_string = str(args.RNA)
gen_string = str(args.genomic)
# Sequences must be in upper-case
for k in seqdict.keys():
    if re.search(rna_string,k):
        rna_seq = seqdict.get(k).upper()
    elif re.search(gen_string,k):
        gen_seq = seqdict.get(k).upper()
    else:
        ref_seq = seqdict.get(k).upper()

# We directly compare aligned sequences, but class implementation uses
# unaligned sequences (i.e. no gap characters '-')
san_rna_seq = strings.sanitize(rna_seq)
parser.add_argument('infiles', nargs='+', help='list of infiles')
args = parser.parse_args()

# Unlike other programs in this package, this one is written to be used
# without a wrapper script, but could be easily adapted to do so
for infile in args.infiles:
    # Gets the basename for the file
    basename = infile.rsplit('.',1)[0]
    # We actually provide aligned and sequence-only
    # versions of the output
    out_align = basename + "_trimmed.afa"
    out_seq = basename + "_trimmed.fa"

    # Load sequence data into a data structure for internal use
    seqdict = {}
    files.build_seqdict(infile,seqdict)

    rna_string = str(args.RNA)
    gen_string = str(args.genomic)
    # Sequences must be in upper case
    for k in seqdict.keys():
        if re.search(rna_string,k):
            # Since we are writing these data back out again
            # we want to keep track of sequence headers
            rna_header = k
            rna_seq = seqdict.get(k).upper()
        elif re.search(gen_string,k):
            gen_header = k
            gen_seq = seqdict.get(k).upper()
        else:
            ref_header = k