Example #1
0
def checkBIN(a, b):
    global _dir
    e = os.system("{} 1>{}/tmp/binaries 2>{}/tmp/binaries2".format(
        a, _dir, _dir))

    IN = open_or_die("{}/tmp/binaries".format(_dir), 'rb',
                     'can not open {}/tmp/binaries'.format(_dir))
    found = 1

    while True:
        line = IN.readline()
        if not line:
            break

        if re.search(b, line):
            found = 0

    IN.close()

    if found:
        IN = open_or_die('{}/tmp/binaries2'.format(_dir), 'rb',
                         'can not open {}/tmp/binaries2'.format(_dir))
        while True:
            line = IN.readline()
            if not line:
                break

            if re.search(b, line):
                found = 0

    IN.close()
    return found
Example #2
0
def cat_to(file_1, file_2):
    OUT = open_or_die(file_2, 'a', 'cannot print to {}'.format(file_2))
    IN = open_or_die(file_1, 'rb', 'cannot read from {}'.format(file_1))

    while True:
        line = IN.readline()
        if not line:
            break
        OUT.write(line)

    IN.close()
    OUT.close()
def make_files(file_output, coord_file, thres):
    TMP1 = open_or_die('{}_all'.format(file_output), 'rb',
                       'can not open {}'.format('file_output'))

    PRES = open_or_die(file_output, 'w+',
                       'can not create '.format(file_output))
    counter = 0

    while True:
        l = TMP1.readline()
        if not l:
            break

        l = l.strip()
        if re.match('^>', l):
            line = l.split('_xyz123_')
            if int(line[3]) == thres:  # last value
                counter += 1
                seq = TMP1.readline()
                PRES.write('{}_{}\n{}'.format(line[0], counter, seq))

    TMP1.close()
    PRES.close()

    COORD = open_or_die(coord_file, 'w+',
                        'can not create {}'.format(coord_file))
    TMP2 = open_or_die('{}_all'.format(coord_file), 'rb',
                       'can not open {}'.format(coord_file))
    counter = 0

    while True:
        l = TMP2.readline()
        if not l:
            break

        l = l.strip()

        if re.match('^>', l):
            # split() => split by space, \t, \n
            line = esplit(l)
            line2 = line[0].split('_xyz123_')
            if int(line2[3]) == thres:
                counter += 1
                COORD.write('{}_{}\t{}\t{}\t{}\n'.format(
                    line2[0], counter, line[1], line[2], line[3]))

    TMP2.close()
    COORD.close()
def parse_file_solexa(options, file_solexa):
    global running

    FILE = open_or_die(file_solexa, 'rb',
                       'can not open file: {}'.format(file_solexa))

    while True:
        l = FILE.readline()
        if not l:
            break

        if re.search(r'^\s*$', l):
            continue

        fields = re.split(r'\s+', l)
        seq = ''

        if options.get('-a') == '':
            seq = fields[8]
        else:
            seq = fields[4]

        seq = seq.strip()
        seq = re.sub(r'\.', 'N', seq)
        print('>{}_{}'.format(seq, running))
        print(seq)

        running += 1

    FILE.close()
Example #5
0
def parse_file_mrd(file_out, _hash):
    global hash_sig
    score = None
    refs = []
    FILE = open_or_die(file_out, 'rb', 'can not open {}\n'.format(file_out))
    while True:
        line = FILE.readline()
        if not line:
            break

        m = re.match(r'^score\s+total\s+(\S+)', line)
        if m:
            m = m.groups()
            score = float(m[0])
        else:
            m2 = re.match(r'^(\S+)', line)

            defined = False
            if m2:
                try:
                    hash_sig[m2.groups()[0]]
                    defined = True
                except KeyError:
                    pass

            if m2 and defined:
                refs.append(m2.groups()[0])
            elif re.search(r'^>', line) and score is not None:
                resolve_entry_file_mrd(score, refs, _hash)
                score = None  # ATTR: ($$score,@$refs)=();

    resolve_entry_file_mrd(score, refs, _hash)
Example #6
0
def printUsedParameters(options):
    global _dir, ltime, command_line, file_reads, file_genome, file_reads_vs_genome
    fname = '{}/run_{}_parameters'.format(_dir, ltime)
    OUT = open_or_die(fname, 'w+', 'can not open {}\n'.format(fname))
    OUT.write("Start: {}\n".format(ltime))
    OUT.write("Script\t{}\n".format(sys.argv[0]))
    OUT.write("args {}\n".format(command_line))
    OUT.write("dir_with_tmp_files\tdir_moR_{}\n".format(ltime))

    d = cwd()

    OUT.write("dir\t{}\n".format(d))
    OUT.write("file_reads\t{}\n".format(file_reads))
    OUT.write("file_genome\t{}\n".format(file_genome))
    OUT.write("file_reads_vs_genome\t{}\n".format(file_reads_vs_genome))
    OUT.write("file_mature_ref_this_species\t{}\n".format(
        file_mature_ref_this_species))
    OUT.write("file_mature_ref_other_species\t{}\n".format(
        file_mature_ref_other_species))

    if options.get('-a'):
        OUT.write("option -a =\t{}\n".format(options.get('-a')))
    if options.get('-b'):
        OUT.write("option -b =\t{}\n".format(options.get('-b')))
    if options.get('-c') == '':
        OUT.write("option -c =\t{}\n".format(options.get('-c')))
    if options.get('-t'):
        OUT.write("option -t =\t{}\n".format(options.get('-t')))
    if options.get('-v') == '':
        OUT.write("option -v =\tused\n")


#    if($options{'q'}){print OUT "option{q} =\t$options{'q'}\n";}

    OUT.close()
def parse_genome_and_excise(TMP1, TMP2, file_fasta):
    FASTA = open_or_die(file_fasta, 'rb', 'can not open {}'.format(file_fasta))

    while True:
        line = FASTA.readline()
        if not line:
            break

        line = line.strip()

        m = re.match(r'^>(\S+)(.*)', line)
        if m:
            _id = m.groups()[0]
            desc = m.groups()[1]
            sequence = ''
            while True:
                ll = FASTA.readline()
                if not ll:
                    break
                ll = ll.strip()

                mm = re.match(r'^>(\S+)(.*)', ll)
                if mm:
                    excise(TMP1, TMP2, _id, sequence)
                    _id = mm.groups()[0]
                    desc = mm.groups()[1]
                    sequence = ''
                    continue
                sequence += ll

    excise(TMP1, TMP2, _id, sequence)
    FASTA.close()
Example #8
0
def parse_fasta(file_fasta, _hash):
    FASTA = open_or_die(file_fasta, 'rb',
                        'can not open file'.format(file_fasta))
    while True:
        l = FASTA.readline()
        if not l:
            break

        l = l.strip()

        m = re.match(r'^>(\S+)(.*)', l)
        if m:
            m = m.groups()
            _id = m[0]
            desc = m[1]
            sequence = ''

            while True:
                ll = FASTA.readline()
                if not ll:
                    break

                ll = ll.strip()

                mm = re.match(r'^>(\S+)(.*)', ll)
                if mm:
                    mm = mm.groups()
                    defined = False
                    try:
                        _hash[_id]
                        defined = True
                    except KeyError:
                        pass

                    if (defined and options.get('-a') is None) or (
                            not defined and options.get('-a') == ''):
                        print('>{}{}\n{}'.format(_id, desc, sequence))

                    _id = mm[0]
                    desc = mm[1]
                    sequence = ''
                    continue

                sequence += ll

    defined = False
    try:
        _hash[_id]
        defined = True
    except KeyError:
        pass
    if (defined
            and options.get('-a') is None) or (not defined
                                               and options.get('-a') == ''):
        print('>{}{}\n{}'.format(_id, desc, sequence))

    FASTA.close()
Example #9
0
def cat_files(file_1, file_2, file_out):
    OUT = open_or_die(file_out, 'w+', 'cannot print to {}\n'.format(file_out))
    IN_1 = open_or_die(file_1, 'rb', 'cannot read from {}\n'.format(file_1))
    while True:
        line = IN_1.readline()
        if not line:
            break
        OUT.write(line)
    IN_1.close()

    IN_2 = open_or_die(file_2, 'rb', 'cannot read from {}\n'.format(file_2))
    while True:
        line = IN_2.readline()
        if not line:
            break
        OUT.write(line)

    IN_2.close()
    OUT.close()
Example #10
0
def presort(_file):
    IK = open_or_die(_file, 'rb', 'no arf file given\n')
    IKT = open_or_die('{}.tmp'.format(_file), 'w+',
                      'tmp file could not be created\n')
    index = {}
    count = 0
    l = []

    while True:
        line = IK.readline()
        if not line:
            break
        l = esplit(line)
        if l[5] not in index.keys():
            count += 1
            index[l[5]] = count

        IKT.write('{}\t{}'.format(index[l[5]], line))

    IK.close()
    IKT.close()
Example #11
0
def get_longest_id(f):
    l = 0
    IN = open_or_die(f, 'rb', 'No file given for checking\n')
    while True:
        line = IN.readline()
        if not line:
            break
        m = re.findall(r'>(\S+)', line)
        if m:
            if len(m[0]) > l:
                l = len(m[0])

    IN.close()
    return l
Example #12
0
def parse_file_ids(file_ids, _hash):

    FILE = open_or_die(file_ids, 'rb', 'can not open {}'.format(file_ids))
    while True:
        line = FILE.readline()
        if not line:
            break

        m = re.match(r'^(\S+)', line)
        if m:
            _id = m.groups()[0]
            _hash[_id] = 1

    FILE.close()
def parse_file_ids(_file, _hash):
    # read id file into hash
    if options.get('-k') == '':
        print_stderr('reading id file into memory\n')

    FILE = open_or_die(_file, 'rb', 'can not open {}\n'.format(_file))
    while True:
        line = FILE.readline()
        if not line:
            break

        m = re.match(r'^(\S+)', line)
        if m:
            _id = m.groups()[0]
            _hash[_id] = 1
Example #14
0
def parse_fasta(file_fasta):
    FASTA = open_or_die(file_fasta, 'rb',
                        'can not open {}\n'.format(file_fasta))
    while True:
        line = FASTA.readline()
        if not line:
            break

        m = re.match(r'^(>\S+)', line)
        if m:
            pprint('{}\n'.format(m.groups()[0]))
        else:
            pprint('{}'.format(re.sub('U', 'T', line).upper()))

    FASTA.close()
    return
def parse_file_command_line(file_command_line, file_structure, _dir):
    FILE = open_or_die(file_command_line, 'rb',
                       'can not open {}'.format(file_command_line))
    while True:
        line = FILE.readline()
        if not line:
            break

        if re.search(r'(\S+)', line):
            line = line.strip()
            line = re.sub(file_structure,
                          '{}/precursors_permuted.str'.format(_dir),
                          line,
                          count=1)
            line = re.sub(r'>.+', '', line, count=1)
            return line

    die('{} is empty\n'.format(file_command_line))
def parse_file_arf(file_arf):
    global count_lines, hash_pos

    lines = int(os.popen('cat {} | wc -l'.format(file_arf)).read().strip())

    if options.get('-b') == '':
        print_stderr(
            'reading the mapping file into memory, total lines=$lines\n'.
            format(lines))

    FILENAME = open_or_die(file_arf, 'rb',
                           'Could not open file {}'.format(file_arf))

    while True:
        line = FILENAME.readline()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()
            query = m[0]
            query_map_lng = int(m[1])
            query_beg = int(m[2])
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_map_lng = int(m[6])
            db_beg = int(m[7])
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = int(m[11])
            edit_string = m[12]

            freq = find_freq(query)
            # read into position hash
            insertfeature(db, strand, db_beg, db_end, freq)

            count_lines += 1

    FILENAME.close()
Example #17
0
def handle_config_file(file_reads, MAP, options):
    FILE = open_or_die(
        file_reads, 'rb', 'can not open {}\n'.format(file_reads))
    while True:
        l = FILE.readline()
        if not l:
            break

        m = re.match(r'(^\S+)\s+(\S+)\s*.*$', l)
        if m:
            m = m.groups()
            file_reads = m[0]
            prefix = m[1]

            if (len(file_reads) < len(prefix)):
                file_reads = m[1]
                prefix = m[0]

            test_prefix(prefix)

            MAP.write("\nhandling file '{}' with prefix '{}'\n".format(
                file_reads, prefix))

            # check if files in config file are in accordance with option
            # specified
            if options.get('-a') == '':
                check_file_format_and_option(file_reads, 'a')
            if options.get('-b') == '':
                check_file_format_and_option(file_reads, 'b')
            if options.get('-c') == '':
                check_file_format_and_option(file_reads, 'c')
            if options.get('-e') == '':
                check_file_format_and_option(file_reads, 'e')

            if options.get('-v') == '':
                print_stderr("\nhandling file '{}' with prefix '{}'\n".format(
                    file_reads, prefix))

            handle_one_file(file_reads, prefix, MAP, options)

    FILE.close()
Example #18
0
def parse_file_sig(file_sig, hash_sig):
    global hash_ref
    FILE = open_or_die(file_sig, 'rb', 'can not open {}\n'.format(file_sig))
    while True:
        line = FILE.readline()
        if not line:
            break

        m = re.match(r'^(\S+)', line)

        # Test isdefined
        defined = False
        if m:
            try:
                hash_ref[m.groups()[0]]
                defined = True
            except KeyError:
                pass

        if m and defined:
            # dummy value
            hash_sig[m.groups()[0]] = -50
Example #19
0
def parse_file_mrd_permuted(infile, _hash, options):
    global hash_sig
    score, permutation, refs = None, None, []
    FILE = open_or_die(infile, 'rb', 'can not open {}\n'.format(infile))
    while True:
        line = FILE.readline()
        if not line:
            break

        m = re.match(r'^permutation\s+(\d+)', line)
        if m:
            permutation = m.groups()[0]
        else:
            mm = re.match(r'^score\s+total\s+(\S+)', line)
            if mm:
                score = float(mm.groups()[0])
            else:
                m3 = re.match(r'^(\S+)', line)

                defined = False
                if m3:
                    try:
                        hash_sig[m3.groups()[0]]
                        defined = True
                    except KeyError:
                        pass

                if m3 and defined:
                    refs.append(m3.groups()[0])
                elif re.search(r'^>', line) and score is not None:
                    resolve_entry_file_mrd_permuted(
                        score, permutation, refs, _hash, options)
                    # ATTR: ($$i, @$refs) = () in
                    # resolve_entry_file_mrd_permuted
                    score = None

    resolve_entry_file_mrd_permuted(score, permutation, refs, _hash, options)
    score = None
Example #20
0
def parse_file_ref(filename, hash_ref):
    _id, desc, seq = None, None, None

    FASTA = open_or_die(filename, 'rb', 'can not open {}\n'.format(filename))
    while True:
        l = FASTA.readline()
        if not l:
            break

        l = l.strip()

        m = re.match(r'^>(\S+)(.*)', l)
        if m:
            m = m.groups()
            _id = m[0]
            desc = m[1]
            seq = ''
            while True:
                ll = FASTA.readline()
                if not ll:
                    break

                ll = ll.strip()

                mm = re.match(r'^>(\S+)(.*)', ll)
                if mm:
                    mm = mm.groups()
                    hash_ref[_id] = seq
                    _id = mm[0]
                    desc = mm[1]
                    seq = ''
                    continue
                seq += ll

    hash_ref[_id] = seq
    FASTA.close()
Example #21
0
def parse_fasta(options, file_fasta):
    _id = ''
    seq = ''

    FASTA = open_or_die(file_fasta, 'rb',
                        'can not open file {}\n'.format(file_fasta))

    while True:
        l = FASTA.readline()
        if not l:
            break

        l = l.strip()

        m = re.match(r'^>(\S+)', l)
        if m:
            _id = m.groups()[0]
            seq = ''

            while True:
                ll = FASTA.readline()
                if not ll:
                    break

                ll = ll.strip()

                mm = re.match(r'^>(\S+)', ll)
                if mm:
                    resolve(options, _id, seq)
                    _id = mm.groups()[0]
                    seq = ''
                    continue
                seq += ll

    resolve(options, _id, seq)
    FASTA.close()
def parse_file_struct(file_struct):
    global db_old
    FILE_STRUCT = open_or_die(file_struct, 'rb',
                              'can not open file {}\n'.format(file_struct))
    while True:
        line = FILE_STRUCT.readline()
        if not line:
            break

        line = line.strip()

        m = re.match(r'^>(\S+)\s*(.*)', line)
        if m:
            m = m.groups()
            _id = m[0]
            desc = m[1]
            seq = ""
            struct = ""
            mfe = ""

            while True:
                line2 = FILE_STRUCT.readline()
                if not line2:
                    break

                line2 = line2.strip()
                mm = re.match(r'^>(\S+)\s*(.*)', line2)
                if mm:
                    hash_desc[_id] = desc
                    hash_seq[_id] = seq
                    hash_struct[_id] = struct
                    hash_mfe[_id] = mfe
                    _id = mm.groups()[0]
                    desc = mm.groups()[1]
                    seq = ""
                    struct = ""
                    mfe = ""
                    continue

                m3 = re.match(r'^\w', line2)
                if m3:
                    line2 = tr(line2, 'uU', 'tT')
                    seq += line2

                m3 = re.search(r'((\.|\(|\))+)', line2)
                if m3:
                    struct += m3.groups()[0]

                m3 = re.search(r'\((\s*-\d+\.\d+)\)', line2)
                if m3:
                    mfe = m3.groups()[0]

    hash_desc[_id] = desc
    hash_seq[_id] = seq
    hash_struct[_id] = struct
    hash_mfe[_id] = mfe

    # print('\n'.join(sorted(hash_struct.values())))
    # print('\n'.join(sorted(hash_desc.keys())))

    FILE_STRUCT.close()
Example #23
0
#!/usr/bin/env python
from __future__ import print_function

import re
import sys

from port import die, esplit, open_or_die

if __name__ == '__main__':
    if len(sys.argv) < 2:
        die('No csv file given for bed conversion\n')

    known, novel, _not, line, thres, score, line, strand, label, end = (
        None, None, None, None, None, None, None, None, None, None)

    IN = open_or_die(sys.argv[1], 'r', 'cannot open {}\n'.format(sys.argv[1]))
    while True:
        line = IN.readline()
        if not line:
            break

        if re.search(r'novel miRNAs predicted by moRNA Finder', line):
            novel = 1
            known = 0
            _not = 0
        elif re.search(r'mature miRBase miRNAs detected', line):
            novel = 0
            known = 1
            _not = 0
        else:
            l = esplit(line)
Example #24
0
def read_stats(options):
    _hash = {}
    count = 0
    k2 = {}
    IN = open_or_die(options.get('-s'), 'rb',
                     'No reads file in fasta format given\n')
    while True:
        line = IN.readline()
        if not line:
            break

        m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line)
        if m:
            m = m.groups()

            try:
                if _hash[m[0]]:
                    continue
            except KeyError:
                pass

            # ATTR: Performance issue below, use logic above
            # if m[0] in _hash.keys() and _hash[m[0]]:
            #     continue

            _hash[m[0]] = 1
            count += int(m[2])

            if m[1] not in k2.keys():
                k2[m[1]] = 0
            k2[m[1]] += int(m[2])
    IN.close()

    _hash2 = {}
    count2 = 0
    k22 = {}

    print_stderr('Mapping statistics\n')
    IN = open_or_die(options.get('-t'), 'rb', 'No mapping file given\n')
    while True:
        line = IN.readline()
        if not line:
            break
        m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line)
        if m:
            m = m.groups()
            if m[0] in _hash2.keys() and _hash2[m[0]]:
                continue
            _hash2[m[0]] = 1
            count2 += int(m[2])

            if m[1] not in k22.keys():
                k22[m[1]] = 0

            k22[m[1]] += int(m[2])
    IN.close()

    print_stderr('\n#desc\ttotal\tmapped\tunmapped\t%mapped\t%unmapped\n')
    print_stderr("total: {}\t{}\t{}\t".format(count, count2, count - count2))
    print_stderr("{0:.3f}\t{1:.3f}\n".format(
        count2 / float(count), 1 - (count2 / float(count))))

    for k in k2.keys():
        print_stderr('{}: {}\t{}\t{}\t'.format(
            k, k2[k], k22[k], k2[k] - k22[k]))
        print_stderr('{0:.3f}\t{1:.3f}\n'.format(
            float(k22[k]) / k2[k], 1 - (float(k22[k]) / k2[k])))
Example #25
0
def check_file_format_and_option(file_reads, aFormat):
    print_stderr('\n')
    warning = '''\n\n***** Please check if the option you used (options $format) designates the correct format of the supplied reads file $file *****\n\n
[options]
-a              input file is seq.txt format
-b              input file is qseq.txt format
-c              input file is fasta format
-e              input file is fastq format
-d              input file is a config file (see moRNA Finder documentation).
                options -a, -b, -c or -e must be given with option -d.
'''
    line = None
    if aFormat == 'a':
        i = 0
        IN = open_or_die(
            file_reads, 'rb', 'Cannot open file {} supplied by option -a\n'.format(file_reads))
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            line = esplit(l)
            # $#line != 4
            if len(line) != 5:
                die('The seq.txt file does not contain 5 columns. Please make sure to follow the _seq.txt file format conventions\n{}'.format(warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == 'b':
        IN = open_or_die(
            file_reads, 'rb', 'Cannot open qseq.txt file {} supplied by option -b\n'.format(file_reads))
        i = 0
        mes = 'Please make sure your file is in accordance with the qses.txt format specifications\n'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            line = esplit(l)

            if len(line) != 11:
                die('The qseq.txt file does not contain 11 columns but {}. Please make sure to follow the qseq.txt file format conventions\n{}'.format(
                    len(line), warning))

            if not re.search(r'^\S+', line[9]):
                die('The sequence field in the qseq.txt file is invalid. Please make sure to follow the qseq.txt file format conventions\n{}'.format(warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == '-c':
        IN = open_or_die(file_reads, 'rb',
                         'Cannot open FASTA file supplied by option -c\n')
        i = 0
        mes = 'Please make sure your file is in accordance with the fasta format specifications and does not contain whitespace in IDs or sequences'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            if i == 1:
                if not re.search(r'^>\S+$', l):
                    die("First line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 2:
                if not re.search(r'^\S+$', l):
                    die("Second line of FASTA reads file contains whitespace in sequence\n{}\n".format(
                        mes))
            if i == 3:
                if not re.search(r'^>\S+$', l):
                    die("Second ID line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 4:
                if not re.search(r'^\S+$', l):
                    die("Secdond sequence line of FASTA reads file contains whitespace in sequence\n{}\n{}".format(
                        mes, warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == '-e':
        IN = open_or_die(file_reads, 'rb',
                         'Cannot open FASTQ file supplied by option -e\n')
        i = 0
        mes = 'Please make sure your file is in accordance with the FASTQ format specifications'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            if i == 1:
                if not re.search(r'^@\S+', l):
                    die("First line of FASTQ reads file is not in accordance with the fastq format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 2:
                if re.search(r'^\S+$', l):
                    die("Second line of FASTQ reads file contains whitespace in sequence\n{}\n{}".format(
                        mes, warning))
            if i == 3:
                if re.search(r'^\+', l):
                    die("Third line of FASTQ reads file does not start with a '+' character.\n{}\n{}".format(mes, warning))
            if i == 4:
                if re.search(r'^\S+$', l):
                    die("Fourth line of FASTQ reads file contains whitespace\n{}\n{}".format(
                        mes, warning))

            if i == 4:
                break
def parse_file_arf(file_arf, options):
    global running, gscan, hash_edits
    FILE_ARF = open_or_die(file_arf, 'rb',
                           'can not open {}\n'.format(file_arf))
    while True:
        line = FILE_ARF.readline()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()
            query = m[0]
            query_map_lng = int(m[1])
            query_beg = m[2]
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_map_lng = int(m[6])
            db_beg = m[7]
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = int(m[11])
            edit_string = m[12]

            running += 1
            if options.get('-j') == '':
                (query_map_lng, query_end, query_seq, db_map_lng, db_end,
                 db_seq, edits, edit_string) = remove_trailing_nts(
                     query_map_lng, query_end, query_seq, db_map_lng, db_end,
                     db_seq, edits, edit_string)

            if '-a' in options.keys() and int(options.get('-a')) < edits:
                continue

            if options.get('-b') and query_map_lng < int(options.get('-b')):
                continue

            if options.get('-c') and int(options.get('-c')) < query_map_lng:
                continue

            if options.get('-d') and query not in hash_queries_incl.keys():
                continue

            if options.get('-e') and query in hash_queries_excl.keys():
                continue

            if options.get('-f') and db not in hash_dbs_incl.keys():
                continue

            if options.get('-g') and db in hash_dbs_excl.keys():
                continue

            if not (options.get('-h') == '' or options.get('-i')):
                pprint('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.
                       format(query, query_map_lng, query_beg, query_end,
                              query_seq, db, db_map_lng, db_beg, db_end,
                              db_seq, strand, edits, edit_string))
                continue

            if gscan:
                create_hash_key_chain(hash_edits, 0, query, edits)
                hash_edits[query][edits] += 1
            else:
                evaluation = evaluate_query(query, edits, options)
                if evaluation:
                    pprint(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".
                        format(query, query_map_lng, query_beg, query_end,
                               query_seq, db, db_map_lng, db_beg, db_end,
                               db_seq, strand, edits, edit_string))
    file_output = args.file_output
    coord_file = args.coord_file
    pres_max = args.pres_max

    opts, argss = getopt.getopt(sys.argv[6:], 'b')
    options = dict(opts)

    if not re.search(r'^[-]*\d+', pres_max):
        print_stderr('{} is not an integer number\n'.format(pres_max))
        sys.exit(-1)

    for z in range(1, upper_bound):
        dblimit[z] = 0
        thres_counts[z] = 0

    TMP1 = open_or_die('{}_all'.format(file_output), 'w+',
                       'cannot create file {}'.format(file_output))

    TMP2 = open_or_die('{}_all'.format(coord_file), 'w+',
                       'cannot create file {}'.format(coord_file))

    if options.get('-b') == '':
        print_stderr('finding lengths of genome contigs\n')

    parse_file_arf(file_arf)

    if options.get('-b') == '':
        print_stderr(
            'reading the genome into memory and excising potential precursors\n'
        )

    parse_genome_and_excise(TMP1, TMP2, file_fasta)
Example #28
0
if __name__ == '__main__':
    # create a log file for the mapper.py
    # the latest run of mapper will be on top of the log file
    #
    #
    if len(sys.argv) < 2:
        print(usage)
        sys.exit(-1)

    if os.path.exists('mapper.log'):
        os.system('mv mapper.log mapper.log_bak')
    else:
        os.system('touch mapper.log_bak')

    MAP = open_or_die('mapper.log_tmp', 'w+',
                      'could not create mapper.log_tmp\n')
    # cdir = os.path.dirname(os.path.realpath(__file__))
    cdir = os.path.abspath('.')

    MAP.write('current dir:\t{}\n'.format(cdir))
    MAP.write('mapper command:\t{} {}\n'.format(
        sys.argv[0], ' '.join(sys.argv[1:])))

    parser = argparse.ArgumentParser(usage=usage)
    parser.add_argument('input_file_reads', help='input file')
    args = parser.parse_args(sys.argv[1:2])

    file_reads = args.input_file_reads

    if not os.path.exists(file_reads):
        print('No config or reads file could be found')
                print i
            except KeyError:
                pass

            while defined and rhash[rn]:
                rn = (2 * int(random.randrange(lines / 2)))
                defined = False
                try:
                    rhash[rn]
                    defined = True
                except KeyError:
                    pass

            rhash[rn] = 1

        IN = open_or_die(in_file, 'rb', 'can not open {}'.format(in_file))
        while True:
            l = IN.readline().strip()
            if not l:
                break

            m = re.match(r'^\>(.+)$', l)
            if m:
                m = m.groups()
                counter += 1
                _id = m[0]
                if re.search(r'\s+', _id):
                    die('Error in line {}: The identifier\n {}\n contains white spaces\n\nPlease make sure that none of the identifiers contain whitepaces.\nYou could run remove_white_space_in_id.py {} > newfile\nThis will remove everything from the id line after the first whitespace'.format(
                        Nicenumber(counter),
                        l,
                        in_file,
def parse_file_arf(file_arf):
    '''
    read through the signature blastparsed file, fills up a hash with information on queries
    (deep sequences) mapping to the current db (potential precursor) and resolve each
    potential precursor in turn
    '''
    global db_old, hash_query
    FILENAME = open_or_die(file_arf, 'rb',
                           'could not open file {}\n'.format(file_arf))
    while True:
        line = FILENAME.readline()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()

            query = m[0]
            query_lng = int(m[1])
            query_beg = int(m[2])
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_lng = int(m[6])
            db_beg = int(m[7])
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = int(m[11])
            edit_string = m[12]

            # only reads that map sense to the potential precursor are
            # considered
            if strand == "-":
                continue

            # if the new line concerns a new db (potential precursor) then the
            # old db must be resolved
            if db_old and db_old != db:
                # print(db, db_old)
                resolve_potential_precursor()

            # resolve the number of reads that the deep sequence represents
            freq = find_freq(query)

            # read information of the query (deep sequence) into hash
            create_hash_key_chain(hash_query, db_beg, query, 'db_beg')
            create_hash_key_chain(hash_query, db_end, query, 'db_end')
            create_hash_key_chain(hash_query, strand, query, 'strand')
            create_hash_key_chain(hash_query, freq, query, 'freq')

            hash_query[query]["db_beg"] = db_beg
            hash_query[query]["db_end"] = db_end
            hash_query[query]["strand"] = strand
            hash_query[query]["freq"] = freq

            db_old = db

    resolve_potential_precursor()