Example #1
0
def run_mapper(
    reads_file,
    clip_seq,
    fa_prefix,
    collapsed_file,
    mapping_file,
):
    cmd = 'mapper.py {} -c -j -k {} -l 18 -m -p {} -s {} -t {} -v'.format(
        reads_file, clip_seq, fa_prefix, collapsed_file, mapping_file)
    print_stderr(cmd, '\n')
    ret = os.system(cmd)
    if ret:
        die('Run mapper.py failed.\n')
Example #2
0
def compute_randfold():
    global options, dir_tmp
    if options.get('-c') == '':
        return

    # compute randfold p-values for the subset of precursors which are
    # plausible Dicer substrates

    pprint("#computing randfold p-values\n")
    print_stderr("#computing randfold p-values\n")
    cmd = "select_for_randfold.py {}/signature.arf {}/precursors.str > {}/precursors_for_randfold.ids\n\n".format(
        dir_tmp, dir_tmp, dir_tmp)
    print_stderr(cmd)
    start()
    ret_select_for_randfold = os.system(cmd)
    end()

    start()
    cmd = "fastaselect.py {}/precursors.fa {}/precursors_for_randfold.ids > {}/precursors_for_randfold.fa\n\n".format(
        dir_tmp, dir_tmp, dir_tmp)
    print_stderr(cmd)
    ret_fasta_select = os.system(cmd)
    end()

    start()
    cmd = "randfold -s {}/precursors_for_randfold.fa 99 > {}/precursors_for_randfold.rand\n\n".format(
        dir_tmp, dir_tmp)
    print_stderr(cmd)
    ret_randfold = os.system(cmd)
    end()
Example #3
0
def make_dir_tmp():
    global _dir, ltime, dir_tmp
    # make temporary directory
    if not os.path.isdir('moR_runs'):
        os.mkdir('moR_runs')

    _dir = "moR_runs/run_{}".format(ltime)

    print_stderr("mkdir {}\n\n".format(_dir))
    os.mkdir(_dir)

    dir_tmp = "{}/tmp".format(_dir)

    os.mkdir(dir_tmp)
def parse_file_arf(file_arf):
    global count_lines

    lines = int(os.popen('cat {} | wc -l'.format(file_arf)).read().strip())

    if options.get('-b') == '':
        print_stderr(
            'reading the mapping file into memory, total lines={}\n'.format(
                lines))

    try:
        FILENAME = open(file_arf, 'rb')
    except IOError:
        print('Could not open file {}'.format(file_arf))
        sys.exit(-1)

    while True:
        line = FILENAME.read()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()
            query = m[0]
            query_map_lng = int(m[1])
            query_beg = int(m[2])
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_map_lng = int(m[6])
            db_beg = int(m[7])
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = m[11]
            edit_string = m[12]

            freq = find_req(query)

            insertfeature(db, strand, db_beg, db_end, freq)
            count_lines += 1

            if options.get('b') == '':
                pass

    FILENAME.close()
Example #5
0
def map_reads(file_reads_latest, MAP, options):
    global mismatches_seed, threads, orig_file_reads
    # map reads to genome
    MAP.write('mapping reads to genome index\n')
    if options.get('-v') == '':
        print_stderr('mapping reads to genome index\n')

    file_genome_latest = options.get('-p')
    mapping_loc = 5
    if '-r' in options.keys():
        mapping_loc = options.get('-r')

    cmd = 'bowtie -p {} -f -n {} -e 80 -l 18 -a -m {} --best --strata {}  --al {}/{}_mapped --un {}/{}_not_mapped  {} {}/mappings.bwt 2>bowtie.log\n\n'.format(
        threads,
        mismatches_seed,
        mapping_loc,
        file_genome_latest,
        _dir,
        orig_file_reads,
        _dir,
        orig_file_reads,
        file_reads_latest,
        _dir
    )
    MAP.write(cmd)
    ret_mapping = os.system(cmd.strip())
    file_mapping_latest = '{}/mappings.bwt'.format(_dir)

    cmd = 'convert_bowtie_output.py {} > {}/mappings.arf\n'.format(
        file_mapping_latest, _dir)
    MAP.write(cmd)
    ret_parse_to_arf = os.system(cmd.strip())
    file_mapping_latest = '{}/mappings.arf'.format(_dir)

    # trim unmapped nts in the 3' end
    MAP.write("trimming unmapped nts in the 3' ends\n")
    if options.get('-v') == '':
        print_stderr("trimming unmapped nts in the 3' ends\n")

    cmd = 'parse_mappings.py {} -j > {}/mappings_trim.arf\n\n'.format(
        file_mapping_latest, _dir)
    MAP.write(cmd)
    ret_trim = os.system(cmd.strip())
    file_mapping_latest = '{}/mappings_trim.arf'.format(_dir)

    if options.get('-v') == '':
        cat_to(file_mapping_latest, options.get('-t'))

    return file_mapping_latest
def parse_file_ids(_file, _hash):
    # read id file into hash
    if options.get('-k') == '':
        print_stderr('reading id file into memory\n')

    FILE = open_or_die(_file, 'rb', 'can not open {}\n'.format(_file))
    while True:
        line = FILE.readline()
        if not line:
            break

        m = re.match(r'^(\S+)', line)
        if m:
            _id = m.groups()[0]
            _hash[_id] = 1
def scan(file_arf, options):
    global gscan, running
    if options.get('-k') == '':
        lines = os.popen('cat {} | wc -l'.format(file_arf)).read().strip()
        print_stderr('scanning mappings, total={}\n'.format(lines))

    gscan = 1
    parse_file_arf(file_arf, options)

    gscan = 0
    if options.get('-k') == '':
        print_stderr('resolving best mappings for each read\n')
    fill_hash()

    running = 0
Example #8
0
def core_algorithm():
    '''
    run moRNA Finder core algorithm
    '''
    global _dir, dir_tmp, file_mature_ref_other_species, ltime
    pprint("#running moRNA Finder core algorithm\n")
    print_stderr("#running moRNA Finder core algorithm\n")
    line = None

    longest_id = 40
    if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
        longest_id = get_longest_id("{}/{}".format(
            dir_tmp, file_mature_ref_this_species))

    start()

    if not re.search('none', file_mature_ref_other_species, re.IGNORECASE):
        line = "core_algorithm.py {}/signature.arf {}/precursors.str -s {}/{} -v -50 -l {}".format(
            dir_tmp, dir_tmp, dir_tmp, file_mature_ref_other_species,
            longest_id)
    else:
        line = "core_algorithm.py {}/signature.arf {}/precursors.str -v -50 -l {}".format(
            dir_tmp, dir_tmp, longest_id)

    if not options.get('-c') == '':
        line += " -y {}/precursors_for_randfold.rand".format(dir_tmp)

    cmd = "{} > {}/output.mrd\n".format(line, _dir)
    print_stderr(cmd)
    ret_mor_core = os.system(cmd)
    if options.get('-E'):
        ret_mor_core = os.system('{} -t > {}/error.output.mrd'.format(
            line, _dir))

    end()

    # check if file is empty
    fname = "{}/output.mrd".format(_dir)
    if not file_s(fname):
        print_stderr("Error:\n\tFile {} is empty\n\n".format(fname))
        print_stderr(
            "Now running core_algorithm.py with option -t to see why all precursors were discarded\n"
        )
        ret_mor_core = os.system('{} -t > error.output.mrd_{}'.format(
            line, ltime))
        print_stderr(
            "The debug file is called error.output.mrd_{}\n".format(ltime))
        die("\nExiting now\n\n")
Example #9
0
def parse_mappings():
    global file_reads_vs_genome, parsed_arf, dir_tmp
    # parse mappings to retain only perfect mappings of reads 18 nt <= length
    # <= 25 nt that map perfectly to five loci or less
    pprint("#parsing genome mappings\n")
    print_stderr("#parsing genome mappings\n")

    cmd = "parse_mappings.py {} -a 0 -b 18 -c 25 -i 5 > {}/{}_parsed.arf\n\n".format(
        file_reads_vs_genome, dir_tmp, parsed_arf)
    print_stderr(cmd)

    start()
    ret_parse_mappings = os.popen(cmd).read()
    end()

    return 0
def parse_file_arf(file_arf):
    global count_lines, hash_pos

    lines = int(os.popen('cat {} | wc -l'.format(file_arf)).read().strip())

    if options.get('-b') == '':
        print_stderr(
            'reading the mapping file into memory, total lines=$lines\n'.
            format(lines))

    FILENAME = open_or_die(file_arf, 'rb',
                           'Could not open file {}'.format(file_arf))

    while True:
        line = FILENAME.readline()
        if not line:
            break

        m = re.match(
            r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)',
            line)
        if m:
            m = m.groups()
            query = m[0]
            query_map_lng = int(m[1])
            query_beg = int(m[2])
            query_end = int(m[3])
            query_seq = m[4]
            db = m[5]
            db_map_lng = int(m[6])
            db_beg = int(m[7])
            db_end = int(m[8])
            db_seq = m[9]
            strand = m[10]
            edits = int(m[11])
            edit_string = m[12]

            freq = find_freq(query)
            # read into position hash
            insertfeature(db, strand, db_beg, db_end, freq)

            count_lines += 1

    FILENAME.close()
def excise_struct(struct, beg, end, strand):
    global db_old
    lng = len(struct)

    # begin can be equal to end if only one nucleotide is excised
    if not (beg <= end):
        print_stderr(
            'begin can not be greater than end for {}\n'.format(db_old))
        sys.exit(0)

    # rarely, permuted combinations of signature and structure cause out of bound excision errors.
    # this happens once appr. every two thousand combinations
    if not (beg <= len(struct)):
        return 0

    # the blast parsed format is 1-indexed, substr is 0-indexed
    sub_struct = substr(struct, beg - 1, end - beg + 1)

    return sub_struct
Example #12
0
def resolve_entry_file_mrd_permuted(score, permutation, refs, _hash, options):
    if permutation is None:
        print_stderr('The {} file is not properly formatted.\nMaybe it does not contain the lines with \"permutation int\"?\n'.format(
            options.get('-a')
        ))
        sys.exit(0)

    floor = int(math.floor(score))
    create_hash_key_chain(_hash, 0, 'total', permutation, floor)
    _hash['total'][permutation][floor] += 1

    if refs:
        create_hash_key_chain(_hash, 0, 'known', permutation, floor)
        _hash['known'][permutation][floor] += 1
    else:
        create_hash_key_chain(_hash, 0, 'novel', permutation, floor)
        _hash['novel'][permutation][floor] += 1

    for i in range(len(refs)):
        refs.pop()
Example #13
0
def handle_config_file(file_reads, MAP, options):
    FILE = open_or_die(
        file_reads, 'rb', 'can not open {}\n'.format(file_reads))
    while True:
        l = FILE.readline()
        if not l:
            break

        m = re.match(r'(^\S+)\s+(\S+)\s*.*$', l)
        if m:
            m = m.groups()
            file_reads = m[0]
            prefix = m[1]

            if (len(file_reads) < len(prefix)):
                file_reads = m[1]
                prefix = m[0]

            test_prefix(prefix)

            MAP.write("\nhandling file '{}' with prefix '{}'\n".format(
                file_reads, prefix))

            # check if files in config file are in accordance with option
            # specified
            if options.get('-a') == '':
                check_file_format_and_option(file_reads, 'a')
            if options.get('-b') == '':
                check_file_format_and_option(file_reads, 'b')
            if options.get('-c') == '':
                check_file_format_and_option(file_reads, 'c')
            if options.get('-e') == '':
                check_file_format_and_option(file_reads, 'e')

            if options.get('-v') == '':
                print_stderr("\nhandling file '{}' with prefix '{}'\n".format(
                    file_reads, prefix))

            handle_one_file(file_reads, prefix, MAP, options)

    FILE.close()
Example #14
0
def perform_controls():
    global dir_tmp, _dir, file_mature_ref_other_species, ltime
    # run permuted controls:
    pprint("#running permuted controls\n")
    print_stderr("#running permuted controls\n")
    start()
    line = None

    if not re.search('none', file_mature_ref_other_species, re.IGNORECASE):
        line = "core_algorithm.py {}/signature.arf {}/precursors.str -s {}/{} -v -50".format(
            dir_tmp, dir_tmp, dir_tmp, file_mature_ref_other_species)
    else:
        line = "core_algorithm.py {}/signature.arf {}/precursors.str -v -50".format(
            dir_tmp, dir_tmp)

    if not (options.get('-c') == ''):
        line += " -y {}/precursors_for_randfold.rand".format(dir_tmp)

    cmd = "echo '{} > {}/output.mrd' > {}/command_line\n\n".format(
        line,
        _dir,
        dir_tmp,
    )
    print_stderr(cmd)
    ret_command_line = os.system(cmd)
    cmd = "perform_controls.py {}/command_line {}/precursors.str 100 -a > {}/output_permuted.mrd 2>>error_{}.log\n\n".format(
        dir_tmp, dir_tmp, dir_tmp, ltime)
    print_stderr(cmd)
    ret_perform_controls = os.system(cmd)
    end()
Example #15
0
def excise_precursors():
    global file_genome, parsed_arf, dir_tmp, stack_height_min, dir_tmp, max_pres
    # excise precursors from the genome
    pprint("#excising precursors\n")
    print_stderr("#excising precursors\n")

    start()
    ret_excise_precursors = None

    if options.get('-a'):
        cmd = "excise_precursors.py {} {}/{}_parsed.arf {}/precursors.coords -a {} > {}/precursors.fa\n\n".format(
            file_genome, dir_tmp, parsed_arf, dir_tmp, stack_height_min,
            dir_tmp)
        print_stderr(cmd)
        ret_excise_precursors = os.popen(cmd).read()
    else:
        cmd = "excise_precursors_iterative_final.py {} {}/{}_parsed.arf {}/precursors.fa {}/precursors.coords {}\n".format(
            file_genome, dir_tmp, parsed_arf, dir_tmp, dir_tmp, max_pres)
        print_stderr(cmd)
        ret_excise_precursors = os.popen(cmd).read()

        fname = '{}/precursors.fa_stack'.format(dir_tmp)
        OSS = open_or_die2(fname, 'rb')
        stack_height_min = OSS.readline().strip()
        OSS.close()

    end()

    fname = '{}/precursors.fa'.format(dir_tmp)
    # if (-z "$dir_tmp/precursors.fa" or not -f "$dir_tmp/precursors.fa"):
    if not file_s(fname) or not os.path.isfile(
            fname):  # empty or not a regular plain file
        die("No precursors excised\n")

    return 0
Example #16
0
def fold_precursors():
    '''
    predicting RNA secondary structures with RNAfold
    '''
    global dir_tmp, ltime
    pprint("#folding precursors\n")
    print_stderr("#folding precursors\n")
    print_stderr(
        "RNAfold < {}/precursors.fa -noPS > {}/precursors.str\n\n".format(
            dir_tmp, dir_tmp))
    start()
    ret_fold_precursors = os.system(
        "RNAfold < {}/precursors.fa -noPS > {}/precursors.str 2>>error_{}.log".
        format(dir_tmp, dir_tmp, ltime))
    if ret_fold_precursors:
        ret_fold_precursors = os.system(
            "RNAfold < {}/precursors.fa --noPS > {}/precursors.str".format(
                dir_tmp, dir_tmp))
        if ret_fold_precursors:
            die("Some RNAfold error occurred. Error {}\n".format(
                ret_fold_precursors))

    end()
def parse_file_fasta_seqkey(file_fasta, hsh, options):
    if options.get('-a') == '':
        print_stderr('reading file into hash\n')

    _id = ''
    seq = ''
    running_1 = 0

    FASTA = open_or_die2(file_fasta, 'rb')

    while True:
        l = FASTA.readline().strip()
        if not l:
            break

        m = re.match(r'^>(\S+)', l)
        if m:
            _id = m.group()
            seq = ''

            while True:
                ll = FASTA.readline().strip()
                if not ll:
                    break

                mm = re.match(r'^>(\S+)', ll)
                if mm:
                    cnt = find_cnt(_id)
                    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
                    # ATTR: Performance issue below:
                    # create_hash_key_chain(hsh, 0, seq)
                    try:
                        hsh[seq] = (hsh[seq]) + cnt
                    except KeyError:
                        hsh[seq] = cnt

                    running_1 += 1

                    if options.get('-a') == '':
                        print_stderr('{}\r'.format(running_1))

                    _id = mm.group()
                    seq = ''
                    continue

                seq += ll

    cnt = find_cnt(_id)
    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
    create_hash_key_chain(hsh, 0, seq)
    hsh[seq] += cnt
    running_1 += 1

    if options.get('-a') == '':
        print_stderr('{}\r'.format(running_1))

    FASTA.close()
Example #18
0
def prepare_signature():
    '''
    prepare signature file
    '''
    global file_reads, dir_tmp, read_align_mismatches, file_mature_ref_this_species, ltime
    pprint("#preparing signature\n")
    print_stderr("#preparing signature\n")

    if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
        cmd = "prepare_signature.py {} {}/precursors.fa {} -a {}/{} -o {}/signature.arf 2>>error_{}.log\n\n".format(
            file_reads, dir_tmp, read_align_mismatches, dir_tmp,
            file_mature_ref_this_species, dir_tmp, ltime)
        print_stderr(cmd)
        start()
        ret_prepare_signature = os.popen(cmd).read()
        end()
    else:
        cmd = "prepare_signature.py {} {}/precursors.fa {} -o {}/signature.arf 2>>error_{}.log\n\n".format(
            file_reads, dir_tmp, read_align_mismatches, dir_tmp, ltime)
        start()
        ret_prepare_signature = os.popen(cmd).read()
        end()

    return 0
def excise_seq(seq, beg, end, strand):
    '''
    excise sub sequence from the potential precursor
    '''
    global db_old

    # begin can be equal to end if only one nucleotide is excised
    if not (beg <= end):
        print_stderr('begin can not greater than end for {}\n'.format(db_old))
        sys.exit(0)

    # rarely, permuted combinations of signature and structure cause out of bound excision errors.
    # this happens once appr. every two thousand combinations
    if not (beg <= len(seq)):
        return 0

    # the blast parsed format is 1-indexed, substr is 0-indexed
    sub_seq = substr(seq, beg - 1, end - beg + 1)

    # if on the minus strand, the reverse complement should be returned
    if strand == "-":
        sub_seq = revcom(sub_seq)

    return sub_seq
Example #20
0
def rna2dna():
    global dir_tmp, file_mature_ref_other_species, file_mature_ref_this_species, file_precursors
    # process_input mirna files
    if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
        start()
        # copy file
        (file_mature_ref_this_species_tmp, path0,
         extension0) = fileparse(file_mature_ref_this_species, '\..*')
        cmd = "rna2dna.py {} > {}/{}{}\n\n".format(
            file_mature_ref_this_species, dir_tmp,
            file_mature_ref_this_species_tmp, extension0)
        print_stderr(cmd)
        ret_parse_mature_ref_this_species = os.popen(cmd).read()
        # rename orig file
        file_mature_ref_this_species = '{}{}'.format(
            file_mature_ref_this_species_tmp, extension0)

    if not re.search('none', file_mature_ref_other_species, re.IGNORECASE):
        # copy file
        (file_mature_ref_other_species_tmp, path0,
         extension0) = fileparse(file_mature_ref_other_species, '\..*')
        cmd = "rna2dna.py {} > {}/{}{}\n\n".format(
            file_mature_ref_other_species, dir_tmp,
            file_mature_ref_other_species_tmp, extension0)
        print_stderr(cmd)

        # here give file name
        ret_parse_mature_ref_other_species = os.popen(cmd).read()
        # rename orig file
        file_mature_ref_other_species = '{}{}'.format(
            file_mature_ref_other_species_tmp, extension0)
        end()

    if not re.search('none', file_precursors, re.IGNORECASE):
        # copy file
        (file_precursors_tmp, path0,
         extension0) = fileparse(file_precursors, '\..*')
        cmd = "rna2dna.py {} > {}/{}{}\n\n".format(file_precursors, dir_tmp,
                                                   file_precursors_tmp,
                                                   extension0)
        print_stderr(cmd)
        # here give file name
        ret_parse_precursors = os.popen(cmd).read()
        # rename orig file
        file_precursors = '{}{}'.format(file_precursors_tmp, extension0)
        end()

    return 0
def print_hash_seqkey(hsh):
    if options.get('-a') == '':
        print_stderr('sorting hash\n')

    running_2 = 0
    if options.get('-a') == '':
        print_stderr('printing hash\n')

    keys = hash_sort_key(hsh, lambda x: (x[1] * -1, x[0]))
    for key in keys:
        cnt = hsh[key]
        # print ">$prefix\_$running_2\_x$cnt\n$key\n";
        print('>{}_{}_x{}\n{}'.format(prefix, running_2, cnt, key))
        running_2 += cnt

        if options.get('-a') == '':
            print_stderr('{}\r'.format(running_2))
Example #22
0
def resolve(options, _id, seq):
    global running

    running += 1

    if options.get('-s') == '':
        print_stderr('{}\r'.format(running))

    lng = len(seq)

    if options.get('-a') and lng < int(options.get('-a')):
        print_stderr('>{}\n{}\n'.format(_id, seq))
        return

    if options.get('-b') == '' and not re.match(r'^(a|c|g|t|u|n)+$', seq,
                                                re.IGNORECASE):
        print_stderr('>{}\n{}\n'.format(_id, seq))
        return

    print('>{}'.format(_id))
    print(seq)
Example #23
0
def make_survey():
    # get overview of the output:
    global _dir, dir_tmp, file_mature_ref_this_species, stack_height_min
    pprint("#doing survey of accuracy\n")
    print_stderr("#doing survey of accuracy\n")

    if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
        cmd = "survey.py {}/output.mrd -a {}/output_permuted.mrd -b {}/{} -c {}/signature.arf -d {} > {}/survey.csv\n\n".format(
            _dir, dir_tmp, dir_tmp, file_mature_ref_this_species, dir_tmp,
            stack_height_min, _dir)
        print_stderr(cmd)
        start()
        ret_survey = os.system(cmd)
        end()

    else:

        cmd = "survey.py {}/output.mrd -a {}/output_permuted.mrd -d {} > {}/survey.csv\n\n".format(
            _dir, dir_tmp, stack_height_min, _dir)
        print_stderr(cmd)
        start()
        ret_survey = os.system(cmd)
        end()
    parser.add_argument('file_output', help=usage)
    parser.add_argument('coord_file', help=usage)
    parser.add_argument('pres_max', help=usage)

    args = parser.parse_args(sys.argv[1:6])
    file_fasta = args.file_fasta
    file_arf = args.file_arf
    file_output = args.file_output
    coord_file = args.coord_file
    pres_max = args.pres_max

    opts, argss = getopt.getopt(sys.argv[6:], 'b')
    options = dict(opts)

    if not re.search(r'^[-]*\d+', pres_max):
        print_stderr('{} is not an integer number\n'.format(pres_max))
        sys.exit(-1)

    for z in range(1, upper_bound):
        dblimit[z] = 0
        thres_counts[z] = 0

    TMP1 = open_or_die('{}_all'.format(file_output), 'w+',
                       'cannot create file {}'.format(file_output))

    TMP2 = open_or_die('{}_all'.format(coord_file), 'w+',
                       'cannot create file {}'.format(coord_file))

    if options.get('-b') == '':
        print_stderr('finding lengths of genome contigs\n')
    coord_file = args.coord_file

    opts, argss = getopt.getopt(sys.argv[4:], 'a:b')
    options = dict(opts)

    try:
        PF = open(coord_file, 'w+')
    except:
        print('cannot create file {}'.format(coord_file))
        sys.exit(-1)

    if options.get('-a'):
        freq_min = int(options.get('-a'))

    if options.get('-b') == '':
        print_stderr('finding lengths of genome contigs\n')

    parse_file_arf(file_arf)

    if options.get('-b') == '':
        print_stderr(
            'reading the genome into memory and excising potential precursors\n'
        )

    parse_genome_and_excise(PF, file_fasta)

    if options.get('-b') == '':
        print_stderr('potential precursors excised\n')

    close(PF)
Example #26
0
def check_file_format_and_option(file_reads, aFormat):
    print_stderr('\n')
    warning = '''\n\n***** Please check if the option you used (options $format) designates the correct format of the supplied reads file $file *****\n\n
[options]
-a              input file is seq.txt format
-b              input file is qseq.txt format
-c              input file is fasta format
-e              input file is fastq format
-d              input file is a config file (see moRNA Finder documentation).
                options -a, -b, -c or -e must be given with option -d.
'''
    line = None
    if aFormat == 'a':
        i = 0
        IN = open_or_die(
            file_reads, 'rb', 'Cannot open file {} supplied by option -a\n'.format(file_reads))
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            line = esplit(l)
            # $#line != 4
            if len(line) != 5:
                die('The seq.txt file does not contain 5 columns. Please make sure to follow the _seq.txt file format conventions\n{}'.format(warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == 'b':
        IN = open_or_die(
            file_reads, 'rb', 'Cannot open qseq.txt file {} supplied by option -b\n'.format(file_reads))
        i = 0
        mes = 'Please make sure your file is in accordance with the qses.txt format specifications\n'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            line = esplit(l)

            if len(line) != 11:
                die('The qseq.txt file does not contain 11 columns but {}. Please make sure to follow the qseq.txt file format conventions\n{}'.format(
                    len(line), warning))

            if not re.search(r'^\S+', line[9]):
                die('The sequence field in the qseq.txt file is invalid. Please make sure to follow the qseq.txt file format conventions\n{}'.format(warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == '-c':
        IN = open_or_die(file_reads, 'rb',
                         'Cannot open FASTA file supplied by option -c\n')
        i = 0
        mes = 'Please make sure your file is in accordance with the fasta format specifications and does not contain whitespace in IDs or sequences'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            if i == 1:
                if not re.search(r'^>\S+$', l):
                    die("First line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 2:
                if not re.search(r'^\S+$', l):
                    die("Second line of FASTA reads file contains whitespace in sequence\n{}\n".format(
                        mes))
            if i == 3:
                if not re.search(r'^>\S+$', l):
                    die("Second ID line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 4:
                if not re.search(r'^\S+$', l):
                    die("Secdond sequence line of FASTA reads file contains whitespace in sequence\n{}\n{}".format(
                        mes, warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == '-e':
        IN = open_or_die(file_reads, 'rb',
                         'Cannot open FASTQ file supplied by option -e\n')
        i = 0
        mes = 'Please make sure your file is in accordance with the FASTQ format specifications'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            if i == 1:
                if not re.search(r'^@\S+', l):
                    die("First line of FASTQ reads file is not in accordance with the fastq format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 2:
                if re.search(r'^\S+$', l):
                    die("Second line of FASTQ reads file contains whitespace in sequence\n{}\n{}".format(
                        mes, warning))
            if i == 3:
                if re.search(r'^\+', l):
                    die("Third line of FASTQ reads file does not start with a '+' character.\n{}\n{}".format(mes, warning))
            if i == 4:
                if re.search(r'^\S+$', l):
                    die("Fourth line of FASTQ reads file contains whitespace\n{}\n{}".format(
                        mes, warning))

            if i == 4:
                break
Example #27
0
def read_stats(options):
    _hash = {}
    count = 0
    k2 = {}
    IN = open_or_die(options.get('-s'), 'rb',
                     'No reads file in fasta format given\n')
    while True:
        line = IN.readline()
        if not line:
            break

        m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line)
        if m:
            m = m.groups()

            try:
                if _hash[m[0]]:
                    continue
            except KeyError:
                pass

            # ATTR: Performance issue below, use logic above
            # if m[0] in _hash.keys() and _hash[m[0]]:
            #     continue

            _hash[m[0]] = 1
            count += int(m[2])

            if m[1] not in k2.keys():
                k2[m[1]] = 0
            k2[m[1]] += int(m[2])
    IN.close()

    _hash2 = {}
    count2 = 0
    k22 = {}

    print_stderr('Mapping statistics\n')
    IN = open_or_die(options.get('-t'), 'rb', 'No mapping file given\n')
    while True:
        line = IN.readline()
        if not line:
            break
        m = re.match(r'^>*((\S\S\S)\S+_x(\d+))', line)
        if m:
            m = m.groups()
            if m[0] in _hash2.keys() and _hash2[m[0]]:
                continue
            _hash2[m[0]] = 1
            count2 += int(m[2])

            if m[1] not in k22.keys():
                k22[m[1]] = 0

            k22[m[1]] += int(m[2])
    IN.close()

    print_stderr('\n#desc\ttotal\tmapped\tunmapped\t%mapped\t%unmapped\n')
    print_stderr("total: {}\t{}\t{}\t".format(count, count2, count - count2))
    print_stderr("{0:.3f}\t{1:.3f}\n".format(
        count2 / float(count), 1 - (count2 / float(count))))

    for k in k2.keys():
        print_stderr('{}: {}\t{}\t{}\t'.format(
            k, k2[k], k22[k], k2[k] - k22[k]))
        print_stderr('{0:.3f}\t{1:.3f}\n'.format(
            float(k22[k]) / k2[k], 1 - (float(k22[k]) / k2[k])))
Example #28
0
def process_reads(file_reads_latest, prefix, MAP):
    global _dir, orig_file_reads
    orig_file_reads = file_reads_latest
    m = re.search(r'([_\-.a-zA-Z0-9]+)$', file_reads_latest)
    if m:
        orig_file_reads = m.groups()[0]

    _dir = make_dir_tmp("_{}_{}".format(prefix, orig_file_reads), MAP)

    # parse solexa to fasta
    if options.get('-h') == '':
        if options.get('-e') == '':
            MAP.write('parsing fastq to fasta format\n')

            if options.get('-v') == '':
                print_stderr('parsing fastq to fasta format\n')

            cmd = 'fastq2fasta.py {} > {}/reads.fa\n'.format(
                file_reads_latest, _dir)
            MAP.write(cmd)
            ret_format = os.system(cmd)
            file_reads_latest = '{}/reads.fa'.format(_dir)
        else:
            MAP.write('parsing Solexa / Illumina output to fasta format\n')

            if options.get('-v') == '':
                print_stderr(
                    'parsing Solexa / Illumina output to fasta format\n')

            line = 'illumina_to_fasta.py {}'.format(file_reads_latest)

            if options.get('-b') == '':
                line += ' -a'

            cmd = '{} > {}/reads.fa\n'.format(line, _dir)
            MAP.write(cmd)

            ret_format = os.system(cmd)
            file_reads_latest = '{}/reads.fa'.format(_dir)

    # RNA to DNA
    if options.get('-i') == '':
        MAP.write('converting rna to dna alphabet\n')

        if options.get('-v') == '':
            print_stderr('converting rna to dna alphabet\n')

        ret_rna2dna = os.system(
            'rna2dna.py {} > {}/reads_dna.fa'.format(file_reads_latest, _dir))
        file_reads_latest = '{}/reads_dna.fa'.format(_dir)

    # discard entries that contain non-canonical letters
    if options.get('-j') == '':
        MAP.write('discarding sequences with non-canonical letters\n')
        if options.get('-v') == '':
            print_stderr('discarding sequences with non-canonical letters\n')

        cmd = 'fastaparse.py {} -b > {}/reads_letters.fa 2>{}/reads_discarded.fa\n'.format(
            file_reads_latest, _dir, _dir)
        MAP.write(cmd)
        ret_clip = os.system(cmd.strip())
        file_reads_latest = '{}/reads_letters.fa'.format(_dir)

    # clip 3' adapters
    if options.get('-k'):
        MAP.write("clipping 3' adapters\n")
        if options.get('-v') == '':
            print_stderr("clipping 3' adapters\n")

        cmd = 'clip_adapters.py {} {} > {}/reads_clip.fa\n'.format(
            file_reads_latest, options.get('-k'), _dir)
        MAP.write(cmd)
        ret_clip = os.system(cmd.strip())
        file_reads_latest = '{}/reads_clip.fa'.format(_dir)

    if options.get('-l'):
        MAP.write('discarding short reads\n')

        if options.get('-v') == '':
            print_stderr('discarding short reads\n')

        cmd = 'fastaparse.py {} -a {} > {}/reads_no_short.fa 2>{}/reads_too_short.fa\n'.format(
            file_reads_latest, options.get('-l'), _dir, _dir)
        MAP.write(cmd)
        ret_rem_short = os.system(cmd.strip())
        file_reads_latest = '{}/reads_no_short.fa'.format(_dir)

    # collapse reads
    if options.get('-m') == '':
        MAP.write('collapsing reads\n')

        if options.get('-v') == '':
            print_stderr('collapsing reads\n')

        cmd = 'collapse_reads_md.py {} {} > {}/reads_nr.fa\n'.format(
            file_reads_latest, prefix, _dir)
        MAP.write(cmd)
        ret_collapse = os.system(cmd)
        file_reads_latest = '{}/reads_nr.fa'.format(_dir)

    # printing reads
    if options.get('-s'):
        cat_to(file_reads_latest, options.get('-s'))

    return file_reads_latest
Example #29
0
    check_options(options, file_reads)

    if '-o' in options.keys():
        threads = options.get('-o')

    cores = os.popen('grep -ic ^processor /proc/cpuinfo').read()
    if not re.search(r'^\d+$', cores):
        cores = os.popen('sysctl -n hw.physicalcpu').read()
        if not re.search(r'^\d+$', cores):
            cores = os.popen('sysctl -n hw.logicalcpu').read()

    if not re.search(r'^\d+$', cores):
        cores = 1

    if threads > cores:
        print_stderr(
            'More threads specified than cores on the system. Reducing the number of threads to {}\n'.format(cores))
        threads = cores

    if options.get('-q') == '':
        mismatches_seed = 1

    prefix_global = 'seq'

    if options.get('-g'):
        prefix_global = options.get('-g')

    if options.get('-d'):
        handle_config_file(file_reads)
    else:
        handle_one_file(file_reads, prefix_global, MAP, options)
Example #30
0
    file_precursors = args.file_precursors
    read_align_edit_distance = args.read_align_edit_distance

    opts, argss = getopt.getopt(sys.argv[4:], "a:bo:")
    options = dict(opts)

    ltime = long(time.time())
    _dir = 'dir_prepare_signature{}'.format(ltime)

    if '-o' not in options.keys() or options.get('-o') == '':
        die('no outfile specified with option -o\n')

    outfile = options.get('-o')

    if options.get('-b') == '':
        print_stderr('preparing signature file\n')

    os.mkdir(_dir)
    shutil.copy(file_precursors, _dir)

    if options.get('-b') == '':
        print_stderr('constructing index of precursors\n')
    os.system('bowtie-build {} {}/precursors.ebwt > /dev/null'.format(
        file_precursors, _dir))

    if options.get('-b') == '':
        print_stderr('mapping reads to precursors\n')
    cmd = 'bowtie -f -v {} -a --best --strata --norc {}/precursors.ebwt {} {}/reads_vs_precursors.bwt 2> /dev/null\n'.format(
        read_align_edit_distance, _dir, file_reads, _dir)
    print_stderr(cmd)
    os.system(cmd)