Beispiel #1
0
def excise_precursors():
    global file_genome, parsed_arf, dir_tmp, stack_height_min, dir_tmp, max_pres
    # excise precursors from the genome
    pprint("#excising precursors\n")
    print_stderr("#excising precursors\n")

    start()
    ret_excise_precursors = None

    if options.get('-a'):
        cmd = "excise_precursors.py {} {}/{}_parsed.arf {}/precursors.coords -a {} > {}/precursors.fa\n\n".format(
            file_genome, dir_tmp, parsed_arf, dir_tmp, stack_height_min,
            dir_tmp)
        print_stderr(cmd)
        ret_excise_precursors = os.popen(cmd).read()
    else:
        cmd = "excise_precursors_iterative_final.py {} {}/{}_parsed.arf {}/precursors.fa {}/precursors.coords {}\n".format(
            file_genome, dir_tmp, parsed_arf, dir_tmp, dir_tmp, max_pres)
        print_stderr(cmd)
        ret_excise_precursors = os.popen(cmd).read()

        fname = '{}/precursors.fa_stack'.format(dir_tmp)
        OSS = open_or_die2(fname, 'rb')
        stack_height_min = OSS.readline().strip()
        OSS.close()

    end()

    fname = '{}/precursors.fa'.format(dir_tmp)
    # if (-z "$dir_tmp/precursors.fa" or not -f "$dir_tmp/precursors.fa"):
    if not file_s(fname) or not os.path.isfile(
            fname):  # empty or not a regular plain file
        die("No precursors excised\n")

    return 0
def parse_file_fasta_seqkey(file_fasta, hsh, options):
    if options.get('-a') == '':
        print_stderr('reading file into hash\n')

    _id = ''
    seq = ''
    running_1 = 0

    FASTA = open_or_die2(file_fasta, 'rb')

    while True:
        l = FASTA.readline().strip()
        if not l:
            break

        m = re.match(r'^>(\S+)', l)
        if m:
            _id = m.group()
            seq = ''

            while True:
                ll = FASTA.readline().strip()
                if not ll:
                    break

                mm = re.match(r'^>(\S+)', ll)
                if mm:
                    cnt = find_cnt(_id)
                    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
                    # ATTR: Performance issue below:
                    # create_hash_key_chain(hsh, 0, seq)
                    try:
                        hsh[seq] = (hsh[seq]) + cnt
                    except KeyError:
                        hsh[seq] = cnt

                    running_1 += 1

                    if options.get('-a') == '':
                        print_stderr('{}\r'.format(running_1))

                    _id = mm.group()
                    seq = ''
                    continue

                seq += ll

    cnt = find_cnt(_id)
    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
    create_hash_key_chain(hsh, 0, seq)
    hsh[seq] += cnt
    running_1 += 1

    if options.get('-a') == '':
        print_stderr('{}\r'.format(running_1))

    FASTA.close()
Beispiel #3
0
            if re.search(r'\s+', _id):
                die('Error in line {}: The identifier\n {}\n\ncontains white spaces\n\n{}\n\nYou could run remove_white_space_in_id.py inputfile > newfile\nThis will remove everything from the id line after the first whitespace\n'
                    .format(Nicenumber(counter), _id, hint))
            else:
                create_hash_key_chain(hash_num, 0, _id)
                hash_num[_id] += 1
        elif not re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]+)$', rin):
            die('Error in line {}: The sequence\n{}\n\ncontains characters others than [acgtunACGTUN]\n\n{}'
                .format(Nicenumber(counter), rin, hint))


if __name__ == '__main__':
    hash_num = {}
    _id = None

    hint = 'Please check your file for the following issues:\n\nI.  Sequences are allowed only to comprise characters [ACGTNacgtn].\nII. Identifiers are not allowed to have withespaces.\n'

    if len(sys.argv) == 1:
        # from stdin
        read_handler(sys.stdin)
    else:
        # from files
        for f in sys.argv[1:]:
            IN = open_or_die2(f, 'rb')

            read_handler(IN)

            IN.close()

    sys.exit(0)
Beispiel #4
0
    line = []

    thres = -50

    if options.get('-s') is not None:
        thres = options.get('-s')

    score = thres
    _max = 'na'
    maxs = 999999999999999999999999999
    if options.get('-t'):
        _max = options.get('-t')
        maxs = _max

    IN = open_or_die2(options.get('-r'), 'rb')
    seqcol = 15
    if options.get('-m') == '':
        seqcol = 13

    if options.get('-k') == '':
        seqcol = 14

    names = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
             '12', 'mature', 'star', 'pres')

    while True:
        l = IN.readline()
        if not l:
            break
Beispiel #5
0
def test_input_files():
    global file_reads, file_reads_vs_genome, file_genome, file_precursors, minpreslen, file_mature_ref_other_species, file_mature_ref_this_species
    IN = open_or_die2(file_reads, 'rb')
    line = IN.readline().strip()
    if not re.search(r'^>\S+', line):
        printErr()
        die("The first line of file $file_reads does not start with '>identifier'\nReads file {} is not a valid fasta file\n\n"
            .format(file_reads))

    if re.search(r'\s', line):
        printErr()
        die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n'
            .format(file_reads, file_reads))

    line = IN.readline()
    if not re.search(r'^[ACGTUNacgtun]*$', line):
        printErr()
        die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n'
            .format(file_reads, file_reads))

    IN.close()

    IN = open_or_die2(file_genome, 'rb')
    line = IN.readline().strip()
    if not re.search(r'>\S+', line):
        printErr()
        die("The first line of file {} does not start with '>identifier'\nGenome file {} is not a fasta file\n\n"
            .format(file_genome, file_genome))

    if re.search(r'\s', line):
        printErr()
        die('Genome file {} has not allowed whitespaces in its first identifier\n\n'
            .format(file_genome))

    # get genome ids
    tmps = os.popen('grep ">" {}'.format(file_genome)).read().strip()
    genomeids = dict(map(lambda x: (x, 1), re.split("\n", tmps)))

    line = IN.readline()
    if not re.search(r'^[ACGTUNacgtun]*$', line):
        printErr()
        die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nGenome file {} is not a fasta file\n\n'
            .format(file_genome, file_genome))

    IN.close()

    IN = open_or_die2(file_reads_vs_genome, 'rb')
    line = IN.readline()
    if not re.search(
            r'^(\S+_x\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+(\d+)\s*([mDIM]*)$',
            line):
        printErr()
        die('Mapping file {} is not in arf format\n\nEach line of the mapping file must consist of the following fields\nreadID_wo_whitespaces  length  start  end read_sequence genomicID_wo_whitspaces length  start   end     genomic_sequence  strand  #mismatches editstring\nThe editstring is optional and must not be contained\nThe readID must end with _xNumber and is not allowed to contain whitespaces.\nThe genomeID is not allowed to contain whitespaces.'
            .format(file_reads_vs_genome))

    IN.close()

    # get ids from arf file and compare them with ids from the genome file
    tmps = os.popen(
        'cut -f6 {}|sort -u'.format(file_reads_vs_genome)).read().strip()
    for s in re.split("\n", tmps):
        if not genomeids.get(">{}".format(s)):
            die("The mapped reference id {} from file {} is not an id of the genome file {}\n\n"
                .format(s, file_reads_vs_genome, file_genome))

    if not re.search('none', file_mature_ref_this_species):
        IN = open_or_die2(file_mature_ref_this_species, 'rb')
        line = IN.readline().strip()
        if not re.search(r'>\S+', line):
            printErr()
            die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_this_species,
                        file_mature_ref_this_species))

        if re.search(r'\s', line):
            printErr()
            die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n"
                .format(file_mature_ref_this_species))

        line = IN.readline()
        if not re.search(r'^[ACGTUNacgtun]*$', line):
            printErr()
            die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_this_species,
                        file_mature_ref_this_species))

        IN.close()

    if not re.search('none', file_mature_ref_other_species):
        IN = open_or_die2(file_mature_ref_other_species, 'rb')
        line = IN.readline().strip()
        if not re.search(r'>\S+', line):
            printErr()
            die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_other_species,
                        file_mature_ref_other_species))

        if re.search(r'\s', line):
            printErr()
            die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n"
                .format(file_mature_ref_other_species))

        line = IN.readline()
        if not re.search(r'^[ACGTUNacgtun]*$', line):
            printErr()
            die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_other_species,
                        file_mature_ref_other_species))

        IN.close()

    if not re.search('none', file_precursors):
        IN = open_or_die2(file_precursors, 'rb')
        line = IN.readline().strip()
        if not re.search(r'>\S+', line):
            printErr()
            die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_precursors, file_precursors))

        if re.search(r'\s', line):
            printErr()
            die("precursor file {} has not allowed whitespaces in its first identifier\n\n"
                .format(file_precursors))

        line = IN.readline()
        if not re.search(r'^[ACGTUNacgtun]*$', line):
            printErr()
            die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_precursors, file_precursors))

        if len(line) < minpreslen:
            printErr()
            die("The precursor file {} does not contain sequences of at least {} nt\nPlease make sure that you provided the correct file and the correct parameter ordering when calling {}\nIf you have precursors with less than {} please use option -p <int> to specify this length\n"
                .format(file_precursors, minpreslen, sys.argv[0], minpreslen))

        IN.close()

    # #################################################
    # precheck finished
    # #################################################

    # do stringent testing of all input files
    pprint("#testing input files\n")
    print_stderr("#testing input files\n")

    if not re.search('none', file_mature_ref_this_species):
        start()
        cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format(
            file_mature_ref_this_species)
        print_stderr(cmd)
        ret_file_mature_ref_this_species = os.popen(cmd).read().strip()

        if ret_file_mature_ref_this_species:
            printErr()
            die("problem with {} {}\n".format(
                file_mature_ref_this_species,
                ret_file_mature_ref_this_species))
        end()

    if not re.search(r'none', file_mature_ref_other_species):
        start()

        cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format(
            file_mature_ref_other_species)
        print_stderr(cmd)
        ret_file_mature_ref_other_species = os.popen(cmd).read().strip()

        if ret_file_mature_ref_other_species:
            printErr()
            die("problem with {} {}\n".format(
                file_mature_ref_other_species,
                ret_file_mature_ref_other_species))
        end()

    cmd = "sanity_check_reads_ready_file.py {} 2>&1\n\n".format(file_reads)
    print_stderr(cmd)
    start()
    ret_test_file_reads = os.popen(cmd).read().strip()

    if ret_test_file_reads:
        printErr()
        die("problem with {} {}\n".format(file_reads, ret_test_file_reads))

    end()

    start()
    cmd = "sanity_check_genome.py {} 2>&1;\n\n".format(file_genome)
    print_stderr(cmd)
    ret_test_file_genome = os.popen(cmd).read().strip()

    if ret_test_file_genome:
        printErr()
        die("problem with {} {}\n".format(file_genome, ret_test_file_genome))

    end()
    start()

    cmd = "sanity_check_mapping_file.py {} 2>&1".format(file_reads_vs_genome)
    print_stderr(cmd)
    ret_test_file_reads_genome = os.popen(cmd).read().strip()

    if ret_test_file_reads_genome:
        printErr()
        die("problem with {} {}\n".format(file_reads_vs_genome,
                                          ret_test_file_reads_genome))

    end()

    if not re.search('none', file_precursors):
        start()

        cmd = "sanity_check_mature_ref.py {} 2>&1".format(file_precursors)
        print_stderr(cmd)
        ret_file_precursors = os.popen(cmd).read().strip()

        if ret_file_precursors:
            printErr()
            die("problem with {} {}\n".format(file_precursors,
                                              ret_file_precursors))

        end()

        start()
        if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
            print_stderr("Quantitation of expressed miRNAs in data\n\n\n")

            species = ''
            if options.get('-t'):
                species = "-t {}".format(options.get('-t'))

            file_star = ''
            if options.get('-s'):
                if file_s(options.get('-s')):
                    file_star = "-s {}".format(options.get('-s'))
                else:
                    print_stderr(
                        "File {} specified by option -s is empty or not found\n"
                        .format(options.get('-s')))
                    options['-s'] = 0

            print("#Quantitation of known miRNAs in data\n")
            dopt = ""
            Popt = ""
            if options.get('-d') == '':
                dopt = "-d"
            if options.get('-P') == '':
                Popt = "-P"

            quant = "quantifier.py -p {} -m {} -r {} {} {} -y {} -k {} {}".format(
                file_precursors, file_mature_ref_this_species, file_reads,
                file_star, species, ltime, dopt, Popt)
            print_stderr(quant, "\n")
            os.system(quant)
            options[
                '-q'] = "expression_analyses/expression_analyses_{}/miRBase.mrd".format(
                    ltime)

            end()
        else:
            print_stderr(
                "Pre-quantitation is skipped caused by missing file with known miRNAs\n\n\n"
            )

    else:
        print_stderr(
            "Pre-quantitation is skipped caused by missing file with known precursor miRNAs\n\n\n"
        )