Example #1
0
def excise_precursors():
    global file_genome, parsed_arf, dir_tmp, stack_height_min, dir_tmp, max_pres
    # excise precursors from the genome
    pprint("#excising precursors\n")
    print_stderr("#excising precursors\n")

    start()
    ret_excise_precursors = None

    if options.get('-a'):
        cmd = "excise_precursors.py {} {}/{}_parsed.arf {}/precursors.coords -a {} > {}/precursors.fa\n\n".format(
            file_genome, dir_tmp, parsed_arf, dir_tmp, stack_height_min,
            dir_tmp)
        print_stderr(cmd)
        ret_excise_precursors = os.popen(cmd).read()
    else:
        cmd = "excise_precursors_iterative_final.py {} {}/{}_parsed.arf {}/precursors.fa {}/precursors.coords {}\n".format(
            file_genome, dir_tmp, parsed_arf, dir_tmp, dir_tmp, max_pres)
        print_stderr(cmd)
        ret_excise_precursors = os.popen(cmd).read()

        fname = '{}/precursors.fa_stack'.format(dir_tmp)
        OSS = open_or_die2(fname, 'rb')
        stack_height_min = OSS.readline().strip()
        OSS.close()

    end()

    fname = '{}/precursors.fa'.format(dir_tmp)
    # if (-z "$dir_tmp/precursors.fa" or not -f "$dir_tmp/precursors.fa"):
    if not file_s(fname) or not os.path.isfile(
            fname):  # empty or not a regular plain file
        die("No precursors excised\n")

    return 0
Example #2
0
def run_quantifier(mrna_hp, mature_this_file, collapsed_file, timestamp):
    cmd = 'quantifier.py -p {} -m {} -r {} -t cel -y {}'.format(
        mrna_hp, mature_this_file, collapsed_file, timestamp)
    print_stderr(cmd, '\n')
    ret = os.system(cmd)
    if ret:
        die('Run quantifier.py failed.\n')
Example #3
0
def run_bedtools_cmd(run_times, fafile, mrna_places, mrna_hp):
    for i in range(run_times):
        cmd = 'bedtools getfasta -fi {} -bed {} -name -fo {}'.format(
            fafile, mrna_places, mrna_hp)
        print_stderr(cmd, '\n')
        ret = os.system(cmd)
        if ret:
            die('Run bedtools failed.\n')
Example #4
0
def test_first_argument():
    if len(sys.argv) < 2:
        die(usage)

    if sys.argv[1] == '-u':
        os.system('make_html.py -u -y 1')
        sys.exit(0)

    if sys.argv[1] == '-h' or sys.argv[1] == '--help':
        die(usage)
Example #5
0
def test_bedtools():
    ret = os.system('which bedtools > /dev/null 2>&1')
    if ret:
        die('''bedtools is not installed in your environment.
For ubuntu/debian, run `apt-get install bedtools`.
For Redhat/CentOS, run `yum install BEDTools`.
For MacOS, run `brew install bedtools`.

If you need more information about how to install bedtools, please visit http://bedtools.readthedoc.org.
''')
Example #6
0
def run_mapper(
    reads_file,
    clip_seq,
    fa_prefix,
    collapsed_file,
    mapping_file,
):
    cmd = 'mapper.py {} -c -j -k {} -l 18 -m -p {} -s {} -t {} -v'.format(
        reads_file, clip_seq, fa_prefix, collapsed_file, mapping_file)
    print_stderr(cmd, '\n')
    ret = os.system(cmd)
    if ret:
        die('Run mapper.py failed.\n')
Example #7
0
def check_line(line):
    if re.search(r'-h\s+\d/', line) or re.search(r'-h\s+\w/', line):
        die("option -h should not be given with an integer or string\n")

    if re.search(r'-i\s+\d/', line) or re.search(r'-i\s+\w/', line):
        die("option -i should not be given with an integer or string\n")

    if re.search(r'-j\s+\d/', line) or re.search(r'-j\s+\w/', line):
        die("option -j should not be given with an integer or string\n")

    if re.search(r'-m\s+\d/', line) or re.search(r'-m\s+\w/', line):
        die("option -m should not be given with an integer or string\n")

    if re.search(r'-q\s+\d/', line) or re.search(r'-q\s+\w/', line):
        die("option -q should not be given with an integer or string\n")
Example #8
0
def core_algorithm():
    '''
    run moRNA Finder core algorithm
    '''
    global _dir, dir_tmp, file_mature_ref_other_species, ltime
    pprint("#running moRNA Finder core algorithm\n")
    print_stderr("#running moRNA Finder core algorithm\n")
    line = None

    longest_id = 40
    if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
        longest_id = get_longest_id("{}/{}".format(
            dir_tmp, file_mature_ref_this_species))

    start()

    if not re.search('none', file_mature_ref_other_species, re.IGNORECASE):
        line = "core_algorithm.py {}/signature.arf {}/precursors.str -s {}/{} -v -50 -l {}".format(
            dir_tmp, dir_tmp, dir_tmp, file_mature_ref_other_species,
            longest_id)
    else:
        line = "core_algorithm.py {}/signature.arf {}/precursors.str -v -50 -l {}".format(
            dir_tmp, dir_tmp, longest_id)

    if not options.get('-c') == '':
        line += " -y {}/precursors_for_randfold.rand".format(dir_tmp)

    cmd = "{} > {}/output.mrd\n".format(line, _dir)
    print_stderr(cmd)
    ret_mor_core = os.system(cmd)
    if options.get('-E'):
        ret_mor_core = os.system('{} -t > {}/error.output.mrd'.format(
            line, _dir))

    end()

    # check if file is empty
    fname = "{}/output.mrd".format(_dir)
    if not file_s(fname):
        print_stderr("Error:\n\tFile {} is empty\n\n".format(fname))
        print_stderr(
            "Now running core_algorithm.py with option -t to see why all precursors were discarded\n"
        )
        ret_mor_core = os.system('{} -t > error.output.mrd_{}'.format(
            line, ltime))
        print_stderr(
            "The debug file is called error.output.mrd_{}\n".format(ltime))
        die("\nExiting now\n\n")
def read_handler(handle):
    global counter
    while True:
        rin = handle.readline().strip()
        if not rin:
            break

        rin = rin.strip()
        counter += 1
        if re.match(r'^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+(\d+)\s*([mDIM]*)$', rin):
            pass
        else:
            die('\nWrong format in line {}: The row\n{}\ndoes not correspond to the format\nread_id_wo_whitespaces\tlength\tstart\tend\tread_sequence         \tgenomicID_wo_whitspaces\tlength\tstart\tend\tgenomic_sequence       \tstrand\t#mismatches\teditstring\ne.g. read_22_x10000 \t22    \t1    \t22 \tagtcgtgactgactgactgacg\tchromosomeIII_x12312312\t22    \t1001 \t1022\tagtcgtgactgactgactgacg\t+-    \t0          \tmmmmmmmmmmmmmmmmmmmmm\nPlease make sure that all lines have the above described format.\n'.format(
                Nicenumber(counter),
                rin
            ))
def parse_file_command_line(file_command_line, file_structure, _dir):
    FILE = open_or_die(file_command_line, 'rb',
                       'can not open {}'.format(file_command_line))
    while True:
        line = FILE.readline()
        if not line:
            break

        if re.search(r'(\S+)', line):
            line = line.strip()
            line = re.sub(file_structure,
                          '{}/precursors_permuted.str'.format(_dir),
                          line,
                          count=1)
            line = re.sub(r'>.+', '', line, count=1)
            return line

    die('{} is empty\n'.format(file_command_line))
Example #11
0
def fold_precursors():
    '''
    predicting RNA secondary structures with RNAfold
    '''
    global dir_tmp, ltime
    pprint("#folding precursors\n")
    print_stderr("#folding precursors\n")
    print_stderr(
        "RNAfold < {}/precursors.fa -noPS > {}/precursors.str\n\n".format(
            dir_tmp, dir_tmp))
    start()
    ret_fold_precursors = os.system(
        "RNAfold < {}/precursors.fa -noPS > {}/precursors.str 2>>error_{}.log".
        format(dir_tmp, dir_tmp, ltime))
    if ret_fold_precursors:
        ret_fold_precursors = os.system(
            "RNAfold < {}/precursors.fa --noPS > {}/precursors.str".format(
                dir_tmp, dir_tmp))
        if ret_fold_precursors:
            die("Some RNAfold error occurred. Error {}\n".format(
                ret_fold_precursors))

    end()
Example #12
0
def read_handler(handle):
    global counter

    while True:
        rin = handle.readline().strip()
        if not rin:
            break

        counter += 1
        m = re.match(r'^\>(.+)$', rin)
        if m:
            m = m.groups()
            _id = m[0]

            if re.search(r'\s+', _id):
                die('Error in line {}: The identifier\n {}\n\ncontains white spaces\n\n{}\n\nYou could run remove_white_space_in_id.py inputfile > newfile\nThis will remove everything from the id line after the first whitespace\n'
                    .format(Nicenumber(counter), _id, hint))
            else:
                create_hash_key_chain(hash_num, 0, _id)
                hash_num[_id] += 1
        elif not re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]+)$', rin):
            die('Error in line {}: The sequence\n{}\n\ncontains characters others than [acgtunACGTUN]\n\n{}'
                .format(Nicenumber(counter), rin, hint))
Example #13
0
def test_installed_binaries():
    global scripts
    stdm = 'If you used the install.py script make sure that you started a complete new shell window after installation.\nIf this did not help please restart youer workstation.\n\n'

    ret = None

    ret = checkBIN("bowtie --version", "version")
    if ret:
        die(
            "Error: \tbowtie not found\nCheck if bowtie is correctly installed and all Pathes were set correctly.\n",
            stdm)

    ret = checkBIN("RNAfold -h", "gamma")
    if ret:
        die(
            "Error: \tRNAfold not found\nCheck if RNAfold is correctly installed and all Pathes were set correctly.\n",
            stdm)

    ret = checkBIN("randfold", "let7")
    if ret:
        die(
            "Error: \trandfold not found\nCheck if randfold is correctly installed and all Pathes were set correctly.\n",
            stdm)

    # TODO, perl PDF lib requirement
    # ret = checkBIN("perl -e \'use PDF::API2; print \"installed\";\'","installed")
    # if ret:
    # die "Error: \tPerl PDF::API2 package not found\nCheck if the perl
    # PDF::API2 package is correctly installed and all Pathes were set
    # correctly.\n$stdm" if($ret);

    if not os.path.isfile('{}/Rfam_for_moR.fa'.format(scripts)):
        die("Error:\t Rfam_for_moR.fa not found in your moRNA Finder scripts directory\nPlease copy this file from the moRNA Finder archive to your moRNA Finder scripts directory\n\n"
            )

    return 0
Example #14
0
    scripts = os.popen('which moR.py').read()
    # scripts = re.sub(r'moR.py', '', scripts, count=1)
    # scripts = re.sub(r'\s+', '', scripts)
    scripts = os.path.dirname(scripts) + '/'

    pprint('#Starting moRNA Finder\n')
    print_stderr('#Starting moRNA Finder\n{} {}\n\n'.format(
        sys.argv[0], ' '.join(sys.argv[1:])))
    print_stderr("moRNA Finder started at {}\n\n\n".format(sTimeG))

    test_first_argument()

    command_line = "{} {}\n".format(sys.argv[0], ' '.join(sys.argv[1:]))

    if len(sys.argv) < 7:
        die(usage)

    parser = argparse.ArgumentParser(usage=usage)
    parser.add_argument('reads', help='reads')
    parser.add_argument('genome', help='genome')
    parser.add_argument('mappings', help='mappings')
    parser.add_argument('miRNAs_ref', help='miRNAs_ref')
    parser.add_argument('miRNAs_other', help='miRNAs_other')
    parser.add_argument('precursors', help='precursors')
    args = parser.parse_args(sys.argv[1:7])
    file_reads = args.reads
    file_genome = args.genome
    file_reads_vs_genome = args.mappings
    file_mature_ref_this_species = args.miRNAs_ref
    file_mature_ref_other_species = args.miRNAs_other
    file_precursors = args.precursors
Example #15
0
            survey_known(score)

        if options.get('-a'):
            survey_signal_to_noise(score)

        if options.get('-d'):
            read_stack_min = options.get('-d')
            pprint('\t{}'.format(read_stack_min))

        pprint('\n')


if __name__ == '__main__':
    if len(sys.argv) < 2:
        die(usage)

    parser = argparse.ArgumentParser(usage=usage)
    parser.add_argument('file_out', help='output file')
    args = parser.parse_args(sys.argv[1:2])

    file_out = args.file_out
    opts, argss = getopt.getopt(sys.argv[2:], 'a:b:c:d:')
    options = dict(opts)

    if (options.get('-b') and not options.get('-c')) or (not options.get('-b') and options.get('-c')):
        die('options -b and -c must be used in conjunction\n')

    if options.get('-b') and options.get('-c'):
        parse_file_ref(options.get('-b'), hash_ref)
Example #16
0
def check_file_format_and_option(file_reads, aFormat):
    print_stderr('\n')
    warning = '''\n\n***** Please check if the option you used (options $format) designates the correct format of the supplied reads file $file *****\n\n
[options]
-a              input file is seq.txt format
-b              input file is qseq.txt format
-c              input file is fasta format
-e              input file is fastq format
-d              input file is a config file (see moRNA Finder documentation).
                options -a, -b, -c or -e must be given with option -d.
'''
    line = None
    if aFormat == 'a':
        i = 0
        IN = open_or_die(
            file_reads, 'rb', 'Cannot open file {} supplied by option -a\n'.format(file_reads))
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            line = esplit(l)
            # $#line != 4
            if len(line) != 5:
                die('The seq.txt file does not contain 5 columns. Please make sure to follow the _seq.txt file format conventions\n{}'.format(warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == 'b':
        IN = open_or_die(
            file_reads, 'rb', 'Cannot open qseq.txt file {} supplied by option -b\n'.format(file_reads))
        i = 0
        mes = 'Please make sure your file is in accordance with the qses.txt format specifications\n'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            line = esplit(l)

            if len(line) != 11:
                die('The qseq.txt file does not contain 11 columns but {}. Please make sure to follow the qseq.txt file format conventions\n{}'.format(
                    len(line), warning))

            if not re.search(r'^\S+', line[9]):
                die('The sequence field in the qseq.txt file is invalid. Please make sure to follow the qseq.txt file format conventions\n{}'.format(warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == '-c':
        IN = open_or_die(file_reads, 'rb',
                         'Cannot open FASTA file supplied by option -c\n')
        i = 0
        mes = 'Please make sure your file is in accordance with the fasta format specifications and does not contain whitespace in IDs or sequences'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            if i == 1:
                if not re.search(r'^>\S+$', l):
                    die("First line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 2:
                if not re.search(r'^\S+$', l):
                    die("Second line of FASTA reads file contains whitespace in sequence\n{}\n".format(
                        mes))
            if i == 3:
                if not re.search(r'^>\S+$', l):
                    die("Second ID line of FASTA reads file is not in accordance with the fasta format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 4:
                if not re.search(r'^\S+$', l):
                    die("Secdond sequence line of FASTA reads file contains whitespace in sequence\n{}\n{}".format(
                        mes, warning))

            if i == 4:
                break
        IN.close()
    elif aFormat == '-e':
        IN = open_or_die(file_reads, 'rb',
                         'Cannot open FASTQ file supplied by option -e\n')
        i = 0
        mes = 'Please make sure your file is in accordance with the FASTQ format specifications'
        while True:
            l = IN.readline().strip()
            if not l:
                break
            i += 1
            if i == 1:
                if not re.search(r'^@\S+', l):
                    die("First line of FASTQ reads file is not in accordance with the fastq format specifications\n{}\n{}".format(
                        mes, warning))
            if i == 2:
                if re.search(r'^\S+$', l):
                    die("Second line of FASTQ reads file contains whitespace in sequence\n{}\n{}".format(
                        mes, warning))
            if i == 3:
                if re.search(r'^\+', l):
                    die("Third line of FASTQ reads file does not start with a '+' character.\n{}\n{}".format(mes, warning))
            if i == 4:
                if re.search(r'^\S+$', l):
                    die("Fourth line of FASTQ reads file contains whitespace\n{}\n{}".format(
                        mes, warning))

            if i == 4:
                break
Example #17
0
def test_prefix(prefix):
    if not (re.search(r'^\w\w\w$', prefix) and not re.search(r'_', prefix)):
        die('prefix $prefix does not contain exactly three alphabet letters\n')
Example #18
0
def check_options(options, file_reads):
    formats = 0
    if options.get('-a') == '':
        formats += 1
        if options.get('-d') != '':
            check_file_format_and_option(file_reads, 'a')

    if options.get('-b') == '':
        formats += 1
        if options.get('-d') != '':
            check_file_format_and_option(file_reads, 'b')

    if options.get('-c') == '':
        formats += 1
        if options.get('-d') != '':
            check_file_format_and_option(file_reads, 'c')

    if options.get('-e') == '':
        formats += 1
        if options.get('-d') != '':
            check_file_format_and_option(file_reads, 'e')

    if formats != 1:
        die('exactly one input format (-a, -b , -e or -c) must be designated\n')

    # check if file supplied matches option, otherwise quit
    processing_steps = 0

    if options.get('-h') == '':
        processing_steps += 1

    if options.get('-i') == '':
        processing_steps += 1

    if options.get('-j') == '':
        processing_steps += 1

    if options.get('-k'):
        processing_steps += 1

    if options.get('-l'):
        processing_steps += 1

    if options.get('-m') == '':
        processing_steps += 1

    if options.get('-p'):
        processing_steps += 1

    if processing_steps <= 0:
        die('at least one processing/mapping step (-h, -i, -j, -k, -l, -m or -p) must be designated\n')

    file_output = 0
    if '-o' in options.keys():
        if not(re.search(r'\d+', options.get('-o')) and int(options.get('-o')) > 0):
            die('options -o must be positive integer\n')

    if options.get('-s'):
        file_output += 1

    if options.get('-t'):
        file_output += 1

    if file_output <= 0:
        die('at least one output file (-s or -t) must be designated\n')

    if options.get('-s') and os.path.exists(options.get('-s')) and not options.get('-n') == '':
        die("file {} already exists\n".format(options.get('-s')))

    if options.get('-t') and os.path.exists(options.get('-t')) and not options.get('-n') == '':
        die("file {} already exists\n".format(options.get('-t')))

    if options.get('-a') == '' or options.get('-b') == '' or options.get('-e') == '' and options.get('-h') != '':
        die("raw illumina output must be parsed to fasta format with options -h\n")

    if options.get('-c') == '' and options.get('-h') == '':
        die("input file is already designated as a fasta file, so option -h should not be used\n")

    if options.get('-c') == '' and not(options.get('-i') == '' or options.get('-j') == '' or options.get('-k') or options.get('-l') or options.get('-m') == '' or options.get('-p')):
        die("at least one processing/mapping step (-i, -j, -k, -l, -m or -p) must be designated\n")

    if options.get('-d') == '' and not(options.get('-a') != '' or options.get('-b') == '' or options.get('-c') == '' or options.get('-e')) == '':
        die("option -d must be given with option -a, -b, -c or -e \n")

    if options.get('-d') == '' and options.get('-g'):
        die("option -d and -g are mutually exclusive. If -d is given, the prefixes must be contained in the config file\n")

    if options.get('-g'):
        test_prefix(options.get('-g'))

    if options.get('-i') == '' and not(options.get('-c') == '' or options.get('-h') == ''):
        die("option -i must be used on reads in fasta format or with option -h \n")

    if options.get('-j') == '' and not(options.get('-c') == '' or options.get('-h') == ''):
        die("option -j must be used on reads in fasta format or with option -h \n")

    if options.get('-k') and not(options.get('-c') == '' or options.get('-h') == ''):
        die("option -k must be used on reads in fasta format or with option - h\n")

    if options.get('-l') and not(options.get('-c') == '' or options.get('-h') == ''):
        die("option -l must be used on reads in fasta format or with option -h \n")

    if options.get('-m') == '' and not(options.get('-c') == '' or options.get('-h') == ''):
        die("option -m must be used on reads in fasta format or with option -h \n")

    if options.get('-p') and not(options.get('-c') == '' or options.get('-h') == ''):
        die("option -p must be used on reads in fasta format or with option -h\n")

    if options.get('-q') == '' and not(options.get('p')):
        die("option -q must be given with option -p\n")

    if options.get('-s') and not(options.get('-h') == '' or options.get('-i') == '' or options.get('-j') == '' or options.get('-k') or options.get('-l') or options.get('-m') == '' or options.get('-p')):
        die("at least one processing step (-h, -i, -j, -k, -l, -m or -p) must be designated if processed file should be output (-s)\n")

    if options.get('-t') and not(options.get('-p')):
        die("reads must be mapped (-p) if mappings are to be output (-t)\n")

    if options.get('-k') and re.search(r'^-', options.get('-k')):
        die("please make sure that the adapter sequence designated with the -k option is correct\n")

    if options.get('-l') and re.search(r'^-', options.get('-l')):
        die("please make sure that the int given with the -l option is correct\n")

    if options.get('-p') and re.search(r'ebwt$', options.get('-p')):
        die("please make sure that you are using the -p option correctly.\nThe argument given after -p must be the _prefix_ of the bowtie\nindexed files and should not contain 'ebwt'. For instance,\nif the first indexed file is called 'h_sapiens_37_asm.1.ebwt'\nthen the prefix is 'h_sapiens_37_asm'.\n")

    if options.get('-p') and re.search(r'^-', options.get('-p')):
        die("please make sure that the genome index designated with the -p option is correct\n")

    # added by SM to check if bowtie is installed when reads should be mapped
    # to genome
    if options.get('-p'):
        # TODO: my $binst=`bowtie --version 2>&1`;
        binst = os.system('bowtie --version 2>&1')
        if binst:
            printErr()
            die("Bowtie mapping tool not installed.\nPlease download from http://downloads.sourceforge.net/project/bowtie-bio/bowtie/ the latest version and install it.\n\n")

    if options.get('-s') and re.search(r'^-', options.get('-s')):
        die("please make sure that the output file designated with the -s option is correct\n")

    if options.get('-t') and re.search(r'^-', options.get('-t')):
        die("please make sure that the output file designated with the -t option is correct\n")
                        hash_seq[tag]
                    except KeyError:
                        hash_seq[tag] = {}

                    hash_seq[tag][mm[0]] = counter

            else:
                die('Error in line {}: Either the sequence\n\n{}\n\ncontains less than 17 characters or contains characters others than [acgtunACGTUN]\n\n\nPlease make sure that your file only comprises sequences that have at least 17 characters\n\ncontaining letters [acgtunACGTUN]\n'.format(
                    Nicenumber(counter),
                    rin
                ))


if __name__ == '__main__':
    if len(sys.argv) < 2:
        die(usage)

    in_file = sys.argv[1]
    # FILE = open_or_die(in_file, 'rb', "Can't open {}".format(in_file))
    # while True:
    #     aBuffer = FILE.read(4096)
    #     if not aBuffer:
    #         break

    #     lines += int(tr(aBuffer, '\n', ''))
    # FILE.close()
    lines = os.popen("wc -l {}".format(in_file)).read().strip()
    lines = int(re.split(r'\s', lines)[0])

    if lines / 2 > 5000000:
        rhash = {}
def read_handler(handle):
    global counter
    while True:
        rin = handle.readline()
        if not rin:
            break

        rin = rin.strip()

        counter += 1
        m = re.match(r'^\>(\S\S\S)(.+)$', rin)
        if m:
            m = m.groups()
            _id = '{}{}'.format(m[0], m[1])
            tag = m[0]

            if re.search(r'\s+', _id):
                die('Error in line {}: The identifier\n \n{}\n \ncontains white spaces\n\n\nPlease make sure that none of the identifiers contain whitepaces.\nYou could run remove_white_space_in_id.py {} > newfile\nThis will remove everything from the id line after the first whitespace'.format(
                    Nicenumber(counter),
                    rin,
                    in_file
                ))
            elif not re.match(r'^(\S\S\S)_(\d+)_(x\d+)$', _id):
                die('Error in line {}: The identifier\n\n{}\n\nhas to have the format\nname_uniqueNumber_xnumber\n\n\nPlease make sure that all identifiers are unique and have the format described above.'.format(
                    Nicenumber(counter),
                    _id,
                ))
            else:
                mm = re.match(r'^(\S\S\S)_(\d+)_(x\d+)$', _id)
                if mm:
                    mm = mm.groups()

                    try:
                        hash_num[m[1]] += 1
                    except KeyError:
                        hash_num[m[1]] = 1

        else:
            mm = re.match(r'^([A|C|G|T|U|N|a|c|g|t|u|n]{17,})$', rin)
            if mm:
                mm = mm.groups()

                defined = False
                try:
                    hash_seq[tag][seq]
                    defined = True
                except KeyError:
                    pass

                if defined and hash_seq[tag][seq]:
                    die('Error in line {}: The sequence\n\n{}\n\noccures at least twice in sample {} in your reads file.\n\nAt first it occured at line {}\n\nPlease make sure that your reads file only contains unique sequences within each sample.\n'.format(
                        Nicenumber(counter),
                        mm[0],
                        tag,
                        Nicenumber(hash_seq[tag][mm[0]])
                    ))
                else:
                    try:
                        hash_seq[tag]
                    except KeyError:
                        hash_seq[tag] = {}

                    hash_seq[tag][mm[0]] = counter

            else:
                die('Error in line {}: Either the sequence\n\n{}\n\ncontains less than 17 characters or contains characters others than [acgtunACGTUN]\n\n\nPlease make sure that your file only comprises sequences that have at least 17 characters\n\ncontaining letters [acgtunACGTUN]\n'.format(
                    Nicenumber(counter),
                    rin
                ))
Example #21
0
        if not line:
            break
        l = esplit(line)
        if l[5] not in index.keys():
            count += 1
            index[l[5]] = count

        IKT.write('{}\t{}'.format(index[l[5]], line))

    IK.close()
    IKT.close()


if __name__ == '__main__':
    if len(sys.argv) < 4:
        die(usage)

    parser = argparse.ArgumentParser(usage)
    parser.add_argument('file_reads', help='file reads')
    parser.add_argument('file_precursors', help='file precursors')
    parser.add_argument('read_align_edit_distance',
                        help='read align edit distance')
    args = parser.parse_args(sys.argv[1:4])

    file_reads = args.file_reads
    file_precursors = args.file_precursors
    read_align_edit_distance = args.read_align_edit_distance

    opts, argss = getopt.getopt(sys.argv[4:], "a:bo:")
    options = dict(opts)
Example #22
0
def test_input_files():
    global file_reads, file_reads_vs_genome, file_genome, file_precursors, minpreslen, file_mature_ref_other_species, file_mature_ref_this_species
    IN = open_or_die2(file_reads, 'rb')
    line = IN.readline().strip()
    if not re.search(r'^>\S+', line):
        printErr()
        die("The first line of file $file_reads does not start with '>identifier'\nReads file {} is not a valid fasta file\n\n"
            .format(file_reads))

    if re.search(r'\s', line):
        printErr()
        die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n'
            .format(file_reads, file_reads))

    line = IN.readline()
    if not re.search(r'^[ACGTUNacgtun]*$', line):
        printErr()
        die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nReads file {} is not a fasta file\n\n'
            .format(file_reads, file_reads))

    IN.close()

    IN = open_or_die2(file_genome, 'rb')
    line = IN.readline().strip()
    if not re.search(r'>\S+', line):
        printErr()
        die("The first line of file {} does not start with '>identifier'\nGenome file {} is not a fasta file\n\n"
            .format(file_genome, file_genome))

    if re.search(r'\s', line):
        printErr()
        die('Genome file {} has not allowed whitespaces in its first identifier\n\n'
            .format(file_genome))

    # get genome ids
    tmps = os.popen('grep ">" {}'.format(file_genome)).read().strip()
    genomeids = dict(map(lambda x: (x, 1), re.split("\n", tmps)))

    line = IN.readline()
    if not re.search(r'^[ACGTUNacgtun]*$', line):
        printErr()
        die('File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nGenome file {} is not a fasta file\n\n'
            .format(file_genome, file_genome))

    IN.close()

    IN = open_or_die2(file_reads_vs_genome, 'rb')
    line = IN.readline()
    if not re.search(
            r'^(\S+_x\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-])\s+(\d+)\s*([mDIM]*)$',
            line):
        printErr()
        die('Mapping file {} is not in arf format\n\nEach line of the mapping file must consist of the following fields\nreadID_wo_whitespaces  length  start  end read_sequence genomicID_wo_whitspaces length  start   end     genomic_sequence  strand  #mismatches editstring\nThe editstring is optional and must not be contained\nThe readID must end with _xNumber and is not allowed to contain whitespaces.\nThe genomeID is not allowed to contain whitespaces.'
            .format(file_reads_vs_genome))

    IN.close()

    # get ids from arf file and compare them with ids from the genome file
    tmps = os.popen(
        'cut -f6 {}|sort -u'.format(file_reads_vs_genome)).read().strip()
    for s in re.split("\n", tmps):
        if not genomeids.get(">{}".format(s)):
            die("The mapped reference id {} from file {} is not an id of the genome file {}\n\n"
                .format(s, file_reads_vs_genome, file_genome))

    if not re.search('none', file_mature_ref_this_species):
        IN = open_or_die2(file_mature_ref_this_species, 'rb')
        line = IN.readline().strip()
        if not re.search(r'>\S+', line):
            printErr()
            die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_this_species,
                        file_mature_ref_this_species))

        if re.search(r'\s', line):
            printErr()
            die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n"
                .format(file_mature_ref_this_species))

        line = IN.readline()
        if not re.search(r'^[ACGTUNacgtun]*$', line):
            printErr()
            die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_this_species,
                        file_mature_ref_this_species))

        IN.close()

    if not re.search('none', file_mature_ref_other_species):
        IN = open_or_die2(file_mature_ref_other_species, 'rb')
        line = IN.readline().strip()
        if not re.search(r'>\S+', line):
            printErr()
            die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_other_species,
                        file_mature_ref_other_species))

        if re.search(r'\s', line):
            printErr()
            die("miRNA reference this species file {} has not allowed whitespaces in its first identifier\n\n"
                .format(file_mature_ref_other_species))

        line = IN.readline()
        if not re.search(r'^[ACGTUNacgtun]*$', line):
            printErr()
            die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_mature_ref_other_species,
                        file_mature_ref_other_species))

        IN.close()

    if not re.search('none', file_precursors):
        IN = open_or_die2(file_precursors, 'rb')
        line = IN.readline().strip()
        if not re.search(r'>\S+', line):
            printErr()
            die("The first line of file {} does not start with '>identifier'\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_precursors, file_precursors))

        if re.search(r'\s', line):
            printErr()
            die("precursor file {} has not allowed whitespaces in its first identifier\n\n"
                .format(file_precursors))

        line = IN.readline()
        if not re.search(r'^[ACGTUNacgtun]*$', line):
            printErr()
            die("File {} contains not allowed characters in sequences\nAllowed characters are ACGTUN\nmiRNA reference this species file {} is not a fasta file\n\n"
                .format(file_precursors, file_precursors))

        if len(line) < minpreslen:
            printErr()
            die("The precursor file {} does not contain sequences of at least {} nt\nPlease make sure that you provided the correct file and the correct parameter ordering when calling {}\nIf you have precursors with less than {} please use option -p <int> to specify this length\n"
                .format(file_precursors, minpreslen, sys.argv[0], minpreslen))

        IN.close()

    # #################################################
    # precheck finished
    # #################################################

    # do stringent testing of all input files
    pprint("#testing input files\n")
    print_stderr("#testing input files\n")

    if not re.search('none', file_mature_ref_this_species):
        start()
        cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format(
            file_mature_ref_this_species)
        print_stderr(cmd)
        ret_file_mature_ref_this_species = os.popen(cmd).read().strip()

        if ret_file_mature_ref_this_species:
            printErr()
            die("problem with {} {}\n".format(
                file_mature_ref_this_species,
                ret_file_mature_ref_this_species))
        end()

    if not re.search(r'none', file_mature_ref_other_species):
        start()

        cmd = "sanity_check_mature_ref.py {} 2>&1\n\n".format(
            file_mature_ref_other_species)
        print_stderr(cmd)
        ret_file_mature_ref_other_species = os.popen(cmd).read().strip()

        if ret_file_mature_ref_other_species:
            printErr()
            die("problem with {} {}\n".format(
                file_mature_ref_other_species,
                ret_file_mature_ref_other_species))
        end()

    cmd = "sanity_check_reads_ready_file.py {} 2>&1\n\n".format(file_reads)
    print_stderr(cmd)
    start()
    ret_test_file_reads = os.popen(cmd).read().strip()

    if ret_test_file_reads:
        printErr()
        die("problem with {} {}\n".format(file_reads, ret_test_file_reads))

    end()

    start()
    cmd = "sanity_check_genome.py {} 2>&1;\n\n".format(file_genome)
    print_stderr(cmd)
    ret_test_file_genome = os.popen(cmd).read().strip()

    if ret_test_file_genome:
        printErr()
        die("problem with {} {}\n".format(file_genome, ret_test_file_genome))

    end()
    start()

    cmd = "sanity_check_mapping_file.py {} 2>&1".format(file_reads_vs_genome)
    print_stderr(cmd)
    ret_test_file_reads_genome = os.popen(cmd).read().strip()

    if ret_test_file_reads_genome:
        printErr()
        die("problem with {} {}\n".format(file_reads_vs_genome,
                                          ret_test_file_reads_genome))

    end()

    if not re.search('none', file_precursors):
        start()

        cmd = "sanity_check_mature_ref.py {} 2>&1".format(file_precursors)
        print_stderr(cmd)
        ret_file_precursors = os.popen(cmd).read().strip()

        if ret_file_precursors:
            printErr()
            die("problem with {} {}\n".format(file_precursors,
                                              ret_file_precursors))

        end()

        start()
        if not re.search('none', file_mature_ref_this_species, re.IGNORECASE):
            print_stderr("Quantitation of expressed miRNAs in data\n\n\n")

            species = ''
            if options.get('-t'):
                species = "-t {}".format(options.get('-t'))

            file_star = ''
            if options.get('-s'):
                if file_s(options.get('-s')):
                    file_star = "-s {}".format(options.get('-s'))
                else:
                    print_stderr(
                        "File {} specified by option -s is empty or not found\n"
                        .format(options.get('-s')))
                    options['-s'] = 0

            print("#Quantitation of known miRNAs in data\n")
            dopt = ""
            Popt = ""
            if options.get('-d') == '':
                dopt = "-d"
            if options.get('-P') == '':
                Popt = "-P"

            quant = "quantifier.py -p {} -m {} -r {} {} {} -y {} -k {} {}".format(
                file_precursors, file_mature_ref_this_species, file_reads,
                file_star, species, ltime, dopt, Popt)
            print_stderr(quant, "\n")
            os.system(quant)
            options[
                '-q'] = "expression_analyses/expression_analyses_{}/miRBase.mrd".format(
                    ltime)

            end()
        else:
            print_stderr(
                "Pre-quantitation is skipped caused by missing file with known miRNAs\n\n\n"
            )

    else:
        print_stderr(
            "Pre-quantitation is skipped caused by missing file with known precursor miRNAs\n\n\n"
        )
        if re.search(r'(\S+)', line):
            line = line.strip()
            line = re.sub(file_structure,
                          '{}/precursors_permuted.str'.format(_dir),
                          line,
                          count=1)
            line = re.sub(r'>.+', '', line, count=1)
            return line

    die('{} is empty\n'.format(file_command_line))


if __name__ == '__main__':
    if len(sys.argv) < 4:
        die(usage)

    parser = argparse.ArgumentParser(usage=usage)
    parser.add_argument('file_command_line', help='command list file')
    parser.add_argument('file_structure', help='structure file')
    parser.add_argument('rounds', help='rounds')
    args = parser.parse_args(sys.argv[1:4])

    file_command_line = args.file_command_line
    file_structure = args.file_structure
    rounds = int(args.rounds)

    opt, argss = getopt.getopt(sys.argv[4:], "a")
    options = dict(opt)

    ltime = long(time.time())
Example #24
0
#!/usr/bin/env python
from __future__ import print_function

import re
import sys

from port import die, esplit, open_or_die

if __name__ == '__main__':
    if len(sys.argv) < 2:
        die('No csv file given for bed conversion\n')

    known, novel, _not, line, thres, score, line, strand, label, end = (
        None, None, None, None, None, None, None, None, None, None)

    IN = open_or_die(sys.argv[1], 'r', 'cannot open {}\n'.format(sys.argv[1]))
    while True:
        line = IN.readline()
        if not line:
            break

        if re.search(r'novel miRNAs predicted by moRNA Finder', line):
            novel = 1
            known = 0
            _not = 0
        elif re.search(r'mature miRBase miRNAs detected', line):
            novel = 0
            known = 1
            _not = 0
        else:
            l = esplit(line)
Example #25
0
def run_bowtie_cmd(fafile, fa_prefix):
    cmd = 'bowtie-build {} {}'.format(fafile, fa_prefix)
    print_stderr(cmd, '\n')
    ret = os.system(cmd)
    if ret:
        die('Run bowtie failed.\n')