Ejemplo n.º 1
0
def mature_generator(lines):
    global map_data
    # each loop should read exactly 3 lines
    output_list = []
    iterator = 0
    while 1:
        if iterator == len(lines):
            break
        line_info = lines[iterator].strip()
        if line_info == "":
            break
        line_seq = lines[iterator + 1].strip()
        line_db = lines[iterator + 2].strip()
        iterator += 3

        # if no read data is matched in putative precursors, discard it
        if DISCARD_NO_READ_PREC_FLAG:
            no_read_prec_flag = SeqModule.check_no_read_prec(
                line_info, map_data, MIN_READ_COUNT_THRESHOLD)
            if no_read_prec_flag is True:
                continue

        # check conserved sequence with blastn
        # if this line_info is classified as conserved sequence, update line_info
        # no need to find duplex, just mark 5p and 3p index corresponding to matched information
        updated_flag = False
        if ANNOTATE_FLAG == 'true' or ANNOTATE_FLAG == 'True':
            line_info, updated_flag = SeqModule.check_conserved_seq(
                line_info, line_seq, blastn_path, mirbase_path,
                ARM_EXTEND_THRESHOLD)
        # if updated_flag is True:
        # start_5p, end_5p, start_3p, end_3p = SeqModule.find_location(line_info, line_seq, line_db)
        # else, do the code below
        ###########################################################

        # Discard non-canonical (i.e. "hard to identify") precursor
        # "Asymmetric" dot-bracket notation precursor : low accuracy, hard to identify star seq, and too many outputs
        # if ")" portion is large in "left side", it's non-canonical
        line_db_left = line_db[0:len(line_db) / 2]
        num_open = line_db_left.count("(")
        num_close = line_db_left.count(")")
        if float(num_close) / num_open > NON_CANONICAL_PREC_FACTOR:
            continue

        # find valid star sequence from putative precursors
        start_5p, end_5p, start_3p, end_3p = SeqModule.star_identifier_v2(
            line_db, MATURE_MIN_LEN, MATURE_MAX_LEN, MAX_SERIAL_MISMATCH,
            MAX_MULT_MISMATCH, MAX_SERIAL_BULGE, MAX_MULT_BULGE)
        if start_5p == 0 and end_5p == 0 and start_3p == 0 and end_3p == 0:  # star seq not found
            continue

        # write putative precursor to the output file
        output_form = SeqModule.generate_output_form(line_info, line_seq,
                                                     line_db, start_5p,
                                                     start_3p, end_5p, end_3p,
                                                     map_data,
                                                     MIN_READ_COUNT_THRESHOLD)
        output_list.append(output_form)
    return output_list
Ejemplo n.º 2
0
def mature_generator(lines):
    global map_data
    # each loop should read exactly 3 lines
    output_list=[]
    iterator = 0
    while 1:
        if iterator == len(lines):
            break
        line_info = lines[iterator].strip()
        if line_info == "":
            break
        line_seq = lines[iterator+1].strip()
        line_db = lines[iterator+2].strip()
        iterator += 3

        # if no read data is matched in putative precursors, discard it
        if DISCARD_NO_READ_PREC_FLAG:
            no_read_prec_flag = SeqModule.check_no_read_prec(line_info, map_data, MIN_READ_COUNT_THRESHOLD)
            if no_read_prec_flag is True:
                continue

        # check conserved sequence with blastn
        # if this line_info is classified as conserved sequence, update line_info
        # no need to find duplex, just mark 5p and 3p index corresponding to matched information
        updated_flag = False
        if ANNOTATE_FLAG == 'true' or ANNOTATE_FLAG == 'True':
            line_info, updated_flag = SeqModule.check_conserved_seq(line_info, line_seq,
                                                                    blastn_path, mirbase_path, ARM_EXTEND_THRESHOLD)
        # if updated_flag is True:
            # start_5p, end_5p, start_3p, end_3p = SeqModule.find_location(line_info, line_seq, line_db)
        # else, do the code below
        ###########################################################

        # Discard non-canonical (i.e. "hard to identify") precursor
        # "Asymmetric" dot-bracket notation precursor : low accuracy, hard to identify star seq, and too many outputs
        # if ")" portion is large in "left side", it's non-canonical
        line_db_left = line_db[0:len(line_db)/2]
        num_open = line_db_left.count("(")
        num_close = line_db_left.count(")")
        if float(num_close)/num_open > NON_CANONICAL_PREC_FACTOR:
            continue

        # find valid star sequence from putative precursors
        start_5p, end_5p, start_3p, end_3p = SeqModule.star_identifier_v2(line_db, MATURE_MIN_LEN, MATURE_MAX_LEN,
                                                                          MAX_SERIAL_MISMATCH, MAX_MULT_MISMATCH,
                                                                          MAX_SERIAL_BULGE, MAX_MULT_BULGE)
        if start_5p == 0 and end_5p == 0 and start_3p == 0 and end_3p == 0:  # star seq not found
            continue

        # write putative precursor to the output file
        output_form = SeqModule.generate_output_form(line_info, line_seq, line_db,
                                                     start_5p, start_3p, end_5p, end_3p,
                                                     map_data, MIN_READ_COUNT_THRESHOLD)
        output_list.append(output_form)
    return output_list
Ejemplo n.º 3
0
def precursor_generator(lines):
    output_precursor_infolist = []
    output_precursor_dblist = []
    reads_total_partial = 0
    length_distribution_partial = count_list({})

    for z in range(0, len(lines)):
        line_split = lines[z].split()
        # Rare occasion of improper line data : should skip it
        if len(line_split) != 7:
            continue
        # accumulate raw rna seq read counts for calculation of RPM
        reads_total_partial += int(line_split[1])

        # accumulate length distribution imformation
        # specify 5' end and add to the corresponding index
        seq_dist_check = line_split[6]
        five_prime = seq_dist_check[0]
        seq_length = len(seq_dist_check)
        dict_key = str(seq_length)+str(five_prime)
        length_distribution_partial[dict_key] += int(line_split[1])

        # Screen for Drosha / Dicer cutting sites (Inspired by miREAP)
        qualified_flag = 1
        name_list_index = ref_name_list.index(line_split[2])
        count = 0

        if line_split[5] == "+":
            count = ref_count_list_pos[name_list_index][int(line_split[3])]
            if count < 3:
                continue
            count_region = count
            count_sites = count
            for i in range(1, 20):
                if int(line_split[3])-i < 0 or int(line_split[3])+i >= len(ref_seq_list[name_list_index]):
                    continue
                if ref_count_list_pos[name_list_index][int(line_split[3])-i] > count \
                        or ref_count_list_pos[name_list_index][int(line_split[3])+i] > count:
                    qualified_flag = 0
                    break
                count_region += ref_count_list_pos[name_list_index][int(line_split[3])-i]
                count_region += ref_count_list_pos[name_list_index][int(line_split[3])+i]
                if i < 3:
                    count_sites += ref_count_list_pos[name_list_index][int(line_split[3])-i]
                    count_sites += ref_count_list_pos[name_list_index][int(line_split[3])+i]
            if float(count_sites)/count_region < DOMINANT_FACTOR or float(count)/count_sites < DOMINANT_FACTOR/2.0:
                qualified_flag = 0

        elif line_split[5] == "-":
            count = ref_count_list_neg[name_list_index][int(line_split[4])]
            if count < 3:
                continue
            count_region = count
            count_sites = count
            for i in range(1, 20):
                if int(line_split[4])-i < 0 or int(line_split[4])+i >= len(ref_seq_list[name_list_index]):
                    continue
                if ref_count_list_neg[name_list_index][int(line_split[4])-i] > count \
                        or ref_count_list_neg[name_list_index][int(line_split[4])+i] > count:
                    qualified_flag = 0
                    break
                count_region += ref_count_list_neg[name_list_index][int(line_split[4])-i]
                count_region += ref_count_list_neg[name_list_index][int(line_split[4])+i]
                if i < 3:
                    count_sites += ref_count_list_neg[name_list_index][int(line_split[4])-i]
                    count_sites += ref_count_list_neg[name_list_index][int(line_split[4])+i]
            if float(count_sites)/count_region < DOMINANT_FACTOR or float(count)/count_sites < DOMINANT_FACTOR/2.0:
                qualified_flag = 0

        if qualified_flag == 0:
            continue

        # Precursor Candidate Information Variable List
        pc_seq = ""
        pc_structure = ""
        pc_start = 0
        pc_end = 0
        pc_abs_energy = 0
        pc_norm_abs_energy = 0

        # Find min. MFE of fold structure and save it to result_precursor
        # WARNING : if abs. of calculated free energy is less than 10, output[2] does not contain proper value
        # Skipping this precursor line is proper, since threshold value is at least 18
        for k in range(0, len(ref_seq_list)):  # reference sequence list loop
            # 160205 : No need to search other genomes
            # but fixed, need better implementation (remove ref seq loop)
            if name_list_index != k:
                continue
            # 150907 : No need to loop arm extension? miREAP only uses const FLANK var (10)
            # disabling arm extension loop has no significant difference, but can reduce time complexity
            for i in range(ARM_EXTEND_THRESHOLD, ARM_EXTEND_THRESHOLD+1):  # arm extension loop (disabled)
                for j in range(int(line_split[4])-int(line_split[3]), DISTANCE_THRESHOLD+int(line_split[4])-int(line_split[3]), RNAFOLD_STEP):  # distance loop

                    # Assuming -5p mature sequence
                    start = int(line_split[3])-i
                    end = int(line_split[4])+j+i
                    if start >= 0 and end < len(ref_seq_list[k]):  # continue only if both indices are valid
                        if line_split[5] == "+":
                            rna_fold_seq = ref_seq_list[k][start:end]
                        elif line_split[5] == "-":
                            rna_fold_seq = SeqModule.create_star(ref_seq_list[k][start:end])
                        if "N" in rna_fold_seq:
                            continue
                        rnafold = subprocess.Popen([RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP"],
                                                   stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                        output = rnafold.communicate(rna_fold_seq)[0].split()
                        # Discard non-canonical (i.e. "hard to identify") precursor
                        pc_structure_left = output[1].strip("\n")[0:len(output[1].strip("\n"))/2]
                        pc_structure_right = output[1].strip("\n")[len(output[1].strip("\n"))/2:len(output[1].strip("\n"))]
                        num_open_left = pc_structure_left.count("(")
                        num_close_left = pc_structure_left.count(")")
                        num_open_right = pc_structure_right.count("(")
                        num_close_right = pc_structure_right.count(")")
                        if num_open_left == 0 or num_close_right == 0:
                            continue
                        if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\
                                                float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR:
                            continue
                        abs_energy = re.findall(r'\d*\.\d*', str(output[2]))
                        if abs_energy != []:
                            if float(abs_energy[0]) >= MIN_ABS_MFE:
                                norm_abs_energy = float(abs_energy[0])/len(rna_fold_seq)
                                if pc_seq == []: # bad implementation, need to repair
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy
                                elif norm_abs_energy > pc_norm_abs_energy:
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy

                    # Assuming -3p mature sequence
                    start = int(line_split[3])-j-i
                    end = int(line_split[4])+i
                    if start >= 0 and end < len(ref_seq_list[k]):   # continue only if both indices are valid
                        if line_split[5] == "+":
                            rna_fold_seq = ref_seq_list[k][start:end]
                        elif line_split[5] == "-":
                            rna_fold_seq = SeqModule.create_star(ref_seq_list[k][start:end])
                        if "N" in rna_fold_seq:
                            continue
                        rnafold = subprocess.Popen([RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP"],
                                                   stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                        output = rnafold.communicate(rna_fold_seq)[0].split()
                        # Discard non-canonical (i.e. "hard to identify") precursor
                        pc_structure_left = output[1].strip("\n")[0:len(output[1].strip("\n"))/2]
                        pc_structure_right = output[1].strip("\n")[len(output[1].strip("\n"))/2:len(output[1].strip("\n"))]
                        num_open_left = pc_structure_left.count("(")
                        num_close_left = pc_structure_left.count(")")
                        num_open_right = pc_structure_right.count("(")
                        num_close_right = pc_structure_right.count(")")
                        if num_open_left == 0 or num_close_right == 0:
                            continue
                        if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\
                                                float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR:
                            continue
                        abs_energy = re.findall(r'\d*\.\d*', str(output[2]))
                        if abs_energy != []:
                            if float(abs_energy[0]) >= MIN_ABS_MFE:
                                norm_abs_energy = float(abs_energy[0])/len(rna_fold_seq)
                                if pc_seq == []: # bad implementation, need to repair
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy
                                elif norm_abs_energy > pc_norm_abs_energy:
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy

        if pc_seq != "":
            output_precursor_infolist.append(lines[z].strip()+"\t"+str(pc_abs_energy)+"\t"+str(pc_norm_abs_energy)+"\t"+
                                             str(pc_start)+"\t"+str(pc_end)+"\n")
            output_precursor_dblist.append(pc_seq+"\n"+pc_structure+"\n")
            continue
    # create counters (subclass of dict) to "merge" partial length distribution dicts later
    length_distribution_counter = Counter(length_distribution_partial)
    return output_precursor_infolist, output_precursor_dblist, reads_total_partial, length_distribution_counter
Ejemplo n.º 4
0
    else:
        smrna_file_path = os.path.join(os.getcwd(), "smrna.fa")
    print("Mapping smrna-seq to reference genome with bowtie...")
    bowtie = subprocess.Popen([bowtie_path, str(ref_file.name),
                               "-f", smrna_file_path,
                               os.path.join(path, "map_bowtie"),
                               "-v", "0", "-m", str(MAX_MULTIPLE_LOCI), "-a", "-t", "-p", str(NUM_THREADS)],
                              stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    bowtie.wait()

    print("Converting bowtie map format to correct map format...")
    # open bowtie-generated map file (read-only, no need to be changed)
    output_bowtie = open(os.path.join(path, "map_bowtie"), "r")

    # convert bowtie map file format to correct form
    SeqModule.convert_bowtie_output(output_bowtie, output_map)
    output_map.seek(0, 0)

    print("Generating count data using map file...")
    # generate count data using map file
    ref_count_dump_pos, ref_count_dump_neg = SeqModule.count_generator(ref_name_list, output_map)
    output_map.seek(0, 0)

    # dump count data file for future usage and skip mapping
    cPickle.dump(ref_count_dump_pos, output_count_pos, -1)
    cPickle.dump(ref_count_dump_neg, output_count_neg, -1)
    output_count_pos.seek(0, 0)
    output_count_neg.seek(0, 0)
    end = time.time()
    print("Elapsed time for mapping : " + str(end - start) + " seconds")
    print("Mapping done")
Ejemplo n.º 5
0
def precursor_generator(lines):
    output_precursor_infolist = []
    output_precursor_dblist = []
    reads_total_partial = 0
    length_distribution_partial = count_list({})

    for z in range(0, len(lines)):
        line_split = lines[z].split()
        # Rare occasion of improper line data : should skip it
        if len(line_split) != 7:
            continue
        # accumulate raw rna seq read counts for calculation of RPM
        reads_total_partial += int(line_split[1])

        # accumulate length distribution imformation
        # specify 5' end and add to the corresponding index
        seq_dist_check = line_split[6]
        five_prime = seq_dist_check[0]
        seq_length = len(seq_dist_check)
        dict_key = str(seq_length) + str(five_prime)
        length_distribution_partial[dict_key] += int(line_split[1])

        # Screen for Drosha / Dicer cutting sites (Inspired by miREAP)
        qualified_flag = 1
        name_list_index = ref_name_list.index(line_split[2])
        count = 0

        if line_split[5] == "+":
            count = ref_count_list_pos[name_list_index][int(line_split[3])]
            if count < 3:
                continue
            count_region = count
            count_sites = count
            for i in range(1, 20):
                if int(line_split[3]) - i < 0 or int(line_split[3]) + i >= len(
                        ref_seq_list[name_list_index]):
                    continue
                if ref_count_list_pos[name_list_index][int(line_split[3])-i] > count \
                        or ref_count_list_pos[name_list_index][int(line_split[3])+i] > count:
                    qualified_flag = 0
                    break
                count_region += ref_count_list_pos[name_list_index][
                    int(line_split[3]) - i]
                count_region += ref_count_list_pos[name_list_index][
                    int(line_split[3]) + i]
                if i < 3:
                    count_sites += ref_count_list_pos[name_list_index][
                        int(line_split[3]) - i]
                    count_sites += ref_count_list_pos[name_list_index][
                        int(line_split[3]) + i]
            if float(count_sites) / count_region < DOMINANT_FACTOR or float(
                    count) / count_sites < DOMINANT_FACTOR / 2.0:
                qualified_flag = 0

        elif line_split[5] == "-":
            count = ref_count_list_neg[name_list_index][int(line_split[4])]
            if count < 3:
                continue
            count_region = count
            count_sites = count
            for i in range(1, 20):
                if int(line_split[4]) - i < 0 or int(line_split[4]) + i >= len(
                        ref_seq_list[name_list_index]):
                    continue
                if ref_count_list_neg[name_list_index][int(line_split[4])-i] > count \
                        or ref_count_list_neg[name_list_index][int(line_split[4])+i] > count:
                    qualified_flag = 0
                    break
                count_region += ref_count_list_neg[name_list_index][
                    int(line_split[4]) - i]
                count_region += ref_count_list_neg[name_list_index][
                    int(line_split[4]) + i]
                if i < 3:
                    count_sites += ref_count_list_neg[name_list_index][
                        int(line_split[4]) - i]
                    count_sites += ref_count_list_neg[name_list_index][
                        int(line_split[4]) + i]
            if float(count_sites) / count_region < DOMINANT_FACTOR or float(
                    count) / count_sites < DOMINANT_FACTOR / 2.0:
                qualified_flag = 0

        if qualified_flag == 0:
            continue

        # Precursor Candidate Information Variable List
        pc_seq = ""
        pc_structure = ""
        pc_start = 0
        pc_end = 0
        pc_abs_energy = 0
        pc_norm_abs_energy = 0

        # Find min. MFE of fold structure and save it to result_precursor
        # WARNING : if abs. of calculated free energy is less than 10, output[2] does not contain proper value
        # Skipping this precursor line is proper, since threshold value is at least 18
        for k in range(0, len(ref_seq_list)):  # reference sequence list loop
            # 160205 : No need to search other genomes
            # but fixed, need better implementation (remove ref seq loop)
            if name_list_index != k:
                continue
            # 150907 : No need to loop arm extension? miREAP only uses const FLANK var (10)
            # disabling arm extension loop has no significant difference, but can reduce time complexity
            for i in range(ARM_EXTEND_THRESHOLD, ARM_EXTEND_THRESHOLD +
                           1):  # arm extension loop (disabled)
                for j in range(
                        int(line_split[4]) - int(line_split[3]),
                        DISTANCE_THRESHOLD + int(line_split[4]) -
                        int(line_split[3]), RNAFOLD_STEP):  # distance loop

                    # Assuming -5p mature sequence
                    start = int(line_split[3]) - i
                    end = int(line_split[4]) + j + i
                    if start >= 0 and end < len(
                            ref_seq_list[k]
                    ):  # continue only if both indices are valid
                        if line_split[5] == "+":
                            rna_fold_seq = ref_seq_list[k][start:end]
                        elif line_split[5] == "-":
                            rna_fold_seq = SeqModule.create_star(
                                ref_seq_list[k][start:end])
                        if "N" in rna_fold_seq:
                            continue
                        rnafold = subprocess.Popen([
                            RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP"
                        ],
                                                   stdin=subprocess.PIPE,
                                                   stdout=subprocess.PIPE)
                        output = rnafold.communicate(rna_fold_seq)[0].split()
                        # Discard non-canonical (i.e. "hard to identify") precursor
                        pc_structure_left = output[1].strip(
                            "\n")[0:len(output[1].strip("\n")) / 2]
                        pc_structure_right = output[1].strip(
                            "\n")[len(output[1].strip("\n")) /
                                  2:len(output[1].strip("\n"))]
                        num_open_left = pc_structure_left.count("(")
                        num_close_left = pc_structure_left.count(")")
                        num_open_right = pc_structure_right.count("(")
                        num_close_right = pc_structure_right.count(")")
                        if num_open_left == 0 or num_close_right == 0:
                            continue
                        if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\
                                                float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR:
                            continue
                        abs_energy = re.findall(r'\d*\.\d*', str(output[2]))
                        if abs_energy != []:
                            if float(abs_energy[0]) >= MIN_ABS_MFE:
                                norm_abs_energy = float(
                                    abs_energy[0]) / len(rna_fold_seq)
                                if pc_seq == []:  # bad implementation, need to repair
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy
                                elif norm_abs_energy > pc_norm_abs_energy:
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy

                    # Assuming -3p mature sequence
                    start = int(line_split[3]) - j - i
                    end = int(line_split[4]) + i
                    if start >= 0 and end < len(
                            ref_seq_list[k]
                    ):  # continue only if both indices are valid
                        if line_split[5] == "+":
                            rna_fold_seq = ref_seq_list[k][start:end]
                        elif line_split[5] == "-":
                            rna_fold_seq = SeqModule.create_star(
                                ref_seq_list[k][start:end])
                        if "N" in rna_fold_seq:
                            continue
                        rnafold = subprocess.Popen([
                            RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP"
                        ],
                                                   stdin=subprocess.PIPE,
                                                   stdout=subprocess.PIPE)
                        output = rnafold.communicate(rna_fold_seq)[0].split()
                        # Discard non-canonical (i.e. "hard to identify") precursor
                        pc_structure_left = output[1].strip(
                            "\n")[0:len(output[1].strip("\n")) / 2]
                        pc_structure_right = output[1].strip(
                            "\n")[len(output[1].strip("\n")) /
                                  2:len(output[1].strip("\n"))]
                        num_open_left = pc_structure_left.count("(")
                        num_close_left = pc_structure_left.count(")")
                        num_open_right = pc_structure_right.count("(")
                        num_close_right = pc_structure_right.count(")")
                        if num_open_left == 0 or num_close_right == 0:
                            continue
                        if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\
                                                float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR:
                            continue
                        abs_energy = re.findall(r'\d*\.\d*', str(output[2]))
                        if abs_energy != []:
                            if float(abs_energy[0]) >= MIN_ABS_MFE:
                                norm_abs_energy = float(
                                    abs_energy[0]) / len(rna_fold_seq)
                                if pc_seq == []:  # bad implementation, need to repair
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy
                                elif norm_abs_energy > pc_norm_abs_energy:
                                    pc_seq = output[0].strip()
                                    pc_structure = output[1].strip("\n")
                                    pc_start = start
                                    pc_end = end
                                    pc_abs_energy = float(abs_energy[0])
                                    pc_norm_abs_energy = norm_abs_energy

        if pc_seq != "":
            output_precursor_infolist.append(lines[z].strip() + "\t" +
                                             str(pc_abs_energy) + "\t" +
                                             str(pc_norm_abs_energy) + "\t" +
                                             str(pc_start) + "\t" +
                                             str(pc_end) + "\n")
            output_precursor_dblist.append(pc_seq + "\n" + pc_structure + "\n")
            continue
    # create counters (subclass of dict) to "merge" partial length distribution dicts later
    length_distribution_counter = Counter(length_distribution_partial)
    return output_precursor_infolist, output_precursor_dblist, reads_total_partial, length_distribution_counter
Ejemplo n.º 6
0
        bowtie_path,
        str(ref_file.name), "-f", smrna_file_path,
        os.path.join(path, "map_bowtie"), "-v", "0", "-m",
        str(MAX_MULTIPLE_LOCI), "-a", "-t", "-p",
        str(NUM_THREADS), '--large-index'
    ],
                              stdin=subprocess.PIPE,
                              stdout=subprocess.PIPE)
    bowtie.wait()

    print("Converting bowtie map format to correct map format...")
    # open bowtie-generated map file (read-only, no need to be changed)
    output_bowtie = open(os.path.join(path, "map_bowtie"), "r")

    # convert bowtie map file format to correct form
    SeqModule.convert_bowtie_output(output_bowtie, output_map)
    output_map.seek(0, 0)

    print("Generating count data using map file...")
    # generate count data using map file
    ref_count_dump_pos, ref_count_dump_neg = SeqModule.count_generator(
        ref_name_list, output_map)
    output_map.seek(0, 0)

    # dump count data file for future usage and skip mapping
    cPickle.dump(ref_count_dump_pos, output_count_pos, -1)
    cPickle.dump(ref_count_dump_neg, output_count_neg, -1)
    output_count_pos.seek(0, 0)
    output_count_neg.seek(0, 0)
    end = time.time()
    print("Elapsed time for mapping : " + str(end - start) + " seconds")