def mature_generator(lines): global map_data # each loop should read exactly 3 lines output_list = [] iterator = 0 while 1: if iterator == len(lines): break line_info = lines[iterator].strip() if line_info == "": break line_seq = lines[iterator + 1].strip() line_db = lines[iterator + 2].strip() iterator += 3 # if no read data is matched in putative precursors, discard it if DISCARD_NO_READ_PREC_FLAG: no_read_prec_flag = SeqModule.check_no_read_prec( line_info, map_data, MIN_READ_COUNT_THRESHOLD) if no_read_prec_flag is True: continue # check conserved sequence with blastn # if this line_info is classified as conserved sequence, update line_info # no need to find duplex, just mark 5p and 3p index corresponding to matched information updated_flag = False if ANNOTATE_FLAG == 'true' or ANNOTATE_FLAG == 'True': line_info, updated_flag = SeqModule.check_conserved_seq( line_info, line_seq, blastn_path, mirbase_path, ARM_EXTEND_THRESHOLD) # if updated_flag is True: # start_5p, end_5p, start_3p, end_3p = SeqModule.find_location(line_info, line_seq, line_db) # else, do the code below ########################################################### # Discard non-canonical (i.e. "hard to identify") precursor # "Asymmetric" dot-bracket notation precursor : low accuracy, hard to identify star seq, and too many outputs # if ")" portion is large in "left side", it's non-canonical line_db_left = line_db[0:len(line_db) / 2] num_open = line_db_left.count("(") num_close = line_db_left.count(")") if float(num_close) / num_open > NON_CANONICAL_PREC_FACTOR: continue # find valid star sequence from putative precursors start_5p, end_5p, start_3p, end_3p = SeqModule.star_identifier_v2( line_db, MATURE_MIN_LEN, MATURE_MAX_LEN, MAX_SERIAL_MISMATCH, MAX_MULT_MISMATCH, MAX_SERIAL_BULGE, MAX_MULT_BULGE) if start_5p == 0 and end_5p == 0 and start_3p == 0 and end_3p == 0: # star seq not found continue # write putative precursor to the output file output_form = SeqModule.generate_output_form(line_info, line_seq, line_db, start_5p, start_3p, end_5p, end_3p, map_data, MIN_READ_COUNT_THRESHOLD) output_list.append(output_form) return output_list
def mature_generator(lines): global map_data # each loop should read exactly 3 lines output_list=[] iterator = 0 while 1: if iterator == len(lines): break line_info = lines[iterator].strip() if line_info == "": break line_seq = lines[iterator+1].strip() line_db = lines[iterator+2].strip() iterator += 3 # if no read data is matched in putative precursors, discard it if DISCARD_NO_READ_PREC_FLAG: no_read_prec_flag = SeqModule.check_no_read_prec(line_info, map_data, MIN_READ_COUNT_THRESHOLD) if no_read_prec_flag is True: continue # check conserved sequence with blastn # if this line_info is classified as conserved sequence, update line_info # no need to find duplex, just mark 5p and 3p index corresponding to matched information updated_flag = False if ANNOTATE_FLAG == 'true' or ANNOTATE_FLAG == 'True': line_info, updated_flag = SeqModule.check_conserved_seq(line_info, line_seq, blastn_path, mirbase_path, ARM_EXTEND_THRESHOLD) # if updated_flag is True: # start_5p, end_5p, start_3p, end_3p = SeqModule.find_location(line_info, line_seq, line_db) # else, do the code below ########################################################### # Discard non-canonical (i.e. "hard to identify") precursor # "Asymmetric" dot-bracket notation precursor : low accuracy, hard to identify star seq, and too many outputs # if ")" portion is large in "left side", it's non-canonical line_db_left = line_db[0:len(line_db)/2] num_open = line_db_left.count("(") num_close = line_db_left.count(")") if float(num_close)/num_open > NON_CANONICAL_PREC_FACTOR: continue # find valid star sequence from putative precursors start_5p, end_5p, start_3p, end_3p = SeqModule.star_identifier_v2(line_db, MATURE_MIN_LEN, MATURE_MAX_LEN, MAX_SERIAL_MISMATCH, MAX_MULT_MISMATCH, MAX_SERIAL_BULGE, MAX_MULT_BULGE) if start_5p == 0 and end_5p == 0 and start_3p == 0 and end_3p == 0: # star seq not found continue # write putative precursor to the output file output_form = SeqModule.generate_output_form(line_info, line_seq, line_db, start_5p, start_3p, end_5p, end_3p, map_data, MIN_READ_COUNT_THRESHOLD) output_list.append(output_form) return output_list