def get_num_turns_in_file(path, bucket_size, max_turns): user1_first_line = True num_turns = 0 exceeds_bucket_size = False x_len = 0 y_len = 0 with open(path) as fileobject: for line in fileobject: text, current_user = split_line(line) # Which folders has only two turns and fits in our bucket num_words = len(text.split()) if num_words > bucket_size: exceeds_bucket_size = True if text == "": continue if user1_first_line: init_user, previous_user = current_user, current_user user1_first_line = False num_turns = 0 # The user is still talking if current_user == previous_user: if current_user == init_user: x_len += num_words else: y_len += num_words # A new user responds else: if y_len != 0 and x_len <= bucket_size and y_len <= bucket_size: num_turns += 2 # reset lengths if we are done with user 2, i.e. start of a new training pair if y_len != 0: x_len = 0 y_len = 0 if current_user == init_user: x_len = num_words else: y_len = num_words previous_user = current_user # if num_turns < 3: # if not exceeds_bucket_size: # print(path) if num_turns > max_turns: print("Path " + path + "exceeds " + str(max_turns) + " turns") return num_turns
def preprocess_training_file(path, x_train_path, y_train_path): go_token = "" eos_token = " . " eot_token = "" ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "]) user1_first_line = True x_train = [] y_train = [] context = "" new_context = "" sentence_holder = "" with open(path) as fileobject: for line in fileobject: text, user = split_line(line) if text == "": continue current_user = user if user1_first_line: init_user, previous_user = current_user, current_user user1_first_line = False sentence_holder = go_token # CONTEXT OPTION 1: context is just the final sentence in the turn if current_user != init_user: if text.endswith(ending_symbols): context = new_context new_context = text + " " else: context = new_context new_context = text + eos_token if current_user == previous_user: # The user is still talking if text.endswith(ending_symbols): sentence_holder += text + " " else: sentence_holder += text + eos_token else: # A new user responds sentence_holder += eot_token + "\n" if current_user == init_user: # Init user talks (should add previous sentence to y_train) y_train.append(sentence_holder) else: x_train.append(context + " " + sentence_holder) if text.endswith(ending_symbols): sentence_holder = go_token + text + " " else: sentence_holder = go_token + text + eos_token previous_user = current_user if current_user != init_user: y_train.append(sentence_holder + eot_token + "\n") with open(x_train_path, 'a') as xTrainObject, open(y_train_path, 'a') as yTrainObject: y_len = len(y_train) if (y_len == len(x_train)): for i in range(y_len): xTrainObject.write(x_train[i].strip() + "\n") yTrainObject.write(y_train[i].strip() + "\n") else: print("-----") print("len x:") print(len(x_train)) print("len y:") print(y_len) print("-----")
def preprocess_on_stateful(path, bucket_size, file_name_number, misspellings_dictionary, dest_path, fasttext_dictionary): go_token = "" eos_token = " . " eot_token = "" ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "]) exceeds_bucket_size = False user1_first_line = True x_train = [] y_train = [] sentence_holder = "" with open(path) as fileobject: for line in fileobject: text, user = split_line(line) text = do_regex_on_line(text, tokens['url'][0], " " + tokens['emoji'][0], tokens['directory'][0]) text = do_misspellings_on_line(text, misspellings_dictionary) line_words = text.split() num_words = len(line_words) + len(sentence_holder.split()) # If a sentence is more than 30 (biggest bucket) words, the entire conversation should be skipped if num_words > bucket_size: exceeds_bucket_size = True break # Empty sentences should be skipped if text == "": continue else: current_user = user # Replace unknown words sentence = "" for word in line_words: word = replace_word_helper(word, fasttext_dictionary) sentence += word + " " text = sentence[:-1] if user1_first_line: init_user, previous_user = current_user, current_user user1_first_line = False sentence_holder = go_token if current_user == previous_user: # The user is still talking if text.endswith(ending_symbols): sentence_holder += text + " " else: sentence_holder += text + eos_token else: # A new user responds sentence_holder += eot_token + "\n" if current_user == init_user: # Init user talks (should add previous sentence to y_train) y_train.append(sentence_holder) else: x_train.append(sentence_holder) if text.endswith(ending_symbols): sentence_holder = go_token + text + " " else: sentence_holder = go_token + text + eos_token previous_user = current_user if not exceeds_bucket_size and y_train != [] and x_train != []: if current_user != init_user: y_train.append(sentence_holder + eot_token + "\n") save_to_file(dest_path + str(file_name_number) + "_x.txt", x_train) save_to_file(dest_path + str(file_name_number) + "_y.txt", y_train) return True return False
def non_turns_exceed_max_turns_in_conv(file_path, fit_1, fit_2, fit_3, fit_4, fit_5, fit_6): go_token = "" eos_token = " . " eot_token = "" ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "]) user1_first_line = True x_train = [] y_train = [] num_turns = 0 fit_1_bool = True fit_2_bool = True fit_3_bool = True fit_4_bool = True fit_5_bool = True fit_6_bool = True sentence_holder = "" with open(file_path) as fileobject: for line in fileobject: text, user = split_line(line) if text == "": continue current_user = user if user1_first_line: init_user, previous_user = current_user, current_user user1_first_line = False sentence_holder = "" if current_user == previous_user: # The user is still talking if text.endswith(ending_symbols): sentence_holder += text + " " else: sentence_holder += text + eos_token else: # A new user responds sentence_holder += eot_token + "\n" if current_user == init_user: # Init user talks (should add previous sentence to y_train) y_train.append(sentence_holder) words = sentence_holder.split() num_words = len(words) if num_words > fit_1: fit_1_bool = False if num_words > fit_2: fit_2_bool = False if num_words > fit_3: fit_3_bool = False if num_words > fit_4: fit_4_bool = False if num_words > fit_5: fit_5_bool = False if num_words > fit_6: fit_6_bool = False else: x_train.append(sentence_holder) words = sentence_holder.split() num_words = len(words) if num_words > fit_1: fit_1_bool = False if num_words > fit_2: fit_2_bool = False if num_words > fit_3: fit_3_bool = False if num_words > fit_4: fit_4_bool = False if num_words > fit_5: fit_5_bool = False if num_words > fit_6: fit_6_bool = False if text.endswith(ending_symbols): sentence_holder = go_token + text + " " else: sentence_holder = go_token + text + eos_token previous_user = current_user if current_user != init_user: y_train.append(sentence_holder + eot_token + "\n") words = sentence_holder.split() num_words = len(words) if num_words > fit_1: fit_1_bool = False if num_words > fit_2: fit_2_bool = False if num_words > fit_3: fit_3_bool = False if num_words > fit_4: fit_4_bool = False if num_words > fit_5: fit_5_bool = False if num_words > fit_6: fit_6_bool = False y_len = len(y_train) num_turns = len(x_train) + y_len if (y_len == len(x_train)): return num_turns, fit_1_bool, fit_2_bool, fit_3_bool, fit_4_bool, fit_5_bool, fit_6_bool else: print("different y and x len") return 0, False, False, False, False, False, False
def preprocess_training_file(path, x_train_path, y_train_path): go_token = "" eos_token = " . " eot_token = "" ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "]) user1_first_line = True x_train = [] y_train = [] context = "" sentence_holder = "" with open(path) as fileobject: for line in fileobject: text, user = split_line(line) if text == "": continue current_user = user if user1_first_line: init_user, previous_user = current_user, current_user user1_first_line = False sentence_holder = go_token if current_user == previous_user: # The user is still talking if text.endswith(ending_symbols): sentence_holder += text + " " else: sentence_holder += text + eos_token else: # A new user responds if current_user == init_user: # Init user talks (should add previous sentence to y_train) if len(sentence_holder.split()) <= 30: context = sentence_holder else: context = "" sentence_holder += eot_token + "\n" y_train.append(sentence_holder) else: sentence_holder += eot_token + "\n" x_train.append(context + " " + sentence_holder) if text.endswith(ending_symbols): sentence_holder = go_token + text + " " else: sentence_holder = go_token + text + eos_token len_y = len(y_train) if len_y > 0 and len(x_train) == len_y: if len(x_train[len_y - 1].split()) > 60 or len( y_train[len_y - 1].split()) > 30: del x_train[-1] del y_train[-1] previous_user = current_user if current_user != init_user: y_train.append(sentence_holder + eot_token + "\n") with open(x_train_path, 'a') as xTrainObject, open(y_train_path, 'a') as yTrainObject: y_len = len(y_train) if (y_len == len(x_train)): for i in range(y_len): xTrainObject.write(x_train[i].strip() + "\n") yTrainObject.write(y_train[i].strip() + "\n") else: print("-----") print("len x:") print(len(x_train)) print("len y:") print(y_len) print("-----")