Exemple #1
0
def get_num_turns_in_file(path, bucket_size, max_turns):
    user1_first_line = True
    num_turns = 0
    exceeds_bucket_size = False

    x_len = 0
    y_len = 0
    with open(path) as fileobject:
        for line in fileobject:
            text, current_user = split_line(line)

            # Which folders has only two turns and fits in our bucket
            num_words = len(text.split())
            if num_words > bucket_size:
                exceeds_bucket_size = True

            if text == "":
                continue
            if user1_first_line:
                init_user, previous_user = current_user, current_user
                user1_first_line = False
                num_turns = 0
            # The user is still talking
            if current_user == previous_user:
                if current_user == init_user:
                    x_len += num_words
                else:
                    y_len += num_words
            # A new user responds
            else:

                if y_len != 0 and x_len <= bucket_size and y_len <= bucket_size:
                    num_turns += 2
                # reset lengths if we are done with user 2, i.e. start of a new training pair
                if y_len != 0:
                    x_len = 0
                    y_len = 0
                if current_user == init_user:
                    x_len = num_words
                else:
                    y_len = num_words
            previous_user = current_user
    # if num_turns < 3:
    #     if not exceeds_bucket_size:
    #         print(path)
    if num_turns > max_turns:
        print("Path " + path + "exceeds " + str(max_turns) + " turns")
    return num_turns
Exemple #2
0
def preprocess_training_file(path, x_train_path, y_train_path):
    go_token = ""
    eos_token = " . "
    eot_token = ""

    ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "])

    user1_first_line = True

    x_train = []
    y_train = []

    context = ""
    new_context = ""

    sentence_holder = ""
    with open(path) as fileobject:
        for line in fileobject:
            text, user = split_line(line)
            if text == "":
                continue
            current_user = user
            if user1_first_line:
                init_user, previous_user = current_user, current_user
                user1_first_line = False
                sentence_holder = go_token

            # CONTEXT OPTION 1: context is just the final sentence in the turn
            if current_user != init_user:
                if text.endswith(ending_symbols):
                    context = new_context
                    new_context = text + " "
                else:
                    context = new_context
                    new_context = text + eos_token

            if current_user == previous_user:  # The user is still talking
                if text.endswith(ending_symbols):
                    sentence_holder += text + " "
                else:
                    sentence_holder += text + eos_token
            else:  # A new user responds
                sentence_holder += eot_token + "\n"

                if current_user == init_user:  # Init user talks (should add previous sentence to y_train)
                    y_train.append(sentence_holder)
                else:
                    x_train.append(context + " " + sentence_holder)
                if text.endswith(ending_symbols):
                    sentence_holder = go_token + text + " "
                else:
                    sentence_holder = go_token + text + eos_token

            previous_user = current_user

    if current_user != init_user:
        y_train.append(sentence_holder + eot_token + "\n")

    with open(x_train_path, 'a') as xTrainObject, open(y_train_path,
                                                       'a') as yTrainObject:
        y_len = len(y_train)
        if (y_len == len(x_train)):
            for i in range(y_len):
                xTrainObject.write(x_train[i].strip() + "\n")
                yTrainObject.write(y_train[i].strip() + "\n")
        else:
            print("-----")
            print("len x:")
            print(len(x_train))
            print("len y:")
            print(y_len)
            print("-----")
Exemple #3
0
def preprocess_on_stateful(path, bucket_size, file_name_number,
                           misspellings_dictionary, dest_path,
                           fasttext_dictionary):
    go_token = ""
    eos_token = " . "
    eot_token = ""

    ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "])

    exceeds_bucket_size = False
    user1_first_line = True

    x_train = []
    y_train = []

    sentence_holder = ""

    with open(path) as fileobject:
        for line in fileobject:
            text, user = split_line(line)
            text = do_regex_on_line(text, tokens['url'][0],
                                    " " + tokens['emoji'][0],
                                    tokens['directory'][0])
            text = do_misspellings_on_line(text, misspellings_dictionary)

            line_words = text.split()
            num_words = len(line_words) + len(sentence_holder.split())

            # If a sentence is more than 30 (biggest bucket) words, the entire conversation should be skipped
            if num_words > bucket_size:
                exceeds_bucket_size = True
                break

            # Empty sentences should be skipped
            if text == "":
                continue
            else:
                current_user = user
            # Replace unknown words
            sentence = ""
            for word in line_words:
                word = replace_word_helper(word, fasttext_dictionary)
                sentence += word + " "
            text = sentence[:-1]

            if user1_first_line:
                init_user, previous_user = current_user, current_user
                user1_first_line = False
                sentence_holder = go_token

            if current_user == previous_user:  # The user is still talking
                if text.endswith(ending_symbols):
                    sentence_holder += text + " "
                else:
                    sentence_holder += text + eos_token
            else:  # A new user responds
                sentence_holder += eot_token + "\n"

                if current_user == init_user:  # Init user talks (should add previous sentence to y_train)
                    y_train.append(sentence_holder)
                else:
                    x_train.append(sentence_holder)
                if text.endswith(ending_symbols):
                    sentence_holder = go_token + text + " "
                else:
                    sentence_holder = go_token + text + eos_token

            previous_user = current_user

    if not exceeds_bucket_size and y_train != [] and x_train != []:
        if current_user != init_user:
            y_train.append(sentence_holder + eot_token + "\n")
        save_to_file(dest_path + str(file_name_number) + "_x.txt", x_train)
        save_to_file(dest_path + str(file_name_number) + "_y.txt", y_train)
        return True
    return False
Exemple #4
0
def non_turns_exceed_max_turns_in_conv(file_path, fit_1, fit_2, fit_3, fit_4,
                                       fit_5, fit_6):
    go_token = ""
    eos_token = " . "
    eot_token = ""

    ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "])
    user1_first_line = True

    x_train = []
    y_train = []
    num_turns = 0
    fit_1_bool = True
    fit_2_bool = True
    fit_3_bool = True
    fit_4_bool = True
    fit_5_bool = True
    fit_6_bool = True

    sentence_holder = ""
    with open(file_path) as fileobject:
        for line in fileobject:
            text, user = split_line(line)
            if text == "":
                continue
            current_user = user
            if user1_first_line:
                init_user, previous_user = current_user, current_user
                user1_first_line = False
                sentence_holder = ""

            if current_user == previous_user:  # The user is still talking
                if text.endswith(ending_symbols):
                    sentence_holder += text + " "
                else:
                    sentence_holder += text + eos_token
            else:  # A new user responds
                sentence_holder += eot_token + "\n"

                if current_user == init_user:  # Init user talks (should add previous sentence to y_train)
                    y_train.append(sentence_holder)
                    words = sentence_holder.split()
                    num_words = len(words)
                    if num_words > fit_1:
                        fit_1_bool = False
                    if num_words > fit_2:
                        fit_2_bool = False
                    if num_words > fit_3:
                        fit_3_bool = False
                    if num_words > fit_4:
                        fit_4_bool = False
                    if num_words > fit_5:
                        fit_5_bool = False
                    if num_words > fit_6:
                        fit_6_bool = False
                else:
                    x_train.append(sentence_holder)
                    words = sentence_holder.split()
                    num_words = len(words)
                    if num_words > fit_1:
                        fit_1_bool = False
                    if num_words > fit_2:
                        fit_2_bool = False
                    if num_words > fit_3:
                        fit_3_bool = False
                    if num_words > fit_4:
                        fit_4_bool = False
                    if num_words > fit_5:
                        fit_5_bool = False
                    if num_words > fit_6:
                        fit_6_bool = False
                if text.endswith(ending_symbols):
                    sentence_holder = go_token + text + " "
                else:
                    sentence_holder = go_token + text + eos_token

            previous_user = current_user

    if current_user != init_user:
        y_train.append(sentence_holder + eot_token + "\n")
        words = sentence_holder.split()
        num_words = len(words)
        if num_words > fit_1:
            fit_1_bool = False
        if num_words > fit_2:
            fit_2_bool = False
        if num_words > fit_3:
            fit_3_bool = False
        if num_words > fit_4:
            fit_4_bool = False
        if num_words > fit_5:
            fit_5_bool = False
        if num_words > fit_6:
            fit_6_bool = False

    y_len = len(y_train)
    num_turns = len(x_train) + y_len
    if (y_len == len(x_train)):
        return num_turns, fit_1_bool, fit_2_bool, fit_3_bool, fit_4_bool, fit_5_bool, fit_6_bool
    else:
        print("different y and x len")
        return 0, False, False, False, False, False, False
Exemple #5
0
def preprocess_training_file(path, x_train_path, y_train_path):
    go_token = ""
    eos_token = " . "
    eot_token = ""

    ending_symbols = tuple(["!", "?", ".", "_EMJ", "_EMJ "])

    user1_first_line = True

    x_train = []
    y_train = []

    context = ""

    sentence_holder = ""
    with open(path) as fileobject:
        for line in fileobject:
            text, user = split_line(line)
            if text == "":
                continue
            current_user = user
            if user1_first_line:
                init_user, previous_user = current_user, current_user
                user1_first_line = False
                sentence_holder = go_token

            if current_user == previous_user:  # The user is still talking
                if text.endswith(ending_symbols):
                    sentence_holder += text + " "
                else:
                    sentence_holder += text + eos_token
            else:  # A new user responds
                if current_user == init_user:  # Init user talks (should add previous sentence to y_train)
                    if len(sentence_holder.split()) <= 30:
                        context = sentence_holder
                    else:
                        context = ""
                    sentence_holder += eot_token + "\n"
                    y_train.append(sentence_holder)
                else:
                    sentence_holder += eot_token + "\n"
                    x_train.append(context + " " + sentence_holder)
                if text.endswith(ending_symbols):
                    sentence_holder = go_token + text + " "
                else:
                    sentence_holder = go_token + text + eos_token

            len_y = len(y_train)
            if len_y > 0 and len(x_train) == len_y:
                if len(x_train[len_y - 1].split()) > 60 or len(
                        y_train[len_y - 1].split()) > 30:
                    del x_train[-1]
                    del y_train[-1]

            previous_user = current_user

    if current_user != init_user:
        y_train.append(sentence_holder + eot_token + "\n")

    with open(x_train_path, 'a') as xTrainObject, open(y_train_path,
                                                       'a') as yTrainObject:
        y_len = len(y_train)
        if (y_len == len(x_train)):
            for i in range(y_len):
                xTrainObject.write(x_train[i].strip() + "\n")
                yTrainObject.write(y_train[i].strip() + "\n")
        else:
            print("-----")
            print("len x:")
            print(len(x_train))
            print("len y:")
            print(y_len)
            print("-----")