Beispiel #1
0
def add_noise_to_string(a_string, amount_of_noise):
    """Add some artificial spelling mistakes to the string"""
    if rand() < amount_of_noise * len(a_string):
        # Replace a character with a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(
            CHARS[:-1]) + a_string[random_char_position + 1:]
    if rand() < amount_of_noise * len(a_string):
        # Delete a character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + a_string[
            random_char_position + 1:]
    if len(a_string) < MAX_INPUT_LEN and rand(
    ) < amount_of_noise * len(a_string):
        # Add a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(
            CHARS[:-1]) + a_string[random_char_position:]
    if rand() < amount_of_noise * len(a_string):
        # Transpose 2 characters
        random_char_position = random_randint(len(a_string) - 1)
        a_string = (a_string[:random_char_position] +
                    a_string[random_char_position + 1] +
                    a_string[random_char_position] +
                    a_string[random_char_position + 2:])
    return a_string
Beispiel #2
0
    def add_noise_to_string(self,a_string):
        """Adds aritificial random noise to a string, returns a list of strings with noise added"""
        CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
        incorrectVersions=[]
        origString=a_string
        for i in range(random.randrange(1,4)):
            a_string=origString
            onehop=random.randrange(1,3)
            for _ in range(onehop):
                j=random.randrange(1,5)
                if j==1 and len(a_string)>0:
                    # Replace a character with a random character
                    random_char_position = random_randint(len(a_string))
                    a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:]
                elif j==2 and len(a_string)>0:
                    # Delete a character
                    random_char_position = random_randint(len(a_string))
                    a_string = a_string[:random_char_position] + a_string[random_char_position + 1:]

                elif j==3:
                    # Add a random character
                    if len(a_string)>0:
                        random_char_position = random_randint(len(a_string))
                        a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:]
                    else:
                        a_string=random_choice(CHARS[:-1])
                elif len(a_string)>1:
                    # Transpose 2 characters
                    random_char_position = random_randint(len(a_string) - 1)
                    a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] +
                                a_string[random_char_position + 2:])
                incorrectVersions.append(a_string)
        return incorrectVersions
def add_noise_to_string(
        a_string,
        amount_of_noise):  # Add artificial spelling mistakes to string

    from numpy.random import choice as random_choice, randint as random_randint, seed as random_seed, rand

    CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")

    if rand() < amount_of_noise * len(a_string):
        # Replace a character with a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(
            CHARS[:-1]) + a_string[random_char_position + 1:]
    if rand() < amount_of_noise * len(a_string):
        # Delete a character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + a_string[
            random_char_position + 1:]
    if len(a_string) < MAX_INPUT_LEN and rand(
    ) < amount_of_noise * len(a_string):
        # Add a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(
            CHARS[:-1]) + a_string[random_char_position:]
    if rand() < amount_of_noise * len(a_string):
        # Transpose 2 characters
        random_char_position = random_randint(len(a_string) - 1)
        a_string = (a_string[:random_char_position] +
                    a_string[random_char_position + 1] +
                    a_string[random_char_position] +
                    a_string[random_char_position + 2:])
    return a_string
Beispiel #4
0
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
    """Iterative Training"""
    # Train the model each generation and show predictions against the validation dataset
    for iteration in range(1, NUMBER_OF_ITERATIONS):
        print()
        print('-' * 50)
        print('Iteration', iteration)
        model.fit(X_train,
                  y_train,
                  batch_size=BATCH_SIZE,
                  nb_epoch=EPOCHS_PER_ITERATION,
                  validation_data=(X_val, y_val))
        # Select 10 samples from the validation set at random so we can visualize errors
        for _ in range(10):
            ind = random_randint(0, len(X_val))
            rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])]  # pylint:disable=no-member
            preds = model.predict_classes(rowX, verbose=0)
            q = ctable.decode(rowX[0])
            correct = ctable.decode(rowy[0])
            guess = ctable.decode(preds[0], calc_argmax=False)
            if INVERTED:
                print('Q', q[::-1])  # inverted back!
            else:
                print('Q', q)
            print('A', correct)
            print(
                Colors.ok + '☑' +
                Colors.close if correct == guess else Colors.fail + '☒' +
                Colors.close, guess)
            print('---')
Beispiel #5
0
def generate_news_data(corpus):
    """Generate some news data"""
    print("Generating Data")
    questions, answers, seen_answers = [], [], set()
    while corpus:
        line = corpus.pop()
        while len(line) > MIN_INPUT_LEN:
            if len(line) <= MAX_INPUT_LEN:
                answer = line
                line = ""
            else:
                space_location = line.rfind(" ", MIN_INPUT_LEN,
                                            MAX_INPUT_LEN - 1)
                if space_location > -1:
                    answer = line[:space_location]
                    line = line[len(answer) + 1:]
                else:
                    space_location = line.rfind(" ")  # no limits this time
                    if space_location == -1:
                        break  # we are done with this line
                    else:
                        line = line[space_location + 1:]
                        continue
            if answer and answer in seen_answers:
                continue
            seen_answers.add(answer)
            answers.append(answer)
        if random_randint(100000) == 8:  # Show some progress
            print('.', end="")
    print('suffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question = add_noise_to_string(answer, AMOUNT_OF_NOISE)
        question += '.' * (MAX_INPUT_LEN - len(question))
        answer += "." * (MAX_INPUT_LEN - len(answer))
        answers[answer_index] = answer
        assert len(answer) == MAX_INPUT_LEN
        if random_randint(100000) == 8:  # Show some progress
            print(len(seen_answers))
            print("answer:   '{}'".format(answer))
            print("question: '{}'".format(question))
            print()
        question = question[::-1] if INVERTED else question
        questions.append(question)

    return questions, answers
Beispiel #6
0
def generate_news_data(corpus):
    """Generate some news data"""
    print ("Generating Data")
    questions, answers, seen_answers = [], [], set()
    while corpus:
        line = corpus.pop()
        while len(line) > MIN_INPUT_LEN:
            if len(line) <= MAX_INPUT_LEN:
                answer = line
                line = ""
            else:
                space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1)
                if space_location > -1:
                    answer = line[:space_location]
                    line = line[len(answer) + 1:]
                else:
                    space_location = line.rfind(" ") # no limits this time
                    if space_location == -1:
                        break # we are done with this line
                    else:
                        line = line[space_location + 1:]
                        continue
            if answer and answer in seen_answers:
                continue
            seen_answers.add(answer)
            answers.append(answer)
        if random_randint(100000) == 8: # Show some progress
            print('.', end="")
    print('suffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question = add_noise_to_string(answer, AMOUNT_OF_NOISE)
        question += '.' * (MAX_INPUT_LEN - len(question))
        answer += "." * (MAX_INPUT_LEN - len(answer))
        answers[answer_index] = answer
        assert len(answer) == MAX_INPUT_LEN
        if random_randint(100000) == 8: # Show some progress
            print (len(seen_answers))
            print ("answer:   '{}'".format(answer))
            print ("question: '{}'".format(question))
            print ()
        question = question[::-1] if INVERTED else question
        questions.append(question)

    return questions, answers
def add_noise_to_sentence(sentence, amount_of_noise):
    """
     Add artificial spelling mistakes to string
    :param sentence: list of words
    :param amount_of_noise: constant from 0 to 1 which show amount of mistakes
    :return: list of words with mistakes
    """

    CHARS = list("abcdefghijklmnopqrstuvwxyz")

    if rand() < amount_of_noise * len(sentence):
        # Replace a character with a random character
        random_word_position = random_randint(len(sentence))
        if len(sentence[random_word_position]):
            random_char_position = random_randint(
                len(sentence[random_word_position]))
            sentence[random_word_position] = sentence[
                random_word_position][:random_char_position] + random_choice(
                    CHARS[:-1]
                ) + sentence[random_word_position][random_char_position + 1:]

    if rand() < amount_of_noise * len(sentence):
        # Delete a character
        random_word_position = random_randint(len(sentence))
        if len(sentence[random_word_position]) > 1:
            random_char_position = random_randint(
                len(sentence[random_word_position]))
            sentence[random_word_position] = sentence[random_word_position][:random_char_position] + \
                                             sentence[random_word_position][random_char_position + 1:]

    if rand() < amount_of_noise * len(sentence):
        # Add a random character
        random_word_position = random_randint(len(sentence))
        if len(sentence[random_word_position]):
            random_char_position = random_randint(
                len(sentence[random_word_position]))
            sentence[random_word_position] = sentence[
                random_word_position][:random_char_position] + random_choice(
                    CHARS[:-1]
                ) + sentence[random_word_position][random_char_position:]

    if rand() < amount_of_noise * len(sentence):
        # Transpose 2 characters
        random_word_position = random_randint(len(sentence))
        if len(sentence[random_word_position]) > 1:
            random_char_position = random_randint(
                len(sentence[random_word_position]) - 1)
            sentence[random_word_position] = sentence[random_word_position][:random_char_position] + \
                                             sentence[random_word_position][random_char_position + 1] + \
                                             sentence[random_word_position][random_char_position] + \
                                             sentence[random_word_position][random_char_position + 2:]
    return sentence
Beispiel #8
0
def add_noise_to_string(a_string, amount_of_noise):
    """Add some artificial spelling mistakes to the string"""
    if rand() < amount_of_noise * len(a_string):
        # Replace a character with a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:]
    if rand() < amount_of_noise * len(a_string):
        # Delete a character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + a_string[random_char_position + 1:]
    if len(a_string) < MAX_INPUT_LEN and rand() < amount_of_noise * len(a_string):
        # Add a random character
        random_char_position = random_randint(len(a_string))
        a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:]
    if rand() < amount_of_noise * len(a_string):
        # Transpose 2 characters
        random_char_position = random_randint(len(a_string) - 1)
        a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] +
                    a_string[random_char_position + 2:])
    return a_string
def print_random_predictions(model, ctable, X_val, y_val):
    """Select 10 samples from the validation set at random so we can visualize errors"""
    print()
    for _ in range(10):
        ind = random_randint(0, len(X_val))
        rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
        preds = model.predict_classes(rowX, verbose=0)
        q = ctable.decode(rowX[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        if CONFIG.inverted:
            print('Q', q[::-1]) # inverted back!
        else:
            print('Q', q)
        print('A', correct)
        print(Colors.green + '☑' + Colors.close if correct == guess else Colors.red + '☒' + Colors.close, guess)
        print('---')
    print()
Beispiel #10
0
def show_samples(model, dataset, epoch, logs, X_dev_batch, y_dev_batch):
    """Selects 10 samples from the dev set at random so we can visualize errors"""

    for _ in range(10):
        ind = random_randint(0, len(X_dev_batch))
        row_X, row_y = X_dev_batch[np.array([ind])], y_dev_batch[np.array([ind])]
        preds = model.predict_classes(row_X, verbose=0)
        q = dataset.character_table.decode(row_X[0])
        correct = dataset.character_table.decode(row_y[0])
        guess = dataset.character_table.decode(preds[0], calc_argmax=False)

        if INVERTED:
            print('Q', q[::-1])  # inverted back!
        else:
            print('Q', q)

        print('A', correct)
        print(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess)
        print('---')
Beispiel #11
0
def generate_news_data():
    """Generate some news data"""
    print("Generating Data")
    answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n")
    questions = []
    print('shuffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question, answer = generate_question(answer)
        answers[answer_index] = answer
        assert len(answer) == CONFIG.max_input_len
        if random_randint(100000) == 8:  # Show some progress
            print(len(answers))
            print("answer:   '{}'".format(answer))
            print("question: '{}'".format(question))
            print()
        question = question[::-1] if CONFIG.inverted else question
        questions.append(question)

    return questions, answers
Beispiel #12
0
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
    """Iterative Training"""
    # Train the model each generation and show predictions against the validation dataset
    for iteration in range(1, NUMBER_OF_ITERATIONS):
        print()
        print('-' * 50)
        print('Iteration', iteration)
        model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=EPOCHS_PER_ITERATION, validation_data=(X_val, y_val),
                  show_accuracy=True)
        # Select 10 samples from the validation set at random so we can visualize errors
        for _ in range(10):
            ind = random_randint(0, len(X_val))
            rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
            preds = model.predict_classes(rowX, verbose=0)
            q = ctable.decode(rowX[0])
            correct = ctable.decode(rowy[0])
            guess = ctable.decode(preds[0], calc_argmax=False)
            if INVERTED:
                print('Q', q[::-1]) # inverted back!
            else:
                print('Q', q)
            print('A', correct)
            print(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess)
            print('---')
Beispiel #13
0
def show_samples(model, dataset, epoch, logs, X_dev_batch, y_dev_batch):
    """Selects 10 samples from the dev set at random so we can visualize errors"""
    #UTF8Writer = getwriter('utf8')
    #sys.stdout = UTF8Writer(sys.stdout)
    #PYTHONIOENCODING=utf8
    for _ in range(10):
        ind = random_randint(0, len(X_dev_batch))
        row_X, row_y = X_dev_batch[np.array([ind
                                             ])], y_dev_batch[np.array([ind])]
        preds = model.predict_classes(row_X, verbose=0)
        q = dataset.character_table.decode(row_X[0])
        correct = dataset.character_table.decode(row_y[0])
        guess = dataset.character_table.decode(preds[0], calc_argmax=False)

        #if INVERTED:
        #   print('Q', q[::-1])  # inverted back!
        #else:
        #   print('Q', q)

        #print('A', correct)
        #print(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess)
        #print('---')

        with open("data/outFile.txt", "a", encoding="utf-8") as out:
            if INVERTED:
                out.write('Q ' + q[::-1])  # inverted back!
            else:
                out.write('Q ' + q)
        with open("data/outFile.txt", "a", encoding="utf-8") as out:
            out.write('A ' + correct)
            if correct == guess:
                out.write(Colors.ok + '?' + ' ' + guess)
            else:
                out.write(Colors.fail + '?' + ' ' + guess)
            #out.write(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess)
            out.write('---')
Beispiel #14
0
def add_noise_to_sentence(sentence, amount_of_noise):
    """
     Add artificial spelling mistakes to string
    :param sentence: list of words
    :param amount_of_noise: constant from 0 to 1 which show amount of mistakes
    :return: list of words with mistakes
    """

    CHARS = list("abcdefghijklmnopqrstuvwxyz")

    substitutions = {
        "a": ["a"],
        "b": ["b"],
        "c": ["c"],
        "d": ["d"],
        "e": ["e"],
        "f": ["f"],
        "g": ["g"],
        "h": ["h"],
        "i": ["i"],
        "j": ["j"],
        "k": ["k"],
        "l": ["l"],
        "m": ["m"],
        "n": ["n"],
        "o": ["o"],
        "p": ["p"],
        "q": ["q"],
        "r": ["r"],
        "s": ["s"],
        "t": ["t"],
        "u": ["u"],
        "v": ["v"],
        "w": ["w"],
        "x": ["x"],
        "y": ["y"],
        "z": ["z"],
        "A": ["A"],
        "B": ["B"],
        "C": ["C"],
        "D": ["D"],
        "E": ["E"],
        "F": ["F"],
        "G": ["G"],
        "H": ["H"],
        "I": ["I"],
        "J": ["J"],
        "K": ["K"],
        "L": ["L"],
        "M": ["M"],
        "N": ["N"],
        "O": ["O"],
        "P": ["P"],
        "Q": ["Q"],
        "R": ["R"],
        "S": ["S"],
        "T": ["T"],
        "U": ["U"],
        "V": ["V"],
        "W": ["W"],
        "X": ["X"],
        "Y": ["Y"],
        "Z": ["Z"],
        " ": [" "],
        ".": ["."]
    }

    if rand() < amount_of_noise * len(sentence):
        # Replace a character with a random character
        random_char_position = random_randint(len(sentence))
        sentence = sentence[:random_char_position] + random_choice(
            substitutions[sentence[random_char_position]]
        ) + sentence[random_char_position + 1:]

    if rand() < amount_of_noise * len(sentence):
        # Delete a character
        random_char_position = random_randint(len(sentence))
        sentence = sentence[:random_char_position] + sentence[
            random_char_position + 1:]

    if rand() < amount_of_noise * len(sentence) and len(sentence) < 197:
        # Add a random character
        random_char_position = random_randint(len(sentence))
        sentence = sentence[:random_char_position] + random_choice(
            CHARS[:-1]) + sentence[random_char_position:]

    if rand() < amount_of_noise * len(sentence):
        # Transpose 2 characters
        random_char_position = random_randint(len(sentence) - 1)
        sentence = sentence[:random_char_position] + sentence[random_char_position + 1] + \
                   sentence[random_char_position] + sentence[random_char_position + 2:]

    return sentence