Beispiel #1
0
def vectorize_data(vec_cleaned, data_name):  # training, dev, or test
    X_vec = np.zeros((int(len(vec_cleaned) / batchsize), batchsize, data_dim),
                     dtype=np.bool)
    Y_vec = np.zeros(
        (int(len(vec_cleaned) / batchsize), batchsize, len(vocab)),
        dtype=np.bool)
    X_token = []
    # easy minibatch
    # https://docs.python.org/2.7/library/functions.html?highlight=zip#zip
    for m, mini_batch_tokens in enumerate(zip(*[iter(vec_cleaned)] *
                                              batchsize)):
        X_token_m = []
        x_mini_batch = np.zeros((batchsize, data_dim), dtype=np.bool)
        y_mini_batch = np.zeros((batchsize, len(vocab)), dtype=np.bool)

        for j, token in enumerate(mini_batch_tokens):
            if not mistake_happen():
                x_mini_batch[j], x_token = binarize.noise_char(
                    token, "NO NOISE", alph)
            elif noise_type == 'OCR':
                rnd_noise = random.choice([
                    'DELETE', 'INSERT', 'REPLACE', 'REPLACETABLE',
                    'REPLACETABLE'
                ])  # MAKE REPLACETABLE MORE PROBABLE
                x_mini_batch[j], x_token = binarize.noise_char(
                    token, rnd_noise, alph)
            elif jumble_type == 'NO':
                x_mini_batch[j], x_token = binarize.noise_char(
                    token, noise_type, alph)
                pass
            else:
                x_mini_batch[j], x_token = binarize.jumble_char(
                    token, jumble_type, alph)

            bin_label = [0] * len(vocab)

            if token in vocab.keys():
                bin_label[vocab[token]] = 1
            else:
                bin_label[ID_UNKNOWN_WORD] = 1

            y_mini_batch[j] = np.array(bin_label)
            X_token_m.append(x_token)
        X_vec[m] = x_mini_batch
        Y_vec[m] = y_mini_batch
        X_token.append(X_token_m)

        #percentage = int(m*100. / (len(vec_cleaned)/batchsize))
        #sys.stdout.write("\r%d %% %s" % (percentage, data_name))
        #print(str(percentage) + '%'),
        #sys.stdout.flush()
    print()
    return X_vec, Y_vec, X_token
Beispiel #2
0
def vectorize_data(vec_cleaned, alph, noise_type='JUMBLE', jumble_type='INT'):
    X_vec = np.zeros((int(len(vec_cleaned)), len(alph) * 3), dtype=np.bool)
    for i, word in enumerate(vec_cleaned):
        if jumble_type == 'NO':
            x_feat, _ = binarize.noise_char(word, noise_type, alph)
        else:
            x_feat, _ = binarize.jumble_char(word, jumble_type, alph)
        X_vec[i] = x_feat

    return X_vec.reshape((1, 20, 228))
def vectorize_data(vec_cleaned, data_name):  # training, dev, or test
    X_vec = np.zeros(
        (int(len(vec_cleaned) / batchsize), batchsize, len(alph) * 3),
        dtype=np.bool)
    Y_vec = np.zeros(
        (int(len(vec_cleaned) / batchsize), batchsize, len(vocab)),
        dtype=np.bool)
    X_token = []
    # easy minibatch
    # https://docs.python.org/2.7/library/functions.html?highlight=zip#zip
    for m, mini_batch_tokens in enumerate(zip(*[iter(vec_cleaned)] *
                                              batchsize)):
        X_token_m = []
        x_mini_batch = np.zeros((batchsize, len(alph) * 3), dtype=np.bool)
        y_mini_batch = np.zeros((batchsize, len(vocab)), dtype=np.bool)

        for j, token in enumerate(mini_batch_tokens):
            if jumble_type == 'NO':
                x_mini_batch[j], x_token = binarize.noise_char(
                    token, noise_type, alph)
                pass
            else:
                x_mini_batch[j], x_token = binarize.jumble_char(
                    token, jumble_type, alph)

            bin_label = [0] * len(vocab)
            bin_label[vocab[token]] = 1
            y_mini_batch[j] = np.array(bin_label)
            X_token_m.append(x_token)
        X_vec[m] = x_mini_batch
        Y_vec[m] = y_mini_batch
        X_token.append(X_token_m)

        #percentage = int(m*100. / (len(vec_cleaned)/batchsize))
        #sys.stdout.write("\r%d %% %s" % (percentage, data_name))
        #print(str(percentage) + '%'),
        #sys.stdout.flush()
    print()
    return X_vec, Y_vec, X_token
Beispiel #4
0
def vectorize_data(vec_cleaned, data_name):  # training, dev, or test
    X_vec = np.zeros((int(len(vec_cleaned) / batchsize), batchsize, data_dim),
                     dtype=np.bool)
    X_token = []
    # easy minibatch
    # https://docs.python.org/2.7/library/functions.html?highlight=zip#zip
    for m, mini_batch_tokens in enumerate(zip(*[iter(vec_cleaned)] *
                                              batchsize)):
        X_token_m = []
        x_mini_batch = np.zeros((batchsize, data_dim), dtype=np.bool)
        for j, token in enumerate(mini_batch_tokens):
            x_mini_batch[j], x_token = binarize.noise_char(
                token, "No noise", alph)
            X_token_m.append(x_token)

        X_vec[m] = x_mini_batch
        X_token.append(X_token_m)

        #percentage = int(m*100. / (len(vec_cleaned)/batchsize))
        #sys.stdout.write("\r%d %% %s" % (percentage, data_name))
        #print(str(percentage) + '%'),
        #sys.stdout.flush()
    print()
    return X_vec, X_token
Beispiel #5
0
    action='store_true',
    help='If True, results will be printed in the console only')

args = parser.parse_args()

SOURCE_PATH = args.source
OUTPUT_FILE = open(args.output, "w")
SOURCE_TEXT = args.text
PRINT_CONSOLE = args.console

if SOURCE_TEXT == "":
    SOURCE_TEXT = open(SOURCE_PATH, "r").read()

OUTPUT_WORDS = [""]

text_cleaned = my_tokenize(SOURCE_TEXT)

print("\n")

for w in text_cleaned:
    if True:  # mistake_happen() if want to make errors not all the time
        rnd_noise = random.choice(
            ['DELETE', 'INSERT', 'REPLACE', 'REPLACETABLE',
             'REPLACETABLE'])  #MAKE REPLACETABLE MORE PROBABLE
        token, w = binarize.noise_char(w, rnd_noise, alph)

    if PRINT_CONSOLE:
        print(w + " ", end="", flush=True)
    else:
        OUTPUT_FILE.write(w + " ")