def vectorize_data(vec_cleaned, data_name): # training, dev, or test X_vec = np.zeros((int(len(vec_cleaned) / batchsize), batchsize, data_dim), dtype=np.bool) Y_vec = np.zeros( (int(len(vec_cleaned) / batchsize), batchsize, len(vocab)), dtype=np.bool) X_token = [] # easy minibatch # https://docs.python.org/2.7/library/functions.html?highlight=zip#zip for m, mini_batch_tokens in enumerate(zip(*[iter(vec_cleaned)] * batchsize)): X_token_m = [] x_mini_batch = np.zeros((batchsize, data_dim), dtype=np.bool) y_mini_batch = np.zeros((batchsize, len(vocab)), dtype=np.bool) for j, token in enumerate(mini_batch_tokens): if not mistake_happen(): x_mini_batch[j], x_token = binarize.noise_char( token, "NO NOISE", alph) elif noise_type == 'OCR': rnd_noise = random.choice([ 'DELETE', 'INSERT', 'REPLACE', 'REPLACETABLE', 'REPLACETABLE' ]) # MAKE REPLACETABLE MORE PROBABLE x_mini_batch[j], x_token = binarize.noise_char( token, rnd_noise, alph) elif jumble_type == 'NO': x_mini_batch[j], x_token = binarize.noise_char( token, noise_type, alph) pass else: x_mini_batch[j], x_token = binarize.jumble_char( token, jumble_type, alph) bin_label = [0] * len(vocab) if token in vocab.keys(): bin_label[vocab[token]] = 1 else: bin_label[ID_UNKNOWN_WORD] = 1 y_mini_batch[j] = np.array(bin_label) X_token_m.append(x_token) X_vec[m] = x_mini_batch Y_vec[m] = y_mini_batch X_token.append(X_token_m) #percentage = int(m*100. / (len(vec_cleaned)/batchsize)) #sys.stdout.write("\r%d %% %s" % (percentage, data_name)) #print(str(percentage) + '%'), #sys.stdout.flush() print() return X_vec, Y_vec, X_token
def vectorize_data(vec_cleaned, alph, noise_type='JUMBLE', jumble_type='INT'): X_vec = np.zeros((int(len(vec_cleaned)), len(alph) * 3), dtype=np.bool) for i, word in enumerate(vec_cleaned): if jumble_type == 'NO': x_feat, _ = binarize.noise_char(word, noise_type, alph) else: x_feat, _ = binarize.jumble_char(word, jumble_type, alph) X_vec[i] = x_feat return X_vec.reshape((1, 20, 228))
def vectorize_data(vec_cleaned, data_name): # training, dev, or test X_vec = np.zeros( (int(len(vec_cleaned) / batchsize), batchsize, len(alph) * 3), dtype=np.bool) Y_vec = np.zeros( (int(len(vec_cleaned) / batchsize), batchsize, len(vocab)), dtype=np.bool) X_token = [] # easy minibatch # https://docs.python.org/2.7/library/functions.html?highlight=zip#zip for m, mini_batch_tokens in enumerate(zip(*[iter(vec_cleaned)] * batchsize)): X_token_m = [] x_mini_batch = np.zeros((batchsize, len(alph) * 3), dtype=np.bool) y_mini_batch = np.zeros((batchsize, len(vocab)), dtype=np.bool) for j, token in enumerate(mini_batch_tokens): if jumble_type == 'NO': x_mini_batch[j], x_token = binarize.noise_char( token, noise_type, alph) pass else: x_mini_batch[j], x_token = binarize.jumble_char( token, jumble_type, alph) bin_label = [0] * len(vocab) bin_label[vocab[token]] = 1 y_mini_batch[j] = np.array(bin_label) X_token_m.append(x_token) X_vec[m] = x_mini_batch Y_vec[m] = y_mini_batch X_token.append(X_token_m) #percentage = int(m*100. / (len(vec_cleaned)/batchsize)) #sys.stdout.write("\r%d %% %s" % (percentage, data_name)) #print(str(percentage) + '%'), #sys.stdout.flush() print() return X_vec, Y_vec, X_token
def vectorize_data(vec_cleaned, data_name): # training, dev, or test X_vec = np.zeros((int(len(vec_cleaned) / batchsize), batchsize, data_dim), dtype=np.bool) X_token = [] # easy minibatch # https://docs.python.org/2.7/library/functions.html?highlight=zip#zip for m, mini_batch_tokens in enumerate(zip(*[iter(vec_cleaned)] * batchsize)): X_token_m = [] x_mini_batch = np.zeros((batchsize, data_dim), dtype=np.bool) for j, token in enumerate(mini_batch_tokens): x_mini_batch[j], x_token = binarize.noise_char( token, "No noise", alph) X_token_m.append(x_token) X_vec[m] = x_mini_batch X_token.append(X_token_m) #percentage = int(m*100. / (len(vec_cleaned)/batchsize)) #sys.stdout.write("\r%d %% %s" % (percentage, data_name)) #print(str(percentage) + '%'), #sys.stdout.flush() print() return X_vec, X_token
action='store_true', help='If True, results will be printed in the console only') args = parser.parse_args() SOURCE_PATH = args.source OUTPUT_FILE = open(args.output, "w") SOURCE_TEXT = args.text PRINT_CONSOLE = args.console if SOURCE_TEXT == "": SOURCE_TEXT = open(SOURCE_PATH, "r").read() OUTPUT_WORDS = [""] text_cleaned = my_tokenize(SOURCE_TEXT) print("\n") for w in text_cleaned: if True: # mistake_happen() if want to make errors not all the time rnd_noise = random.choice( ['DELETE', 'INSERT', 'REPLACE', 'REPLACETABLE', 'REPLACETABLE']) #MAKE REPLACETABLE MORE PROBABLE token, w = binarize.noise_char(w, rnd_noise, alph) if PRINT_CONSOLE: print(w + " ", end="", flush=True) else: OUTPUT_FILE.write(w + " ")