def setup(self): tokenizer = train.tokenize(TRAIN_PATH, VALID_PATH, delimiter='\t') batch_size = 3 max_length = 20 vocab_size = 864 self.dg = train.DataGenerator(TRAIN_PATH, batch_size, max_length, vocab_size, tokenizer)
def main(): counter, ngram_size, vocab_size = load_model() while True: line = input('Enter a sentence (EXIT to break):') if line == 'EXIT': break words = tokenize(line, ngram_size) probability = 1. for offset in range(0, len(words) - ngram_size + 1): history = tuple(words[offset:offset + ngram_size - 1]) joint = tuple(words[offset:offset + ngram_size]) history_count = counter[history] joint_count = counter[joint] logging.info(str(history) + '\t count = %d' % history_count) logging.info(str(joint) + '\t count = %d' % joint_count) # probability with additive smoothing probability *= (joint_count + 1) / (history_count + vocab_size) print() print('------------------------------------------') print('Probability: %.40f' % probability) print() print()
def line_reader(line, ngram_size, vocabulary): # tokenize words = tokenize(line, ngram_size) # update vocabulary vocabulary |= set(words) # skip lines that are too short to produce proper ngrams if len(words) < ngram_size - 1: return # yield ngrams of size ngram_size and (ngram_size - 1) for offset in range(0, len(words) - ngram_size + 1): yield tuple(words[offset:offset + ngram_size]) yield tuple(words[offset:offset + ngram_size - 1])
def categorize(self, test_list_path, doc_path, outfile): ''' Helper function to categorize one document and write the results to the outfile. ''' # Generate list of tokens for the given document token_list = tokenize(os.path.join(test_list_path, doc_path)) # Compute similarity metric for each of the categories similarities = {} for category in self.ii.category_count.keys(): similarities[category] = self.similarity(token_list, category) # Pick the category with highest similarity and write results to # output file label = max(similarities, key=similarities.get) print(doc_path + ' ' + label, file=outfile)
def test_tokenize(): tokenizer = train.tokenize(TRAIN_PATH, VALID_PATH, delimiter='\t') test_data = [['容疑者', 'が']] assert tokenizer.texts_to_sequences(test_data) == [[265, 3]]
from train import complete_prompt, tokenize from flask import Flask, render_template from tensorflow.keras.models import load_model app = Flask(__name__) model = load_model('./model') tokenize_result = tokenize() @app.route('/<name>') def show_name_poem(name): poem = complete_prompt(model, tokenize_result, "Dear " + name) return render_template('output.html', name=name, poem=poem) if __name__ == '__main__': # Threaded option to enable multiple instances for multiple user access support app.run(threaded=True, port=5000)
def main(): # verify arg if len(sys.argv) != 2: print("Mising argument 1: absolute path to text to label") return # get arg as name filePath = sys.argv[1] # try to load file if not os.path.isfile(filePath): print("File at {} does not exist".format(filePath)) return # check extension if filePath[-4:] != '.txt': print("File exists, but must be .txt format") return # load file to label contents = '' with open(filePath, 'r') as fp: for line in fp.readlines(): contents += line # tokenize input text tokens = tokenize(contents) # enumerate classes classFiles = next(os.walk('../result'))[2] if len(classFiles) == 0: print("No classes to label from") return classFiles.remove('.dummy') # get class names from filenames classNames = [] for f in classFiles: classNames.append(f[:-4]) # load classes classWords = [] for f in classFiles: classContents = '' with open('../result/' + f, 'r') as fp: for line in fp.readlines(): classContents += line thisClassWords = classContents.split('\n') classWords.append(thisClassWords) # enumerate colors in 1D line: 256^3 colors = [] floorColor = (7 * 16 + 7)**3 colorDist = math.floor( (256**3 - floorColor) / (len(classNames) * colorContrastMultiplier)) for i in range(0, len(classNames)): color = floorColor + i * colorDist hexString = hex(color) colors.append(hexString[2:]) # try to apply each class for token in tokens: if token[1] == True: for i in range(0, len(classNames)): # iterate words to look for for w in classWords[i]: if token[0] == w: token[ 0] = '<span style="background-color: #{}">{}</span>'.format( colors[i], token[0]) # rebuild text with stylized tokens rebuilt = '' for token in tokens: rebuilt += token[0] # write rebuilt text to file with open(filePath[:-4] + '.md', 'w') as fp: # build legend fp.write('Legend:<br />') for i in range(0, len(classNames)): fp.write( '<span style="background-color: #{}">{}</span><br />'.format( colors[i], classNames[i])) fp.write('<br />') # fix tabs and newlines rebuilt = rebuilt.replace('\n', '<br />') rebuilt = rebuilt.replace('\t', ' ' * 4) # write contents fp.write(rebuilt)
model_state = data["model_state"] model = NeuralNet(input_size, hidden_size, output_size).to(device) model.load_state_dict(model_state) # Sets the module in evaluation mode. model.eval() bot_name = "Bot" print("Can I help you? (type 'quit' to exit)") while True: # sentence = "do you use credit cards?" sentence = input("You: ") if sentence == "quit": break sentence = tokenize(sentence) X = bag_of_words(sentence, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.75: for intent in intents['intents']: if tag == intent["tag"]: