def format(s, split_phrases=False, add_sol_eol=False, add_eol_only=False, only_one_phrase=False): z = tokenize_weak.format(s) if split_phrases: x = [] z = z.replace(',', ' ') z = z.replace('?', ' ? . ') z = z.replace('!', ' . ') zz = z.split('.') #zz = filter(None, re.split("[,.\-!?:]+", z)) for i in zz: xx = i.split(' ') y = [] for j in xx: j = j.strip() if len(j) > 0: y.append(j) i = ' '.join(y) i = i.strip() if len(i) > 1 and not i.isspace(): if not add_eol_only: x.append( hparams['sol'] + ' ' + i + ' ' + hparams['eol'] + ' . ' ) else: if i.split(' ')[-1] != hparams['eol']: x.append( i + ' ' + hparams['eol'] + ' . ') if only_one_phrase and len(x) > 1: ## return just first phrase return x[-1] x = ' '.join(x) return x if add_sol_eol: if not add_eol_only: z = hparams['sol'] + ' ' + z z = z + ' ' + hparams['eol'] return z
def make_vocab(): wordlist = [] for filename in train_file: with open(filename, 'r') as x: xx = x.read() for line in xx.split('\n'): line = tokenize_weak.format(line) y = line.lower().split() for word in y: wordlist.append(word) pass print('values read') #wordset = set(wordlist) c = Counter(wordlist) l = len(wordlist) print(l, 'length of raw vocab data') if l > vocab_length: l = vocab_length cc = c.most_common() #cc = wordlist print(len(cc), 'length of result list') #v = [] num = 0 ss = sorted(cc, key=itemgetter(1)) #print(ss[0:10]) ss.reverse() #print(ss[0:10]) for z in ss: # sorted(cc, key= lambda word: word[1]): if z[0].lower() not in v and num < vocab_length: v.append(z[0].lower()) num += 1 #vv = list(set(v)) v.sort()
def format(s): z = tokenize_weak.format(s) if z == None or z.strip() == '': z = ' what ? ' z = hparams['sol'] + ' ' + z z = z + ' ' + hparams['eol'] return z
def task_interactive(self): print('-------------------') while True: line = input("> ") line = tokenize_weak.format(line) print(line) out, _ = self.evaluate(None, None, line) print(out)
def task_interactive(self): self.model, _, _ = self.embedding_model(self.model, self.model_encoder, self.model_inference, global_check=True) print('-------------------') while True: line = input("> ") line = tokenize_weak.format(line) print(line) self.predict_words(line, stop_at_eol=True)
def tokenize(sentence): sentence = tokenize_weak.format(sentence) xx = [] for x in sentence.split(): x = x.strip() if x == '\n' or x == '\r' or x == '\n\r' or x == '\r\n' or len(x) == 0: pass else: #xx.append(' ') xx.append(x) pass sentence = ' '.join(xx) return sentence pass
def task_interactive(self): print('-------------------') while True: line = input("> ") line = tokenize_weak.format(line) ll = [] for l in line.split(): if l in self.vocab_lang.word2index: ll.append(l) if len(ll) > 0: line = ' '.join(ll) print(line) out, _ = self.evaluate(None, None, line) print(out)
def process_questions(questions, include_blacklisted=True, do_tokenize=True): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() if do_tokenize: prepared_questions.append( tokenize_weak.format(question ) if question else '##emptyquestion##') else: prepared_questions.append( question if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_score = score_answers(answers, 'answers') best_index, best_score = get_best_score(answers_score, include_blacklisted) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) else: prepared_answers_list.append({ 'answers': answers, 'scores': answers_score, 'best_index': best_index, 'best_score': best_score }) return prepared_answers_list
def make_vocab(): wordlist = [] with open(train_file, 'r') as x: xx = x.read() for line in xx.split('\n'): line = tokenize_weak.format(line) y = line.lower().split() for word in y: wordlist.append(word) pass print('values read') wordset = set(wordlist) c = Counter(wordset) l = len(wordset) print(l,'length of raw vocab data') if l > vocab_length: l = vocab_length cc = c.most_common(l) print(len(cc), 'length of result list') #v = [] for z in sorted(cc): if z[0].lower() not in v: v.append(z[0].lower()) #vv = list(set(v)) v.sort()
def make_vocab(train_file, order=False, read_glove=False, contractions=False, no_limit=False): global v, v_end wordlist = [] vocab_length = hparams['num_vocab_total'] if contractions: whitelist.extend(directions) wordlist.extend(whitelist) #wordlist.extend(directions) print('add whitelist.') for filename in train_file: if os.path.isfile(filename) and filename.endswith('.csv'): print('csv file:', filename) with open(filename, 'rb') as x: text = x.readlines() for xx in text: #[:csv_cutoff]: line = xx.strip().decode('utf-8', errors='ignore') y = line.split(',')[ 1:-1] # magic numbers -- which columns to use. y[0] = y[0].lower() #print(y) for word in y: if word not in wordlist or True: wordlist.append(word) pass for filename in train_file: if os.path.isfile(filename) and not filename.endswith('.csv'): print('found:', filename) with open(filename, 'r') as x: xx = x.read() for line in xx.split('\n'): line = tokenize_weak.format(line) y = line.lower().split() for word in y: wordlist.append(word) pass print('values read from text file.', ' '.join(train_file)) if read_glove: with open(FROM, 'r') as x: xx = x.read() for line in xx.split('\n'): l = line.split(' ') #print( len(l)) if len(l) > 2: wordlist.append(l[0].strip()) pass print('values read from glove file.') #wordset = set(wordlist) c = Counter(wordlist) l = len(wordlist) print(l, 'length of raw vocab data') if l > vocab_length and not no_limit: l = vocab_length if no_limit: vocab_length = l hparams['num_vocab_total'] = vocab_length cc = c.most_common()[:l] print(len(cc), 'length of result list') #v = [] num = 0 if order: ss = sorted(cc, key=itemgetter(1)) #print(ss[0:10]) ss.reverse() else: ss = cc #print(ss[0:10]) #vocab_length -= m print(vocab_length, 'vl') for z in ss: # sorted(cc, key= lambda word: word[1]): if (z[0].lower() not in v and num < vocab_length) or (z[0].lower() in whitelist and z[0].lower() not in v_end): v.append(z[0].lower()) num += 1 if len(v_end) > 0: v_temp = [] for z in v_end: if z not in v: v_temp.append(z) v_end = v_temp v_temp_num = len(v_end) v = v[:-v_temp_num] if order: v.sort() if len(v_end) > 0: v.extend(v_end) vv = [hparams['unk'], hparams['sol'], hparams['eol'], hparams['eow']] for z in v: if len(vv) < vocab_length and z not in vv: vv.append(z) if len(v_end) > 0: vv.extend(v_end) v = vv print('len', len(v)) return v
def format(s): z = tokenize_weak.format(s) if z.strip() == '': z = ' what ? ' return z
if arg_stagger: print('stagger output.') for line in z: ## set autoencode here. auto_flag = False if args['autoencode'] is not None and random.uniform(0, 1) < arg_autoencode: auto_flag = True else: auto_flag = False save = '' if num >= arg_start and (arg_length == 0 or num < arg_start + arg_length): line = line.split('\t') line[0] = format(line[0]) line[1] = format(line[1]) line[0], line[1] = move_order(line[0], line[1]) if arg_eol and len(line[0]) > 1: line[0] += ' ' + hparams['eol'] if arg_eol and len(line[1]) > 1: line[1] += ' ' + hparams['eol'] if not arg_stagger and arg_classifier != "MRPC" and arg_classifier != "MNLI" and not arg_gpt2: src.write(line[0].lower()) save = line[0][:] if not line[0].endswith('\n'):
print(answers['answers'][answers['best_index']]) sys.exit() # Interactive mode print("\n\nStarting interactive mode (first response will take a while):") colorama.init() hparams['num_translations_per_input'] = 10 hparams['override_loaded_hparams'] = True print(hparams) # QAs while True: question = input(colorama.Fore.WHITE + "\n> ") question = tokenize_weak.format(question) answers = process_questions(question) print(answers) answers = answers[0] #answers = process_questions(question)[0] chosen = '' if answers is None: print(colorama.Fore.RED + "! Question can't be empty") else: for i, _ in enumerate(answers['scores']): if chosen == '': if answers['scores'][i] == 1 and question.strip().lower( ) != answers['answers'][i].strip().lower(): chosen = answers['answers'][i] print("{}- {}{}".format( colorama.Fore.GREEN