def create_basic_dataset(): # Load train & test data positive_set = uu.load_text_file(POSITIVE_PATH, as_words=True) negative_set = uu.load_text_file(NEGATIVE_PATH, as_words=True) X_train = positive_set[:int(len(positive_set) * train_ratio)] + \ negative_set[:int(len(negative_set) * train_ratio)] X_test = positive_set[int(len(positive_set) * train_ratio):] + \ negative_set[int(len(negative_set) * train_ratio):] Y_train = [[1]] * int(len(positive_set) * train_ratio) + \ [[0]] * int(len(negative_set) * train_ratio) Y_test = [[1]] * (len(positive_set) - int(len(positive_set) * train_ratio)) + \ [[0]] * (len(negative_set) - int(len(negative_set) * train_ratio)) # print(f'{len(X_train)}, {len(X_test)}, {len(Y_train)}, {len(Y_test)}') # print(X_test[916]) # Translate words into vectors embedding_model = Word2Vec.load(EMBEDDING_PATH).wv X_train_t = np.array(list(map(lambda x: embedding_model[x], X_train))) X_test_t = np.array(list(map(lambda x: embedding_model[x], X_test))) Y_train_t = np.array(Y_train) Y_test_t = np.array(Y_test) # print(f"Train set: X - {X_train_t.shape}, Y - {Y_train_t.shape}") # print(f"Test set: X - {X_test_t.shape}, Y - {Y_test_t.shape}") return X_train_t, X_test_t, Y_train_t, Y_test_t
def create_skip_grams(word_list_fn, sentences_fn, output_fn, window_size=2, log_per=100000): # Load sentences uu.print_dt('Load sentences...') sentences = uu.load_text_file(sentences_fn) # Load word list uu.print_dt('Load word list...') word_list = uu.load_text_file(word_list_fn) word_dict = {w: i for i, w in enumerate(word_list)} # Create skip_grams uu.print_dt('Create skip_grams...') skip_grams = [] all_skip_grams = 0 for idx, s in enumerate(sentences[:100000], 1): words_in_sentences = s.strip().split(' ') for w_idx, w in enumerate(words_in_sentences, 0): target = w # Only add number of sets to all_skip_grams if (target not in word_list): all_skip_grams += min([window_size, w_idx]) + \ min([window_size, len(words_in_sentences) - w_idx - 1]) continue # Create word set for c_idx in range(w_idx - window_size, w_idx + window_size + 1): if (c_idx == w_idx): continue if (c_idx < 0 or c_idx >= len(words_in_sentences)): continue all_skip_grams += 1 content = words_in_sentences[c_idx] if content in word_list: skip_grams.append([word_dict[target], word_dict[content]]) if (idx % log_per == 0): print_set = (idx, len(skip_grams), all_skip_grams, len(skip_grams) / all_skip_grams * 100) uu.print_dt( "%7d sentences were parsed: %8d of %8d skip-grams can be used. (%2f%%)" % print_set) print_set = (len(skip_grams), all_skip_grams, len(skip_grams) / all_skip_grams * 100) uu.print_dt( "All sentences were parsed: %8d of %8d skip-grams can be used. (%2f%%)" % print_set) uu.print_dt("Save skip-grams...") with open(output_fn, 'w') as writefile: for s_g in skip_grams: writefile.write("%d %d" % (s_g[0], s_g[1]) + os.linesep)
def generate_usable_vrm_set(content_fn, vrm_fn, output_fn, logger): contents = uu.load_text_file(content_fn) vrms = uu.load_text_file(vrm_fn) vrm_sets = [] for c, vrm in zip(contents, vrms): if contents != '': vrm_sets.append({'content': c, 'vrm': vrm}) shuffle(contents) with open(output_fn, 'w') as writefile: json.dump(vrm_sets, writefile)
def select_with_word_txt(sentences_fn, output_fn, target_word, logger, max_len=10000): logger.info('Start processing...') sentences = uu.load_text_file(sentences_fn) shuffle(sentences) positive_ss = [] negative_ss = [] for s in tqdm(sentences): if s == '': continue is_positive = False for w in s.split(' '): if w == target_word: if len(positive_ss) < max_len: positive_ss.append(s) is_positive = True break if not is_positive and len(negative_ss) < max_len: negative_ss.append(s) if len(positive_ss) >= max_len and len(negative_ss) >= max_len: break logger.info("Write results...") filename, file_extension = os.path.splitext(output_fn) with open(f"{filename}_positive{file_extension}", 'w') as writefile: writefile.write(os.linesep.join(positive_ss)) with open(f"{filename}_negative{file_extension}", 'w') as writefile: writefile.write(os.linesep.join(negative_ss))
def get_info_of_sentences(sentences_fn, sentences_num): if sentences_num < 1: print("ERROR: sentences_num MUST be more than 1.") return print("Start to read file...") sentences = uu.load_text_file(sentences_fn)[:sentences_num] single_word_sentence = 0 total_words = [] print("Get words from sentences...") for s in sentences: words = s.strip().split(' ') if len(words) == 1: single_word_sentence += 1 total_words.extend(words) total_words_num = len(total_words) unique_words_num = len(list(set(total_words))) logger = uu.get_custom_logger( 'sentences_info', os.path.join(uu.get_base_path(), 'logs/sentences_info.log')) logger.info(f'{sentences_num} sentences INFO:') logger.info('Total words: %d | Unique words: %d (%.2f%% of total)' % (total_words_num, unique_words_num, unique_words_num / total_words_num * 100)) logger.info('Words per sentences: %.2f' % (total_words_num / sentences_num)) logger.info( "Single-word-sentences: %d (%.2f%% of total)" % (single_word_sentence, single_word_sentence / sentences_num * 100)) logger.info("=" * 50)
def adjust_sentence_len(sentences_fn, output_fn, sentence_len=16): sentences = uu.load_text_file(sentences_fn) new_sentences = [] for s in tqdm(sentences): words = s.split(' ') if len(words) > sentence_len: words = words[:sentence_len] elif len(words) < sentence_len: words = ['0'] * (sentence_len - len(words)) + words new_sentences.append(' '.join(words)) with open(output_fn, 'w') as writefile: writefile.write(os.linesep.join(new_sentences))
def make_omitted_sentences(sentences_fn, output_fn, sentences_num, min_count): if sentences_num < 1: print("ERROR: sentences_num MUST be more than 1.") return print("Start to read file...") sentences = uu.load_text_file(sentences_fn)[:sentences_num] print("Get word_counts from sentences...") word_counts = {} for s in tqdm(sentences): words = s.strip().split(' ') for w in words: if w in word_counts.keys(): word_counts[w] += 1 else: word_counts[w] = 1 print("Get frequent words list...") frequent_words = [] for k in tqdm(word_counts.keys()): if word_counts[k] >= min_count: frequent_words.append(k) logger = uu.get_custom_logger( 'info_omitted', os.path.join(uu.get_base_path(), 'logs/omit.log')) logger.info("Omitting ~%d Sentences with min_count %d" % (sentences_num, min_count)) frequent_len = len(frequent_words) total_len = len(word_counts) logger.info("Survived Vocabs: %d of Total %d (%.2f%%)" % (frequent_len, total_len, frequent_len / total_len * 100)) print("Write results...") total_words_len = 0 omitted_words_len = 0 with open(output_fn, 'w') as writefile: for s in tqdm(sentences): words = s.strip().split(' ') omitted_words = [] for idx, w in enumerate(words): if w not in frequent_words: words[idx] = '()' omitted_words.append(w) omitted_words_len += len(omitted_words) total_words_len += len(words) - omitted_words_len writefile.write("%s [%s]" % (' '.join(words), ', '.join(omitted_words)) + os.linesep) frequent_words_len = total_words_len - omitted_words_len logger.info("Survived Words: %d of Total %d (%.2f%%)" % (frequent_words_len, total_words_len, frequent_words_len / total_words_len * 100)) logger.info("-" * 50)
def vrm_script_to_json(input_fn, output_fn, logger): in_brackets_re = re.compile('\(.*?\)') sentences = uu.load_text_file(input_fn) dialogs = [] speechs = [] speakers = [] speaker = 'A' for s in tqdm(sentences): s = s.strip() if s == '': # If blank line, push speechs into dialogs & reset variables if len(speechs) > 0: dialogs.append(speechs) speechs = [] speakers = [] speaker = 'A' else: # Remove words in parentheses s = in_brackets_re.sub(' ', s) # Remove colons between numbers s = re.sub('([0-9]+[:][0-9]+)', (lambda obj: obj.string.replace(':', ' ')), s) # Split content and VRM tag content, vrm = s[:-2].strip(), s[-2:] # If there is info for speaker, normalize it like 'A', 'B', ... if ':' in content: raw_speaker, content = content.split( ':')[0].strip(), content.split(':')[1].strip() if raw_speaker not in speakers: speakers.append(raw_speaker) speaker = chr(65 + speakers.index(raw_speaker)) # Save speech into speechs speechs.append({ 'speaker': speaker, 'utterance': content, 'vrm': vrm }) with open(output_fn, 'w') as writefile: json.dump(dialogs, writefile)
def split_sentences_in_txt(input_fn, output_fn, log_fn): ELLIPSIS_RE = re.compile('\.\.+|…') IN_BRACKETS_RE = re.compile( '\(.*?\)') # Limitation on nested brackets like '(a(b)c)' logger = uu.get_custom_logger('toolbox', log_fn) sentences = uu.load_text_file(input_fn) results = [] logger.info('Split sentences...') for s in tqdm(sentences): s = s.strip() if len(s) == 0 or s == '' or not s.startswith('[['): results.append('') else: if ' ' not in s: continue result = [] speaker = s.split(' ')[0] replaced_s = IN_BRACKETS_RE.sub('', ' '.join(s.split(' ')[1:])) replaced_s = ELLIPSIS_RE.sub(' ', replaced_s) splited_s = '' for w in replaced_s.strip(): if w == '.': if len(splited_s) > 0: result.append(speaker + ' ' + splited_s) splited_s = '' elif w == '!' or w == '?': result.append(speaker + ' ' + splited_s + w) splited_s = '' else: splited_s += w if len(splited_s) > 0: result.append(speaker + ' ' + splited_s) results.extend(result) logger.info('Save results...') with open(output_fn, 'w') as writefile: for r in tqdm(results): writefile.write(r + os.linesep) logger.info( f'Done - {len(sentences)} sentences => {len(results)} sentences')
def remove_less_frequent_words(sentences_fn, words_fn, output_fn, logger, frequent_num=5): sentences = uu.load_text_file(sentences_fn) with open(words_fn, 'r') as readfile: words_dict = json.load(readfile) words = list(filter(lambda k: words_dict[k] >= 5, words_dict)) new_sentences = [] for s in tqdm(sentences): new_words = [] for w in s.split(' '): if w == '0' or w in words: new_words.append(w) else: ['0'] + new_words new_sentences.append(' '.join(new_words)) with open(output_fn, 'w') as writefile: writefile.write(os.linesep.join(new_sentences))
def tokenize_vrm_content(input_fn, output_fn, model_path, logger): token_re = re.compile("[a-zA-Z]+[']*[a-zA-z]*|[0-9]") logger.info("Load word2vec model...") model = KeyedVectors.load_word2vec_format(model_path, binary="True") word_vectors = model.wv sentences = uu.load_text_file(input_fn) result = [] for s in tqdm(sentences): tokens = token_re.findall(s) for t in tokens: try: word_vectors.get_vector(t) except KeyError as e: logger.info(f'"{t}" is removed.') tokens.remove(t) result.append(' '.join(tokens)) with open(output_fn, 'w') as writefile: writefile.write(os.linesep.join(result))
def draw_word_frequency_plot(input_fn, logger): sentences = uu.load_text_file(input_fn, as_words=True) count = {} for s in sentences: length = len(s) if length == 0: continue if length in count: count[length] += 1 else: count[length] = 1 logger.info('Drawing plot...') count_list = sorted(count.items()) x, y = zip(*count_list) # i_25, i_50, i_75 = get_three_points(y) [i_25, i_50, i_75, i_90, i_95, i_99] = _get_proportion_indexes(y, [.25, .50, .75, .90, .95, .99]) plt.plot(x, y, alpha=0.5) plt.scatter(x, y, s=10) plt.title(f'#. words in tokenized VRM script sentences') plt.xlabel("#. of words") plt.ylabel("Counts") plt.annotate(f"25% Value: {x[i_25]}", xy=(x[i_25], y[i_25]), xytext=(40, 30), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"50% Value: {x[i_50]}", xy=(x[i_50], y[i_50]), xytext=(40, 10), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"75% Value: {x[i_75]}", xy=(x[i_75], y[i_75]), xytext=(40, 30), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"90% Value: {x[i_90]}", xy=(x[i_90], y[i_90]), xytext=(40, 50), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"95% Value: {x[i_95]}", xy=(x[i_95], y[i_95]), xytext=(40, 35), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"99% Value: {x[i_99]}", xy=(x[i_99], y[i_99]), xytext=(30, 20), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.annotate(f"End Value: {x[-1]}", xy=(x[-1], y[-1]), xytext=(-60, 70), textcoords='offset points', arrowprops=dict(arrowstyle="->")) plt.show()