Beispiel #1
0
def create_basic_dataset():
  # Load train & test data
  positive_set = uu.load_text_file(POSITIVE_PATH, as_words=True)
  negative_set = uu.load_text_file(NEGATIVE_PATH, as_words=True)

  X_train = positive_set[:int(len(positive_set) * train_ratio)] + \
            negative_set[:int(len(negative_set) * train_ratio)]
  X_test = positive_set[int(len(positive_set) * train_ratio):] + \
          negative_set[int(len(negative_set) * train_ratio):]
  Y_train = [[1]] * int(len(positive_set) * train_ratio) + \
            [[0]] * int(len(negative_set) * train_ratio)
  Y_test = [[1]] * (len(positive_set) - int(len(positive_set) * train_ratio)) + \
          [[0]] * (len(negative_set) - int(len(negative_set) * train_ratio))

  # print(f'{len(X_train)}, {len(X_test)}, {len(Y_train)}, {len(Y_test)}')
  # print(X_test[916])

  # Translate words into vectors
  embedding_model = Word2Vec.load(EMBEDDING_PATH).wv
  X_train_t = np.array(list(map(lambda x: embedding_model[x], X_train)))
  X_test_t = np.array(list(map(lambda x: embedding_model[x], X_test)))
  Y_train_t = np.array(Y_train)
  Y_test_t = np.array(Y_test)

  # print(f"Train set: X - {X_train_t.shape}, Y - {Y_train_t.shape}")
  # print(f"Test set: X - {X_test_t.shape}, Y - {Y_test_t.shape}")

  return X_train_t, X_test_t, Y_train_t, Y_test_t
Beispiel #2
0
def create_skip_grams(word_list_fn,
                      sentences_fn,
                      output_fn,
                      window_size=2,
                      log_per=100000):
    # Load sentences
    uu.print_dt('Load sentences...')
    sentences = uu.load_text_file(sentences_fn)

    # Load word list
    uu.print_dt('Load word list...')
    word_list = uu.load_text_file(word_list_fn)
    word_dict = {w: i for i, w in enumerate(word_list)}

    # Create skip_grams
    uu.print_dt('Create skip_grams...')
    skip_grams = []
    all_skip_grams = 0
    for idx, s in enumerate(sentences[:100000], 1):
        words_in_sentences = s.strip().split(' ')
        for w_idx, w in enumerate(words_in_sentences, 0):
            target = w
            # Only add number of sets to all_skip_grams
            if (target not in word_list):
                all_skip_grams += min([window_size, w_idx]) + \
                    min([window_size, len(words_in_sentences) - w_idx - 1])
                continue

            # Create word set
            for c_idx in range(w_idx - window_size, w_idx + window_size + 1):
                if (c_idx == w_idx):
                    continue

                if (c_idx < 0 or c_idx >= len(words_in_sentences)):
                    continue

                all_skip_grams += 1
                content = words_in_sentences[c_idx]
                if content in word_list:
                    skip_grams.append([word_dict[target], word_dict[content]])
        if (idx % log_per == 0):
            print_set = (idx, len(skip_grams), all_skip_grams,
                         len(skip_grams) / all_skip_grams * 100)
            uu.print_dt(
                "%7d sentences were parsed: %8d of %8d skip-grams can be used. (%2f%%)"
                % print_set)

    print_set = (len(skip_grams), all_skip_grams,
                 len(skip_grams) / all_skip_grams * 100)
    uu.print_dt(
        "All sentences were parsed: %8d of %8d skip-grams can be used. (%2f%%)"
        % print_set)

    uu.print_dt("Save skip-grams...")
    with open(output_fn, 'w') as writefile:
        for s_g in skip_grams:
            writefile.write("%d %d" % (s_g[0], s_g[1]) + os.linesep)
def generate_usable_vrm_set(content_fn, vrm_fn, output_fn, logger):
    contents = uu.load_text_file(content_fn)
    vrms = uu.load_text_file(vrm_fn)
    vrm_sets = []
    for c, vrm in zip(contents, vrms):
        if contents != '':
            vrm_sets.append({'content': c, 'vrm': vrm})
    shuffle(contents)
    with open(output_fn, 'w') as writefile:
        json.dump(vrm_sets, writefile)
Beispiel #4
0
def select_with_word_txt(sentences_fn,
                         output_fn,
                         target_word,
                         logger,
                         max_len=10000):
    logger.info('Start processing...')
    sentences = uu.load_text_file(sentences_fn)
    shuffle(sentences)

    positive_ss = []
    negative_ss = []
    for s in tqdm(sentences):
        if s == '':
            continue
        is_positive = False
        for w in s.split(' '):
            if w == target_word:
                if len(positive_ss) < max_len:
                    positive_ss.append(s)
                is_positive = True
                break
        if not is_positive and len(negative_ss) < max_len:
            negative_ss.append(s)
        if len(positive_ss) >= max_len and len(negative_ss) >= max_len:
            break

    logger.info("Write results...")
    filename, file_extension = os.path.splitext(output_fn)
    with open(f"{filename}_positive{file_extension}", 'w') as writefile:
        writefile.write(os.linesep.join(positive_ss))
    with open(f"{filename}_negative{file_extension}", 'w') as writefile:
        writefile.write(os.linesep.join(negative_ss))
Beispiel #5
0
def get_info_of_sentences(sentences_fn, sentences_num):
    if sentences_num < 1:
        print("ERROR: sentences_num MUST be more than 1.")
        return

    print("Start to read file...")
    sentences = uu.load_text_file(sentences_fn)[:sentences_num]
    single_word_sentence = 0
    total_words = []

    print("Get words from sentences...")
    for s in sentences:
        words = s.strip().split(' ')
        if len(words) == 1:
            single_word_sentence += 1
        total_words.extend(words)

    total_words_num = len(total_words)
    unique_words_num = len(list(set(total_words)))

    logger = uu.get_custom_logger(
        'sentences_info',
        os.path.join(uu.get_base_path(), 'logs/sentences_info.log'))
    logger.info(f'{sentences_num} sentences INFO:')
    logger.info('Total words: %d | Unique words: %d (%.2f%% of total)' %
                (total_words_num, unique_words_num,
                 unique_words_num / total_words_num * 100))
    logger.info('Words per sentences: %.2f' %
                (total_words_num / sentences_num))
    logger.info(
        "Single-word-sentences: %d (%.2f%% of total)" %
        (single_word_sentence, single_word_sentence / sentences_num * 100))
    logger.info("=" * 50)
Beispiel #6
0
def adjust_sentence_len(sentences_fn, output_fn, sentence_len=16):
    sentences = uu.load_text_file(sentences_fn)
    new_sentences = []
    for s in tqdm(sentences):
        words = s.split(' ')
        if len(words) > sentence_len:
            words = words[:sentence_len]
        elif len(words) < sentence_len:
            words = ['0'] * (sentence_len - len(words)) + words
        new_sentences.append(' '.join(words))
    with open(output_fn, 'w') as writefile:
        writefile.write(os.linesep.join(new_sentences))
Beispiel #7
0
def make_omitted_sentences(sentences_fn, output_fn, sentences_num, min_count):
    if sentences_num < 1:
        print("ERROR: sentences_num MUST be more than 1.")
        return

    print("Start to read file...")
    sentences = uu.load_text_file(sentences_fn)[:sentences_num]

    print("Get word_counts from sentences...")
    word_counts = {}
    for s in tqdm(sentences):
        words = s.strip().split(' ')
        for w in words:
            if w in word_counts.keys():
                word_counts[w] += 1
            else:
                word_counts[w] = 1

    print("Get frequent words list...")
    frequent_words = []
    for k in tqdm(word_counts.keys()):
        if word_counts[k] >= min_count:
            frequent_words.append(k)
    logger = uu.get_custom_logger(
        'info_omitted', os.path.join(uu.get_base_path(), 'logs/omit.log'))
    logger.info("Omitting ~%d Sentences with min_count %d" %
                (sentences_num, min_count))
    frequent_len = len(frequent_words)
    total_len = len(word_counts)
    logger.info("Survived Vocabs: %d of Total %d (%.2f%%)" %
                (frequent_len, total_len, frequent_len / total_len * 100))

    print("Write results...")
    total_words_len = 0
    omitted_words_len = 0
    with open(output_fn, 'w') as writefile:
        for s in tqdm(sentences):
            words = s.strip().split(' ')
            omitted_words = []
            for idx, w in enumerate(words):
                if w not in frequent_words:
                    words[idx] = '()'
                    omitted_words.append(w)
            omitted_words_len += len(omitted_words)
            total_words_len += len(words) - omitted_words_len
            writefile.write("%s [%s]" %
                            (' '.join(words), ', '.join(omitted_words)) +
                            os.linesep)
    frequent_words_len = total_words_len - omitted_words_len
    logger.info("Survived Words: %d of Total %d (%.2f%%)" %
                (frequent_words_len, total_words_len,
                 frequent_words_len / total_words_len * 100))
    logger.info("-" * 50)
def vrm_script_to_json(input_fn, output_fn, logger):
    in_brackets_re = re.compile('\(.*?\)')

    sentences = uu.load_text_file(input_fn)
    dialogs = []
    speechs = []
    speakers = []
    speaker = 'A'

    for s in tqdm(sentences):
        s = s.strip()
        if s == '':
            # If blank line, push speechs into dialogs & reset variables
            if len(speechs) > 0:
                dialogs.append(speechs)
            speechs = []
            speakers = []
            speaker = 'A'
        else:
            # Remove words in parentheses
            s = in_brackets_re.sub(' ', s)

            # Remove colons between numbers
            s = re.sub('([0-9]+[:][0-9]+)',
                       (lambda obj: obj.string.replace(':', ' ')), s)

            # Split content and VRM tag
            content, vrm = s[:-2].strip(), s[-2:]

            # If there is info for speaker, normalize it like 'A', 'B', ...
            if ':' in content:
                raw_speaker, content = content.split(
                    ':')[0].strip(), content.split(':')[1].strip()
                if raw_speaker not in speakers:
                    speakers.append(raw_speaker)
                speaker = chr(65 + speakers.index(raw_speaker))

            # Save speech into speechs
            speechs.append({
                'speaker': speaker,
                'utterance': content,
                'vrm': vrm
            })

    with open(output_fn, 'w') as writefile:
        json.dump(dialogs, writefile)
Beispiel #9
0
def split_sentences_in_txt(input_fn, output_fn, log_fn):
    ELLIPSIS_RE = re.compile('\.\.+|…')
    IN_BRACKETS_RE = re.compile(
        '\(.*?\)')  # Limitation on nested brackets like '(a(b)c)'

    logger = uu.get_custom_logger('toolbox', log_fn)
    sentences = uu.load_text_file(input_fn)
    results = []
    logger.info('Split sentences...')
    for s in tqdm(sentences):
        s = s.strip()
        if len(s) == 0 or s == '' or not s.startswith('[['):
            results.append('')
        else:
            if ' ' not in s:
                continue
            result = []
            speaker = s.split(' ')[0]
            replaced_s = IN_BRACKETS_RE.sub('', ' '.join(s.split(' ')[1:]))
            replaced_s = ELLIPSIS_RE.sub(' ', replaced_s)
            splited_s = ''
            for w in replaced_s.strip():
                if w == '.':
                    if len(splited_s) > 0:
                        result.append(speaker + ' ' + splited_s)
                        splited_s = ''
                elif w == '!' or w == '?':
                    result.append(speaker + ' ' + splited_s + w)
                    splited_s = ''
                else:
                    splited_s += w
            if len(splited_s) > 0:
                result.append(speaker + ' ' + splited_s)
            results.extend(result)

    logger.info('Save results...')
    with open(output_fn, 'w') as writefile:
        for r in tqdm(results):
            writefile.write(r + os.linesep)
    logger.info(
        f'Done - {len(sentences)} sentences => {len(results)} sentences')
Beispiel #10
0
def remove_less_frequent_words(sentences_fn,
                               words_fn,
                               output_fn,
                               logger,
                               frequent_num=5):
    sentences = uu.load_text_file(sentences_fn)
    with open(words_fn, 'r') as readfile:
        words_dict = json.load(readfile)
    words = list(filter(lambda k: words_dict[k] >= 5, words_dict))
    new_sentences = []
    for s in tqdm(sentences):
        new_words = []
        for w in s.split(' '):
            if w == '0' or w in words:
                new_words.append(w)
            else:
                ['0'] + new_words
        new_sentences.append(' '.join(new_words))

    with open(output_fn, 'w') as writefile:
        writefile.write(os.linesep.join(new_sentences))
def tokenize_vrm_content(input_fn, output_fn, model_path, logger):
    token_re = re.compile("[a-zA-Z]+[']*[a-zA-z]*|[0-9]")

    logger.info("Load word2vec model...")
    model = KeyedVectors.load_word2vec_format(model_path, binary="True")
    word_vectors = model.wv

    sentences = uu.load_text_file(input_fn)
    result = []
    for s in tqdm(sentences):
        tokens = token_re.findall(s)
        for t in tokens:
            try:
                word_vectors.get_vector(t)
            except KeyError as e:
                logger.info(f'"{t}" is removed.')
                tokens.remove(t)
        result.append(' '.join(tokens))

    with open(output_fn, 'w') as writefile:
        writefile.write(os.linesep.join(result))
def draw_word_frequency_plot(input_fn, logger):
    sentences = uu.load_text_file(input_fn, as_words=True)

    count = {}
    for s in sentences:
        length = len(s)
        if length == 0:
            continue
        if length in count:
            count[length] += 1
        else:
            count[length] = 1

    logger.info('Drawing plot...')
    count_list = sorted(count.items())
    x, y = zip(*count_list)
    # i_25, i_50, i_75 = get_three_points(y)
    [i_25, i_50, i_75, i_90, i_95,
     i_99] = _get_proportion_indexes(y, [.25, .50, .75, .90, .95, .99])

    plt.plot(x, y, alpha=0.5)
    plt.scatter(x, y, s=10)
    plt.title(f'#. words in tokenized VRM script sentences')
    plt.xlabel("#. of words")
    plt.ylabel("Counts")
    plt.annotate(f"25% Value: {x[i_25]}",
                 xy=(x[i_25], y[i_25]),
                 xytext=(40, 30),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.annotate(f"50% Value: {x[i_50]}",
                 xy=(x[i_50], y[i_50]),
                 xytext=(40, 10),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.annotate(f"75% Value: {x[i_75]}",
                 xy=(x[i_75], y[i_75]),
                 xytext=(40, 30),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.annotate(f"90% Value: {x[i_90]}",
                 xy=(x[i_90], y[i_90]),
                 xytext=(40, 50),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.annotate(f"95% Value: {x[i_95]}",
                 xy=(x[i_95], y[i_95]),
                 xytext=(40, 35),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.annotate(f"99% Value: {x[i_99]}",
                 xy=(x[i_99], y[i_99]),
                 xytext=(30, 20),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.annotate(f"End Value: {x[-1]}",
                 xy=(x[-1], y[-1]),
                 xytext=(-60, 70),
                 textcoords='offset points',
                 arrowprops=dict(arrowstyle="->"))
    plt.show()