def bp_encode(text_in, bp_vocab, subword2idx):
    sw_token_list = []
    for tmp_token in \
        word_tokenizer(text_in.lower().strip()):
        sw_token_list.extend(subword_tokenize(
            tmp_token, bp_vocab, subword2idx))
    return sw_token_list
def learn_word_vocab(corpus):
    w_counter = Counter()
    
    for tmp_text in corpus:
        tmp_tokens = word_tokenizer(tmp_text.strip().lower())
        w_counter.update(tmp_tokens)
    
    word_counts = []
    for word, count in w_counter.items():
        tmp_word = "<" + word + ">"
        tmp_word = "".join([x+" " for x in tmp_word]).strip()
        word_counts.append((tmp_word, count))
    return dict(word_counts)
Beispiel #3
0
        else:
            tmp_class = "good_joke"
        
        tmp_jokes_tuple.append((tmp_class, tmp_joke))
del tmp_row, tmp_joke
print("Total of", str(len(tmp_jokes_tuple)), "jokes loaded.")

# Process the data. #
tmp_jokes_filtered = []

w_counter = Counter()
for tmp_class, tmp_joke in tmp_jokes_tuple:
    tmp_joke = \
        tmp_joke.replace("\n", " \n ").replace("\'", " ")
    tmp_tokens = [
        x for x in word_tokenizer(tmp_joke.lower()) if x != ""]
    
    if len(tmp_tokens) <= max_len:
        w_counter.update(tmp_tokens)
        tmp_jokes_filtered.append((tmp_class, tmp_joke))
    del tmp_tokens

print("Total of", str(len(tmp_jokes_filtered)), "jokes filtered.")
del tmp_jokes_tuple

word_counts = []
for word, count in w_counter.items():
    tmp_word = "<" + word + ">"
    tmp_word = "".join([x+" " for x in tmp_word]).strip()
    word_counts.append((tmp_word, count))
word_counts = dict(word_counts)
Beispiel #4
0
        for x in tmp_split.split(",")
    ]
    convs.append(tmp_ids)

q_len = 10
a_len = 10

w_counter = Counter()
tmp_corpus = []
tmp_data_tuple = []
for conv in convs:
    for i in range(len(conv) - 1):
        tmp_qns = id2line[conv[i]].lower().replace("\\u", " ").replace(
            "\\i", " ").replace("\n", " ").replace("\t", " ")
        #tmp_qns = re.sub(r"[^\w\s]", " ", tmp_qns)
        tmp_qns = [x for x in word_tokenizer(tmp_qns) if x != ""]

        tmp_ans = id2line[conv[i + 1]].lower().replace("\\u", " ").replace(
            "\\i", " ").replace("\n", " ").replace("\t", " ")
        #tmp_ans = re.sub(r"[^\w\s]", " ", tmp_ans)
        tmp_ans = [x for x in word_tokenizer(tmp_ans) if x != ""]

        if len(tmp_qns) == 0 or len(tmp_ans) == 0:
            continue
        elif len(tmp_qns) <= q_len and len(tmp_ans) <= a_len:
            w_counter.update(tmp_qns)
            w_counter.update(tmp_ans)
            tmp_data_tuple.append((" ".join(tmp_qns), " ".join(tmp_ans)))

elapsed_tm = (time.time() - start_tm) / 60
print("Elapsed Time:", str(elapsed_tm), "mins.")