# In[41]:

reload(ds)
post, mbti_type, user = ds.splitPosts(df)

# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(
    post, mbti_type, test_size=0.2, random_state=88)

print("MBIT posts", post_train[:5])
print('')
print("MBTI Labels: ", label_train[:5])

# Build a vocabulary (V size is defaulted to full text) for train corpus
vocab_mbti = vocabulary.Vocabulary(
    (utils.canonicalize_word(w) for w in post_train))
print("Vocab Size: ", vocab_mbti.size)

# tokenize and canonicalize train and test sets
x_train = []
for post in post_train:
    x_train.append(vocab_mbti.words_to_ids(post.split()))

x_test = []
for post in post_test:
    x_test.append(vocab_mbti.words_to_ids(post.split()))

reload(ds)
y_train_id, y_test_id = ds.one_hot_label(mbti_type, label_train, label_test)
y_train, y_test = ds.label_to_id(mbti_type, label_train, label_test)

# In[39]:

reload(ds)
post, mbti_type, user = ds.splitPosts(df)

# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(post, mbti_type, test_size=0.2, random_state=88)

print("MBIT posts", post_train[:5])
print('')
print("MBTI Labels: ",label_train[:5])

# Build a vocabulary (V size is defaulted to full text) for train corpus
vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in post_train))
print("Vocab Size: ",vocab_mbti.size)

# tokenize and canonicalize train and test sets
x_train = []
for post in post_train:
    x_train.append(vocab_mbti.words_to_ids(post.split()))

x_test = []
for post in post_test:
    x_test.append(vocab_mbti.words_to_ids(post.split()))
    
reload(ds)
y_train, y_test = ds.one_hot_label(mbti_type, label_train, label_test)
y_train_id, y_test_id, label_map = ds.label_to_id(mbti_type, label_train, label_test)
Esempio n. 3
0
def full_vocab_canon(x):
    # Build a vocabulary (V size is defaulted to full text) for train corpus
    vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in x))
    print("Full Vocab Built, size: ", vocab_mbti.size)
    return vocab_mbti.size, vocab_mbti