# In[41]: reload(ds) post, mbti_type, user = ds.splitPosts(df) # Split data: 80% train, 20% test post_train, post_test, label_train, label_test = train_test_split( post, mbti_type, test_size=0.2, random_state=88) print("MBIT posts", post_train[:5]) print('') print("MBTI Labels: ", label_train[:5]) # Build a vocabulary (V size is defaulted to full text) for train corpus vocab_mbti = vocabulary.Vocabulary( (utils.canonicalize_word(w) for w in post_train)) print("Vocab Size: ", vocab_mbti.size) # tokenize and canonicalize train and test sets x_train = [] for post in post_train: x_train.append(vocab_mbti.words_to_ids(post.split())) x_test = [] for post in post_test: x_test.append(vocab_mbti.words_to_ids(post.split())) reload(ds) y_train_id, y_test_id = ds.one_hot_label(mbti_type, label_train, label_test) y_train, y_test = ds.label_to_id(mbti_type, label_train, label_test)
# In[39]: reload(ds) post, mbti_type, user = ds.splitPosts(df) # Split data: 80% train, 20% test post_train, post_test, label_train, label_test = train_test_split(post, mbti_type, test_size=0.2, random_state=88) print("MBIT posts", post_train[:5]) print('') print("MBTI Labels: ",label_train[:5]) # Build a vocabulary (V size is defaulted to full text) for train corpus vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in post_train)) print("Vocab Size: ",vocab_mbti.size) # tokenize and canonicalize train and test sets x_train = [] for post in post_train: x_train.append(vocab_mbti.words_to_ids(post.split())) x_test = [] for post in post_test: x_test.append(vocab_mbti.words_to_ids(post.split())) reload(ds) y_train, y_test = ds.one_hot_label(mbti_type, label_train, label_test) y_train_id, y_test_id, label_map = ds.label_to_id(mbti_type, label_train, label_test)
def full_vocab_canon(x): # Build a vocabulary (V size is defaulted to full text) for train corpus vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in x)) print("Full Vocab Built, size: ", vocab_mbti.size) return vocab_mbti.size, vocab_mbti