# # Vectorize cBoW # <div style="position: absolute; right:0;top:0"><a href="./vectorizer_index.doc.ipynb" style="text-decoration: none"> <font size="5">←</font></a> # <a href="../evaluation.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div> # # `Description` # # --- # ## Setup and Settings # --- # In[ ]: from __init__ import init_vars init_vars(vars(), ('info', {}), ('runvars', {}), ('num_docs', 400)) import numpy as np from scipy import sparse import data import config from base import nbprint from util import ProgressIterator from widgetbase import nbbox from embedding.main import get_model from embedding.common import OOVException from vectorizer.widgets import cbow_vector_picker from vectorizer.plots import plot_matrix
# # Classification # <div style="position: absolute; right:0;top:0"><a href="./metrics.ipynb" style="text-decoration: none"> <font size="5">←</font></a> # <a href="../evaluation.py.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div> # # `Description` # # --- # ## Setup # --- # In[10]: from __init__ import init_vars init_vars(vars(), ('info', {})) import numpy as np from sklearn import svm from sklearn.model_selection import cross_val_score from sklearn.preprocessing import StandardScaler import data import config from base import nbprint from util import ProgressIterator from widgetbase import nbbox from metrics.widgets import h_mat_picker from metrics.helper import load_ground_truth_classes if RUN_SCRIPT: h_mat_picker(info)
# coding: utf-8 # # Vocab Builder # <div style="position: absolute; right:0;top:0"><a href="./vocab.ipynb" style="text-decoration: none"> <font size="5">←</font></a> # <a href="../evaluation.py.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div> # # This module provides the `count_tokens()` and the `filter_tokens()` functions. # # --- # ## Setup and Settings # --- # In[1]: from __init__ import init_vars init_vars(vars(), ('info', {}), ('runvars', {})) import random from operator import attrgetter from nltk.corpus import stopwords import nltk try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') import data import config from base import nbprint from util import ProgressIterator from widgetbase import nbbox
# # PT Tokenizer # <div style="position: absolute; right:0;top:0"><a href="./tokenizer.ipynb" style="text-decoration: none"> <font size="5">←</font></a> # <a href="../evaluation.py.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div> # # This is a wrapper around the Penn Treebank tokenizer provided by the NLTK. # For more information see https://www.nltk.org/api/nltk.tokenize.html # # --- # ## Setup and Settings # --- # In[5]: from __init__ import init_vars init_vars(vars()) import nltk try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') from nltk.tokenize import word_tokenize import tokenizer.common from tokenizer.token_util import TokenizerBase # --- # ## Build PTTokenizer class # ---
# coding: utf-8 # # Vectorize Phrase # <div style="position: absolute; right:0;top:0"><a href="./vectorizer_index.doc.ipynb" style="text-decoration: none"> <font size="5">←</font></a> # <a href="../evaluation.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div> # # `Description` # # --- # ## Setup and Settings # --- # In[ ]: from __init__ import init_vars init_vars(vars(), ('info', {}), ('runvars', {}), ('num_docs', 400), ('embedding_dim', 200)) import numpy as np from scipy import sparse import data import config from base import nbprint from util import ProgressIterator from widgetbase import nbbox from embedding.main import get_model from vectorizer.widgets import phrase_vector_picker from vectorizer.plots import plot_matrix