Esempio n. 1
0
# # Vectorize cBoW
# <div style="position: absolute; right:0;top:0"><a href="./vectorizer_index.doc.ipynb" style="text-decoration: none"> <font size="5">←</font></a>
# <a href="../evaluation.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div>
# 
# `Description`
# 
# ---
# ## Setup and Settings
# ---

# In[ ]:


from __init__ import init_vars
init_vars(vars(), ('info', {}), ('runvars', {}), ('num_docs', 400))

import numpy as np
from scipy import sparse

import data
import config
from base import nbprint
from util import ProgressIterator
from widgetbase import nbbox

from embedding.main import get_model
from embedding.common import OOVException

from vectorizer.widgets import cbow_vector_picker
from vectorizer.plots import plot_matrix
Esempio n. 2
0
# # Classification
# <div style="position: absolute; right:0;top:0"><a href="./metrics.ipynb" style="text-decoration: none"> <font size="5">←</font></a>
# <a href="../evaluation.py.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div>
#
# `Description`
#
# ---
# ## Setup
# ---

# In[10]:

from __init__ import init_vars

init_vars(vars(), ('info', {}))

import numpy as np
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import data
import config
from base import nbprint
from util import ProgressIterator
from widgetbase import nbbox

from metrics.widgets import h_mat_picker
from metrics.helper import load_ground_truth_classes

if RUN_SCRIPT: h_mat_picker(info)
# coding: utf-8

# # Vocab Builder
# <div style="position: absolute; right:0;top:0"><a href="./vocab.ipynb" style="text-decoration: none"> <font size="5">←</font></a>
# <a href="../evaluation.py.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div>
#
# This module provides the `count_tokens()` and the `filter_tokens()` functions.
#
# ---
# ## Setup and Settings
# ---

# In[1]:

from __init__ import init_vars
init_vars(vars(), ('info', {}), ('runvars', {}))

import random
from operator import attrgetter
from nltk.corpus import stopwords
import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

import data
import config
from base import nbprint
from util import ProgressIterator
from widgetbase import nbbox
Esempio n. 4
0
# # PT Tokenizer
# <div style="position: absolute; right:0;top:0"><a href="./tokenizer.ipynb" style="text-decoration: none"> <font size="5">←</font></a>
# <a href="../evaluation.py.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div>
#
# This is a wrapper around the Penn Treebank tokenizer provided by the NLTK.
# For more information see https://www.nltk.org/api/nltk.tokenize.html
#
# ---
# ## Setup and Settings
# ---

# In[5]:

from __init__ import init_vars
init_vars(vars())

import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

from nltk.tokenize import word_tokenize

import tokenizer.common
from tokenizer.token_util import TokenizerBase

# ---
# ## Build PTTokenizer class
# ---
Esempio n. 5
0
# coding: utf-8

# # Vectorize Phrase
# <div style="position: absolute; right:0;top:0"><a href="./vectorizer_index.doc.ipynb" style="text-decoration: none"> <font size="5">←</font></a>
# <a href="../evaluation.ipynb" style="text-decoration: none"> <font size="5">↑</font></a></div>
#
# `Description`
#
# ---
# ## Setup and Settings
# ---

# In[ ]:

from __init__ import init_vars
init_vars(vars(), ('info', {}), ('runvars', {}), ('num_docs', 400),
          ('embedding_dim', 200))

import numpy as np
from scipy import sparse

import data
import config
from base import nbprint
from util import ProgressIterator
from widgetbase import nbbox

from embedding.main import get_model

from vectorizer.widgets import phrase_vector_picker
from vectorizer.plots import plot_matrix