Esempio n. 1
0
def vocab_picker(info):
    if info is None:
        return
        
    output_box = wb.DynHTML(display = False)
    run_requirements_button = widgets.Button(description="Run Requirements")
    nbbox = wb.nbbox(display = False)
    
    def update_output(clear_nbbox = True):
        nbbox.make_current()
        if clear_nbbox:
            nbbox.clear()
        run_requirements_button.disabled = data.tokenized_document_exists(info)
        output_box.update(wb.info_table('data:short,token:short,vocab:long', info))
      
    def on_run_requirements_button_clicked(b):
        nbbox.make_current()
        nbprint.clear()
        try:
            check_requirements(info)
        except:
            nbbox.print_traceback()
        update_output(False)
        
    data_selector = wb.get_data_selector(info, update_output)
    token_selector = wb.get_token_selector(info, update_output, 'B')
    count_vocab_selector = wb.get_vocab_selector(info, update_output, 'B')
    run_requirements_button.on_click(on_run_requirements_button_clicked)
    settings_box = widgets.VBox([data_selector,token_selector,count_vocab_selector,run_requirements_button])
    
    display(settings_box)
    output_box.display()  
    nbbox.display()
    
    update_output()
Esempio n. 2
0
def vocab_app():
    info = {}
    output_box = wb.DynHTML(display = False)
    nbbox = wb.nbbox(mini = True, display = False)
    
    def update_output(clear_nbbox = True):
        nbbox.make_current()
        if clear_nbbox:
            nbbox.clear()
        output_box.update(wb.info_table('data:short,token:short,vocab:long', info))
      
    def on_run_button_clicked(b):
        nbbox.make_current()
        nbbox.clear()
        try:
            check_requirements(info)
            from vocab.main import run_vocab
            run_vocab(info)
        except:
            nbprint.print_traceback()
        update_output(False)
        
    data_selector = wb.get_data_selector(info, update_output)
    token_vocab_selector = wb.get_linked_token_vocab_selector(info, update_output)
    run_button = widgets.Button(description="Run All")
    run_button.on_click(on_run_button_clicked)
    settings_box = widgets.VBox([data_selector,
                                 token_vocab_selector,
                                 run_button])
    
    display(settings_box)
    output_box.display()
    nbbox.display()
Esempio n. 3
0
def h_mat_and_bow_picker(info):
    output_box = wb.DynHTML(display=False)
    nbbox = wb.nbbox(display=False)
    second_info = {}
    info['second_info'] = second_info

    def update_output(clear_nbbox=True):
        nbbox.make_current()
        if clear_nbbox:
            nbbox.clear()
        output_box.update(
            wb.info_table(
                'data:short,token:short,vocab:short,vector:short,model:short',
                info))

    h_mat_selector = wb.get_h_mat_selector(info, update_output)
    data_selector = wb.get_data_selector(second_info, update_output)
    token_vocab_selector = wb.get_linked_token_vocab_selector(
        second_info, update_output)
    count_vector_selector = wb.get_vector_selector(second_info, update_output,
                                                   'B')
    settings_box = widgets.VBox([
        h_mat_selector, data_selector, token_vocab_selector,
        count_vector_selector
    ])
    display(settings_box)
    output_box.display()
    nbbox.display()

    update_output()
Esempio n. 4
0
def c_vec_picker(info):        
    output_box = wb.DynHTML(display = False)
    nbbox = wb.nbbox(display = False)
    
    def update_output(clear_nbbox = True):
        nbbox.make_current()
        nbbox.clear()
        output_box.update(wb.info_table('data:short,token:short,vocab:short,vector:short,model:short', info))
      
    c_vec_selector = wb.get_c_vec_selector(info, update_output, labeled_only = False)
    display(c_vec_selector)
    output_box.display()   
    nbbox.display()
    
    update_output()
Esempio n. 5
0
def token_app():
    info = {}
    output_box = wb.DynHTML(display=False)
    nbbox = wb.nbbox(display=False)

    def update_output(clear_nbbox=True):
        nbbox.make_current()
        if clear_nbbox:
            nbbox.clear()
        output_box.update(
            wb.info_table('data:short,token:long',
                          info,
                          exists=[('Exists', 'header',
                                   data.tokenized_document_exists(info))]))

    def on_run_button_clicked(b):
        nbbox.make_current()
        nbbox.clear()
        try:
            check_requirements(info)
            from tokenizer.main import run_tokenizer
            run_tokenizer(info)
        except:
            nbprint.print_traceback()
        update_output(False)

    data_selector = wb.get_data_selector(info, update_output)
    token_selector = wb.get_token_selector(info, update_output, 'BC')
    run_button = widgets.Button(description="Run All")
    run_button.on_click(on_run_button_clicked)
    settings_box = widgets.VBox([data_selector, token_selector, run_button])

    display(settings_box)
    output_box.display()
    nbbox.display()

    update_output()
Esempio n. 6
0
from vectorizer.vectorize_bow import make_term_doc_mat_count, make_term_doc_mat_tf_idf

if RUN_SCRIPT: cbow_vector_picker(info)


# ---
# ## Count
# ---
# 
# Count matrix is the base for all further types. Take function `make_term_doc_mat_count()` from [Vectorize BoW](./vectorize_bow.ipynb#Count-Tokens-and-build-matrix)

# In[ ]:


if RUN_SCRIPT:
    nbbox(mini=True)
    make_term_doc_mat_count(info, runvars)


# ---
# ## Tf-Idf
# ---
# This matrix can be computed by multiplying the embedding matrix $V$, where each column is the wordembedding vector of the corresponding word, with the standard BoW tf-idf matrix $X$. $X$ is computed using the `make_term_doc_mat_tf_idf()` function from [Vectorize BoW](./vectorize_bow.ipynb#Build-complete-tokenizer-function). Note that $V$ will not be explicitly computed.

# In[ ]:


def make_cbow_mat_tf_idf(info, runvars):
    # Create tf-idf matrix
    make_term_doc_mat_tf_idf(info, runvars)
    tf_idf_mat = runvars['term_doc_mat_tf_idf']
Esempio n. 7
0
def model_app(info):
    output_box = wb.DynHTML(display = False)
    nbbox = wb.nbbox(display = False)
    
    def update_output(clear_nbbox = True):
        nbbox.make_current()
        if clear_nbbox:
            nbbox.clear()
        output_box.update(wb.info_table('data:short,token:long,vocab:short,vector:short,model:long', info))
      
    def on_run_button_clicked(b):
        nbbox.make_current()
        nbbox.clear()
        try:
            check_requirements(info)
            from models.main import run_models
            run_models(info)
        except:
            nbprint.print_traceback()
        update_output(False)
        
    data_selector = wb.get_data_selector(info, update_output)
    token_vocab_selector = wb.get_linked_token_vocab_selector(info, update_output)
    token_selector = wb.get_token_selector(info, update_output, 'C')
    vocab_selector = wb.get_vocab_selector(info, update_output, 'C')
    model_selector = wb.get_model_selector(info, update_output)
    num_topics_selector = wb.get_num_topics_selector(info, update_output, data_selector)
    run_button = widgets.Button(description="Run All")
        
    def vector_changed():
        vector_bcp, vector_id = config.split(info['vector_version'])
        if vector_bcp == 'B':
            token_vocab_selector.layout.visibility = 'visible'
            token_selector.layout.visibility = 'hidden'
            vocab_selector.layout.visibility = 'hidden'
        elif vector_bcp == 'C':
            token_vocab_selector.layout.visibility = 'hidden'
            token_selector.layout.visibility = 'visible'
            vocab_selector.layout.visibility = 'visible'
        elif vector_bcp == 'P':
            token_vocab_selector.layout.visibility = 'hidden'
            token_selector.layout.visibility = 'hidden'
            vocab_selector.layout.visibility = 'hidden'
        update_output(False)
    vector_selector = wb.get_vector_selector(info, vector_changed, 'BCP')
    
    run_button.on_click(on_run_button_clicked)
    settings_box = widgets.VBox([data_selector,
                                 vector_selector,
                                 token_vocab_selector,
                                 token_selector,
                                 vocab_selector,
                                 model_selector,
                                 num_topics_selector,
                                 run_button])
    
    display(settings_box)
    output_box.display()
    nbbox.display()
    
    vector_changed()
Esempio n. 8
0
# ## Tokenize Document
# ---
# The following functions consitute the `W2VTokenizer` class that transforms the raw text of a document into tokens.

# In[170]:


class W2VTokenizer(TokenizerBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding_model = get_model(self.info)
        self.filter = self.embedding_model.filter.filter


if RUN_SCRIPT:
    nbbox()
    w2v_tokenizer = W2VTokenizer(info)
    w2v_tokenizer.text = runvars['document']['text']

# ### Prepare Text
#
# This step lowercases all characters and replaces the following:
# - `separator_token` by `separator_token_replacement`
# - all whitespaces by a single whitespace
# - `#` by nothing

# In[171]:

_re_whitespace = re.compile('[\s]+', re.UNICODE)
_re_url = re.compile('(http://[^\s]+)|(https://[^\s]+)|(www\.[^\s]+)')
Esempio n. 9
0
def token_picker(info, runvars, bcp='B'):
    output_box = wb.DynHTML(display=False)
    run_requirements_button = widgets.Button(description="Run Requirements")
    nbbox = wb.nbbox(display=False)

    def document_table(replacement_text=None):
        header = ('Document', 'header')
        document_id = ('Id', 'std', info['document_id'])
        if replacement_text is not None:
            text = replacement_text
        else:
            try:
                text = runvars['document']['text']
            except:
                text = 'Not available'
        text = ('', 'text', text)
        return [header, document_id, text]

    def load_document():
        try:
            with data.document_reader(info) as documents:
                runvars['document'] = next(doc for doc in documents
                                           if doc["id"] == info["document_id"])
        except:
            pass

    def update_output(clear_nbbox=True):
        nbbox.make_current()
        if clear_nbbox:
            nbbox.clear()
        runvars['document'] = None
        if data.documents_exists(info):
            run_requirements_button.disabled = True
            output_box.update(
                wb.info_table('data:short,token:long,custom:document',
                              info,
                              document=document_table('Loading...')))
            load_document()
        else:
            run_requirements_button.disabled = False
        output_box.update(
            wb.info_table('data:short,token:long,custom:document',
                          info,
                          document=document_table()))

    def on_run_requirements_button_clicked(b):
        nbbox.make_current()
        nbbox.clear()
        try:
            check_requirements(info)
        except:
            nbprint.print_traceback()
        update_output(False)

    data_selector = wb.get_data_selector(info, update_output)
    document_selector = wb.get_rawdocument_selector(info, update_output)
    token_selector = wb.get_token_selector(info, update_output, bcp)
    run_requirements_button.on_click(on_run_requirements_button_clicked)
    settings_box = widgets.VBox([
        data_selector, document_selector, token_selector,
        run_requirements_button
    ])

    display(settings_box)
    output_box.display()
    nbbox.display()

    update_output()