Example #1
0
    def frequent_words(corpus, normalize, include_pos, n_top=100):

        if include_pos is None or include_pos == ('', ):
            include_pos = []

        return [
            x[0] for x in textacy_utility.get_most_frequent_words(
                corpus, n_top, normalize=normalize, include_pos=include_pos)
        ] + ['_gpe_']
 def pos_change_handler(*args):
     with gui.output:
         gui.compute.disabled = True
         selected = set(gui.stop_words.value)
         frequent_words = [
             x[0] for x in textacy_utility.get_most_frequent_words(
                 corpus,
                 100,
                 normalize=gui.normalize.value,
                 include_pos=gui.include_pos.value)
         ]
         gui.stop_words.options = frequent_words
         selected = selected & set(gui.stop_words.options)
         gui.stop_words.value = list(selected)
         gui.compute.disabled = False
Example #3
0
 def pos_change_handler(*args):
     with self.model_widgets.output:
         self.model_widgets.compute.disabled = True
         selected = set(self.corpus_widgets.stop_words.value)
         frequent_words = [
             x[0] for x in textacy_utility.get_most_frequent_words(
                 corpus,
                 100,
                 normalize=self.corpus_widgets.normalize.value,
                 include_pos=self.corpus_widgets.include_pos.value)
         ]
         self.corpus_widgets.stop_words.options = frequent_words
         selected = selected & set(
             self.corpus_widgets.stop_words.options)
         self.corpus_widgets.stop_words.value = list(selected)
         self.model_widgets.compute.disabled = False
def word_frequency_gui(wti_index, corpus):

    treaty_time_groups = wti_index.get_treaty_time_groupings()

    lw = lambda w: widgets.Layout(width=w)

    include_pos_tags = ['ADJ', 'VERB', 'ADV', 'NOUN', 'PROPN']
    weighting_options = {'Count': 'count', 'Frequency': 'freq'}
    normalize_options = {'': False, 'Lemma': 'lemma', 'Lower': 'lower'}
    pos_options = include_pos_tags
    default_include_pos = ['NOUN', 'PROPN']
    frequent_words = [
        x[0].lower() for x in textacy_utility.get_most_frequent_words(
            corpus, 100, include_pos=default_include_pos)
    ]

    group_by_options = {
        treaty_time_groups[k]['title']: k
        for k in treaty_time_groups
    }
    output_type_options = [
        ('Sample', 'table'),
        ('Rank', 'rank'),
        ('Excel', 'excel'),
    ]
    ngrams_options = {'-': None, '1': [1], '1,2': [1, 2], '1,2,3': [1, 2, 3]}
    party_preset_options = wti_index.get_party_preset_options()
    parties_options = [
        x for x in wti_index.get_countries_list() if x != 'ALL OTHER'
    ]

    gui = types.SimpleNamespace(
        progress=widgets.IntProgress(value=0,
                                     min=0,
                                     max=5,
                                     step=1,
                                     description='',
                                     layout=lw('98%')),
        parties=widgets.SelectMultiple(description='Parties',
                                       options=parties_options,
                                       value=[],
                                       rows=7,
                                       layout=lw('200px')),
        party_preset=widgets_config.dropdown('Presets',
                                             party_preset_options,
                                             None,
                                             layout=lw('200px')),
        ngrams=widgets.Dropdown(description='n-grams',
                                options=ngrams_options,
                                value=None,
                                layout=lw('200px')),
        min_word=widgets.Dropdown(description='Min length',
                                  options=[1, 2, 3, 4],
                                  value=1,
                                  layout=lw('200px')),
        normalize=widgets.Dropdown(description='Normalize',
                                   options=normalize_options,
                                   value='lemma',
                                   layout=lw('200px')),
        weighting=widgets.Dropdown(description='Weighting',
                                   options=weighting_options,
                                   value='freq',
                                   layout=lw('200px')),
        include_pos=widgets.SelectMultiple(description='POS',
                                           options=pos_options,
                                           value=default_include_pos,
                                           rows=7,
                                           layout=lw('150px')),
        stop_words=widgets.SelectMultiple(description='STOP',
                                          options=frequent_words,
                                          value=list([]),
                                          rows=7,
                                          layout=lw('200px')),
        group_by_column=widgets.Dropdown(description='Group by',
                                         value='signed_year',
                                         options=group_by_options,
                                         layout=lw('200px')),
        output_type=widgets.Dropdown(description='Output',
                                     value='rank',
                                     options=output_type_options,
                                     layout=lw('200px')),
        compute=widgets.Button(description='Compute',
                               button_style='Success',
                               layout=lw('120px')),
        display_score=widgets.ToggleButton(description='Display score',
                                           icon='check',
                                           value=False,
                                           layout=lw('120px')),
        output=widgets.Output(layout={'border': '1px solid black'}),
        file_suffix=widgets.Text(
            value='',
            placeholder='(optional id)',
            description='ID:',
            disabled=False,
            layout=lw('200px'),
            tooltip="Optional plain text id that will be added to filename."))

    boxes = widgets.VBox([
        gui.progress,
        widgets.HBox([
            widgets.VBox([
                gui.normalize,
                gui.ngrams,
                gui.weighting,
                gui.group_by_column,
                gui.output_type,
            ]),
            widgets.VBox([
                gui.parties,
                gui.party_preset,
            ]),
            gui.include_pos,
            widgets.VBox([
                gui.stop_words,
                gui.file_suffix,
            ]),
            widgets.VBox([
                gui.display_score,
                gui.compute,
            ],
                         layout=widgets.Layout(align_items='flex-end')),
        ]), gui.output
    ])

    display(boxes)

    def on_party_preset_change(change):  # pylint: disable=W0613
        if gui.party_preset.value is None:
            return
        gui.parties.value = gui.parties.options if 'ALL' in gui.party_preset.value else gui.party_preset.value

    gui.party_preset.observe(on_party_preset_change, names='value')

    def pos_change_handler(*args):
        with gui.output:
            gui.compute.disabled = True
            selected = set(gui.stop_words.value)
            frequent_words = [
                x[0].lower() for x in textacy_utility.get_most_frequent_words(
                    corpus,
                    100,
                    normalize=gui.normalize.value,
                    include_pos=gui.include_pos.value,
                    #weighting=gui.weighting.value
                )
            ]

            gui.stop_words.options = frequent_words
            selected = selected & set(gui.stop_words.options)
            gui.stop_words.value = list(selected)
            gui.compute.disabled = False

    gui.include_pos.observe(pos_change_handler, 'value')
    gui.weighting.observe(pos_change_handler, 'value')

    def compute_callback_handler(*_args):
        gui.output.clear_output()
        with gui.output:
            try:
                gui.compute.disabled = True
                df_counts = compute_list_of_most_frequent_words(
                    corpus=corpus,
                    gui=gui,
                    target=gui.normalize.value,
                    treaty_time_groups=treaty_time_groups,
                    group_by_column=gui.group_by_column.value,
                    parties=gui.parties.value,
                    weighting=gui.weighting.value,
                    include_pos=gui.include_pos.value,
                    stop_words=set(gui.stop_words.value),
                    display_score=gui.display_score.value)
                display_list_of_most_frequent_words(gui, df_counts)
            except Exception as ex:
                logger.error(ex)
                raise
            finally:
                gui.progress.value = 0
                gui.compute.disabled = False

    gui.compute.on_click(compute_callback_handler)
    return gui
def display_topic_model_gui(data_folder, state, corpus, **opts):
    def spinner_widget(filename="images/spinner-02.gif", width=40, height=40):
        with open(filename, "rb") as image_file:
            image = image_file.read()
        return widgets.Image(value=image,
                             format='gif',
                             width=width,
                             height=height,
                             layout={'visibility': 'hidden'})

    pos_options = [
        x for x in opts['tagset'].POS.unique() if x not in [
            'PUNCT', '', 'DET', 'X', 'SPACE', 'PART', 'CONJ', 'SYM', 'INTJ',
            'PRON'
        ]
    ]
    engine_options = [
        ('MALLET LDA', 'gensim_mallet-lda'),
        ('gensim LDA', 'gensim_lda'),
        ('gensim LSI', 'gensim_lsi'),
        ('gensim HDP', 'gensim_hdp'),
        ('gensim DTM', 'gensim_dtm'),
        ('scikit LDA', 'sklearn_lda'),
        ('scikit NMF', 'sklearn_nmf'),
        ('scikit LSA', 'sklearn_lsa'),
        ('STTM   LDA', 'gensim_sttm-lda'),
        ('STTM   BTM', 'gensim_sttm-btm'),
        ('STTM   PTM', 'gensim_sttm-ptm'),
        ('STTM  SATM', 'gensim_sttm-satm'),
        ('STTM   DMM', 'gensim_sttm-dmm'),
        ('STTM  WATM', 'gensim_sttm-watm'),
    ]
    normalize_options = {
        'None': False,
        'Use lemma': 'lemma',
        'Lowercase': 'lower'
    }
    ngrams_options = {'1': [1], '1, 2': [1, 2], '1,2,3': [1, 2, 3]}
    default_include_pos = ['NOUN', 'PROPN']
    frequent_words = [
        x[0] for x in textacy_utility.get_most_frequent_words(
            corpus, 100, normalize='lemma', include_pos=default_include_pos)
    ] + ['_gpe_']
    named_entities_disabled = len(corpus) == 0 or len(
        corpus[0].spacy_doc.ents) == 0
    gui = types.SimpleNamespace(
        progress=widgets.IntProgress(value=0,
                                     min=0,
                                     max=5,
                                     step=1,
                                     description='',
                                     layout=widgets.Layout(width='90%')),
        n_topics=widgets.IntSlider(description='#topics',
                                   min=2,
                                   max=100,
                                   value=20,
                                   step=1,
                                   layout=widgets.Layout(width='240px')),
        min_freq=widgets.IntSlider(description='Min word freq',
                                   min=0,
                                   max=10,
                                   value=2,
                                   step=1,
                                   layout=widgets.Layout(width='240px')),
        max_doc_freq=widgets.IntSlider(description='Min doc %',
                                       min=75,
                                       max=100,
                                       value=100,
                                       step=1,
                                       layout=widgets.Layout(width='240px')),
        max_iter=widgets.IntSlider(description='Max iterations',
                                   min=100,
                                   max=6000,
                                   value=2000,
                                   step=10,
                                   layout=widgets.Layout(width='240px')),
        ngrams=widgets.Dropdown(description='n-grams',
                                options=ngrams_options,
                                value=[1],
                                layout=widgets.Layout(width='200px')),
        normalize=widgets.Dropdown(description='Normalize',
                                   options=normalize_options,
                                   value='lemma',
                                   layout=widgets.Layout(width='200px')),
        filter_stops=widgets.ToggleButton(value=True,
                                          description='Remove stopword',
                                          tooltip='Filter out stopwords',
                                          icon='check'),
        named_entities=widgets.ToggleButton(value=False,
                                            description='Merge entities',
                                            tooltip='Merge entities',
                                            icon='check',
                                            disabled=named_entities_disabled),
        apply_idf=widgets.ToggleButton(
            value=False,
            description='Apply IDF',
            tooltip='Apply IDF (skikit-learn) or TF-IDF (gensim)',
            icon='check'),
        mask_gpe=widgets.ToggleButton(value=False,
                                      description='Mask GPE',
                                      tooltip='Mask GPE',
                                      icon='check'),
        include_pos=widgets.SelectMultiple(
            description='POS',
            options=pos_options,
            value=default_include_pos,
            rows=7,
            layout=widgets.Layout(width='160px')),
        stop_words=widgets.SelectMultiple(
            description='STOP',
            options=frequent_words,
            value=list([]),
            rows=7,
            layout=widgets.Layout(width='220px')),
        method=widgets.Dropdown(description='Engine',
                                options=engine_options,
                                value='gensim_lda',
                                layout=widgets.Layout(width='200px')),
        compute=widgets.Button(description='Compute',
                               button_style='Success',
                               layout=widgets.Layout(width='115px',
                                                     background_color='blue')),
        show_trace=widgets.Checkbox(
            value=False, description='Show trace',
            disabled=False),  #layout=widgets.Layout(width='80px')),
        boxes=None,
        output=widgets.Output(
            layout={'border': '1px solid black'}),  # , 'height': '400px'
        spinner=spinner_widget())

    gui.boxes = widgets.VBox([
        gui.progress,
        widgets.HBox([
            widgets.VBox(
                [gui.n_topics, gui.min_freq, gui.max_doc_freq, gui.max_iter]),
            widgets.VBox([
                gui.filter_stops, gui.named_entities, gui.apply_idf,
                gui.mask_gpe
            ]), gui.include_pos, gui.stop_words,
            widgets.VBox([
                gui.normalize, gui.ngrams, gui.method,
                widgets.HBox([gui.spinner, gui.compute],
                             layout=widgets.Layout(align_items='flex-end'))
            ],
                         layout=widgets.Layout(align_items='flex-end')),
            gui.show_trace
        ]),
        widgets.VBox([gui.output]),
    ])

    def tick(x=None):
        gui.progress.value = gui.progress.value + 1 if x is None else x

    def buzy(is_buzy):
        gui.compute.disabled = is_buzy
        gui.spinner.layout.visibility = 'visible' if is_buzy else 'hidden'

    def compute_topic_model_handler(*args):
        gui.output.clear_output()
        buzy(True)

        with gui.output:

            try:
                state.data = compute_topic_model(data_folder,
                                                 corpus,
                                                 gui.method.value,
                                                 gui.n_topics.value,
                                                 gui,
                                                 tick=tick,
                                                 **opts)

                topics = topic_model_utility.get_topics_unstacked(
                    state.topic_model,
                    n_tokens=100,
                    id2term=state.id2term,
                    topic_ids=state.relevant_topics)

                display(topics)

            except Exception as ex:
                logger.error(ex)
                state.data = None
                raise
            finally:
                buzy(False)

    gui.compute.on_click(compute_topic_model_handler)

    def pos_change_handler(*args):
        with gui.output:
            gui.compute.disabled = True
            selected = set(gui.stop_words.value)
            frequent_words = [
                x[0] for x in textacy_utility.get_most_frequent_words(
                    corpus,
                    100,
                    normalize=gui.normalize.value,
                    include_pos=gui.include_pos.value)
            ]
            gui.stop_words.options = frequent_words
            selected = selected & set(gui.stop_words.options)
            gui.stop_words.value = list(selected)
            gui.compute.disabled = False

    gui.include_pos.observe(pos_change_handler, 'value')

    def method_change_handler(*args):
        with gui.output:

            gui.compute.disabled = True
            method = gui.method.value

            gui.apply_idf.disabled = False
            gui.apply_idf.description = 'Apply TF-IDF' if method.startswith(
                'gensim') else 'Apply IDF'

            gui.ngrams.disabled = False
            if 'MALLET' in method:
                gui.ngrams.value = [1]
                gui.ngrams.disabled = True
                gui.apply_idf.description = 'TF-IDF N/A'
                gui.apply_idf.disabled = True

            gui.n_topics.disabled = False
            if 'HDP' in method:
                gui.n_topics.value = gui.n_topics.max
                gui.n_topics.disabled = True

            gui.compute.disabled = False

    gui.method.observe(method_change_handler, 'value')
    method_change_handler()
    display(gui.boxes)
    return gui