def plot_network(
    nodes,
    edges,
    node_description=None,
    node_size=5,
    node_opts=None,
    line_opts=None,
    element_id='nx_id3',
    figsize=(900, 900),
    node_label='name',
    node_label_opts=None,
    edge_label='name',
    edge_label_opts=None,
    tools=None,
    **figkwargs
):

    edges_source = ColumnDataSource(edges)
    nodes_source = ColumnDataSource(nodes)

    node_opts = extend(DFLT_NODE_OPTS, node_opts or {})
    line_opts = extend(DFLT_EDGE_OPTS, line_opts or {})

    p = figure(plot_width=figsize[0], plot_height=figsize[1], tools=tools or TOOLS, **figkwargs)

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    if 'line_color' in edges.keys():
        line_opts = extend(line_opts, { 'line_color': 'line_color', 'alpha': 1.0})

    _ = p.multi_line('xs', 'ys', line_width='weight', source=edges_source, **line_opts)
    r_nodes = p.circle('x', 'y', size=node_size, source=nodes_source, **node_opts)

    if 'fill_color' in nodes.keys():
        r_nodes.glyph.fill_color = 'fill_color'

    if node_description is not None:
        p.add_tools(HoverTool(renderers=[r_nodes], tooltips=None, callback=WidgetUtility.
            glyph_hover_callback(nodes_source, 'node_id', text_ids=node_description.index,
                                 text=node_description, element_id=element_id)))

    if node_label is not None and node_label in nodes.keys():
        label_opts = extend({}, DFLT_LABEL_OPTS, node_label_opts or {})
        p.add_layout(LabelSet(source=nodes_source, x='x', y='y', text=node_label, **label_opts))

    if edge_label is not None and edge_label in edges.keys():
        label_opts = extend({}, DFLT_LABEL_OPTS, edge_label_opts or {})
        p.add_layout(LabelSet(source=edges_source, x='m_x', y='m_y', text=edge_label, **label_opts))

    handle = show(p, notebook_handle=True)

    return dict(
        handle=handle,
        edges=edges,
        nodes=nodes,
        edges_source=edges_source,
        nodes_source=nodes_source
    )
def get_corpus_documents(corpus):
    metadata = [ utility.extend({}, doc._.meta, _get_pos_statistics(doc)) for doc in corpus ]
    df = pd.DataFrame(metadata)[['treaty_id', 'filename', 'signed_year', 'party1', 'party2'] + POS_NAMES]
    df['title'] = df.treaty_id
    df['lang'] = df.filename.str.extract(r'\w{4,6}\_(\w\w)')
    df['words'] = df[POS_NAMES].apply(sum, axis=1)
    return df
def create_textacy_corpus(corpus_reader,
                          nlp,
                          tick=utility.noop,
                          n_chunk_threshold=100000):

    corpus = textacy.Corpus(nlp)
    counter = 0

    for filename, document_id, text, metadata in corpus_reader:

        metadata = utility.extend(
            metadata, dict(filename=filename, document_id=document_id))

        if len(text) > n_chunk_threshold:
            doc = textacy.spacier.utils.make_doc_from_text_chunks(
                text, lang=nlp, chunk_size=n_chunk_threshold)
            corpus.add_doc(doc)
            doc._.meta = metadata
        else:
            corpus.add((text, metadata))

        counter += 1
        if counter % 100 == 0:
            logger.info('%s documents added...', counter)
        tick(counter)

    logger.info('Done! %s documents added!', counter)

    return corpus
def create_textacy_corpus(corpus_reader,
                          nlp,
                          tick=utility.noop,
                          strip_tensor=True):
    logger.info('creating corpus (this might take some time)...')
    batch_size = 100
    corpus = textacy.Corpus(nlp)
    document_id = 0
    n_chunk_threshold = 50000
    for filename, text, metadata in corpus_reader:

        metadata = utility.extend(
            metadata, dict(filename=filename, document_id=document_id))

        if len(text) > n_chunk_threshold:
            spacy_doc = textacy.spacier.utils.make_doc_from_text_chunks(
                text, lang=nlp, chunk_size=n_chunk_threshold)
            corpus.add_doc(spacy_doc, metadata)
        else:
            corpus.add_text(text, metadata)

        if strip_tensor:
            for doc in corpus:
                doc.spacy_doc.tensor = None

        document_id += 1
        if document_id % batch_size == 0:
            logger.info('%s documents added...', document_id)
            tick(document_id)

    return corpus
Ejemplo n.º 5
0
def create_textacy_corpus(documents, nlp, tick=utility.noop):
    corpus = textacy.Corpus(nlp)
    for filename, text, metadata in documents:
        corpus.add_text(text, utility.extend(dict(filename=filename),
                                             metadata))
        tick()
    return corpus
Ejemplo n.º 6
0
 def create_float_slider(self, description, **args):
     args = utility.extend(
         dict(min=0.0,
              max=0.0,
              step=0.1,
              value=0.0,
              disabled=False,
              continuous_update=False), args)
     return widgets.FloatSlider(description=description, **args)
Ejemplo n.º 7
0
 def create_select_widget(self,
                          label='',
                          values=None,
                          default=None,
                          **kwargs):
     opts = dict(options=values or [],
                 value=default if default is not None and default in values
                 else values[0] if len(values or []) > 0 else None,
                 description=label,
                 disabled=False)
     opts = utility.extend(opts, kwargs)
     return widgets.Dropdown(**opts)
    def generate(self, stream, column_functions=None):

        pos_tags = {k: 0 for k in self.tag_set.keys()}
        pos_delimiter = stream.append_pos
        pos_statistics = []
        pos_total_counter = collections.Counter()
        for document, tokens in stream:

            counter = collections.Counter([
                x.split(pos_delimiter)[-1].upper()
                if pos_delimiter in x else '???' for x in tokens
            ])
            pos_total_counter.update(counter)

            counter_dict = dict(counter)

            pos_counts = extend(pos_tags, {
                k: v
                for k, v in counter_dict.items() if k in pos_tags.keys()
            })
            other_counts = [
                k for k in counter_dict.keys()
                if k not in pos_tags.keys() and k != ''
            ]

            if len(other_counts) > 0:
                logger.warning('Warning strange PoS tags: File %s, tags %s',
                               document, other_counts)

            pos_statistics.append(extend(pos_counts, filename=document))

        df = pd.DataFrame(pos_statistics)
        for (column_name, column_function) in (column_functions or []):
            df[column_name] = column_function(df)

        return df
Ejemplo n.º 9
0
def get_positioned_edges(network, layout, sort_attr=None):
    """Extracts network edge attributes and assigns coordinates to endpoints, and computes midpont coordinate

        Parameters
        ----------
        network : nx.Graph
            The networkx graph.

        layout : dict of node + point pairs i.e. (node, [x,y])
            A dictionary that contains coordinates for all nodes.
            
        Returns
        -------
            Return list of dicts
             i.e. {
                 source:  source node,
                 target:  target-node,
                 xs:      [x1, x2],           
                 ys:      [y1, y2],           # Y-coordinate
                 m_x:     (x1 + x2) / 2,
                 y_x:     (y1 + y2) / 2,
                 attr-1:  value of attr-1
                 ...      
                 attr-n:  value of attr-n
            }
            
            x1, y1     source node's coordinate
            x2, y2     target node's coordinate
            m_x, m_y   midpoint coordinare
            
    """
    list_of_dicts = [
        extend(
            dict(
                source=u,
                target=v,
                xs=[layout[u][0], layout[v][0]],
                ys=[layout[u][1], layout[v][1]],
                m_x=[(layout[u][0] + layout[v][0])/2.0],
                m_y=[(layout[u][1] + layout[v][1])/2.0]),
            d)
        for u, v, d in network.edges(data=True)
    ]
    
    if sort_attr is not None:
        list_of_dicts.sort(key=lambda x: x[sort_attr])

    return list_of_dicts
def get_corpus_data(corpus, document_index, title, columns_of_interest=None):
    metadata = [
        utility.extend({}, dict(document_id=doc._.meta['document_id']),
                       _get_pos_statistics(doc)) for doc in corpus
    ]
    df = pd.DataFrame(metadata)[['document_id'] + POS_NAMES]
    if columns_of_interest is not None:
        document_index = document_index[columns_of_interest]
    df = pd.merge(df,
                  document_index,
                  left_on='document_id',
                  right_index=True,
                  how='inner')
    df['title'] = df[title]
    df['words'] = df[POS_NAMES].apply(sum, axis=1)
    return df
def generate_field_filters(documents, opts):
    filters = []
    for opt in opts:  # if opt['type'] == 'multiselect':
        options = opt.get(
            'options',
            _get_field_values(documents,
                              opt['field'],
                              as_tuple=True,
                              query=opt.get('query', None)))
        description = opt.get('description', '')
        rows = min(4, len(options))
        gf = utility.extend(opt,
                            widget=widgets_config.selectmultiple(description,
                                                                 options,
                                                                 value=(),
                                                                 rows=rows))
        filters.append(gf)
    return filters
Ejemplo n.º 12
0
def plot_network(nodes, edges, plot_opts, fig_opts=None):

    edges_source = bokeh.models.ColumnDataSource(edges)
    nodes_source = bokeh.models.ColumnDataSource(nodes)

    node_opts = utility.extend(DFLT_NODE_OPTS, plot_opts.get('node_opts', {}))
    line_opts = utility.extend(DFLT_EDGE_OPTS, plot_opts.get('line_opts', {}))
    fig_opts  = utility.extend(DFLT_FIG_OPTS, fig_opts or {})
    
    node_size = plot_opts.get('node_size', 1)
    
    p = figure(**fig_opts)

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    if 'line_color' in edges.keys():
        line_opts = utility.extend(line_opts, { 'line_color': 'line_color', 'alpha': 1.0})

    _ = p.multi_line('xs', 'ys', line_width='weight', source=edges_source, **line_opts)
    r_nodes = p.circle('x', 'y', size=node_size, source=nodes_source, **node_opts)

    if 'fill_color' in nodes.keys():
        r_nodes.glyph.fill_color = 'fill_color'

    node_description = plot_opts.get('node_description', None)
    if node_description is not None:
        element_id = plot_opts.get('element_id', '_')
        text_source = ColumnDataSource(dict(text_id=node_description.index, text=node_description))
        p.add_tools(bokeh.models.HoverTool(renderers=[r_nodes], tooltips=None, callback=widgets_config.
            glyph_hover_callback(nodes_source, 'node_id', text_source=text_source, element_id=element_id)))

    node_label = plot_opts.get('node_label', None)
    if node_label is not None and node_label in nodes.keys():
        label_opts = utility.extend({}, DFLT_LABEL_OPTS, plot_opts.get('node_label_opts', {}))
        p.add_layout(bokeh.models.LabelSet(source=nodes_source, x='x', y='y', text=node_label, **label_opts))

    edge_label = plot_opts.get('edge_label', None)
    if edge_label is not None and edge_label in edges.keys():
        label_opts = utility.extend({}, DFLT_LABEL_OPTS, plot_opts.get('edge_label_opts', {}))
        p.add_layout(bokeh.models.LabelSet(source=edges_source, x='m_x', y='m_y', text=edge_label, **label_opts))

    handle = bokeh.plotting.show(p, notebook_handle=True)

    return dict(
        handle=handle,
        edges=edges,
        nodes=nodes,
        edges_source=edges_source,
        nodes_source=nodes_source
    )
        def gen_docs(corpus_reader, nlp):

            document_id = 0
            for filename, text, metadata in corpus_reader:

                metadata = utility.extend(
                    metadata, dict(filename=filename, document_id=document_id))

                if len(text) > n_chunk_threshold:
                    spacy_doc = textacy.spacier.utils.make_doc_from_text_chunks(
                        text, lang=nlp, chunk_size=n_chunk_threshold)
                else:
                    spacy_doc = nlp(text)

                if "textacy" not in spacy_doc.user_data:
                    spacy_doc.user_data["textacy"] = {}

                spacy_doc.user_data["textacy"]["metadata"] = metadata

                if document_id == 0:
                    spacy_doc.user_data["textacy"][
                        "spacy_lang_meta"] = nlp.meta

                spacy_doc.tensor = None
                if document_id == 0:
                    spacy_doc.user_data['textacy'][
                        'spacy_lang_meta'] = nlp.meta

                yield spacy_doc

                document_id += 1

                logger.info('%s documents added...size was %s...', document_id,
                            len(spacy_doc))

                tick(document_id)

                spacy_doc = None
def _get_pos_statistics(doc):
    pos_iter = (x.pos_ for x in doc if x.pos not in [96, 0, 100])
    pos_counts = dict(collections.Counter(pos_iter))
    stats = utility.extend(dict(POS_TO_COUNT), pos_counts)
    return stats
Ejemplo n.º 15
0
def extract_corpus_terms(corpus, extract_args):
    """ Extracts documents and terms from a corpus
    
    Parameters
    ----------
    corpus : textacy Corpus
        Corpus in textacy format.
        
    extract_args : dict
        Dict that contains args that specifies the filter and transforms
        extract_args['args'] positional arguments for textacy.Doc.to_terms_list
        extract_args['kwargs'] Keyword arguments for textacy.Doc.to_terms_list
        extract_args['extra_stop_words'] List of additional stopwords to use
        extract_args['substitutions'] Dict (map) with term substitution
        extract_args['mask_gpe'] Boolean flag indicating if GPE should be substituted
        extract_args['min_freq'] Integer value specifying min global word count.
        extract_args['max_doc_freq'] Float value between 0 and 1 indicating threshold
          for documentword frequency, Words that occur in more than `max_doc_freq`
          documents will be filtered out.
        
    None
    ----
        extract_args.min_freq and extract_args.min_freq is the same value but used differently
        kwargs.min_freq is passed directly as args to `textacy_doc.to_terms_list`
        tokens below extract_args.min_freq threshold are added to the `extra_stop_words` list
    Returns
    -------
    iterable of documents (which is iterable of terms)
        Documents where terms have ben filtered and transformed according to args.
        
    """

    kwargs = dict(extract_args.get('kwargs', {}))
    args = dict(extract_args.get('args', {}))
    normalize = args.get('normalize', 'lemma')
    substitutions = extract_args.get('substitutions', {})
    extra_stop_words = set(extract_args.get('extra_stop_words', None) or [])
    chunk_size = extract_args.get('chunk_size', None)
    min_length = extract_args.get('min_length', 2)

    mask_gpe = extract_args.get('mask_gpe', False)
    if mask_gpe is True:
        gpe_names = {x: '_gpe_' for x in get_gpe_names(corpus)}
        substitutions = utility.extend(substitutions, gpe_names)

    min_freq = extract_args.get('min_freq', 1)

    if min_freq > 1:
        words = infrequent_words(corpus,
                                 normalize=normalize,
                                 weighting='count',
                                 threshold=min_freq,
                                 as_strings=True)
        extra_stop_words = extra_stop_words.union(words)
        logger.info('Ignoring {} low-frequent words!'.format(len(words)))

    max_doc_freq = extract_args.get('max_doc_freq', 100)

    if max_doc_freq < 100:
        words = frequent_document_words(corpus,
                                        normalize=normalize,
                                        weighting='freq',
                                        dfs_threshold=max_doc_freq,
                                        as_strings=True)
        extra_stop_words = extra_stop_words.union(words)
        logger.info('Ignoring {} high-frequent words!'.format(len(words)))

    extract_args = {
        'args': args,
        'kwargs': kwargs,
        'substitutions': substitutions,
        'extra_stop_words': extra_stop_words,
        'chunk_size': None
    }

    terms = (extract_document_terms(doc, extract_args) for doc in corpus)

    return terms
Ejemplo n.º 16
0
def display_party_network(parties=None,
                          period_group_index=0,
                          treaty_filter='',
                          plot_data=None,
                          topic_group=None,
                          recode_is_cultural=False,
                          layout_algorithm='',
                          C=1.0,
                          K=0.10,
                          p1=0.10,
                          output='network',
                          party_name='party',
                          node_size_range=[40, 60],
                          palette_name=None,
                          width=900,
                          height=900,
                          node_size=None,
                          node_partition=None,
                          wti_index=None,
                          year_limit=None,
                          progress=utility.noop,
                          done_callback=None):
    try:

        if output == 'print_args':
            args = utility.filter_dict(locals(), ['progress', 'done_callback'],
                                       filter_out=True)
            args['wti_index'] = None
            args['plot_data'] = None
            args['output'] = 'network'
            pp(args)
            return

        plot_data = plot_data or utility.SimpleStruct(handle=None,
                                                      nodes=None,
                                                      edges=None,
                                                      slice_range_type=2,
                                                      slice_range=year_limit)
        weight_threshold = 0.0

        palette = get_palette(palette_name)

        progress(1)
        period_group = config.DEFAULT_PERIOD_GROUPS[period_group_index]
        kwargs = dict(period_group=period_group,
                      treaty_filter=treaty_filter,
                      recode_is_cultural=recode_is_cultural,
                      year_limit=year_limit)
        parties = list(parties)
        party_data = wti_index.get_party_network(party_name, topic_group,
                                                 parties, **kwargs)

        if party_data is None or party_data.shape[0] == 0:
            print('No data for selection')
            return

        if topic_group is not None:

            party_data = party_data.loc[(party_data.topic.isin(
                topic_group.keys()))]

            group_keys = topic_group.keys()
            line_palette = color_utility.DEFAULT_LINE_PALETTE
            line_palette_map = {
                k: i % len(line_palette)
                for i, k in enumerate(group_keys)
            }
            party_data['line_color'] = party_data.category.apply(
                lambda x: line_palette[line_palette_map[x]])

        else:
            party_data['category'] = party_data.topic

        party_data['edge_label'] = party_data.signed.apply(
            lambda x: x.year).astype(str) + '/' + party_data.category

        progress(2)

        #if not multigraph:
        #    data = data.groupby(['party', 'party_other']).size().reset_index().rename(columns={0: 'weight'})

        if party_data is None or party_data.shape[0] == 0:
            print('No data for selection')
            return

        G = create_party_network(party_data, K, node_partition,
                                 palette)  #, multigraph)

        progress(3)

        if output == 'network':

            if weight_threshold > 0:
                G = get_sub_network(G, weight_threshold)

            layout, _ = layout_network(G, layout_algorithm,
                                       **dict(scale=1.0, K=K, C=C, p=p1))

            progress(4)

            edges = get_positioned_edges2(G, layout, sort_attr='signed')
            nodes = get_positioned_nodes(G, layout)

            edges = {k: list(edges[k]) for k in edges}
            nodes = {k: list(nodes[k]) for k in nodes}

            node_size = setup_node_size(nodes, node_size, node_size_range)

            x_offset, y_offset = adjust_node_label_offset(nodes, node_size)

            plot_opts = utility.extend(NETWORK_PLOT_OPTS,
                                       figsize=(width, height),
                                       node_size=node_size,
                                       node_label_opts=dict(y_offset=y_offset,
                                                            x_offset=x_offset),
                                       edge_label_opts={})

            progress(5)

            data = plot_network(nodes=nodes,
                                edges=edges,
                                node_label='name',
                                edge_label='edge_label',
                                **plot_opts)

            plot_data.update(**data)

            progress(6)

            #bp.show(p)

            if done_callback is not None:
                done_callback(None)

        elif output == 'table':
            party_data.columns = [
                dict(party='source', party_other='target').get(x, x)
                for x in party_data.columns
            ]
            display(party_data)

    except Exception as ex:
        logger.error(ex)
        raise
    finally:
        progress(0)
def _get_pos_statistics(doc):
    pos_iter = (x.pos_ for x in doc if x.pos_ not in ['NUM', 'PUNCT', 'SPACE'])
    pos_counts = dict(collections.Counter(pos_iter))
    stats = utility.extend(dict(POS_TO_COUNT), pos_counts)
    return stats
Ejemplo n.º 18
0
def compute(corpus, tick=utility.noop, method='sklearn_lda', vec_args=None, term_args=None, tm_args=None, **args):
    
    tick()
    
    vec_args = utility.extend({}, DEFAULT_VECTORIZE_PARAMS, vec_args)
    
    terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ]
    fx_terms = lambda: terms # [ doc for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ]
            
    perplexity_score = None
    coherence_score = None
    vectorizer = None
    doc_topic_matrix = None
    doc_term_matrix = None
    
    documents = textacy_utility.get_corpus_documents(corpus)

    if method.startswith('sklearn'):
        
        vectorizer = textacy.Vectorizer(**vec_args)
        doc_term_matrix = vectorizer.fit_transform(fx_terms())

        model = textacy.TopicModel(method.split('_')[1], **tm_args)
        model.fit(doc_term_matrix)
        
        tick()
        
        doc_topic_matrix = model.transform(doc_term_matrix)
        
        tick()
        
        id2word = vectorizer.id_to_term
        bow_corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False)
        
        # FIXME!!!
        perplexity_score = None
        coherence_score = None
        
    elif method.startswith('gensim_'):
        
        algorithm = method.split('_')[1].upper()
        
        id2word = gensim.corpora.Dictionary(fx_terms())
        bow_corpus = [ id2word.doc2bow(tokens) for tokens in fx_terms() ]
        
        if args.get('tfidf_weiging', False):
            # assert algorithm != 'MALLETLDA', 'MALLET training model cannot (currently) use TFIDF weighed corpus'
            tfidf_model = gensim.models.tfidfmodel.TfidfModel(bow_corpus)
            bow_corpus = [ tfidf_model[d] for d in bow_corpus ]
        
        algorithms = setup_gensim_algorithms(corpus, bow_corpus, id2word, tm_args)
        
        engine = algorithms[algorithm]['engine']
        engine_options = algorithms[algorithm]['options']
        
        model = engine(**engine_options)
        
        if hasattr(model, 'log_perplexity'):
            perplexity_score = model.log_perplexity(bow_corpus, len(bow_corpus))
        
        try:
            coherence_model_lda =  gensim.models.CoherenceModel(model=model, texts=fx_terms(), dictionary=id2word, coherence='c_v')
            coherence_score = coherence_model_lda.get_coherence()
        except Exception as ex:
            logger.error(ex)
            coherence_score = None
            
    processed = topic_model_utility.compile_metadata(
        model,
        bow_corpus,
        id2word,
        documents,
        vectorizer=vectorizer,
        doc_topic_matrix=doc_topic_matrix,
        n_tokens=200
    )
    
    model_data = types.SimpleNamespace(
        topic_model=model,
        id2term=id2word,
        bow_corpus=bow_corpus,
        doc_term_matrix=doc_term_matrix,
        #doc_topic_matrix=doc_topic_matrix,
        #vectorizer=vectorizer,
        processed=processed,
        perplexity_score=perplexity_score,
        coherence_score=coherence_score,
        options=dict(method=method, vec_args=vec_args, term_args=term_args, tm_args=tm_args, **args),
        coherence_scores=None
    )
    
    tick(0)
    
    return model_data
    def test_zip_archive(self):

        filename = '.\\temp\\daedalus_articles_pos_xml_1931-2017.zip'
        filepath = join_test_data_path(filename)
        source = ZipReader(filepath, '*.xml')
        pos_delimiter = "_"
        #for document, tokens in source:
        #    print("{}: {}".format(document, len(tokens)))

        import collections
        import pandas as pd
        import re

        stream = SparvCorpusSourceReader(
            source=source,
            transforms=[ lambda tokens: [ x.lower()  for x in tokens] ],
            postags="''",
            chunk_size=None,
            lemmatize=False,
            append_pos=pos_delimiter,
            ignores=""
        )
        pos_tags = {

            'AB': 0, #  'Adverb',
            'DT': 0, #  'Determiner',
            'HA': 0, #  'WH-adverb',
            'HD': 0, #  'WH-determiner',
            'HP': 0, #  'WH-pronoun',
            'HS': 0, #  'WH-possessive',
            'IE': 0, #  'Infinitival marker',
            'IN': 0, #  'Interjection',
            'JJ': 0, #  'Adjective',
            'KN': 0, #  'Coordinating conjunction',
            'NN': 0, #  'Noun',
            'PC': 0, #  'Participle',
            'PL': 0, #  'Particle',
            'PM': 0, #  'Proper Noun',
            'PN': 0, #  'Pronoun',
            'PP': 0, #  'Preposition',
            'PS': 0, #  'Possessive pronoun',
            'RG': 0, #  'Cardinal number',
            'RO': 0, #  'Ordinal number',
            'SN': 0, #  'Subordinating conjunction',
            'VB': 0, #  'Verb',
            'UO': 0, #  'Foreign word',

            'MAD': 0, #  'Major delimiter',
            'MID': 0, #  'Minor delimiter',
            'PAD': 0, #  'Pairwise delimiter',
            '???': 0
        }

        pos_statistics = []
        pos_total_counter = collections.Counter()
        for document, tokens in stream:

            counter = collections.Counter([ x.split(pos_delimiter)[-1].upper() if pos_delimiter in x else '???' for x in tokens ])
            pos_total_counter.update(counter)

            counter_dict = dict(counter)

            pos_counts = extend(pos_tags, { k: v for k, v in counter_dict.items() if k in pos_tags.keys() })
            other_counts = [ k for k in counter_dict.keys() if k not in pos_tags.keys() ]

            if len(other_counts) > 0:
                logger.warning('Warning strange PoS tags: File %s, tags %s', document, other_counts)

            pos_statistics.append(extend(pos_counts, filename=document))

        #v = {k: [dic[k] for dic in LD] for k in LD[0]}
        df = pd.DataFrame(pos_statistics)
        df['year'] = df.filename.apply(lambda x: int(re.search(r'daedalus_volume_(\d{4})', x).group(1)))
        df['article'] = df.filename.apply(lambda x: int(re.search(r'article_(\d{2})', x).group(1)))
        df['segment'] = df.filename.apply(lambda x: int(re.search(r'(\d{2})\.txt', x).group(1)))
        df.to_excel("stats.xlsx")
def plot(  # pylint: disable=W0102
        network,
        layout,
        scale=1.0,
        threshold=0.0,
        node_description=None,
        node_size=5,
        node_size_range=[20, 40],
        weight_scale=5.0,
        normalize_weights=True,
        node_opts=None,
        line_opts=None,
        text_opts=None,
        element_id='nx_id3',
        figsize=(900, 900),
        tools=None,
        palette=DFLT_PALETTE,
        **figkwargs):
    if threshold > 0:
        network = get_sub_network(network, threshold)

    edges = networkx_utility.get_positioned_edges(network, layout)

    if normalize_weights and 'weight' in edges.keys():
        max_weight = max(edges['weight'])
        edges['weight'] = [float(x) / max_weight for x in edges['weight']]

    if weight_scale != 1.0 and 'weight' in edges.keys():
        edges['weight'] = [weight_scale * float(x) for x in edges['weight']]

    edges = dict(source=u, target=v, xs=xs, ys=ys, weights=weights)

    nodes = networkx_utility.get_positioned_nodes(network, layout)

    #node_size = setup_node_size(nodes, node_size, node_size_range)
    if node_size in nodes.keys() and node_size_range is not None:
        nodes['clamped_size'] = utility.clamp_values(nodes[node_size],
                                                     node_size_range)
        node_size = 'clamped_size'

    label_y_offset = 'y_offset' if node_size in nodes.keys() else node_size + 8
    if label_y_offset == 'y_offset':
        nodes['y_offset'] = [
            y + r
            for (y,
                 r) in zip(nodes['y'], [r / 2.0 + 8 for r in nodes[node_size]])
        ]

    edges = {k: list(edges[k]) for k in edges}
    nodes = {k: list(nodes[k]) for k in nodes}

    edges_source = bokeh.models.ColumnDataSource(edges)
    nodes_source = bokeh.models.ColumnDataSource(nodes)

    node_opts = utility.extend(DFLT_NODE_OPTS, node_opts or {})
    line_opts = utility.extend(DFLT_EDGE_OPTS, line_opts or {})

    p = figure(plot_width=figsize[0],
               plot_height=figsize[1],
               tools=tools or TOOLS,
               **figkwargs)

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    _ = p.multi_line('xs',
                     'ys',
                     line_width='weights',
                     source=edges_source,
                     **line_opts)
    r_nodes = p.circle('x',
                       'y',
                       size=node_size,
                       source=nodes_source,
                       **node_opts)

    if 'fill_color' in nodes.keys():
        r_nodes.glyph.fill_color = 'fill_color'

    if node_description is not None:
        text_source = ColumnDataSource(
            dict(text_id=node_description.index, text=node_description))
        p.add_tools(
            bokeh.models.HoverTool(
                renderers=[r_nodes],
                tooltips=None,
                callback=widgets_config.glyph_hover_callback(
                    nodes_source,
                    'node_id',
                    text_source,
                    element_id=element_id)))

    label_opts = utility.extend(
        DFLT_TEXT_OPTS,
        dict(y_offset=label_y_offset,
             text_color='black',
             text_baseline='bottom'), text_opts or {})

    p.add_layout(bokeh.models.LabelSet(source=nodes_source, **label_opts))

    return p
Ejemplo n.º 21
0
                },
                {
                    'engine_name': 'LdaMallet',
                    'engine_option': {
                        'iterations': 2000,
                        'passes': 3,
                        'engine_path': mallet_path
                    }
                }
            ]
        }
    ]

    for run_option in run_options:

        option = utility.extend(dict(DEFAULT_OPT), dict(run_option))

        if option.get('skip', False) is True:
            continue

        n_topics = to_sequence(run_option['num_topics'])

        engines = to_sequence(run_option['engines'])

        for engine in engines:

            print("engine: {}".format(engine['engine_name']))

            for n_topic in n_topics:

                option['engine_option'] = utility.extend(engine['engine_option'], dict(num_topics=n_topic))