def plot_network( nodes, edges, node_description=None, node_size=5, node_opts=None, line_opts=None, element_id='nx_id3', figsize=(900, 900), node_label='name', node_label_opts=None, edge_label='name', edge_label_opts=None, tools=None, **figkwargs ): edges_source = ColumnDataSource(edges) nodes_source = ColumnDataSource(nodes) node_opts = extend(DFLT_NODE_OPTS, node_opts or {}) line_opts = extend(DFLT_EDGE_OPTS, line_opts or {}) p = figure(plot_width=figsize[0], plot_height=figsize[1], tools=tools or TOOLS, **figkwargs) p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None if 'line_color' in edges.keys(): line_opts = extend(line_opts, { 'line_color': 'line_color', 'alpha': 1.0}) _ = p.multi_line('xs', 'ys', line_width='weight', source=edges_source, **line_opts) r_nodes = p.circle('x', 'y', size=node_size, source=nodes_source, **node_opts) if 'fill_color' in nodes.keys(): r_nodes.glyph.fill_color = 'fill_color' if node_description is not None: p.add_tools(HoverTool(renderers=[r_nodes], tooltips=None, callback=WidgetUtility. glyph_hover_callback(nodes_source, 'node_id', text_ids=node_description.index, text=node_description, element_id=element_id))) if node_label is not None and node_label in nodes.keys(): label_opts = extend({}, DFLT_LABEL_OPTS, node_label_opts or {}) p.add_layout(LabelSet(source=nodes_source, x='x', y='y', text=node_label, **label_opts)) if edge_label is not None and edge_label in edges.keys(): label_opts = extend({}, DFLT_LABEL_OPTS, edge_label_opts or {}) p.add_layout(LabelSet(source=edges_source, x='m_x', y='m_y', text=edge_label, **label_opts)) handle = show(p, notebook_handle=True) return dict( handle=handle, edges=edges, nodes=nodes, edges_source=edges_source, nodes_source=nodes_source )
def get_corpus_documents(corpus): metadata = [ utility.extend({}, doc._.meta, _get_pos_statistics(doc)) for doc in corpus ] df = pd.DataFrame(metadata)[['treaty_id', 'filename', 'signed_year', 'party1', 'party2'] + POS_NAMES] df['title'] = df.treaty_id df['lang'] = df.filename.str.extract(r'\w{4,6}\_(\w\w)') df['words'] = df[POS_NAMES].apply(sum, axis=1) return df
def create_textacy_corpus(corpus_reader, nlp, tick=utility.noop, n_chunk_threshold=100000): corpus = textacy.Corpus(nlp) counter = 0 for filename, document_id, text, metadata in corpus_reader: metadata = utility.extend( metadata, dict(filename=filename, document_id=document_id)) if len(text) > n_chunk_threshold: doc = textacy.spacier.utils.make_doc_from_text_chunks( text, lang=nlp, chunk_size=n_chunk_threshold) corpus.add_doc(doc) doc._.meta = metadata else: corpus.add((text, metadata)) counter += 1 if counter % 100 == 0: logger.info('%s documents added...', counter) tick(counter) logger.info('Done! %s documents added!', counter) return corpus
def create_textacy_corpus(corpus_reader, nlp, tick=utility.noop, strip_tensor=True): logger.info('creating corpus (this might take some time)...') batch_size = 100 corpus = textacy.Corpus(nlp) document_id = 0 n_chunk_threshold = 50000 for filename, text, metadata in corpus_reader: metadata = utility.extend( metadata, dict(filename=filename, document_id=document_id)) if len(text) > n_chunk_threshold: spacy_doc = textacy.spacier.utils.make_doc_from_text_chunks( text, lang=nlp, chunk_size=n_chunk_threshold) corpus.add_doc(spacy_doc, metadata) else: corpus.add_text(text, metadata) if strip_tensor: for doc in corpus: doc.spacy_doc.tensor = None document_id += 1 if document_id % batch_size == 0: logger.info('%s documents added...', document_id) tick(document_id) return corpus
def create_textacy_corpus(documents, nlp, tick=utility.noop): corpus = textacy.Corpus(nlp) for filename, text, metadata in documents: corpus.add_text(text, utility.extend(dict(filename=filename), metadata)) tick() return corpus
def create_float_slider(self, description, **args): args = utility.extend( dict(min=0.0, max=0.0, step=0.1, value=0.0, disabled=False, continuous_update=False), args) return widgets.FloatSlider(description=description, **args)
def create_select_widget(self, label='', values=None, default=None, **kwargs): opts = dict(options=values or [], value=default if default is not None and default in values else values[0] if len(values or []) > 0 else None, description=label, disabled=False) opts = utility.extend(opts, kwargs) return widgets.Dropdown(**opts)
def generate(self, stream, column_functions=None): pos_tags = {k: 0 for k in self.tag_set.keys()} pos_delimiter = stream.append_pos pos_statistics = [] pos_total_counter = collections.Counter() for document, tokens in stream: counter = collections.Counter([ x.split(pos_delimiter)[-1].upper() if pos_delimiter in x else '???' for x in tokens ]) pos_total_counter.update(counter) counter_dict = dict(counter) pos_counts = extend(pos_tags, { k: v for k, v in counter_dict.items() if k in pos_tags.keys() }) other_counts = [ k for k in counter_dict.keys() if k not in pos_tags.keys() and k != '' ] if len(other_counts) > 0: logger.warning('Warning strange PoS tags: File %s, tags %s', document, other_counts) pos_statistics.append(extend(pos_counts, filename=document)) df = pd.DataFrame(pos_statistics) for (column_name, column_function) in (column_functions or []): df[column_name] = column_function(df) return df
def get_positioned_edges(network, layout, sort_attr=None): """Extracts network edge attributes and assigns coordinates to endpoints, and computes midpont coordinate Parameters ---------- network : nx.Graph The networkx graph. layout : dict of node + point pairs i.e. (node, [x,y]) A dictionary that contains coordinates for all nodes. Returns ------- Return list of dicts i.e. { source: source node, target: target-node, xs: [x1, x2], ys: [y1, y2], # Y-coordinate m_x: (x1 + x2) / 2, y_x: (y1 + y2) / 2, attr-1: value of attr-1 ... attr-n: value of attr-n } x1, y1 source node's coordinate x2, y2 target node's coordinate m_x, m_y midpoint coordinare """ list_of_dicts = [ extend( dict( source=u, target=v, xs=[layout[u][0], layout[v][0]], ys=[layout[u][1], layout[v][1]], m_x=[(layout[u][0] + layout[v][0])/2.0], m_y=[(layout[u][1] + layout[v][1])/2.0]), d) for u, v, d in network.edges(data=True) ] if sort_attr is not None: list_of_dicts.sort(key=lambda x: x[sort_attr]) return list_of_dicts
def get_corpus_data(corpus, document_index, title, columns_of_interest=None): metadata = [ utility.extend({}, dict(document_id=doc._.meta['document_id']), _get_pos_statistics(doc)) for doc in corpus ] df = pd.DataFrame(metadata)[['document_id'] + POS_NAMES] if columns_of_interest is not None: document_index = document_index[columns_of_interest] df = pd.merge(df, document_index, left_on='document_id', right_index=True, how='inner') df['title'] = df[title] df['words'] = df[POS_NAMES].apply(sum, axis=1) return df
def generate_field_filters(documents, opts): filters = [] for opt in opts: # if opt['type'] == 'multiselect': options = opt.get( 'options', _get_field_values(documents, opt['field'], as_tuple=True, query=opt.get('query', None))) description = opt.get('description', '') rows = min(4, len(options)) gf = utility.extend(opt, widget=widgets_config.selectmultiple(description, options, value=(), rows=rows)) filters.append(gf) return filters
def plot_network(nodes, edges, plot_opts, fig_opts=None): edges_source = bokeh.models.ColumnDataSource(edges) nodes_source = bokeh.models.ColumnDataSource(nodes) node_opts = utility.extend(DFLT_NODE_OPTS, plot_opts.get('node_opts', {})) line_opts = utility.extend(DFLT_EDGE_OPTS, plot_opts.get('line_opts', {})) fig_opts = utility.extend(DFLT_FIG_OPTS, fig_opts or {}) node_size = plot_opts.get('node_size', 1) p = figure(**fig_opts) p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None if 'line_color' in edges.keys(): line_opts = utility.extend(line_opts, { 'line_color': 'line_color', 'alpha': 1.0}) _ = p.multi_line('xs', 'ys', line_width='weight', source=edges_source, **line_opts) r_nodes = p.circle('x', 'y', size=node_size, source=nodes_source, **node_opts) if 'fill_color' in nodes.keys(): r_nodes.glyph.fill_color = 'fill_color' node_description = plot_opts.get('node_description', None) if node_description is not None: element_id = plot_opts.get('element_id', '_') text_source = ColumnDataSource(dict(text_id=node_description.index, text=node_description)) p.add_tools(bokeh.models.HoverTool(renderers=[r_nodes], tooltips=None, callback=widgets_config. glyph_hover_callback(nodes_source, 'node_id', text_source=text_source, element_id=element_id))) node_label = plot_opts.get('node_label', None) if node_label is not None and node_label in nodes.keys(): label_opts = utility.extend({}, DFLT_LABEL_OPTS, plot_opts.get('node_label_opts', {})) p.add_layout(bokeh.models.LabelSet(source=nodes_source, x='x', y='y', text=node_label, **label_opts)) edge_label = plot_opts.get('edge_label', None) if edge_label is not None and edge_label in edges.keys(): label_opts = utility.extend({}, DFLT_LABEL_OPTS, plot_opts.get('edge_label_opts', {})) p.add_layout(bokeh.models.LabelSet(source=edges_source, x='m_x', y='m_y', text=edge_label, **label_opts)) handle = bokeh.plotting.show(p, notebook_handle=True) return dict( handle=handle, edges=edges, nodes=nodes, edges_source=edges_source, nodes_source=nodes_source )
def gen_docs(corpus_reader, nlp): document_id = 0 for filename, text, metadata in corpus_reader: metadata = utility.extend( metadata, dict(filename=filename, document_id=document_id)) if len(text) > n_chunk_threshold: spacy_doc = textacy.spacier.utils.make_doc_from_text_chunks( text, lang=nlp, chunk_size=n_chunk_threshold) else: spacy_doc = nlp(text) if "textacy" not in spacy_doc.user_data: spacy_doc.user_data["textacy"] = {} spacy_doc.user_data["textacy"]["metadata"] = metadata if document_id == 0: spacy_doc.user_data["textacy"][ "spacy_lang_meta"] = nlp.meta spacy_doc.tensor = None if document_id == 0: spacy_doc.user_data['textacy'][ 'spacy_lang_meta'] = nlp.meta yield spacy_doc document_id += 1 logger.info('%s documents added...size was %s...', document_id, len(spacy_doc)) tick(document_id) spacy_doc = None
def _get_pos_statistics(doc): pos_iter = (x.pos_ for x in doc if x.pos not in [96, 0, 100]) pos_counts = dict(collections.Counter(pos_iter)) stats = utility.extend(dict(POS_TO_COUNT), pos_counts) return stats
def extract_corpus_terms(corpus, extract_args): """ Extracts documents and terms from a corpus Parameters ---------- corpus : textacy Corpus Corpus in textacy format. extract_args : dict Dict that contains args that specifies the filter and transforms extract_args['args'] positional arguments for textacy.Doc.to_terms_list extract_args['kwargs'] Keyword arguments for textacy.Doc.to_terms_list extract_args['extra_stop_words'] List of additional stopwords to use extract_args['substitutions'] Dict (map) with term substitution extract_args['mask_gpe'] Boolean flag indicating if GPE should be substituted extract_args['min_freq'] Integer value specifying min global word count. extract_args['max_doc_freq'] Float value between 0 and 1 indicating threshold for documentword frequency, Words that occur in more than `max_doc_freq` documents will be filtered out. None ---- extract_args.min_freq and extract_args.min_freq is the same value but used differently kwargs.min_freq is passed directly as args to `textacy_doc.to_terms_list` tokens below extract_args.min_freq threshold are added to the `extra_stop_words` list Returns ------- iterable of documents (which is iterable of terms) Documents where terms have ben filtered and transformed according to args. """ kwargs = dict(extract_args.get('kwargs', {})) args = dict(extract_args.get('args', {})) normalize = args.get('normalize', 'lemma') substitutions = extract_args.get('substitutions', {}) extra_stop_words = set(extract_args.get('extra_stop_words', None) or []) chunk_size = extract_args.get('chunk_size', None) min_length = extract_args.get('min_length', 2) mask_gpe = extract_args.get('mask_gpe', False) if mask_gpe is True: gpe_names = {x: '_gpe_' for x in get_gpe_names(corpus)} substitutions = utility.extend(substitutions, gpe_names) min_freq = extract_args.get('min_freq', 1) if min_freq > 1: words = infrequent_words(corpus, normalize=normalize, weighting='count', threshold=min_freq, as_strings=True) extra_stop_words = extra_stop_words.union(words) logger.info('Ignoring {} low-frequent words!'.format(len(words))) max_doc_freq = extract_args.get('max_doc_freq', 100) if max_doc_freq < 100: words = frequent_document_words(corpus, normalize=normalize, weighting='freq', dfs_threshold=max_doc_freq, as_strings=True) extra_stop_words = extra_stop_words.union(words) logger.info('Ignoring {} high-frequent words!'.format(len(words))) extract_args = { 'args': args, 'kwargs': kwargs, 'substitutions': substitutions, 'extra_stop_words': extra_stop_words, 'chunk_size': None } terms = (extract_document_terms(doc, extract_args) for doc in corpus) return terms
def display_party_network(parties=None, period_group_index=0, treaty_filter='', plot_data=None, topic_group=None, recode_is_cultural=False, layout_algorithm='', C=1.0, K=0.10, p1=0.10, output='network', party_name='party', node_size_range=[40, 60], palette_name=None, width=900, height=900, node_size=None, node_partition=None, wti_index=None, year_limit=None, progress=utility.noop, done_callback=None): try: if output == 'print_args': args = utility.filter_dict(locals(), ['progress', 'done_callback'], filter_out=True) args['wti_index'] = None args['plot_data'] = None args['output'] = 'network' pp(args) return plot_data = plot_data or utility.SimpleStruct(handle=None, nodes=None, edges=None, slice_range_type=2, slice_range=year_limit) weight_threshold = 0.0 palette = get_palette(palette_name) progress(1) period_group = config.DEFAULT_PERIOD_GROUPS[period_group_index] kwargs = dict(period_group=period_group, treaty_filter=treaty_filter, recode_is_cultural=recode_is_cultural, year_limit=year_limit) parties = list(parties) party_data = wti_index.get_party_network(party_name, topic_group, parties, **kwargs) if party_data is None or party_data.shape[0] == 0: print('No data for selection') return if topic_group is not None: party_data = party_data.loc[(party_data.topic.isin( topic_group.keys()))] group_keys = topic_group.keys() line_palette = color_utility.DEFAULT_LINE_PALETTE line_palette_map = { k: i % len(line_palette) for i, k in enumerate(group_keys) } party_data['line_color'] = party_data.category.apply( lambda x: line_palette[line_palette_map[x]]) else: party_data['category'] = party_data.topic party_data['edge_label'] = party_data.signed.apply( lambda x: x.year).astype(str) + '/' + party_data.category progress(2) #if not multigraph: # data = data.groupby(['party', 'party_other']).size().reset_index().rename(columns={0: 'weight'}) if party_data is None or party_data.shape[0] == 0: print('No data for selection') return G = create_party_network(party_data, K, node_partition, palette) #, multigraph) progress(3) if output == 'network': if weight_threshold > 0: G = get_sub_network(G, weight_threshold) layout, _ = layout_network(G, layout_algorithm, **dict(scale=1.0, K=K, C=C, p=p1)) progress(4) edges = get_positioned_edges2(G, layout, sort_attr='signed') nodes = get_positioned_nodes(G, layout) edges = {k: list(edges[k]) for k in edges} nodes = {k: list(nodes[k]) for k in nodes} node_size = setup_node_size(nodes, node_size, node_size_range) x_offset, y_offset = adjust_node_label_offset(nodes, node_size) plot_opts = utility.extend(NETWORK_PLOT_OPTS, figsize=(width, height), node_size=node_size, node_label_opts=dict(y_offset=y_offset, x_offset=x_offset), edge_label_opts={}) progress(5) data = plot_network(nodes=nodes, edges=edges, node_label='name', edge_label='edge_label', **plot_opts) plot_data.update(**data) progress(6) #bp.show(p) if done_callback is not None: done_callback(None) elif output == 'table': party_data.columns = [ dict(party='source', party_other='target').get(x, x) for x in party_data.columns ] display(party_data) except Exception as ex: logger.error(ex) raise finally: progress(0)
def _get_pos_statistics(doc): pos_iter = (x.pos_ for x in doc if x.pos_ not in ['NUM', 'PUNCT', 'SPACE']) pos_counts = dict(collections.Counter(pos_iter)) stats = utility.extend(dict(POS_TO_COUNT), pos_counts) return stats
def compute(corpus, tick=utility.noop, method='sklearn_lda', vec_args=None, term_args=None, tm_args=None, **args): tick() vec_args = utility.extend({}, DEFAULT_VECTORIZE_PARAMS, vec_args) terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ] fx_terms = lambda: terms # [ doc for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ] perplexity_score = None coherence_score = None vectorizer = None doc_topic_matrix = None doc_term_matrix = None documents = textacy_utility.get_corpus_documents(corpus) if method.startswith('sklearn'): vectorizer = textacy.Vectorizer(**vec_args) doc_term_matrix = vectorizer.fit_transform(fx_terms()) model = textacy.TopicModel(method.split('_')[1], **tm_args) model.fit(doc_term_matrix) tick() doc_topic_matrix = model.transform(doc_term_matrix) tick() id2word = vectorizer.id_to_term bow_corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False) # FIXME!!! perplexity_score = None coherence_score = None elif method.startswith('gensim_'): algorithm = method.split('_')[1].upper() id2word = gensim.corpora.Dictionary(fx_terms()) bow_corpus = [ id2word.doc2bow(tokens) for tokens in fx_terms() ] if args.get('tfidf_weiging', False): # assert algorithm != 'MALLETLDA', 'MALLET training model cannot (currently) use TFIDF weighed corpus' tfidf_model = gensim.models.tfidfmodel.TfidfModel(bow_corpus) bow_corpus = [ tfidf_model[d] for d in bow_corpus ] algorithms = setup_gensim_algorithms(corpus, bow_corpus, id2word, tm_args) engine = algorithms[algorithm]['engine'] engine_options = algorithms[algorithm]['options'] model = engine(**engine_options) if hasattr(model, 'log_perplexity'): perplexity_score = model.log_perplexity(bow_corpus, len(bow_corpus)) try: coherence_model_lda = gensim.models.CoherenceModel(model=model, texts=fx_terms(), dictionary=id2word, coherence='c_v') coherence_score = coherence_model_lda.get_coherence() except Exception as ex: logger.error(ex) coherence_score = None processed = topic_model_utility.compile_metadata( model, bow_corpus, id2word, documents, vectorizer=vectorizer, doc_topic_matrix=doc_topic_matrix, n_tokens=200 ) model_data = types.SimpleNamespace( topic_model=model, id2term=id2word, bow_corpus=bow_corpus, doc_term_matrix=doc_term_matrix, #doc_topic_matrix=doc_topic_matrix, #vectorizer=vectorizer, processed=processed, perplexity_score=perplexity_score, coherence_score=coherence_score, options=dict(method=method, vec_args=vec_args, term_args=term_args, tm_args=tm_args, **args), coherence_scores=None ) tick(0) return model_data
def test_zip_archive(self): filename = '.\\temp\\daedalus_articles_pos_xml_1931-2017.zip' filepath = join_test_data_path(filename) source = ZipReader(filepath, '*.xml') pos_delimiter = "_" #for document, tokens in source: # print("{}: {}".format(document, len(tokens))) import collections import pandas as pd import re stream = SparvCorpusSourceReader( source=source, transforms=[ lambda tokens: [ x.lower() for x in tokens] ], postags="''", chunk_size=None, lemmatize=False, append_pos=pos_delimiter, ignores="" ) pos_tags = { 'AB': 0, # 'Adverb', 'DT': 0, # 'Determiner', 'HA': 0, # 'WH-adverb', 'HD': 0, # 'WH-determiner', 'HP': 0, # 'WH-pronoun', 'HS': 0, # 'WH-possessive', 'IE': 0, # 'Infinitival marker', 'IN': 0, # 'Interjection', 'JJ': 0, # 'Adjective', 'KN': 0, # 'Coordinating conjunction', 'NN': 0, # 'Noun', 'PC': 0, # 'Participle', 'PL': 0, # 'Particle', 'PM': 0, # 'Proper Noun', 'PN': 0, # 'Pronoun', 'PP': 0, # 'Preposition', 'PS': 0, # 'Possessive pronoun', 'RG': 0, # 'Cardinal number', 'RO': 0, # 'Ordinal number', 'SN': 0, # 'Subordinating conjunction', 'VB': 0, # 'Verb', 'UO': 0, # 'Foreign word', 'MAD': 0, # 'Major delimiter', 'MID': 0, # 'Minor delimiter', 'PAD': 0, # 'Pairwise delimiter', '???': 0 } pos_statistics = [] pos_total_counter = collections.Counter() for document, tokens in stream: counter = collections.Counter([ x.split(pos_delimiter)[-1].upper() if pos_delimiter in x else '???' for x in tokens ]) pos_total_counter.update(counter) counter_dict = dict(counter) pos_counts = extend(pos_tags, { k: v for k, v in counter_dict.items() if k in pos_tags.keys() }) other_counts = [ k for k in counter_dict.keys() if k not in pos_tags.keys() ] if len(other_counts) > 0: logger.warning('Warning strange PoS tags: File %s, tags %s', document, other_counts) pos_statistics.append(extend(pos_counts, filename=document)) #v = {k: [dic[k] for dic in LD] for k in LD[0]} df = pd.DataFrame(pos_statistics) df['year'] = df.filename.apply(lambda x: int(re.search(r'daedalus_volume_(\d{4})', x).group(1))) df['article'] = df.filename.apply(lambda x: int(re.search(r'article_(\d{2})', x).group(1))) df['segment'] = df.filename.apply(lambda x: int(re.search(r'(\d{2})\.txt', x).group(1))) df.to_excel("stats.xlsx")
def plot( # pylint: disable=W0102 network, layout, scale=1.0, threshold=0.0, node_description=None, node_size=5, node_size_range=[20, 40], weight_scale=5.0, normalize_weights=True, node_opts=None, line_opts=None, text_opts=None, element_id='nx_id3', figsize=(900, 900), tools=None, palette=DFLT_PALETTE, **figkwargs): if threshold > 0: network = get_sub_network(network, threshold) edges = networkx_utility.get_positioned_edges(network, layout) if normalize_weights and 'weight' in edges.keys(): max_weight = max(edges['weight']) edges['weight'] = [float(x) / max_weight for x in edges['weight']] if weight_scale != 1.0 and 'weight' in edges.keys(): edges['weight'] = [weight_scale * float(x) for x in edges['weight']] edges = dict(source=u, target=v, xs=xs, ys=ys, weights=weights) nodes = networkx_utility.get_positioned_nodes(network, layout) #node_size = setup_node_size(nodes, node_size, node_size_range) if node_size in nodes.keys() and node_size_range is not None: nodes['clamped_size'] = utility.clamp_values(nodes[node_size], node_size_range) node_size = 'clamped_size' label_y_offset = 'y_offset' if node_size in nodes.keys() else node_size + 8 if label_y_offset == 'y_offset': nodes['y_offset'] = [ y + r for (y, r) in zip(nodes['y'], [r / 2.0 + 8 for r in nodes[node_size]]) ] edges = {k: list(edges[k]) for k in edges} nodes = {k: list(nodes[k]) for k in nodes} edges_source = bokeh.models.ColumnDataSource(edges) nodes_source = bokeh.models.ColumnDataSource(nodes) node_opts = utility.extend(DFLT_NODE_OPTS, node_opts or {}) line_opts = utility.extend(DFLT_EDGE_OPTS, line_opts or {}) p = figure(plot_width=figsize[0], plot_height=figsize[1], tools=tools or TOOLS, **figkwargs) p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None _ = p.multi_line('xs', 'ys', line_width='weights', source=edges_source, **line_opts) r_nodes = p.circle('x', 'y', size=node_size, source=nodes_source, **node_opts) if 'fill_color' in nodes.keys(): r_nodes.glyph.fill_color = 'fill_color' if node_description is not None: text_source = ColumnDataSource( dict(text_id=node_description.index, text=node_description)) p.add_tools( bokeh.models.HoverTool( renderers=[r_nodes], tooltips=None, callback=widgets_config.glyph_hover_callback( nodes_source, 'node_id', text_source, element_id=element_id))) label_opts = utility.extend( DFLT_TEXT_OPTS, dict(y_offset=label_y_offset, text_color='black', text_baseline='bottom'), text_opts or {}) p.add_layout(bokeh.models.LabelSet(source=nodes_source, **label_opts)) return p
}, { 'engine_name': 'LdaMallet', 'engine_option': { 'iterations': 2000, 'passes': 3, 'engine_path': mallet_path } } ] } ] for run_option in run_options: option = utility.extend(dict(DEFAULT_OPT), dict(run_option)) if option.get('skip', False) is True: continue n_topics = to_sequence(run_option['num_topics']) engines = to_sequence(run_option['engines']) for engine in engines: print("engine: {}".format(engine['engine_name'])) for n_topic in n_topics: option['engine_option'] = utility.extend(engine['engine_option'], dict(num_topics=n_topic))