Example #1
0
	def __create_neighbor_df( self, query ):
		rows = []
		top_label = "Top %d Neighbor Terms" % config.get("num_neighbors", 10)
		columns = [ "Query Term", top_label ]
		# generate the individual recommendations
		top_label = "Top %d Neighbor Terms" % config.get("num_neighbors", 10)
		columns = [ "Query Term", top_label ]
		for query_term in query:
			neighbors = self.embed.get_neighbors( query_term, num_neighbors = config.get("num_neighbors", 10) )
			row = { "Query Term" : query_term, top_label : ", ".join( neighbors ) }
			rows.append( row )
		# generate the overall combination
		neighbors = self.embed.get_neighbors( query, num_neighbors = config.get("num_neighbors", 10) )
		row = { "Query Term" : "Combined Query", top_label : ", ".join( neighbors ) }
		rows.append( row )
		return pd.DataFrame( rows )
Example #2
0
	def generate_termlevel_card_text( self ):
		text = "Individual term-level silhouette scores for the top %d terms in the descriptor of the topic selected below." % ( 
			config.get( "top_terms", 10) )
		text += " Terms are arranged in descending order by score."
		text += " where a score close to 1 indicates a term that is semantically coherent with respect to its topic,"
		text += " while a score close to -1 indicates a term that does not fit well with its topic."
		return dcc.Markdown( text )
Example #3
0
	def generate_neighbor_card_text( self ):
		num_neighbors = config.get("num_neighbors", 10)
		text = "The word embedding *%s* was generated by the *%s* algorithm on the %s, " % ( 
			self.metadata["id"], self.metadata["algorithm"]["id"], self.metadata["description"] )
		text += " where each term is represented by a vector of %d dimensions." % ( self.metadata["dimensions"] )
		text += "\n\nEnter one or more query terms and hit return to show a list of the %d most similar neighbor terms for each query term, based on the similarities in the word embedding." % num_neighbors
		text += " The final row of the table shows the %d most similar neighbors for the combined set of query terms when considered together." % num_neighbors
		return dcc.Markdown(text)
Example #4
0
	def generate_vtable( self ):
		""" Generates a Dash table containing topic-level validation scores. """
		if self.current_embed_id is None:
			return ""
		# already cached these results?
		if self.current_embed_id in self.validation_cache:
			df = self.validation_cache[self.current_embed_id]
			log.info("Using cached comparison validation scores for embedding %s" % self.current_embed_id )
		else:
			# get the word embedding
			embed = self.webcore.get_embedding(self.current_embed_id)
			if embed is None:
				return ""
			# perform the evaluation
			log.info("Performing comparison on %d topic models using %s ..." % (len(self.all_metadata), self.current_embed_id) )
			df = self.validator.get_validation_df( self.all_metadata, embed )
			if df is None:
				return ""
			# round it
			df = df.round( config.get("precision", 3) )
			self.validation_cache[self.current_embed_id] = df
		if df is None:
			return ""
		data = df.to_dict('records')
		columns = []
		for i in df.columns:
			if i in measure_short_names:
				columns.append( {"name": measure_short_names[i], "id": i, "deletable": False, "selectable": False} )
			else:
				columns.append( {"name": i, "id": i, "deletable": False, "selectable": False} )
		return dash_table.DataTable(
		    id='validation_model',
		    columns=columns,
		    data=data,
		    sort_action='native',
			style_header={
				'backgroundColor': 'white',
				'fontWeight': 'bold',
				'border-top': '1px solid #dee2e6',
				'border-bottom': '2px solid #dee2e6',
				'line-height': 3.1
			},			
			style_cell=
				{
					'textAlign': 'right',
					'border-top': '1px solid #dee2e6',
					'line-height': 3.1
				},
			style_cell_conditional=[
				{
					'if': {'column_id': c},
					'textAlign': 'left'
				} for c in ['Name', 'Corpus']
			],
		    style_as_list_view=True
		)
Example #5
0
    def __apply_mds(self, D):
        """ Applies Multidimensional scaling (MDS) to the specified distance matrix
		and returns the resulting coordinates """
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            mds = manifold.MDS(n_components=2,
                               random_state=config.get("random_seed", 100),
                               dissimilarity="euclidean")
            results = mds.fit(D)
        # we just need the coordinates
        return results.embedding_
Example #6
0
	def generate_termlevel_chart( self ):
		if self.current_embed_id is None:
			return ""
		# already cached these results?
		if self.current_embed_id in self.termlevel_cache:
			scores = self.termlevel_cache[self.current_embed_id]
			log.info("Using cached term-level silhouette scores for embedding %s" % self.current_embed_id )
		else:
			# get the word embedding
			embed = self.webcore.get_embedding(self.current_embed_id)
			if embed is None:
				return ""
			log.info("Applying term-level silhouette analysis to topic model using %s ..." % self.current_embed_id )
			scores = self.validator.get_termlevel_silhouette_scores( self.metadata, embed )
			if scores is None:
				return ""
			self.termlevel_cache[self.current_embed_id] = scores
		# create the values for the chart, based on the currently selected topic
		term_scores = pd.Series( scores[self.current_topic_index-1] ).sort_values(ascending=True)
		xvalues, yvalues = [], []
		for term, score in term_scores.iteritems():
			xvalues.append( round( score, config.get("precision", 3) ) )
			yvalues.append( term )
		# choose a sensible range for the x-axis
		min_value, max_value = -1, 1
		# get the color from the palette
		colors =  self.get_colors( self.metadata["k"] )
		s_rgb = self.format_color_string( colors[self.current_topic_index-1] )
		# generate the chart
		return dcc.Graph(
			id='chart_topicsil',
			figure={
				'data': [
					{
						'x': xvalues, 
						'y': yvalues, 
						'type': 'bar',
						'orientation' : 'h',
						'marker' : { 'color': s_rgb, 'opacity': 0.4 },
						'hovertemplate': '<b>%{y}</b>: %{x}<extra></extra>',
						'hoverlabel' : { 'bgcolor' : 'rgb(250, 246, 208)' }
		    		},
				],
				'layout': 
				{ 
					'margin': { "t" : 30, "l" : 120, "r" : 120 },
					'yaxis' : { 'tickfont' : { "size" : 14 } },
					'xaxis' : { 'title' : "Term Silhouette Score", 
						'tickfont' : { "size" : 14 },
						'titlefont' : { "size" : 15 },
						'range': [min_value, max_value] }
				}
			})	
Example #7
0
	def generate_heatmap_card( self ):
		query_string = config.get("query_sample", "")
		return dbc.Card(
			[
				dbc.CardHeader("Embedding Term Similarity Heatmap", className="card-header"),
				dbc.CardBody(
					[
						html.Div( self.generate_heatmap_card_text(), className="card-text"),
						html.Div( self.generate_embed_heatmap( query_string ), id='content_embed_heatmap'),
					]
				)
			],
		)
Example #8
0
	def generate_vsummary( self ):
		""" Generates a Dash table containing overall model-level validation scores. """
		if self.current_embed_id is None:
			return ""
		# already cached these results?
		if self.current_embed_id in self.validation_cache:
			df = self.validation_cache[self.current_embed_id]
			log.info("Using cached validation scores for embedding %s" % self.current_embed_id )
		else:
			# get the word embedding
			embed = self.webcore.get_embedding(self.current_embed_id)
			if embed is None:
				return ""
			# perform the evaluation
			log.info("Evaluating overall topic model using %s ..." % self.current_embed_id )
			df = self.validator.get_validation_df( self.metadata, embed )
			self.validation_cache[self.current_embed_id] = df
		if df is None:
			return ""
		# generate data
		df_mean = df.mean( axis = 0)
		df_mean = df_mean.round( config.get("precision",3) )
		data = []
		columns = [ "Measure", "Mean Value" ]
		for i, value in df_mean.iteritems():
			if not i in measure_names:
				continue
			label = "%s (%s)" %( measure_names[i], measure_short_names[i] )
			data.append( { "Measure" : label, "Mean Value" : value } )
		# generate 
		return dash_table.DataTable(
		    id='validation_summary',
		    columns=[{"name": i, "id": i, "deletable": False, "selectable": False} for i in columns],
		    data=data,
			style_cell=
				{
					'textAlign': 'center'
				},
			style_header={
			        'backgroundColor': 'white',
			        'fontWeight': 'bold',
			        'border-bottom': '2px solid #808080'
			    },				
			style_cell_conditional=[
				{
					'if': {'column_id': c},
					'textAlign': 'left'
				} for c in ['Measure']
			],
		    style_as_list_view=True
		)
Example #9
0
	def generate_document_association_chart( self ):
		descriptors = self.metadata.get_descriptors()
		if descriptors is None:
			return ""
		if self.document_associations is None:
			self.document_associations = self.metadata.get_document_associations()
		if self.document_associations is None:
			return ""
		# get the top documents for this topic
		weights = self.document_associations[self.current_document_topic_index].sort_values(ascending=False).head( self.top_associations )
		max_value = self.document_associations.max().max()
		# reverse the order
		weights = weights.sort_values(ascending=True)
		xvalues, yvalues = [], []
		for doc_id, score in weights.iteritems():
			xvalues.append( round( score, config.get("precision", 3) ) )
			yvalues.append( doc_id + " " )
		# get the color from the palette
		colors = self.get_colors( self.metadata["k"] )
		s_rgb = self.format_color_string( colors[self.current_document_topic_index-1] )
		# generate the chart
		title = "Topic %02d: %s" % ( self.current_document_topic_index, ", ".join( descriptors[self.current_document_topic_index-1] ) )
		chart_height = self.get_barchart_height( len(xvalues) )
		return dcc.Graph(
			id='chart_document_assoc',
			figure={
				'data': [
					{
						'x': xvalues, 
						'y': yvalues, 
						'type': 'bar',
						'orientation' : 'h',
						'marker' : { 'color': s_rgb, 'opacity': 0.4 },
						'hovertemplate': '<b>%{y}</b>: %{x}<extra></extra>',
						'hoverlabel' : { 'bgcolor' : 'rgb(250, 246, 208)' }
		    		},
				],
				'layout': 
				{ 
					'title' : { 'text': title, 'font' : { "size" : 15 } },
					'height' : chart_height,
					'margin': { "t" : 40, "l" : 250, "r" : 100 },
					'yaxis' : { 'tickfont' : { "size" : 14 } },
					'xaxis' : { 'title' : "Topic-Document Association", 
						'tickfont' : { "size" : 13 },
						'titlefont' : { "size" : 15 },
						'range': [0, max_value]
					},
				}
			})
Example #10
0
	def __init__( self, webcore, all_model_metadata ):
		super(ComparisonLayout, self).__init__( webcore )
		# validation measures
		self.validator = ModelValidator()
		# page details
		self.page_title = "%s - Comparison" % self.page_title
		self.page_suffix = "-comparison"
		# current state
		self.top_terms = config.get("top_terms", 10)
		self.all_metadata = all_model_metadata
		self.current_embed_id = None
		self.current_metadata_indices = [0, 1]
		# cache of validation results
		self.validation_cache = {}
Example #11
0
	def __calculate_similarity_df( self, terms ):
		""" Calculate similarity matrix between specified terms. Note that this 
		assumes the embedding model has been previously loaded. """
		n = len(terms)
		rows = []
		for i in range(n):
			rows.append( { "term1" : terms[i], "term2" : terms[i], "sim" : 1.0 } )
			for j in range(i+1,n):
				if terms[i] in self.embed and terms[j] in self.embed:
					sim = round( self.embed.similarity( terms[i], terms[j] ), config.get("precision",3) )
				else:
					sim = 0.0
				rows.append( { "term1" : terms[i], "term2" : terms[j], "sim" : sim } )
				rows.append( { "term1" : terms[j], "term2" : terms[i], "sim" : sim } )
		return pd.DataFrame( rows )
Example #12
0
	def __init__(self, model_id, meta_file_path):
		log.info("Loading model metadata from %s" % meta_file_path)
		# read the JSON
		fin = open(meta_file_path, "r")
		data = json.load(fin)
		fin.close()
		if type(data) != dict:
			raise Exception("Invalid JSON format in metadata file")
		if not "type" in data:
			raise Exception("No type specified in metadata file")
		if data["type"] != "topic_model":
			raise Exception("Metadata does not describe a topic model")
		if not "files" in data:
			raise Exception("No file paths specified in metadata file")
		# add the metadata
		self["id"] = model_id
		for key in data:
			self[key] = data[key]
		# ensure we have the mandatory metadata
		if not "corpus" in self:
			self["corpus"] = "unknown"
		if not "algorithm" in self:
			self["algorithm"] = { "id" : "unknown", "parameters" : {} }
		for key in ["topics", "documents", "terms"]:
			if not key in self:
				self[key] = 0
		# other properties
		self.meta_file_path = Path(meta_file_path)
		self.dir_base = meta_file_path.parent
		self.term_rankings = None
		self.partition = None
		self.term_associations = None
		self.document_associations = None
		# other settings
		self.top_terms = config.get("top_terms", 10)
		self.extended_top_terms = config.get("extended_top_terms", 20)
Example #13
0
	def __init__( self, webcore, all_model_metadata, show_navbar = True  ):
		super(ValidationLayout, self).__init__( webcore )
		# validation measures
		self.validator = TopicValidator()
		# page details
		self.show_navbar = show_navbar
		self.page_title = "%s - Validation" % self.page_title
		self.page_suffix = "-validation"
		# current state
		self.metadata = all_model_metadata
		self.current_embed_id = None
		self.current_measure_id = config.get( "default_measure", "coherence" )
		# cache of validation results
		self.validation_cache = {}
		self.term_distribution_cache = {}
Example #14
0
	def __init__( self, webcore, model_metadata, show_navbar = True ):
		super(TopicModelLayout, self).__init__( webcore )
		self.show_navbar = show_navbar
		# page details
		self.page_title = "%s - Topic Model" % self.page_title
		self.page_suffix = "-topics"
		# number of top associations
		self.top_associations = config.get("num_associations", 20)
		# current state
		self.metadata = model_metadata
		self.current_term_topic_index = 1
		self.current_document_topic_index = 1
		# cache
		self.term_associations = None
		self.document_associations = None
		self.partition_df = None
Example #15
0
	def generate_matching_table( self ):
		if self.current_embed_id is None:
			return ""
		embed = self.webcore.get_embedding(self.current_embed_id)
		if embed is None:
			return ""
		descriptors1 = self.all_metadata[self.current_metadata_indices[0]].get_descriptors()
		descriptors2 = self.all_metadata[self.current_metadata_indices[1]].get_descriptors()
		if descriptors1 is None or descriptors2 is None:
			return ""
		# perform the match
		matcher = TopicMatcher(embed)
		permutation, similarities = matcher.match(descriptors1, descriptors2)
		# create the table
		k1, k2 = len(descriptors1), len(descriptors2)
		num_fmt = "%02d" if max(k1,k2) < 100 else "%03d"
		rows = []
		matched_model2 = []
		for topic_index1 in range(k1):
			topic_index2 = permutation[topic_index1]
			matched_model2.append(topic_index2)
			ranking1 = descriptors1[topic_index1]
			row = { "Topic 1":num_fmt % (topic_index1+1) }
			row["Descriptor 1"] = ", ".join(ranking1[0:self.top_terms])
			if topic_index2 < k2:
				ranking2 = descriptors2[topic_index2]
				row["Topic 2"] = num_fmt % (topic_index2+1)
				row["Descriptor 2"] = ", ".join(ranking2[0:self.top_terms])
				row["Similarity"] = config.get("float_format","%.3f") % similarities[topic_index1]
			else:
				row["Topic 2"] = ""
				row["Descriptor 2"] = ""
				row["Similarity"] = ""
			rows.append( row )
		for topic_index2 in range(k2):
			if topic_index2 in matched_model2:
				continue
			ranking2 = descriptors2[topic_index2]
			row = { "Topic 2":num_fmt % (topic_index2+1) }
			row["Descriptor 2"] = ", ".join(ranking2[0:self.top_terms])
			row["Topic 1"] = ""
			row["Descriptor 1"] = ""
			row["Similarity"] = ""
			rows.append( row )
		df = pd.DataFrame(rows)
		alignments = { "Topic 1":"center", "Topic 2":"center", "Similarity":"center" }
		return DataFrameTable( df, id="matching-table", alignments=alignments, striped=False, hover=False ).generate_layout()
Example #16
0
    def generate_measure_dropdown(self):
        """ Utility function to generate a dropdown component which allows the user
		to choose between different topic evaluation measures. """
        measure_options = []
        for measure_id in measure_names:
            measure_options.append({
                "label":
                "%s (%s)" %
                (measure_names[measure_id], measure_short_names[measure_id]),
                "value":
                measure_id
            })
        default_measure = config.get("default_measure",
                                     measure_options[0]["value"])
        return dbc.Select(id='measure-dropdown',
                          options=measure_options,
                          value=default_measure)
Example #17
0
 def get_validation_df(self, all_meta, embed):
     if embed is None:
         return None
     measures = get_measures(measure_names.keys(), embed)
     rows = []
     for meta in all_meta:
         descriptors = meta.get_descriptors()
         if descriptors is None:
             continue
         row = {
             "Name": meta["id"],
             "Corpus": meta["corpus"],
             "Topics": len(descriptors)
         }
         for measure_id in measures:
             score = measures[measure_id].evaluate_model(descriptors)
             # TODO: move rounding elsewhere?
             row[measure_id] = round(score, config.get("precision", 3))
         rows.append(row)
     return pd.DataFrame(rows)
Example #18
0
	def generate_neighbor_card( self ):
		query_string = config.get("query_sample", "")
		return dbc.Card(
			[
				dbc.CardHeader("Embedding Term Neighbors", className="card-header"),
				dbc.CardBody(
					[
						html.Div( self.generate_neighbor_card_text(), className="card-text"),
						dbc.InputGroup(
							[
								dbc.InputGroupAddon("Query Terms", addon_type="prepend"),
								dbc.Input(id="query-embed", value=query_string, placeholder="Enter a list of comma-separated terms...", 
									type="text", debounce=True, className="custom-text"),
							]
						),
						html.Div( self.generate_neighbor_table( query_string ), id='content_neighbor_table' ),
					]
				),
			],
		)
Example #19
0
    def get_validation_df(self, meta, embed):
        """ Get a Data Frame containing validation scores for the individual topics in
		the specified topic model. """
        if embed is None:
            return pd.DataFrame([])
        descriptors = meta.get_descriptors()
        if descriptors is None:
            return pd.DataFrame([])
        measures = get_measures(measure_names.keys(), embed)
        rows = []
        num_fmt = "%02d" if len(descriptors) < 100 else "%03d"
        for i in range(meta["k"]):
            rows.append({
                "Topic": num_fmt % (i + 1),
                "Descriptor": ", ".join(descriptors[i])
            })
        for measure_id in measures:
            scores = measures[measure_id].evaluate_topics(descriptors)
            for i, score in enumerate(scores):
                # TODO: move rounding elsewhere?
                rows[i][measure_id] = round(score, config.get("precision", 3))
        return pd.DataFrame(rows)
Example #20
0
	def generate_embed_heatmap( self, query_string ):
		# parse the query string
		query = self.__parse_query_string( query_string )
		if len(query) == 0:
			return ""
		# get the embedding
		if self.embed is None:
			self.embed = self.webcore.get_embedding(self.metadata["id"])
			if self.embed is None:
				return ""
		# get all unique terms
		all_terms =  []
		for query_term in query:
			all_terms.append( query_term )
			neighbors = self.embed.get_neighbors( query_term, num_neighbors = config.get("num_neighbors", 10) )
			for term in neighbors:
				if not term in all_terms:
					all_terms.append( term )
		df = self.__calculate_similarity_df( all_terms )
		return dcc.Graph(
			id='chart_topicheatmap',
			figure={
				'data': [
					{
						'x': df["term1"], 
						'y': df["term2"],
						'z': df["sim"],
						'type': 'heatmap',		    		
						'hoverlabel' : { 'bgcolor' : 'rgb(250, 246, 208)' },
						# 'hovertemplate': '<b>%text</b><br>Similarity: %{z}<extra></extra>',
					},
				],
				'layout': 
				{ 
					'margin': { "t" : 2 },
					'height': 900,
					"yaxis" : { "autorange" : 'reversed'},
				}
			})	
Example #21
0
	def __find_metadata(self):
		self.embedding_meta = {}
		self.model_meta = {}
		extension = config.get("file_extension", ".meta")
		meta_file_paths = self.dir_core.glob('**/*' + extension)
		for meta_file_path in meta_file_paths:
			try:
				with open(meta_file_path, "r") as fin:
					data = json.load(fin)
					if type(data) != dict:
						continue
					# create the ID as a relative path minus the extension
					meta_id = filepath_to_metadata_id( meta_file_path, self.dir_core )
					if data["type"] == "embedding":
						self.embedding_meta[meta_id] = EmbeddingMeta(meta_id, meta_file_path)
					elif data["type"] == "topic_model":
						self.model_meta[meta_id] = TopicModelMeta(meta_id, meta_file_path)
					else:
						log.info("Unknown metadata type %s in file %s" % (data["type"], meta_file_path))
			except Exception as e:
				log.warning("Skipping file: %s" % meta_file_path)
				log.warning(e)
		log.info("Found %d embeddings, %d topic models" 
			% (len(self.embedding_meta), len(self.model_meta)))
Example #22
0
 def generate_topiclevel_heatmap(self):
     """ Generate a heatmap depicting pairwise topic-topic similarities. """
     if self.current_embed_id is None:
         return ""
     descriptors = self.metadata.get_descriptors()
     if descriptors is None:
         return ""
     # already cached these results?
     if self.current_embed_id in self.topiclevel_cache:
         df = self.topiclevel_cache[self.current_embed_id]
         log.info("Using cached similarites for embedding %s" %
                  self.current_embed_id)
     else:
         # get the word embedding
         embed = self.webcore.get_embedding(self.current_embed_id)
         if embed is None:
             return ""
         log.info("Computing similarities for topic model using %s ..." %
                  self.current_embed_id)
         df = self.validator.get_topic_pair_similarity_df(
             self.metadata, embed)
         if df is None:
             return ""
         # round it
         df = df.round(config.get("precision", 3))
         self.topiclevel_cache[self.current_embed_id] = df
     # generate the chart
     hovertext = []
     for i, row in df.iterrows():
         topic_num1 = int(row["topic1"].replace("Topic ", "")) - 1
         topic_num2 = int(row["topic2"].replace("Topic ", "")) - 1
         s = "<b>%s</b>: %s<br><b>%s</b>: %s" % (row["topic1"], ", ".join(
             descriptors[topic_num1]), row["topic2"], ", ".join(
                 descriptors[topic_num2]))
         hovertext.append(s)
     return dcc.Graph(
         id='chart_topicheatmap',
         figure={
             'data': [
                 {
                     'x':
                     df["topic1"],
                     'y':
                     df["topic2"],
                     'z':
                     df["sim"],
                     'hovertext':
                     hovertext,
                     'type':
                     'heatmap',
                     'hoverlabel': {
                         'bgcolor': 'rgb(250, 246, 208)'
                     },
                     'hovertemplate':
                     '%{hovertext}<br>Similarity: %{z}<extra></extra>',
                 },
             ],
             'layout': {
                 'margin': {
                     "t": 2
                 },
                 'height': 600,
                 "yaxis": {
                     "autorange": 'reversed'
                 },
             }
         })
Example #23
0
    def generate_termlevel_heatmap(self):
        """ Generate a heatmap showing the similarities between the pairs of terms which appear in the descriptor
		of an individual topic. """
        if self.current_embed_id is None:
            return ""
        descriptors = self.metadata.get_descriptors()
        if descriptors is None:
            return ""
        # already cached these results?
        if self.current_embed_id in self.termlevel_cache:
            df = self.termlevel_cache[self.current_embed_id]
            log.info("Using cached term similarites for embedding %s" %
                     self.current_embed_id)
        else:
            # get the word embedding
            embed = self.webcore.get_embedding(self.current_embed_id)
            if embed is None:
                return ""
            log.info(
                "Computing term similarities for topic model using %s ..." %
                self.current_embed_id)
            df = self.validator.get_term_pair_similarity_df(
                self.metadata, embed)
            if df is None:
                return ""
            # round it
            df = df.round(config.get("precision", 3))
            self.termlevel_cache[self.current_embed_id] = df
        # now get the relevant terms for this topic and filter the Data Frame
        current_descriptor = descriptors[self.current_topic_index - 1]
        current_descriptor_set = set(current_descriptor)
        xvalues, yvalues, zvalues, hovertext = [], [], [], []
        # TODO: make this more effecient
        for i, row in df.iterrows():
            if row["term1"] in current_descriptor_set and row[
                    "term2"] in current_descriptor_set:
                xvalues.append(row["term1"])
                yvalues.append(row["term2"])
                zvalues.append(row["sim"])
                hovertext.append("<b>(%s, %s)</b>" %
                                 (row["term1"], row["term2"]))
        # generate the chart
        title = "Topic %02d: %s" % (self.current_topic_index,
                                    ", ".join(current_descriptor))
        return dcc.Graph(
            id='chart_topicheatmap',
            figure={
                'data': [
                    {
                        'x':
                        xvalues,
                        'y':
                        yvalues,
                        'z':
                        zvalues,
                        'hovertext':
                        hovertext,
                        'type':
                        'heatmap',
                        'hoverlabel': {
                            'bgcolor': 'rgb(250, 246, 208)'
                        },
                        'hovertemplate':
                        '%{hovertext}<br>Similarity: %{z}<extra></extra>',
                    },
                ],
                'layout': {
                    'title': {
                        'text': title,
                        'font': {
                            "size": 15
                        }
                    },
                    'margin': {
                        "t": 40,
                        "l": 200,
                        "r": 200
                    },
                    'height': 600,
                    'xaxis': {
                        'tickfont': {
                            "size": 14
                        }
                    },
                    "yaxis": {
                        "autorange": 'reversed',
                        'tickfont': {
                            "size": 14
                        }
                    },
                }
            })
Example #24
0
	def generate_heatmap_card_text( self ):
		text = "The heatmap visualization below shows the similarities between all query terms entered above,"
		text += " and the %d neighbors of those terms." % config.get("num_neighbors", 10)
		text += " The similarities of the terms are based on the word embedding *%s* selected above." % self.metadata["id"]
		return dcc.Markdown(text)
Example #25
0
	def generate_topiclevel_chart( self ):
		if self.current_embed_id is None:
			return ""
		# already cached these results?
		if self.current_embed_id in self.topiclevel_cache:
			df_sil = self.topiclevel_cache[self.current_embed_id]
			log.info("Using cached silhouette scores for embedding %s" % self.current_embed_id )
		else:
			# get the word embedding
			embed = self.webcore.get_embedding(self.current_embed_id)
			if embed is None:
				return ""
			log.info("Applying silhouette analysis to topic model using %s ..." % self.current_embed_id )
			df_sil = self.validator.get_topiclevel_silhouette_df( self.metadata, embed )
			if df_sil is None:
				return ""
			# round it 
			df_sil = df_sil.round( config.get("precision", 3) )
			self.topiclevel_cache[self.current_embed_id] = df_sil
		# sort the results in reverse order
		df_sil = df_sil.sort_values(by="Score", ascending=True)
		colors = self.get_colors( self.metadata["k"] )
		xvalues, yvalues, hovertext, s_colors = [], [], [], []
		for label, row in df_sil.iterrows():
			xvalues.append( row["Score"] )
			yvalues.append( label + " " )
			hovertext.append( row["Descriptor"] )
			s_colors.append( self.format_color_string(colors[row["Number"]-1]) )
		# choose a sensible range for the x-axis
		if ( min(df_sil["Score"]) <= -0.5) or (max(df_sil["Score"]) >= 0.5):
			min_value, max_value = -1, 1
		else:
			min_value, max_value = -0.5, 0.5
		# generate the chart
		if len(xvalues) <= 6:
			chart_height = 400
		elif len(xvalues) <= 10:
			chart_height = 500
		else:
			chart_height = 600
		return dcc.Graph(
			id='chart_topicsil',
			figure={
				'data': [
					{
						'x': xvalues, 
						'y': yvalues, 
						'hovertext' : hovertext,
						'type': 'bar',
						'orientation' : 'h',
						'marker' : { 'color' : s_colors, 'opacity': 0.4 },
						'hovertemplate': '<b>%{y}</b>: %{x}<br>%{hovertext}<extra></extra>',
						'hoverlabel' : { 'bgcolor' : 'rgb(250, 246, 208)' }
		    		},
				],
				'layout': 
				{ 
					'height' : chart_height,
					'margin': { "t" : 20 },
					'yaxis' : { 'tickfont' : { "size" : 14 } },
					'xaxis' : { 'title' : "Topic Silhouette Score", 
					'tickfont' : { "size" : 13 },
					'titlefont' : { "size" : 15 },
					'range': [min_value, max_value] }
				}
			})