def setUp(self): self.fact1 = Fact( "corpus1", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message1 = Message(self.fact1, 0.1, 0.2, 0.3) self.fact2 = Fact( "corpus2", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message2 = Message(self.fact2, 0.1, 0.2, 0.3) self.document_plan_node = DocumentPlanNode( [self.message1, self.message2], Relation.ELABORATION)
def _shared_topics_message_parser( self, task_result: TaskResult) -> List[Message]: messages = [] corpus, corpus_type = self.build_corpus_fields(task_result) topics = task_result.task_result.get("result").get("shared_topics") topics = [str(t) for t in topics] result_key = "[TopicModelDocsetComparison:TM:{}]".format( task_result.parameters.get("model_type", "LDA").upper(), ) if len(topics) == 0: analysis_type = "TopicModelDocsetComparison:Shared:Topics:None" result_value = "None" elif len(topics) == 1: analysis_type = "TopicModelDocsetComparison:Shared:Topics:Single" result_value = "[TopicModelDocsetComparison:Topic:{}]".format( topics[0]) else: analysis_type = "TopicModelDocsetComparison:Shared:Topics:Multi" result_value = "[TopicModelDocsetComparison:TopicList:{}]".format( "|".join(topics)) interestingness = task_result.task_result.get("interestingness", {}).get( "shared_topics", 0) messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", analysis_type, result_key, result_value, interestingness, "[LINK:{}]".format(task_result.uuid), # uuid ))) for variant, field in [("Mean", "mean_jsd"), ("Cross", "cross_jsd")]: messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "TopicModelDocsetComparison:Shared:JSD", "[TopicModelDocsetComparison:JSD:{}]".format(variant), task_result.task_result.get("result").get(field), task_result.task_result.get("interestingness").get( field), "[LINK:{}]".format(task_result.uuid), # uuid ))) return messages
def setUp(self): self.fact1 = Fact("1", "_", "_", "_", "_", "_", "_", "_", "_") self.fact2 = Fact("2", "_", "_", "_", "_", "_", "_", "_", "_") self.message1 = Message(self.fact1) self.message2 = Message(self.fact2) self.expr = FactField("corpus") self.matcher = Matcher(self.expr, "=", "1") self.rules = [([self.matcher], [0])] self.slot = Slot(FactFieldSource("corpus")) self.literal = LiteralSlot("literal") self.components = [self.slot, self.literal] self.template = Template(self.components, self.rules)
def parse_messages(self, task_result: TaskResult, context: List[TaskResult], language: str) -> List[Message]: if not task_result.processor == "Summarization": raise WrongResourceException() corpus, corpus_type = self.build_corpus_fields(task_result) messages = [] for summary, interestingness in zip( task_result.task_result["result"]["summary"], task_result.task_result["interestingness"]["sentence_scores"] ): interestingness *= task_result.task_result["interestingness"]["overall"] messages.append( Message( [ Fact( corpus, # corpus corpus_type, # corpus_type None, # timestamp_from None, # timestamp_to "all_time", # timestamp_type "Summarization", # analysis_type "Summary", # result_key summary, # result_value interestingness, # outlierness "[LINK:{}]".format(task_result.uuid), # uuid ) ] ) ) # For now, we limit the summaries to one per result. This needs to be re-evaluated later on. return [max(messages, key=lambda m: m.main_fact.outlierness)]
def parse_messages(self, task_result: TaskResult, context: List[TaskResult], language: str) -> List[Message]: if not task_result.processor == "TopicModelDocumentLinking": raise WrongResourceException() corpus, corpus_type = self.build_corpus_fields(task_result) articles_with_interestingness = [ (article, interestingness) for (article, interestingness) in zip( task_result.task_result["result"]["documents"], task_result.task_result["interestingness"]["documents"] ) ] articles_with_interestingness = sorted(articles_with_interestingness, key=lambda pair: pair[1], reverse=True) single_or_multiple = "Single" if len(articles_with_interestingness) == 1 else "Multiple" return [ Message( Fact( corpus, corpus_type, None, None, "all_time", "TopicModel:DocumentLinking:" + single_or_multiple, "LinkedArticles", "[LinkedArticleList:{}]".format( "|".join([article for (article, interestingness) in articles_with_interestingness]) ), task_result.task_result["interestingness"]["overall"], "[LINK:{}]".format(task_result.uuid), # uuid ) ) ]
def parse_messages(self, task_result: TaskResult, context: List[TaskResult], language: str) -> List[Message]: if not task_result.processor == "ExtractFacets": raise WrongResourceException() corpus, corpus_type = self.build_corpus_fields(task_result) messages = [] for facet_name, results in task_result.task_result["result"].items(): interestingness_values = task_result.task_result[ "interestingness"][facet_name] for facet_value, result_value in results.items(): interestingness = interestingness_values[facet_value] # In cases where we have a *ton* of different values (e.g. issues from messages.append( Message([ Fact( corpus, # corpus corpus_type, # corpus_type None, # timestamp_from None, # timestamp_to "all_time", # timestamp_type "ExtractFacets:" + facet_name, # analysis_type "[{}:{}]".format(facet_name, facet_value), # result_key result_value, # result_value interestingness, # interestingness "[LINK:{}]".format(task_result.uuid), # uuid ) ])) return messages
def setUp(self): self.fact = Fact( "corpus", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", )
def setUp(self): self.fact = Fact( "corpus name", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.source = LiteralSource("Some literal")
def setUp(self): self.fact = Fact( "corpus name", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", ) self.message = Message(self.fact, 0.1, 0.2, 0.3) self.source = FactFieldSource("corpus")
def setUp(self): self.to_value = LiteralSource("some literal") self.attributes = dict() self.fact = Fact( "corpus", "corpus_type", "timestamp_from", "timestamp_to", "timestamp_type", "analysis_type", "result_key", "result_value", "outlierness", )
def setUp(self): self.fact = Fact("1", "_", "_", "_", "_", "_", "_", "kissa", "_") self.message = Message(self.fact) self.expr = FactField("corpus") self.matcher = Matcher(self.expr, "=", "1") self.rules = [([self.matcher], [0])] self.slot = Slot(FactFieldSource("result_value")) self.literal = LiteralSlot("sana") self.components = [self.slot, self.literal] self.template = Template(self.components, self.rules) self.template.fill(self.message, [self.message]) self.realizer = FinnishUralicNLPMorphologicalRealizer()
def parse_complex_messages(self, task_result: TaskResult, context: List[TaskResult]) -> List[Message]: corpus, corpus_type = self.build_corpus_fields(task_result) messages = [] facet_name = task_result.parameters["facet_name"] for value_type, value_type_results in task_result.task_result[ "result"].items(): if value_type != "absolute_counts": continue # TODO: relative_counts are not percentages and are thus really hard to talk about. for facet_value, facet_value_results in value_type_results.items(): from_year = str( min([ int(y) for y in facet_value_results.keys() if y.isnumeric() ])) to_year = str( max([ int(y) for y in facet_value_results.keys() if y.isnumeric() ])) for complex_key in ["max", "min", "avg"]: value = facet_value_results[complex_key] interestingness = task_result.task_result[ "interestingness"][facet_value][0] messages.append( Message([ Fact( corpus, # corpus corpus_type, # corpus_type from_year, # timestamp_from to_year, # timestamp_to "between_years", # timestamp_type "GenerateTimeSeries:{}:{}".format( value_type, complex_key), # analysis_type "[ENTITY:{}:{}]".format( facet_name, facet_value), # result_key value, # result_value interestingness, # outlierness "[LINK:{}]".format(task_result.uuid), # uuid ) ])) return messages
def parse_messages(self, task_result: TaskResult, context: List[TaskResult], language: str) -> List[Message]: if not task_result.processor == "ExtractWords": raise WrongResourceException() unit: str = task_result.parameters.get("unit") if unit == "tokens": unit = "TOKEN" elif unit == "stems": unit = "STEM" else: log.error( "Unexpected unit '{}', expected 'tokens' or 'stems'".format( task_result.parameters.get("unit"))) raise ParsingException() corpus, corpus_type = self.build_corpus_fields(task_result) messages = [] for word in task_result.task_result["result"]["vocabulary"]: interestingness = task_result.task_result["interestingness"].get( word, ProcessorResource.EPSILON) for result_idx, result_name in enumerate( ["Count", "RelativeCount", "TFIDF"]): result = task_result.task_result["result"]["vocabulary"][word][ result_idx] messages.append( Message([ Fact( corpus, # corpus corpus_type, # corpus_type None, # timestamp_from None, # timestamp_to "all_time", # timestamp_type "ExtractWords:" + result_name, # analysis_type "[{}:{}]".format(unit, word), # result_key result, # result_value interestingness, # outlierness "[LINK:{}]".format(task_result.uuid), # uuid ) ])) return messages
def parse_standard_messages(self, task_result: TaskResult, context: List[TaskResult]) -> List[Message]: corpus, corpus_type = self.build_corpus_fields(task_result) messages = [] facet_name = task_result.parameters["facet_name"] for value_type, value_type_results in task_result.task_result[ "result"].items(): if value_type != "absolute_counts": continue # TODO: relative_counts are not percentages and are thus really hard to talk about. for facet_value, facet_value_results in value_type_results.items(): for time, value in facet_value_results.items(): interestingness = task_result.task_result[ "interestingness"][facet_value][1].get( time, ProcessorResource.EPSILON) if not time.isnumeric(): continue if interestingness == 0: continue messages.append( Message([ Fact( corpus, # corpus corpus_type, # corpus_type time, # timestamp_from time, # timestamp_to "year", # timestamp_type "GenerateTimeSeries:" + value_type, # analysis_type "[ENTITY:{}:{}]".format( facet_name, facet_value), # result_key value, # result_value interestingness, # outlierness "[LINK:{}]".format(task_result.uuid), # uuid ) ])) return messages
def _jsd_message_parser(task_result: TaskResult, corpus: str, corpus_type: str, input_processor: str) -> List[Message]: jsd = task_result.task_result["result"]["jensen_shannon_divergence"] interestingness = task_result.task_result["interestingness"][ "jensen_shannon_divergence"] return [ Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:JSD", f"[Comparison:Processor:{input_processor}]", jsd, interestingness, f"[LINK:{task_result.uuid}]", )) ]
def parse_messages(self, task_result: TaskResult, context: List[TaskResult], language: str) -> List[Message]: language = language.split("-")[0] if not task_result.processor == "ExtractNames": raise WrongResourceException() corpus, corpus_type = self.build_corpus_fields(task_result) for entity in task_result.task_result["result"]: entity_name_map: Dict[ str, str] = task_result.task_result["result"][entity].get( "names", {}) entity_names = [ entity_name_map.get(language, None), entity_name_map.get("en", None), list(entity_name_map.values())[0] if list(entity_name_map.values()) else None, entity, ] if not entity_name_map: entity_names.insert( 0, self._resolve_name_from_solr(entity, language)) task_result.task_result["result"][entity]["entity"] = next( name for name in entity_names if name) entities_with_interestingness = [ (entity, max(interestingness.values())) for (entity, interestingness ) in zip(task_result.task_result["result"].values(), task_result.task_result["interestingness"].values()) ] entities_with_interestingness = sorted(entities_with_interestingness, key=lambda pair: pair[1], reverse=True) max_interestingness = entities_with_interestingness[0][1] if max_interestingness < 0.01: entities_with_interestingness = entities_with_interestingness[0] else: entities_with_interestingness = [ (entity, interestingness) for (entity, interestingness) in entities_with_interestingness if interestingness >= 0.01 ] if len(entities_with_interestingness) == 0: return [] count = min(5, len(entities_with_interestingness)) entities_with_interestingness = entities_with_interestingness[:count] single_or_multiple = "Single" if len( entities_with_interestingness) == 1 else "Multiple" return [ Message( Fact( corpus, corpus_type, None, None, "all_time", "ExtractNames:" + single_or_multiple, "ExtractNames", "[ExtractNamesList:{}]".format("|".join([ "{}:{}:{}".format(entity["entity"], entity["salience"], entity["stance"]) for (entity, interestingness) in entities_with_interestingness ])), task_result.task_result["interestingness"]["overall"], "[LINK:{}]".format(task_result.uuid), # uuid )) ]
def _distinct_topics_message_parser(self, task_result: TaskResult, collection_id: int) -> List[Message]: messages = [] topics_label = "distinct_topics" + str(collection_id) collection = "collection" + str(collection_id) corpus, corpus_type = self.build_corpus_fields( task_result.parameters.get(collection)) topics = task_result.task_result.get("result").get(topics_label) topics = [str(t) for t in topics] result_key = "[TopicModelDocsetComparison:TM:{}]".format( task_result.parameters.get("model_type", "LDA").upper(), ) if len(topics) == 0: analysis_type = "TopicModelDocsetComparison:Distinct:Topics:None" result_value = "None" elif len(topics) == 1: analysis_type = "TopicModelDocsetComparison:Distinct:Topics:Single" result_value = "[TopicModelDocsetComparison:Topic:{}]".format( topics[0]) else: analysis_type = "TopicModelDocsetComparison:Distinct:Topics:Multi" result_value = "[TopicModelDocsetComparison:TopicList:{}]".format( "|".join(topics)) interestingness = task_result.task_result.get("interestingness", {}).get(topics_label, 0) messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", analysis_type, result_key, result_value, interestingness, "[LINK:{}]".format(task_result.uuid), # uuid ))) jsd_label = "internal_jsd" + str(collection_id) messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "TopicModelDocsetComparison:Distinct:JSD", "[TopicModelDocsetComparison:JSD:Internal]", task_result.task_result.get("result").get(jsd_label), task_result.task_result.get("interestingness").get( jsd_label), "[LINK:{}]".format(task_result.uuid), # uuid ))) return messages
def setUp(self): self.fact = Fact("_", "_", "t1", "t2", "tt", "_", "_", "_", "_") self.source = TimeSource()
def _value_divergence_parser(task_result: TaskResult, corpus: str, corpus_type: str, input_processor: str) -> List[Message]: # Obtain the result key that is *not* JSD (e.g. abs_diff) comparison_type = "" for val in task_result.task_result["result"]: if val != "jensen_shannon_divergence": comparison_type = val combined: List[Tuple[str, float, float]] = [( key, task_result.task_result["result"][comparison_type][key], task_result.task_result["interestingness"][comparison_type][key], ) for key in task_result.task_result["result"][comparison_type]] if len(combined) == 0: return [ Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:None", f"[Comparison:Processor:{input_processor}]", None, task_result.task_result["interestingness"]["overall"], f"[LINK:{task_result.uuid}]", )) ] # Sort from least divergent to most divergent combined.sort(key=lambda x: x[1]) messages: List[Message] = [] if len(combined) == 1: key, val, interestingness = combined[0] messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:Single", f"[Comparison:Processor:{input_processor}]", f"[Comparison:Value:{key}:{comparison_type}:{val}]", interestingness, f"[LINK:{task_result.uuid}]", ))) elif len(combined) <= 3: (key, val, interestingness) = combined[0] messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:Least:Single", f"[Comparison:Processor:{input_processor}]", f"[Comparison:Value:{key}:{comparison_type}:{val}]", interestingness, f"[LINK:{task_result.uuid}]", ))) (key, val, interestingness) = combined[1] messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:Most:Single", f"[Comparison:Processor:{input_processor}]", f"[Comparison:Value:{key}:{comparison_type}:{val}]", interestingness, f"[LINK:{task_result.uuid}]", ))) else: # at least four keys count = min( 3, len(combined) // 2 ) # half-and-half, ignore middle value if odd n. Max 3 in any case. least_vals = combined[:count] least_vals_max_interestingness = max( interestingness for (_, _, interestingness) in least_vals) messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:Least:Multi", f"[Comparison:Processor:{input_processor}]", "[Comparison:ValueList:{}]".format("|".join([ "{}:{}:{}".format(key, comparison_type, value) for (key, value, _) in least_vals ])), least_vals_max_interestingness, f"[LINK:{task_result.uuid}]", ))) most_vals = combined[-count:] most_vals_max_interestingness = max( interestingness for (_, _, interestingness) in most_vals) messages.append( Message( Fact( corpus, corpus_type, None, None, "all_time", "Compare:Most:Multi", f"[Comparison:Processor:{input_processor}]", "[Comparison:ValueList:{}]".format("|".join([ "{}:{}:{}".format(key, comparison_type, value) for (key, value, _) in most_vals ])), most_vals_max_interestingness, f"[LINK:{task_result.uuid}]", ))) return messages
def setUp(self): self.fact = self.fact = Fact("_", "_", "_", "_", "_", "_", "_", "_", "_") self.expr = LhsExpr()
def setUp(self): self.fact1 = Fact("1", "_", "_", "_", "_", "_", "_", "_", "_") self.fact2 = Fact("2", "_", "_", "_", "_", "_", "_", "_", "_") self.all_facts = [self.fact1, self.fact2] self.expr = ReferentialExpr(1, "corpus")
def setUp(self): self.fact1 = Fact("1", "_", "_", "_", "_", "_", "_", "_", "_") self.fact2 = Fact("2", "_", "_", "_", "_", "_", "_", "_", "_") self.all_facts = [self.fact1, self.fact2] self.expr = FactField("corpus")
def parse_messages(self, task_result: TaskResult, context: List[TaskResult], language: str) -> List[Message]: language = language.split("-")[0] if not task_result.processor == "TrackNameSentiment": raise WrongResourceException() corpus, corpus_type = self.build_corpus_fields(task_result) entries: Dict[str, Dict[int, Tuple[float, float]]] = {} for entity in task_result.task_result["result"]: entity_name_map: Dict[ str, str] = task_result.task_result["result"][entity].get("names") if entity_name_map is None: entity_name_map = {} entity_name_priority_list = [ entity_name_map.get(language, None), entity_name_map.get("en", None), list(entity_name_map.values())[0] if list(entity_name_map.values()) else None, entity, ] if not entity_name_map: entity_name_priority_list.insert( 0, self._resolve_name_from_solr(entity, language)) name = next(name for name in entity_name_priority_list if name) years: Dict[int, Tuple[float, float]] = {} for year in task_result.task_result["result"][entity]: if year == "names": # Skip the names-map continue sentiment = task_result.task_result["result"][entity][year] interestingness = task_result.task_result["interestingness"][ entity][1][year] if sentiment != 0 or interestingness != 0: years[int(year)] = (sentiment, interestingness) entries[name] = years messages: List[Message] = [] for entry, years in entries.items(): if not years: continue max_interestingness = max(interestingness for (year, (sentiment, interestingness)) in years.items()) max_sentiment, max_sentiment_year = max( (sentiment, year) for (year, (sentiment, interestingness)) in years.items()) min_sentiment, min_sentiment_year = min( (sentiment, year) for (year, (sentiment, interestingness)) in years.items()) mean_sentiment = sum(sentiment for (year, ( sentiment, interestingness)) in years.items()) / len(years) min_year = min(years) max_year = max(years) year_count = len(years) messages.append( Message( Fact( corpus, corpus_type, min_year, max_year, "between_years", "TrackNameSentiment:Mean", "[ENTITY:NAME:{}]".format(entry), mean_sentiment, max_interestingness, "[LINK:{}]".format(task_result.uuid), # uuid ))) if len(years) > 1: messages.append( Message( Fact( corpus, corpus_type, min_year, max_year, "between_years", "TrackNameSentiment:CountYears", "[ENTITY:NAME:{}]".format(entry), year_count, max_interestingness, "[LINK:{}]".format(task_result.uuid), # uuid ))) messages.append( Message( Fact( corpus, corpus_type, min_sentiment_year, min_sentiment_year, "year", "TrackNameSentiment:Min", "[ENTITY:NAME:{}]".format(entry), min_sentiment, max_interestingness, "[LINK:{}]".format(task_result.uuid), # uuid )), ) messages.append( Message( Fact( corpus, corpus_type, max_sentiment_year, max_sentiment_year, "year", "TrackNameSentiment:Max", "[ENTITY:NAME:{}]".format(entry), max_sentiment, max_interestingness, "[LINK:{}]".format(task_result.uuid), # uuid ))) return messages