def detect_attributes_by_alias_similarity(self, query_ngrams, data_attributes, query_attributes, attribute_aliases): # Go over each ngram to check for matches in attribute aliases # Check if the word is in one of the attribute aliases for attr in data_attributes: # If already found, continue if attr in query_attributes and any(a for a in query_attributes[attr]["metric"] if a in ["attribute_alias_exact_match"]): continue for alias in attribute_aliases[attr]: for ngram in query_ngrams: score = 0 add_attribute = False # Compute similarity string_similarity_score = helpers.compute_similarity(query_ngrams[ngram]["lower"], data_attributes[attr]["lower"], 'token_similarity') if string_similarity_score == 100: add_attribute = True score = string_similarity_score else: # Compute similarity stemmed_string_similarity_score = helpers.compute_similarity(query_ngrams[ngram]["stemmed_lower"], data_attributes[attr]["stemmed_lower"], 'token_similarity') if stemmed_string_similarity_score == 100: add_attribute = True score = stemmed_string_similarity_score if add_attribute: if attr not in query_attributes: query_attributes[attr] = { 'name': attr, "queryPhrase": [query_ngrams[ngram]["lower"]], 'inferenceType': 'explicit', 'matchScore': self.nl4dv_instance.match_scores['attribute']['attribute_alias_similarity_match'], 'metric': 'attribute_alias_similarity_match', 'isLabel': self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["isLabelAttribute"], 'encode': not self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["isLabelAttribute"], 'isAmbiguous': False, 'ambiguity': list(), 'meta': { 'score': score, 'threshold': self.nl4dv_instance.match_thresholds['string_similarity'], 'alias': alias, 'ambiguity': {} } } # Update Keyword-Attribute-Score Mappings self.update_keyword_attribute_mappings(keyword=query_ngrams[ngram]["lower"], attribute=attr, score=query_attributes[attr]["matchScore"]) return query_attributes
def extract_attributes(self, query_ngrams): """ Return relevant attributes of query """ # Values to be returned query_attributes = dict() # Map between attribute and (score, corresponding word) from the query self.nl4dv_instance.attribute_keyword_mapping = dict() # Map between keyword and the attribute to find ambiguous attributes self.nl4dv_instance.keyword_attribute_mapping = dict() # map of attributes, and their variants - stemmed, lowercase, ... data_attributes = self.get_data_attributes() # map of attributes and their variants - stemmed, lowercase, ... attribute_aliases = self.get_attribute_aliases() # Detect attributes by token exact match query_attributes = self.detect_attributes_by_exact_match(query_ngrams, data_attributes, query_attributes) # Detect attributes by token similarity query_attributes = self.detect_attributes_by_similarity(query_ngrams, data_attributes, query_attributes) # Detect attributes by alias exact match query_attributes = self.detect_attributes_by_alias_exact_match(query_ngrams, data_attributes, query_attributes, attribute_aliases) # Detect attributes by alias similarity query_attributes = self.detect_attributes_by_alias_similarity(query_ngrams, data_attributes, query_attributes, attribute_aliases) # Detect attributes by synonymity query_attributes = self.detect_attributes_by_synonymity(query_ngrams, data_attributes, query_attributes) # Detect attributes by domain value match query_attributes = self.detect_attributes_from_domain_value(query_ngrams, data_attributes, query_attributes) # --------------------------------------------------------------------------------------------------- # Rule Based Filter to ensure 1 keyword maps to the best attribute(s). THIS IS BY KEYWORD AND BY SCORE # --------- # Need the one with higher score from 2 attributes selected by same keyword. # For eg. Querying 'date' in airplane_crashes dataset results in 'Date' (attribute similarity match) as well as 'Summary' (attribute domain value match). # Choose Date and discard Summary. # --------- # If same score, retain both. # For eg. Querying 'expensive' in cars dataset results in 'Retail Price' (attribute alias similarity match) as well as 'Dealer Cost' (attribute alias similarity match). # Retain both Retail Price and Dealer Cost # --------- attributes_to_delete = set() used_keyword_attribute_mapping = dict() for attr in query_attributes: keywords = query_attributes[attr]["queryPhrase"] score = query_attributes[attr]['matchScore'] for keyword in keywords: if keyword in self.nl4dv_instance.keyword_attribute_mapping: used_keyword_attribute_mapping[keyword] = self.nl4dv_instance.keyword_attribute_mapping[keyword] for _attr in self.nl4dv_instance.keyword_attribute_mapping[keyword]: if score > self.nl4dv_instance.keyword_attribute_mapping[keyword][_attr]: attributes_to_delete.add(_attr) elif score < self.nl4dv_instance.keyword_attribute_mapping[keyword][_attr]: attributes_to_delete.add(attr) # Delete unused keywords from the main self.nl4dv_instance.keyword_attribute_mapping dictionary copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping) for key in copy_keyword_attribute_mapping: if key not in used_keyword_attribute_mapping: del self.nl4dv_instance.keyword_attribute_mapping[key] # Now, again delete a few attributes if they are coming from different keywords. Ensure 1 keyword contributes to 1 attribute copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping) for _key in copy_keyword_attribute_mapping: for _attr in copy_keyword_attribute_mapping[_key]: if _key not in query_attributes[_attr]["queryPhrase"]: del self.nl4dv_instance.keyword_attribute_mapping[_key][_attr] # --------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------- # If a keyword is a subset of another keyword # DISCARD the attributes with the smaller keyword # For e.g, "highway miles per gallon" should select only "highway miles per gallon" and not "city miles per gallon" (due to "miles per gallon") copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping) for k1 in copy_keyword_attribute_mapping: for k2 in copy_keyword_attribute_mapping: if k1 != k2 and k1 in k2: # Remove the "smaller" keyword (e.g. science fiction v/s fiction) if k1 in self.nl4dv_instance.keyword_attribute_mapping: del self.nl4dv_instance.keyword_attribute_mapping[k1] # Remove the attribute of the "smaller" keyword IF it is different from the "bigger" one. for _attr in copy_keyword_attribute_mapping[k1]: if _attr not in copy_keyword_attribute_mapping[k2]: attributes_to_delete.add(_attr) # --------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------- # Delete the now-unwanted attributes and keywords mapped to these attributes. for attr in attributes_to_delete: if attr in query_attributes: # 1) delete unwanted attributes from the main attributes object del query_attributes[attr] # 2) delete unwanted attributes from the attribute-keyword mapping object del self.nl4dv_instance.attribute_keyword_mapping[attr] # 3) delete unwanted attributes from the keyword-attributes mapping object for k in self.nl4dv_instance.keyword_attribute_mapping: if attr in self.nl4dv_instance.keyword_attribute_mapping[k]: del self.nl4dv_instance.keyword_attribute_mapping[k][attr] # Delete unwanted keywords in the finalized attributes copy_attribute_keyword_mapping = copy.deepcopy(self.nl4dv_instance.attribute_keyword_mapping) for attr in copy_attribute_keyword_mapping: for k in copy_attribute_keyword_mapping[attr]: if k not in self.nl4dv_instance.keyword_attribute_mapping: del self.nl4dv_instance.attribute_keyword_mapping[attr][k] # --------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------- # Mark attributes as AMBIGUOUS OR NOT. If Ambiguous, append the ambiguities for attr in query_attributes: # Iterate over it's keywords keywords = query_attributes[attr]["queryPhrase"] for keyword in keywords: if len(self.nl4dv_instance.keyword_attribute_mapping[keyword].keys()) > 1: for ambiguous_attr in self.nl4dv_instance.keyword_attribute_mapping[keyword]: if 'ambiguity' not in query_attributes[attr]: query_attributes[attr]['ambiguity'] = list() # Mark it as ambiguous query_attributes[attr]['isAmbiguous'] = True if ambiguous_attr not in query_attributes[attr]["ambiguity"] and ambiguous_attr != attr: query_attributes[attr]["ambiguity"].append(ambiguous_attr) # Since ambiguous attributes so far have the same score, we compute Ratio Similarity to disambiguate among them. query_attributes[attr]["meta"]["confidence"] = round(helpers.compute_similarity(attr, keyword, "ratio_similarity"), 3) else: # Set as unambiguous by default query_attributes[attr]['isAmbiguous'] = False query_attributes[attr]["ambiguity"] = list() query_attributes[attr]["meta"]["confidence"] = 100 # Clean-up both attribute and keyword mappings, if they are EMPTY / None. copy_attribute_keyword_mapping = copy.deepcopy(self.nl4dv_instance.attribute_keyword_mapping) for attr in copy_attribute_keyword_mapping: if not self.nl4dv_instance.attribute_keyword_mapping[attr]: del self.nl4dv_instance.attribute_keyword_mapping[attr] copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping) for keyword in copy_keyword_attribute_mapping: if not self.nl4dv_instance.keyword_attribute_mapping[keyword]: del self.nl4dv_instance.keyword_attribute_mapping[keyword] return query_attributes
def detect_attributes_from_domain_value(self, query_ngrams, data_attributes, query_attributes): value_keyword_mapping = dict() keyword_value_mapping = dict() for attr in data_attributes: # NL4DV does not look for domain value matches in the Label Attribute. Controversial, but that's how we've designed this. # Update: Since the addition of two Checks below (similarity score and number_of_words matched) for a domain value match, this is NOT required. # if attr == self.nl4dv_instance.label_attribute: # continue # Look for domain value matches ONLY for ordinal and nominal variables. # For timeseries and quantitative attribute types, it is difficult to map numbers to attributes AND this is computationally inefficient due to their domain size. if self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["dataType"] in ['Q','T']: continue # RESET for each Attribute value_keyword_mapping[attr] = dict() keyword_value_mapping[attr] = dict() for ngram in query_ngrams: # Do NOT check for n_grams with numeric entities in the domain. They tend to produce erroneous results, especially due to the TOKEN based similarity algorithm. ngram_str = ''.join([i for i in query_ngrams[ngram]["lower"] if not i.isdigit()]) # ngram_str = ''.join([i for i in query_ngrams[ngram]["lower"]]) add_attribute = False for d in self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]['domain']: value_raw = str(d) value = value_raw.lower() # Exact match if ngram_str == value: # Value - Keyword value_keyword_mapping[attr][value_raw] = ngram_str # Keyword - Value if ngram_str not in keyword_value_mapping[attr]: keyword_value_mapping[attr][ngram_str] = set() keyword_value_mapping[attr][ngram_str].add(value_raw) add_attribute = True else: string_similarity_score = helpers.compute_similarity(ngram_str, value,'token_similarity') # Check 1: Token Similarity score should be 100, i.e. at least 1 word/n-gram in the query must match the attribute domain value if string_similarity_score == 100: # [OLD] Check 2: The matched n-gram must contain 2 or more words/tokens. # if len(ngram_tokens) >= 2: # Check 2: The matched attribute domain value must either be of length >= 2, i.e. 2 words OR be 1 of 2 possible words. value_tokens = list(word_tokenize(value)) ngram_tokens = list(word_tokenize(ngram_str)) if len(ngram_tokens) >= 2 or (len(ngram_tokens) == 1 and len(value_tokens) == 2): # Value - Keyword value_keyword_mapping[attr][value_raw] = ngram_str # Keyword - Value if len(ngram_str.split()) <= len(value.split()): if ngram_str not in keyword_value_mapping[attr]: keyword_value_mapping[attr][ngram_str] = set() keyword_value_mapping[attr][ngram_str].add(value_raw) add_attribute = True if add_attribute: # Required: To filter out keyword subsets that point to the same attribute, e.g. science fiction, fiction, science for k1 in keyword_value_mapping[attr].copy(): for k2 in keyword_value_mapping[attr].copy(): if k1!=k2 and k1 in k2: if k1 in keyword_value_mapping[attr]: del keyword_value_mapping[attr][k1] # When attributes are double defined metrics = ["attribute_domain_value_match"] if attr in query_attributes: # Update its metric metrics = query_attributes[attr]["metric"] if "attribute_domain_value_match" not in query_attributes[attr]["metric"]: metrics.append("attribute_domain_value_match") query_attributes[attr] = { 'name': attr, "queryPhrase": list(keyword_value_mapping[attr].keys()), 'inferenceType': 'implicit', 'matchScore': self.nl4dv_instance.match_scores['attribute']['attribute_domain_value_match'], 'metric': metrics, 'isLabel': self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["isLabelAttribute"], 'isAmbiguous': False, 'ambiguity': list(), 'encode': False, 'meta': { 'score': None, 'threshold': None, 'alias': None, 'ambiguity': {} } } op = dict() for k,v in keyword_value_mapping[attr].items(): op[k] = list(v) query_attributes[attr]["meta"]['ambiguity'] = op # Update Keyword-Attribute-Score Mappings self.update_keyword_attribute_mappings(keyword=ngram_str, attribute=attr, score=query_attributes[attr]["matchScore"]) return query_attributes
def detect_attributes_from_domain_value(self, query_ngrams, data_attributes, query_attributes): value_keyword_mapping = dict() keyword_value_mapping = dict() for attr in data_attributes: # Note: This check is NOT right as the domain values are a prime way of applying categorical filters. # if attr in query_attributes and query_attributes[attr]["metric"] in ["attribute_similarity_match","attribute_alias_similarity_match","attribute_synonym_match"]: # continue # ToDo:- Let's NOT look for domain value matches in the Label Attribute. Controversial, but that's how we've designed this. if attr == self.nl4dv_instance.label_attribute: continue # Look for domain value matches ONLY for ordinal and nominal variables. # For timeseries and quantitative attribute types, it is difficult to map numbers to attributes AND this is computationally inefficient due to their domain size. if self.nl4dv_instance.data_genie_instance.data_attribute_map[ attr]["dataType"] not in ['N', 'O']: continue # RESET for each Attribute value_keyword_mapping[attr] = dict() keyword_value_mapping[attr] = dict() for ngram in query_ngrams: # Do NOT check for n_grams with numeric entities in the domain. They tend to produce erroneous results, especially due to the TOKEN based similarity algorithm. ngram_str = ''.join([ i for i in query_ngrams[ngram]["lower"] if not i.isdigit() ]) # ngram_str = ''.join([i for i in query_ngrams[ngram]["lower"]]) add_attribute = False for d in self.nl4dv_instance.data_genie_instance.data_attribute_map[ attr]['domain']: value_raw = str(d) value = value_raw.lower() # Exact match if ngram_str == value: # Value - Keyword value_keyword_mapping[attr][value_raw] = ngram_str # Keyword - Value if ngram_str not in keyword_value_mapping[attr]: keyword_value_mapping[attr][ngram_str] = set() keyword_value_mapping[attr][ngram_str].add(value_raw) add_attribute = True # elif self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["dataType"] == 'T' and helpers.isdate(ngram_str)[0]: # parsed_value = helpers.isdate(ngram_str)[1] # value_keyword_mapping[attr][parsed_value] = ngram_str # # if ngram_str not in keyword_value_mapping[attr]: # keyword_value_mapping[attr][ngram_str] = set() # keyword_value_mapping[attr][ngram_str].add(parsed_value) # # add_attribute = True else: string_similarity_score = helpers.compute_similarity( ngram_str, value, 'token_similarity') if string_similarity_score == 100: # Value - Keyword value_keyword_mapping[attr][value_raw] = ngram_str # Keyword - Value if len(ngram_str.split()) <= len(value.split()): if ngram_str not in keyword_value_mapping[ attr]: keyword_value_mapping[attr][ ngram_str] = set() keyword_value_mapping[attr][ngram_str].add( value_raw) add_attribute = True if add_attribute: # Required: To filter out keyword subsets that point to the same attribute, e.g. science fiction, fiction, science for k1 in keyword_value_mapping[attr].copy(): for k2 in keyword_value_mapping[attr].copy(): if k1 != k2 and k1 in k2: if k1 in keyword_value_mapping[attr]: del keyword_value_mapping[attr][k1] # When attributes are double defined metrics = ["attribute_domain_value_match"] if attr in query_attributes: # Update its metric metrics = query_attributes[attr]["metric"] if "attribute_domain_value_match" not in query_attributes[ attr]["metric"]: metrics.append("attribute_domain_value_match") query_attributes[attr] = { 'name': attr, "queryPhrase": list(keyword_value_mapping[attr].keys()), 'inferenceType': 'implicit', 'matchScore': self.nl4dv_instance.match_scores['attribute'] ['attribute_domain_value_match'], 'metric': metrics, 'isLabel': self.nl4dv_instance.data_genie_instance. data_attribute_map[attr]["isLabelAttribute"], 'isAmbiguous': False, 'ambiguity': list(), 'encode': False, 'meta': { 'score': None, 'threshold': None, 'alias': None, 'ambiguity': {} } } op = dict() for k, v in keyword_value_mapping[attr].items(): op[k] = list(v) query_attributes[attr]["meta"]['ambiguity'] = op # Update Keyword-Attribute-Score Mappings self.update_keyword_attribute_mappings( keyword=ngram_str, attribute=attr, score=query_attributes[attr]["matchScore"]) return query_attributes
def detect_attributes_by_similarity(self, query_ngrams, data_attributes, query_attributes): for attr in data_attributes: is_exact_match = False for ngram in query_ngrams: add_attribute = False score = 0 # Exact Match if data_attributes[attr]["lower"] == query_ngrams[ngram][ "lower"] or data_attributes[attr][ "stemmed_lower"] == query_ngrams[ngram][ "stemmed_lower"]: add_attribute = True is_exact_match = True score = 100 # Similarity Algorithm else: # Compute similarity of tokens string_similarity_score = helpers.compute_similarity( data_attributes[attr]["lower"], query_ngrams[ngram]["lower"], 'token_similarity') if string_similarity_score >= self.nl4dv_instance.match_thresholds[ 'string_similarity']: add_attribute = True score = string_similarity_score else: # Compute similarity of stemmed tokens stemmed_string_similarity_score = helpers.compute_similarity( data_attributes[attr]["stemmed_lower"], query_ngrams[ngram]["stemmed_lower"], 'token_similarity') if stemmed_string_similarity_score >= self.nl4dv_instance.match_thresholds[ 'string_similarity']: add_attribute = True score = stemmed_string_similarity_score if add_attribute: if attr not in query_attributes or query_attributes[attr][ "meta"]["score"] <= score: query_attributes[attr] = { 'name': attr, "queryPhrase": [query_ngrams[ngram]["lower"]], 'inferenceType': 'explicit', 'matchScore': self.nl4dv_instance.match_scores['attribute'] ['attribute_similarity_match'], 'metric': ['attribute_similarity_match'], 'isLabel': self.nl4dv_instance.data_genie_instance. data_attribute_map[attr]["isLabelAttribute"], 'encode': not self.nl4dv_instance.data_genie_instance. data_attribute_map[attr]["isLabelAttribute"], 'isAmbiguous': False, 'ambiguity': list(), 'meta': { 'score': score, 'threshold': self.nl4dv_instance. match_thresholds['string_similarity'], 'alias': None, 'ambiguity': {} } } # Update Keyword-Attribute-Score Mappings self.update_keyword_attribute_mappings( keyword=query_ngrams[ngram]["lower"], attribute=attr, score=query_attributes[attr]["matchScore"]) # Important! If, the attribute is detected by exact match, then break the loop. We can SKIP subsequent n-grams. if is_exact_match: break return query_attributes