Esempio n. 1
0
    def detect_attributes_by_alias_similarity(self, query_ngrams, data_attributes, query_attributes, attribute_aliases):
 
        # Go over each ngram to check for matches in attribute aliases
        # Check if the word is in one of the attribute aliases
        for attr in data_attributes:

            # If already found, continue
            if attr in query_attributes and any(a for a in query_attributes[attr]["metric"] if a in ["attribute_alias_exact_match"]):
                continue

            for alias in attribute_aliases[attr]:
                for ngram in query_ngrams:
                    score = 0
                    add_attribute = False
                    # Compute similarity
                    string_similarity_score = helpers.compute_similarity(query_ngrams[ngram]["lower"], data_attributes[attr]["lower"], 'token_similarity')
                    if string_similarity_score == 100:
                        add_attribute = True
                        score = string_similarity_score
                    else:
                        # Compute similarity
                        stemmed_string_similarity_score = helpers.compute_similarity(query_ngrams[ngram]["stemmed_lower"], data_attributes[attr]["stemmed_lower"], 'token_similarity')
                        if stemmed_string_similarity_score == 100:
                            add_attribute = True
                            score = stemmed_string_similarity_score

                    if add_attribute:
                        if attr not in query_attributes:
                            query_attributes[attr] = {
                                'name': attr,
                                "queryPhrase": [query_ngrams[ngram]["lower"]],
                                'inferenceType': 'explicit',
                                'matchScore': self.nl4dv_instance.match_scores['attribute']['attribute_alias_similarity_match'],
                                'metric': 'attribute_alias_similarity_match',
                                'isLabel': self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["isLabelAttribute"],
                                'encode': not self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["isLabelAttribute"],
                                'isAmbiguous': False,
                                'ambiguity': list(),
                                'meta': {
                                    'score': score,
                                    'threshold': self.nl4dv_instance.match_thresholds['string_similarity'],
                                    'alias': alias,
                                    'ambiguity': {}
                                }
                            }

                            # Update Keyword-Attribute-Score Mappings
                            self.update_keyword_attribute_mappings(keyword=query_ngrams[ngram]["lower"], attribute=attr, score=query_attributes[attr]["matchScore"])


        return query_attributes
Esempio n. 2
0
    def extract_attributes(self, query_ngrams):
        """
        Return relevant attributes of query

        """
        # Values to be returned
        query_attributes = dict()

        # Map between attribute and (score, corresponding word) from the query
        self.nl4dv_instance.attribute_keyword_mapping = dict()

        # Map between keyword and the attribute to find ambiguous attributes
        self.nl4dv_instance.keyword_attribute_mapping = dict()

        # map of attributes, and their variants - stemmed, lowercase, ...
        data_attributes = self.get_data_attributes()

        # map of attributes and their variants - stemmed, lowercase, ...
        attribute_aliases = self.get_attribute_aliases()

        # Detect attributes by token exact match
        query_attributes = self.detect_attributes_by_exact_match(query_ngrams, data_attributes, query_attributes)

        # Detect attributes by token similarity
        query_attributes = self.detect_attributes_by_similarity(query_ngrams, data_attributes, query_attributes)

        # Detect attributes by alias exact match
        query_attributes = self.detect_attributes_by_alias_exact_match(query_ngrams, data_attributes, query_attributes, attribute_aliases)

        # Detect attributes by alias similarity
        query_attributes = self.detect_attributes_by_alias_similarity(query_ngrams, data_attributes, query_attributes, attribute_aliases)

        # Detect attributes by synonymity
        query_attributes = self.detect_attributes_by_synonymity(query_ngrams, data_attributes, query_attributes)

        # Detect attributes by domain value match
        query_attributes = self.detect_attributes_from_domain_value(query_ngrams, data_attributes, query_attributes)

        # ---------------------------------------------------------------------------------------------------
        # Rule Based Filter to ensure 1 keyword maps to the best attribute(s). THIS IS BY KEYWORD AND BY SCORE
        # ---------
        # Need the one with higher score from 2 attributes selected by same keyword.
        # For eg. Querying 'date' in airplane_crashes dataset results in 'Date'  (attribute similarity match) as well as 'Summary' (attribute domain value match).
        # Choose Date and discard Summary.
        # ---------
        # If same score, retain both.
        # For eg. Querying 'expensive' in cars dataset results in 'Retail Price'  (attribute alias similarity match) as well as 'Dealer Cost' (attribute alias similarity match).
        # Retain both Retail Price and Dealer Cost
        # ---------
        attributes_to_delete = set()
        used_keyword_attribute_mapping = dict()

        for attr in query_attributes:
            keywords = query_attributes[attr]["queryPhrase"]
            score = query_attributes[attr]['matchScore']
            for keyword in keywords:
                if keyword in self.nl4dv_instance.keyword_attribute_mapping:
                    used_keyword_attribute_mapping[keyword] = self.nl4dv_instance.keyword_attribute_mapping[keyword]
                for _attr in self.nl4dv_instance.keyword_attribute_mapping[keyword]:
                    if score > self.nl4dv_instance.keyword_attribute_mapping[keyword][_attr]:
                        attributes_to_delete.add(_attr)
                    elif score < self.nl4dv_instance.keyword_attribute_mapping[keyword][_attr]:
                        attributes_to_delete.add(attr)

        # Delete unused keywords from the main self.nl4dv_instance.keyword_attribute_mapping dictionary
        copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping)
        for key in copy_keyword_attribute_mapping:
            if key not in used_keyword_attribute_mapping:
                del self.nl4dv_instance.keyword_attribute_mapping[key]

        # Now, again delete a few attributes if they are coming from different keywords. Ensure 1 keyword contributes to 1 attribute
        copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping)
        for _key in copy_keyword_attribute_mapping:
            for _attr in copy_keyword_attribute_mapping[_key]:
                if _key not in query_attributes[_attr]["queryPhrase"]:
                    del self.nl4dv_instance.keyword_attribute_mapping[_key][_attr]
        # ---------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------
        # If a keyword is a subset of another keyword
        # DISCARD the attributes with the smaller keyword
        # For e.g, "highway miles per gallon" should select only "highway miles per gallon" and not "city miles per gallon" (due to "miles per gallon")
        copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping)
        for k1 in copy_keyword_attribute_mapping:
            for k2 in copy_keyword_attribute_mapping:
                if k1 != k2 and k1 in k2:
                    # Remove the "smaller" keyword (e.g. science fiction v/s fiction)
                    if k1 in self.nl4dv_instance.keyword_attribute_mapping:
                        del self.nl4dv_instance.keyword_attribute_mapping[k1]

                    # Remove the attribute of the "smaller" keyword IF it is different from the "bigger" one.
                    for _attr in copy_keyword_attribute_mapping[k1]:
                        if _attr not in copy_keyword_attribute_mapping[k2]:
                            attributes_to_delete.add(_attr)
        # ---------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------
        #  Delete the now-unwanted attributes and keywords mapped to these attributes.
        for attr in attributes_to_delete:
            if attr in query_attributes:
                # 1) delete unwanted attributes from the main attributes object
                del query_attributes[attr]

                # 2) delete unwanted attributes from the attribute-keyword mapping object
                del self.nl4dv_instance.attribute_keyword_mapping[attr]

                # 3) delete unwanted attributes from the keyword-attributes mapping object
                for k in self.nl4dv_instance.keyword_attribute_mapping:
                    if attr in self.nl4dv_instance.keyword_attribute_mapping[k]:
                       del self.nl4dv_instance.keyword_attribute_mapping[k][attr]

        # Delete unwanted keywords in the finalized attributes
        copy_attribute_keyword_mapping = copy.deepcopy(self.nl4dv_instance.attribute_keyword_mapping)
        for attr in copy_attribute_keyword_mapping:
            for k in copy_attribute_keyword_mapping[attr]:
                if k not in self.nl4dv_instance.keyword_attribute_mapping:
                    del self.nl4dv_instance.attribute_keyword_mapping[attr][k]

        # ---------------------------------------------------------------------------------------------------
        # ---------------------------------------------------------------------------------------------------
        #  Mark attributes as AMBIGUOUS OR NOT. If Ambiguous, append the ambiguities
        for attr in query_attributes:
            # Iterate over it's keywords
            keywords = query_attributes[attr]["queryPhrase"]
            for keyword in keywords:
                if len(self.nl4dv_instance.keyword_attribute_mapping[keyword].keys()) > 1:
                    for ambiguous_attr in self.nl4dv_instance.keyword_attribute_mapping[keyword]:
                        if 'ambiguity' not in query_attributes[attr]:
                            query_attributes[attr]['ambiguity'] = list()

                        # Mark it as ambiguous
                        query_attributes[attr]['isAmbiguous'] = True

                        if ambiguous_attr not in query_attributes[attr]["ambiguity"] and ambiguous_attr != attr:
                            query_attributes[attr]["ambiguity"].append(ambiguous_attr)

                    # Since ambiguous attributes so far have the same score, we compute Ratio Similarity to disambiguate among them.
                    query_attributes[attr]["meta"]["confidence"] = round(helpers.compute_similarity(attr, keyword, "ratio_similarity"), 3)
                else:
                    # Set as unambiguous by default
                    query_attributes[attr]['isAmbiguous'] = False
                    query_attributes[attr]["ambiguity"] = list()
                    query_attributes[attr]["meta"]["confidence"] = 100

        # Clean-up both attribute and keyword mappings, if they are EMPTY / None.
        copy_attribute_keyword_mapping = copy.deepcopy(self.nl4dv_instance.attribute_keyword_mapping)
        for attr in copy_attribute_keyword_mapping:
            if not self.nl4dv_instance.attribute_keyword_mapping[attr]:
                del self.nl4dv_instance.attribute_keyword_mapping[attr]

        copy_keyword_attribute_mapping = copy.deepcopy(self.nl4dv_instance.keyword_attribute_mapping)
        for keyword in copy_keyword_attribute_mapping:
            if not self.nl4dv_instance.keyword_attribute_mapping[keyword]:
                del self.nl4dv_instance.keyword_attribute_mapping[keyword]

        return query_attributes
Esempio n. 3
0
    def detect_attributes_from_domain_value(self, query_ngrams, data_attributes, query_attributes):

        value_keyword_mapping = dict()
        keyword_value_mapping = dict()

        for attr in data_attributes:

            # NL4DV does not look for domain value matches in the Label Attribute. Controversial, but that's how we've designed this.
            # Update: Since the addition of two Checks below (similarity score and number_of_words matched) for a domain value match, this is NOT required.
            # if attr == self.nl4dv_instance.label_attribute:
            #     continue

            # Look for domain value matches ONLY for ordinal and nominal variables.
            # For timeseries and quantitative  attribute types, it is difficult to map numbers to attributes AND this is computationally inefficient due to their domain size.
            if self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["dataType"] in ['Q','T']:
                continue

            # RESET for each Attribute
            value_keyword_mapping[attr] = dict()
            keyword_value_mapping[attr] = dict()
            
            for ngram in query_ngrams:

                # Do NOT check for n_grams with numeric entities in the domain. They tend to produce erroneous results, especially due to the TOKEN based similarity algorithm.
                ngram_str = ''.join([i for i in query_ngrams[ngram]["lower"] if not i.isdigit()])
                # ngram_str = ''.join([i for i in query_ngrams[ngram]["lower"]])

                add_attribute = False
                for d in self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]['domain']:
                    value_raw = str(d)
                    value = value_raw.lower()

                    # Exact match
                    if ngram_str == value:
                        # Value - Keyword
                        value_keyword_mapping[attr][value_raw] = ngram_str

                        # Keyword - Value
                        if ngram_str not in keyword_value_mapping[attr]:
                            keyword_value_mapping[attr][ngram_str] = set()

                        keyword_value_mapping[attr][ngram_str].add(value_raw)
                        add_attribute = True

                    else:

                        string_similarity_score = helpers.compute_similarity(ngram_str, value,'token_similarity')

                        # Check 1: Token Similarity score should be 100, i.e. at least 1 word/n-gram in the query must match the attribute domain value
                        if string_similarity_score == 100:

                            # [OLD] Check 2: The matched n-gram must contain 2 or more words/tokens.
                            # if len(ngram_tokens) >= 2:

                            # Check 2: The matched attribute domain value must either be of length >= 2, i.e. 2 words OR be 1 of 2 possible words.
                            value_tokens = list(word_tokenize(value))
                            ngram_tokens = list(word_tokenize(ngram_str))
                            if len(ngram_tokens) >= 2 or (len(ngram_tokens) == 1 and len(value_tokens) == 2):
                                # Value - Keyword
                                value_keyword_mapping[attr][value_raw] = ngram_str

                                # Keyword - Value
                                if len(ngram_str.split()) <= len(value.split()):
                                    if ngram_str not in keyword_value_mapping[attr]:
                                        keyword_value_mapping[attr][ngram_str] = set()
                                    keyword_value_mapping[attr][ngram_str].add(value_raw)

                                add_attribute = True

                if add_attribute:
                    # Required: To filter out keyword subsets that point to the same attribute, e.g. science fiction, fiction, science
                    for k1 in keyword_value_mapping[attr].copy():
                            for k2 in keyword_value_mapping[attr].copy():
                                if k1!=k2 and k1 in k2:
                                    if k1 in keyword_value_mapping[attr]:
                                        del keyword_value_mapping[attr][k1]

                    # When attributes are double defined
                    metrics = ["attribute_domain_value_match"]
                    if attr in query_attributes:
                        # Update its metric
                        metrics = query_attributes[attr]["metric"]
                        if "attribute_domain_value_match" not in query_attributes[attr]["metric"]:
                            metrics.append("attribute_domain_value_match")

                    query_attributes[attr] = {
                        'name': attr,
                        "queryPhrase": list(keyword_value_mapping[attr].keys()),
                        'inferenceType': 'implicit',
                        'matchScore': self.nl4dv_instance.match_scores['attribute']['attribute_domain_value_match'],
                        'metric': metrics,
                        'isLabel': self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["isLabelAttribute"],
                        'isAmbiguous': False,
                        'ambiguity': list(),
                        'encode': False,
                        'meta': {
                            'score': None,
                            'threshold': None,
                            'alias': None,
                            'ambiguity': {}
                        }
                    }

                    op = dict()
                    for k,v in keyword_value_mapping[attr].items():
                        op[k] = list(v)
                    query_attributes[attr]["meta"]['ambiguity'] = op

                    # Update Keyword-Attribute-Score Mappings
                    self.update_keyword_attribute_mappings(keyword=ngram_str, attribute=attr, score=query_attributes[attr]["matchScore"])

        return query_attributes
Esempio n. 4
0
    def detect_attributes_from_domain_value(self, query_ngrams,
                                            data_attributes, query_attributes):

        value_keyword_mapping = dict()
        keyword_value_mapping = dict()

        for attr in data_attributes:

            # Note: This check is NOT right as the domain values are a prime way of applying categorical filters.
            # if attr in query_attributes and query_attributes[attr]["metric"] in ["attribute_similarity_match","attribute_alias_similarity_match","attribute_synonym_match"]:
            #     continue

            # ToDo:- Let's NOT look for domain value matches in the Label Attribute. Controversial, but that's how we've designed this.
            if attr == self.nl4dv_instance.label_attribute:
                continue

            # Look for domain value matches ONLY for ordinal and nominal variables.
            # For timeseries and quantitative  attribute types, it is difficult to map numbers to attributes AND this is computationally inefficient due to their domain size.
            if self.nl4dv_instance.data_genie_instance.data_attribute_map[
                    attr]["dataType"] not in ['N', 'O']:
                continue

            # RESET for each Attribute
            value_keyword_mapping[attr] = dict()
            keyword_value_mapping[attr] = dict()

            for ngram in query_ngrams:

                # Do NOT check for n_grams with numeric entities in the domain. They tend to produce erroneous results, especially due to the TOKEN based similarity algorithm.
                ngram_str = ''.join([
                    i for i in query_ngrams[ngram]["lower"] if not i.isdigit()
                ])
                # ngram_str = ''.join([i for i in query_ngrams[ngram]["lower"]])

                add_attribute = False
                for d in self.nl4dv_instance.data_genie_instance.data_attribute_map[
                        attr]['domain']:
                    value_raw = str(d)
                    value = value_raw.lower()

                    # Exact match
                    if ngram_str == value:
                        # Value - Keyword
                        value_keyword_mapping[attr][value_raw] = ngram_str

                        # Keyword - Value
                        if ngram_str not in keyword_value_mapping[attr]:
                            keyword_value_mapping[attr][ngram_str] = set()

                        keyword_value_mapping[attr][ngram_str].add(value_raw)
                        add_attribute = True

                    # elif self.nl4dv_instance.data_genie_instance.data_attribute_map[attr]["dataType"] == 'T' and helpers.isdate(ngram_str)[0]:
                    #     parsed_value = helpers.isdate(ngram_str)[1]
                    #     value_keyword_mapping[attr][parsed_value] = ngram_str
                    #
                    #     if ngram_str not in keyword_value_mapping[attr]:
                    #         keyword_value_mapping[attr][ngram_str] = set()
                    #     keyword_value_mapping[attr][ngram_str].add(parsed_value)
                    #
                    #     add_attribute = True
                    else:

                        string_similarity_score = helpers.compute_similarity(
                            ngram_str, value, 'token_similarity')
                        if string_similarity_score == 100:
                            # Value - Keyword
                            value_keyword_mapping[attr][value_raw] = ngram_str

                            # Keyword - Value
                            if len(ngram_str.split()) <= len(value.split()):
                                if ngram_str not in keyword_value_mapping[
                                        attr]:
                                    keyword_value_mapping[attr][
                                        ngram_str] = set()
                                keyword_value_mapping[attr][ngram_str].add(
                                    value_raw)

                            add_attribute = True

                if add_attribute:
                    # Required: To filter out keyword subsets that point to the same attribute, e.g. science fiction, fiction, science
                    for k1 in keyword_value_mapping[attr].copy():
                        for k2 in keyword_value_mapping[attr].copy():
                            if k1 != k2 and k1 in k2:
                                if k1 in keyword_value_mapping[attr]:
                                    del keyword_value_mapping[attr][k1]

                    # When attributes are double defined
                    metrics = ["attribute_domain_value_match"]
                    if attr in query_attributes:
                        # Update its metric
                        metrics = query_attributes[attr]["metric"]
                        if "attribute_domain_value_match" not in query_attributes[
                                attr]["metric"]:
                            metrics.append("attribute_domain_value_match")

                    query_attributes[attr] = {
                        'name':
                        attr,
                        "queryPhrase":
                        list(keyword_value_mapping[attr].keys()),
                        'inferenceType':
                        'implicit',
                        'matchScore':
                        self.nl4dv_instance.match_scores['attribute']
                        ['attribute_domain_value_match'],
                        'metric':
                        metrics,
                        'isLabel':
                        self.nl4dv_instance.data_genie_instance.
                        data_attribute_map[attr]["isLabelAttribute"],
                        'isAmbiguous':
                        False,
                        'ambiguity':
                        list(),
                        'encode':
                        False,
                        'meta': {
                            'score': None,
                            'threshold': None,
                            'alias': None,
                            'ambiguity': {}
                        }
                    }

                    op = dict()
                    for k, v in keyword_value_mapping[attr].items():
                        op[k] = list(v)
                    query_attributes[attr]["meta"]['ambiguity'] = op

                    # Update Keyword-Attribute-Score Mappings
                    self.update_keyword_attribute_mappings(
                        keyword=ngram_str,
                        attribute=attr,
                        score=query_attributes[attr]["matchScore"])

        return query_attributes
Esempio n. 5
0
    def detect_attributes_by_similarity(self, query_ngrams, data_attributes,
                                        query_attributes):

        for attr in data_attributes:
            is_exact_match = False
            for ngram in query_ngrams:
                add_attribute = False
                score = 0

                # Exact Match
                if data_attributes[attr]["lower"] == query_ngrams[ngram][
                        "lower"] or data_attributes[attr][
                            "stemmed_lower"] == query_ngrams[ngram][
                                "stemmed_lower"]:
                    add_attribute = True
                    is_exact_match = True
                    score = 100

                # Similarity Algorithm
                else:
                    # Compute similarity of tokens
                    string_similarity_score = helpers.compute_similarity(
                        data_attributes[attr]["lower"],
                        query_ngrams[ngram]["lower"], 'token_similarity')
                    if string_similarity_score >= self.nl4dv_instance.match_thresholds[
                            'string_similarity']:
                        add_attribute = True
                        score = string_similarity_score
                    else:
                        # Compute similarity of stemmed tokens
                        stemmed_string_similarity_score = helpers.compute_similarity(
                            data_attributes[attr]["stemmed_lower"],
                            query_ngrams[ngram]["stemmed_lower"],
                            'token_similarity')
                        if stemmed_string_similarity_score >= self.nl4dv_instance.match_thresholds[
                                'string_similarity']:
                            add_attribute = True
                            score = stemmed_string_similarity_score

                if add_attribute:
                    if attr not in query_attributes or query_attributes[attr][
                            "meta"]["score"] <= score:
                        query_attributes[attr] = {
                            'name':
                            attr,
                            "queryPhrase": [query_ngrams[ngram]["lower"]],
                            'inferenceType':
                            'explicit',
                            'matchScore':
                            self.nl4dv_instance.match_scores['attribute']
                            ['attribute_similarity_match'],
                            'metric': ['attribute_similarity_match'],
                            'isLabel':
                            self.nl4dv_instance.data_genie_instance.
                            data_attribute_map[attr]["isLabelAttribute"],
                            'encode':
                            not self.nl4dv_instance.data_genie_instance.
                            data_attribute_map[attr]["isLabelAttribute"],
                            'isAmbiguous':
                            False,
                            'ambiguity':
                            list(),
                            'meta': {
                                'score':
                                score,
                                'threshold':
                                self.nl4dv_instance.
                                match_thresholds['string_similarity'],
                                'alias':
                                None,
                                'ambiguity': {}
                            }
                        }

                        # Update Keyword-Attribute-Score Mappings
                        self.update_keyword_attribute_mappings(
                            keyword=query_ngrams[ngram]["lower"],
                            attribute=attr,
                            score=query_attributes[attr]["matchScore"])

                        # Important! If, the attribute is detected by exact match, then break the loop. We can SKIP subsequent n-grams.
                        if is_exact_match:
                            break

        return query_attributes