Esempio n. 1
0
class FromDatasetsNamedEntitiesPredictor(Predictor):
    """ Predicts named entities of a text by looking up terms in a dataset."""

    location_strings = {}
    dataset = pd.DataFrame(columns=["term", "entity_code", "parent_terms"],
                           dtype=str)
    flashtext = None
    marked_for_removal = []

    def __init__(self, predictor_config):
        super().__init__(predictor_config)
        self.load_datasets(predictor_config["datasets"])

    @property
    def config_validation_schema_custom_part(self):
        return yaml.load(
            """
            datasets:
                type: list
                schema:
                    type: dict
                    schema:
                        code:
                            type: string
                            required: True
                        location:
                            type: string
                            regex: "^.+?:.+"
                            required: True
            """,
            Loader=yaml.FullLoader,
        )

    def load_datasets(self, entity_code_location_string_dict):
        for entity_code_location_string in entity_code_location_string_dict:
            entity_code = entity_code_location_string["code"]
            location_string = entity_code_location_string["location"]
            # remember location string
            self.location_strings[entity_code] = location_string
            # load entities into dataset
            new_data = DatasetManager.load_dataset_from_location_string(
                location_string, {
                    "term": str,
                    "entity_code": str,
                    "parent_terms": str
                })[0]
            self.dataset = self.dataset.append(new_data)
            # update flashtext
            self.flashtext = KeywordProcessor()
            data_for_flashtext = pd.DataFrame({
                "against": [
                    "`{}``SN``{}`´".format(row["term"], row["entity_code"])
                    if not row["parent_terms"] else "`{}``PN``{}``{}`´".format(
                        row["term"], row["entity_code"], row["parent_terms"])
                    for index, row in self.dataset.iterrows()
                ],
                "replace":
                self.dataset["term"],
            })
            dict_for_flashtext = data_for_flashtext.set_index(
                "against").T.to_dict("list")
            self.flashtext.add_keywords_from_dict(dict_for_flashtext)

    def add_named_entity_term_to_dataset(self, term, entity_code,
                                         parent_terms):
        new_row = pd.DataFrame({
            "term": [term],
            "entity_code": [entity_code],
            "parent_terms": [parent_terms],
        })
        self.dataset = self.dataset.append(new_row)
        if parent_terms != "":
            self.flashtext.add_keywords_from_dict({
                "`{}``PN``{}``{}`´".format(term, entity_code, parent_terms):
                [term]
            })
        else:
            self.flashtext.add_keywords_from_dict(
                {"`{}``SN``{}`´".format(term, entity_code): [term]})

    def remove_named_entity_term_from_dataset(self, term, entity_code):
        self.dataset = self.dataset[~(
            (self.dataset["term"] == term)
            & (self.dataset["entity_code"] == entity_code))]
        self.flashtext.remove_keyword(term)

    def save_dataset(self, location_string, entity_code):
        # get the named entities with the specified entity code
        filtered_named_entities = self.dataset[self.dataset["entity_code"] ==
                                               entity_code].copy()
        # sort the filtered named entities for convenience
        filtered_named_entities["sort"] = filtered_named_entities[
            "term"].str.lower()
        filtered_named_entities = filtered_named_entities.sort_values(
            by=["sort"])
        del filtered_named_entities["sort"]
        # save the dataset
        DatasetManager.save_dataset_to_location_string(filtered_named_entities,
                                                       location_string)

    def mark_named_entity_term_for_removal(self, term, entity_code):
        if (term, entity_code) not in self.marked_for_removal:
            self.marked_for_removal.append((term, entity_code))

    def reset_marked_for_removal(self):
        self.marked_for_removal = []

    def get_parent_terms_for_named_entity(self, term, entity_code):
        # check if we have corresponding parent terms in the named entities dataset
        dataset_query_result = list(
            self.dataset[(self.dataset["entity_code"] == entity_code)
                         & (self.dataset["term"] == term)]["parent_terms"])
        if len(dataset_query_result) > 0:
            # we got a row back
            # return either the parent terms or None depending on parent_terms value in dataset
            dataset_query_result = dataset_query_result[0]
            return (None if dataset_query_result is None or
                    pd.isnull(dataset_query_result) else dataset_query_result)
        else:
            # no, no parent terms found in dataset
            return None

    def learn_from_annotated_text(self, annotated_text, language):
        # note: the definition of a "term" within this function is a tuple of term and entity code
        # get terms to add/update
        terms_to_add = {}
        parented_terms_to_update = []
        affected_entity_codes = []
        for annotation in extract_annotations_as_generator(
                annotated_text,
                types_to_extract=[
                    "standalone_named_entity", "parented_named_entity"
                ],
        ):
            if (len(self.dataset[(self.dataset["term"] == annotation["term"])
                                 & (self.dataset["entity_code"] ==
                                    annotation["entity_code"])]) == 0):
                # term does not exist yet
                terms_to_add = merge_dict(
                    terms_to_add,
                    {
                        (annotation["term"], annotation["entity_code"]):
                        annotation["parent_terms"]
                        if "parent_terms" in annotation else ""
                    },
                )
                affected_entity_codes.append(annotation["entity_code"])
            else:
                # term exists but may need update due to different parent terms
                if "parent_terms" in annotation:
                    currently_stored_parent_terms = list(self.dataset[
                        (self.dataset["term"] == annotation["term"])
                        & (self.dataset["entity_code"] ==
                           annotation["entity_code"])]["parent_terms"])[0]
                    if currently_stored_parent_terms != annotation[
                            "parent_terms"]:
                        # needs update
                        terms_to_add = merge_dict(
                            terms_to_add,
                            {
                                (
                                    annotation["term"],
                                    annotation["entity_code"],
                                ):
                                annotation["parent_terms"]
                                if "parent_terms" in annotation else ""
                            },
                        )
                        parented_terms_to_update.append(
                            (annotation["term"], annotation["entity_code"]))
                        affected_entity_codes.append(annotation["entity_code"])

        # get total terms to remove
        terms_to_remove = []
        for term in self.marked_for_removal:
            if term in terms_to_add:
                continue
            terms_to_remove.append(term)
            affected_entity_codes.append(term[1])
        terms_to_remove.extend(parented_terms_to_update)

        # update key terms dataset (incl. flashtext)
        # remove
        if terms_to_remove:
            for term in terms_to_remove:
                self.remove_named_entity_term_from_dataset(term[0], term[1])
        # add
        if terms_to_add:
            for term in terms_to_add:
                self.add_named_entity_term_to_dataset(term[0], term[1],
                                                      terms_to_add[term])
        # save
        for affected_entity_code in affected_entity_codes:
            if affected_entity_code in self.location_strings:
                self.save_dataset(self.location_strings[affected_entity_code],
                                  affected_entity_code)

    def predict_inline_annotations(self, text, language="en-US"):
        return (self.flashtext.replace_keywords(text)
                if self.flashtext is not None else text)
keyword_processor.extract_keywords(
    'I am a product manager for a java_2e platform')
# output ['product management', 'java']

# 删除关键词
keyword_processor = KeywordProcessor()
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(
    keyword_processor.extract_keywords(
        'I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')
print(
    keyword_processor.extract_keywords(
        'I am a product manager for a java_2e platform'))
# ['product management']

# you can also remove keywords from a list/ dictionary
keyword_processor.remove_keywords_from_dict({"product management": ["PM"]})
keyword_processor.remove_keywords_from_list(["java programing"])
keyword_processor.extract_keywords(
    'I am a product manager for a java_2e platform')
# output ['product management']

# 查询添加关键词的个数
keyword_processor = KeywordProcessor()
# 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围
Esempio n. 3
0
class FromDatasetKeyTermsPredictor(Predictor):
    """ Predicts key terms of a text by looking up terms in a dataset."""

    location_string = None
    dataset = pd.DataFrame(columns=["term", "parent_terms"], dtype=str)
    flashtext = None
    key_terms_marked_for_removal = []

    def __init__(self, predictor_config):
        super().__init__(predictor_config)
        self.load_dataset(predictor_config["location"])

    @property
    def config_validation_schema_custom_part(self):
        return yaml.load(
            """
            location:
                type: string
                regex: "^.+?:.+"
                required: True
            """,
            Loader=yaml.FullLoader,
        )

    def load_dataset(self, location_string):
        # update location_string
        self.location_string = location_string
        # load data
        self.dataset = DatasetManager.load_dataset_from_location_string(
            location_string, {
                "term": str,
                "parent_terms": str
            })[0]
        # setup flashtext for later string replacements
        temp_replace_against_dataset = self.dataset.copy()
        temp_replace_against_dataset["replace"] = temp_replace_against_dataset[
            "term"]
        temp_replace_against_dataset["against"] = temp_replace_against_dataset[
            "replace"]
        temp_replace_against_dataset.loc[
            temp_replace_against_dataset["parent_terms"] != "",
            "against"] = ("`" + temp_replace_against_dataset["term"] +
                          "``PK``" +
                          temp_replace_against_dataset["parent_terms"] + "`´")
        temp_replace_against_dataset.loc[
            temp_replace_against_dataset["parent_terms"] == "",
            "against"] = ("`" + temp_replace_against_dataset["term"] +
                          "``SK`´")
        temp_replace_against_dataset = temp_replace_against_dataset[[
            "replace", "against"
        ]]
        temp_replace_against_dataset_as_dict = {
            row["against"]: [row["replace"]]
            for index, row in temp_replace_against_dataset.iterrows()
        }
        self.flashtext = KeywordProcessor()
        self.flashtext.add_keywords_from_dict(
            temp_replace_against_dataset_as_dict)

    def add_key_term_to_dataset(self, key_term, parent_terms):
        new_row = pd.DataFrame({
            "term": [key_term],
            "parent_terms": [parent_terms]
        })
        self.dataset = self.dataset.append(new_row)
        if parent_terms != "":
            self.flashtext.add_keywords_from_dict(
                {"`{}``PK``{}`´".format(key_term, parent_terms): [key_term]})
        else:
            self.flashtext.add_keywords_from_dict(
                {"`{}``SK`´".format(key_term): [key_term]})

    def remove_key_term_from_dataset(self, key_term):
        self.dataset = self.dataset[self.dataset.term != key_term]
        self.flashtext.remove_keyword(key_term)

    def save_dataset(self, location_string):
        # sort the key terms dataset for convenience
        self.dataset["sort"] = self.dataset["term"].str.lower()
        self.dataset = self.dataset.sort_values(by=["sort"])
        del self.dataset["sort"]
        # save the dataset
        DatasetManager.save_dataset_to_location_string(self.dataset,
                                                       location_string)

    def mark_key_term_for_removal(self, key_term):
        if key_term not in self.key_terms_marked_for_removal:
            self.key_terms_marked_for_removal.append(key_term)

    def reset_key_terms_marked_for_removal(self):
        self.key_terms_marked_for_removal = []

    def learn_from_annotated_text(self, annotated_text, language):
        # get terms to add/update
        key_terms_to_add = {}
        parented_terms_to_update = []
        existing_terms_list = list(self.dataset["term"])
        for annotation in extract_annotations_as_generator(
                annotated_text,
                types_to_extract=["standalone_key_term", "parented_key_term"],
        ):
            if annotation["term"] not in existing_terms_list:
                # term does not exist yet
                key_terms_to_add = merge_dict(
                    key_terms_to_add,
                    {
                        annotation["term"]:
                        annotation["parent_terms"]
                        if "parent_terms" in annotation else ""
                    },
                )
            else:
                # term exists but may need update due to different parent terms
                if "parent_terms" in annotation:
                    currently_stored_parent_terms = list(
                        self.dataset[self.dataset["term"] ==
                                     annotation["term"]]["parent_terms"])[0]
                    if currently_stored_parent_terms != annotation[
                            "parent_terms"]:
                        # needs update
                        key_terms_to_add = merge_dict(
                            key_terms_to_add,
                            {
                                annotation["term"]:
                                annotation["parent_terms"]
                                if "parent_terms" in annotation else ""
                            },
                        )
                        parented_terms_to_update.append(annotation["term"])

        # get total terms to remove
        key_terms_to_remove = [
            key_term for key_term in self.key_terms_marked_for_removal
            if key_term not in key_terms_to_add
        ]
        key_terms_to_remove.extend(parented_terms_to_update)

        # update key terms dataset (incl. flashtext)
        # remove
        if key_terms_to_remove:
            for key_term in key_terms_to_remove:
                self.remove_key_term_from_dataset(key_term)
        # add
        if key_terms_to_add:
            for key_term in key_terms_to_add:
                self.add_key_term_to_dataset(key_term,
                                             key_terms_to_add[key_term])
        # save
        self.save_dataset(self.location_string)

    def predict_inline_annotations(self, text, language="en-US"):
        return (self.flashtext.replace_keywords(text)
                if self.flashtext is not None else text)
Esempio n. 4
0
class PrejudiceSubPopulation(SubPopulation):
    r"""
    Filter samples based on gender bias

    for example in mode 'man'::

        sample 1: "There is a boy.", score: 1
        sample 2: "There is a girl.", score: 1
        sample 3: "There are boys and girls.", score: 0
    """
    def __init__(self, mode='man'):
        super().__init__()

        self.mode = mode
        assert mode in ['man', 'woman', 'both', 'none'], \
            "Mode should be one in ['man', 'woman', 'both', 'none']"

        man_phrases, woman_phrases = self.get_data(
            download_if_needed(PREJUDICE_PATH))
        man_phrases.extend(self.get_words(MAN_WORDS))
        woman_phrases.extend(self.get_words(WOMAN_WORDS))
        self.processor = KeywordProcessor(case_sensitive=True)
        self.processor.add_keywords_from_dict({"man": man_phrases})
        self.processor.add_keywords_from_dict({"woman": woman_phrases})
        # TODO
        self.processor.remove_keyword('My')

    def __repr__(self):
        return "PrejudiceSubpopulation" + "-" + self.mode

    @staticmethod
    def get_data(path):
        # get the name dictionary

        for dic in read_json(path):
            _, dic = dic
            return dic['men'], dic['women']

    @staticmethod
    def get_words(words):
        tokens = []
        tokens.extend(words)
        tokens.extend([token.upper() for token in words])
        tokens.extend([token.title() for token in words])
        return tokens

    def word_match(self, texts, type):
        for text in texts:
            if type == 'man':
                result = self.processor.extract_keywords(text)
                if 'man' in result:
                    return True
            else:
                result = self.processor.extract_keywords(text)
                if 'woman' in result:
                    return True
        return False

    def _score(self, sample, fields, **kwargs):
        r"""
        1 or 0 indicate whether sample fields match mode and prejudice words

        :param sample: data sample
        :param list fields: list of field str
        :param kwargs:
        :return int: score for sample
        """

        texts = [sample.get_text(field) for field in fields]
        man_match = self.word_match(texts, type='man')
        woman_match = self.word_match(texts, type='woman')

        if self.mode == 'man':
            return man_match and not woman_match
        elif self.mode == 'woman':
            return woman_match and not man_match
        elif self.mode == 'both':
            return woman_match and man_match
        else:
            return not woman_match and not man_match

    def get_slice(self, scores, dataset):
        r"""
        Save the samples that mach the phrase groups and mode

        """
        sub_samples = []
        for i, sample in enumerate(dataset):
            if scores[i]:
                sub_samples.append(sample)
        return sub_samples
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java", "python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management', 'java']

# 删除关键词
keyword_processor = KeywordProcessor()
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# ['product management']

# you can also remove keywords from a list/ dictionary
keyword_processor.remove_keywords_from_dict({"product management": ["PM"]})
keyword_processor.remove_keywords_from_list(["java programing"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management']

# 查询添加关键词的个数
keyword_processor = KeywordProcessor()
# 字典格式的关键词,其对应的key为最终匹配出的词,但key不记入关键词搜索的范围
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]