Ejemplo n.º 1
0
    def candidate_weighting(self, df=None):
        """ Candidate weighting function using document frequencies.

            Args:
                df (dict): document frequencies, the number of documents should
                    be specified using the "--NB_DOC--" key.
        """

        # initialize default document frequency counts if none provided
        if df is None:
            df = load_document_frequency_file(self._df_counts, delimiter='\t')

        # initialize the number of documents as --NB_DOC-- + 1 (current)
        N = 1 + df.get('--NB_DOC--', 0)

        # loop throught the candidates
        for k, v in self.candidates.items():

            # get candidate document frequency
            candidate_df = 1 + df.get(k, 0)

            # compute the idf score
            idf = math.log(N / candidate_df, 2)

            # add the idf score to the weights container
            self.weights[k] = len(v.surface_forms) * idf
Ejemplo n.º 2
0
    def candidate_weighting(self, df=None, sigma=3.0, alpha=2.3):
        """Candidate weight calculation as described in the KP-Miner paper.

        Note:
            w = tf * idf * B * P_f
            with
            
              * B = N_d / (P_d * alpha) and B = min(sigma, B)
              * N_d = the number of all candidate terms
              * P_d = number of candidates whose length exceeds one
              * P_f = 1

        Args:
            df (dict): document frequencies, the number of documents should
                be specified using the "--NB_DOC--" key.
            sigma (int): parameter for boosting factor, defaults to 3.0.
            alpha (int): parameter for boosting factor, defaults to 2.3.
        """

        # initialize default document frequency counts if none provided
        if df is None:
            logging.warning('LoadFile._df_counts is hard coded to {}'.format(
                self._df_counts))
            df = load_document_frequency_file(self._df_counts, delimiter='\t')

        # initialize the number of documents as --NB_DOC-- + 1 (current)
        N = 1 + df.get('--NB_DOC--', 0)

        # compute the number of candidates whose length exceeds one
        P_d = sum([len(v.surface_forms) for v in self.candidates.values()
                   if len(v.lexical_form) > 1])

        # fall back to 1 if all candidates are words
        P_d = max(1, P_d)

        # compute the number of all candidate terms
        N_d = sum([len(v.surface_forms) for v in self.candidates.values()])

        # compute the boosting factor
        B = min(N_d / (P_d * alpha), sigma)

        # loop throught the candidates
        for k, v in self.candidates.items():

            # get candidate document frequency
            candidate_df = 1

            # get the df for unigram only
            if len(v.lexical_form) == 1:
                candidate_df += df.get(k, 0)

            # compute the idf score
            idf = math.log(N / candidate_df, 2)

            if len(v.lexical_form) == 1:
                # If single word candidate do not apply boosting factor
                self.weights[k] = len(v.surface_forms) * idf
            else:
                 self.weights[k] = len(v.surface_forms) * B * idf
Ejemplo n.º 3
0
Archivo: kea.py Proyecto: ztx0728/pke
    def feature_extraction(self, df=None, training=False):
        """Extract features for each keyphrase candidate. Features are the
        tf*idf of the candidate and its first occurrence relative to the
        document.

        Args:
            df (dict): document frequencies, the number of documents should be
                specified using the "--NB_DOC--" key.
            training (bool): indicates whether features are computed for the
                training set for computing IDF weights, defaults to false.
        """

        # initialize default document frequency counts if none provided
        if df is None:
            logging.warning('LoadFile._df_counts is hard coded to {}'.format(
                self._df_counts))
            df = load_document_frequency_file(self._df_counts, delimiter='\t')

        # initialize the number of documents as --NB_DOC--
        N = df.get('--NB_DOC--', 0) + 1
        if training:
            N -= 1

        # find the maximum offset
        maximum_offset = float(sum([s.length for s in self.sentences]))

        for k, v in self.candidates.items():

            # get candidate document frequency
            candidate_df = 1 + df.get(k, 0)

            # hack for handling training documents
            if training and candidate_df > 1:
                candidate_df -= 1

            # compute the tf*idf of the candidate
            idf = math.log(N / candidate_df, 2)

            # add the features to the instance container
            self.instances[k] = np.array([len(v.surface_forms) * idf,
                                          v.offsets[0] / maximum_offset])

        # scale features
        self.feature_scaling()
Ejemplo n.º 4
0
    def feature_extraction(self, df=None, training=False, features_set=None):
        """Extract features for each candidate.

        Args:
            df (dict): document frequencies, the number of documents should be
                specified using the "--NB_DOC--" key.
            training (bool): indicates whether features are computed for the
                training set for computing IDF weights, defaults to false.
            features_set (list): the set of features to use, defaults to
                [1, 4, 6].

        """

        # define the default features_set
        if features_set is None:
            features_set = [1, 4, 6]

        # initialize default document frequency counts if none provided
        if df is None:
            df = load_document_frequency_file(self._df_counts, delimiter='\t')

        # initialize the number of documents as --NB_DOC--
        N = df.get('--NB_DOC--', 0) + 1
        if training:
            N -= 1

        # find the maximum offset
        maximum_offset = float(sum([s.length for s in self.sentences]))

        # loop through the candidates
        for k, v in self.candidates.items():

            # initialize features array
            feature_array = []

            # get candidate document frequency
            candidate_df = 1 + df.get(k, 0)

            # hack for handling training documents
            if training and candidate_df > 1:
                candidate_df -= 1

            # compute the tf*idf of the candidate
            idf = math.log(N / candidate_df, 2)

            # [F1] TF*IDF
            feature_array.append(len(v.surface_forms) * idf)

            # [F2] -> TF
            feature_array.append(len(v.surface_forms))

            # [F3] -> term frequency of substrings
            tf_of_substrings = 0
            stoplist = stopwords.words(self.language)
            for i in range(len(v.lexical_form)):
                for j in range(i, min(len(v.lexical_form), i + 3)):
                    sub_words = v.lexical_form[i:j + 1]
                    sub_string = ' '.join(sub_words)

                    # skip if substring is fullstring
                    if sub_string == ' '.join(v.lexical_form):
                        continue

                    # skip if substring contains a stopword
                    if set(sub_words).intersection(stoplist):
                        continue

                    # check whether the substring occurs "as it"
                    if sub_string in self.candidates:

                        # loop throught substring offsets
                        for offset_1 in self.candidates[sub_string].offsets:
                            is_included = False
                            for offset_2 in v.offsets:
                                if offset_1 >= offset_2 and \
                                   offset_1 <= offset_2 + len(v.lexical_form):
                                    is_included = True
                            if not is_included:
                                tf_of_substrings += 1

            feature_array.append(tf_of_substrings)

            # [F4] -> relative first occurrence
            feature_array.append(v.offsets[0] / maximum_offset)

            # [F5] -> relative last occurrence
            feature_array.append(v.offsets[-1] / maximum_offset)

            # [F6] -> length of phrases in words
            feature_array.append(len(v.lexical_form))

            # [F7] -> typeface
            feature_array.append(0)

            # extract information from sentence meta information
            meta = [self.sentences[sid].meta for sid in v.sentence_ids]

            # extract meta information of candidate
            sections = [u['section'] for u in meta if 'section' in u]
            types = [u['type'] for u in meta if 'type' in u]

            # [F8] -> Is in title
            feature_array.append('title' in sections)

            # [F9] -> TitleOverlap
            feature_array.append(0)

            # [F10] -> Header
            feature_array.append('sectionHeader' in types
                                 or 'subsectionHeader' in types
                                 or 'subsubsectionHeader' in types)

            # [F11] -> abstract
            feature_array.append('abstract' in sections)

            # [F12] -> introduction
            feature_array.append('introduction' in sections)

            # [F13] -> related work
            feature_array.append('related work' in sections)

            # [F14] -> conclusions
            feature_array.append('conclusions' in sections)

            # [F15] -> HeaderF
            feature_array.append(
                types.count('sectionHeader') +
                types.count('subsectionHeader') +
                types.count('subsubsectionHeader'))

            # [F11] -> abstractF
            feature_array.append(sections.count('abstract'))

            # [F12] -> introductionF
            feature_array.append(sections.count('introduction'))

            # [F13] -> related workF
            feature_array.append(sections.count('related work'))

            # [F14] -> conclusionsF
            feature_array.append(sections.count('conclusions'))

            # add the features to the instance container
            self.instances[k] = np.array([feature_array[i-1] for i \
                                          in features_set])

        # scale features
        self.feature_scaling()
Ejemplo n.º 5
0
def extract_terms(core_nlp_folder):
    compute_document_frequency(core_nlp_folder,
                               os.path.join(INTERIM_DIR, "cargo_df.tsv.gz"),
                               stoplist=list(STOP_WORDS))
    log.info("Begin Extraction")
    n = 15
    cargo_df = load_document_frequency_file(
        os.path.join(INTERIM_DIR, "cargo_df.tsv.gz"))
    pke_factory = {
        "grammar": r"""
        NP:
            {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>}
        """,
        "filtering_params": {
            "stoplist": list(STOP_WORDS)
        },
        "extractors": {
            "tfidf": {
                "instance": terms.PKEBasedTermsExtractor(TfIdf),
                "weighting_params": {
                    "df": cargo_df
                }
            },
            "kpm": {
                "instance": terms.PKEBasedTermsExtractor(KPMiner),
                "weighting_params": {
                    "df": cargo_df
                }
            },
            "yake": {
                "instance": terms.PKEBasedTermsExtractor(YAKE),
                "filtering_params": {
                    "only_alphanum": True,
                    "strip_outer_stopwords": True
                },
                "weighting_params": {}
            },
            "singlerank": {
                "instance": terms.PKEBasedTermsExtractor(SingleRank),
                "weighting_params": {
                    "window": 10,
                    "pos": {"NOUN", "PROPN", "NUM", "ADJ"}
                }
            },
            "topicrank": {
                "instance": terms.PKEBasedTermsExtractor(TopicRank),
                "weighting_params": {}
            },
            "mprank": {
                "instance": terms.PKEBasedTermsExtractor(MultipartiteRank),
                "weighting_params": {}
            },
            "positionrank": {
                "instance": terms.PKEBasedTermsExtractor(PositionRank),
                "weighting_params": {}
            }
        }
    }
    for name in pke_factory["extractors"]:
        log.info(f"Begin Extraction with PKE based extractor: {name}")
        extractor = pke_factory["extractors"][name]["instance"]
        if "filtering_params" in pke_factory["extractors"][name]:
            filtering_params = {
                **pke_factory["filtering_params"],
                **pke_factory["extractors"][name]["filtering_params"]
            }
        else:
            filtering_params = pke_factory["filtering_params"]
        extractor.extract(
            core_nlp_folder,
            n,
            grammar=pke_factory["grammar"],
            filtering_params=filtering_params,
            weighting_params=pke_factory["extractors"][name]
            ["weighting_params"],
            output_file=os.path.join(EXTRACTED_DIR, f"{name}.csv"),
            auto_term_file=f"data/annotations/automatic/terms/{name}.jsonl")
    # EmbedRank
    log.info("Begin Extraction with EmbedRank extractor")
    embedrank_extractor = terms.EmbedRankTermsExtractor(
        emdib_model_path="pretrain_models/torontobooks_unigrams.bin")
    embedrank_extractor.extract(
        core_nlp_folder,
        n,
        grammar=r"""
            NALL:
                {<NN|NNP|NNS|NNPS>}

            NP:
                {<NALL|CD|JJ>*<NALL>}
            """,
        considered_tags={'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'CD'},
        output_file=os.path.join(EXTRACTED_DIR, "torontobooks_unigrams.csv"))