Exemple #1
0
    def runSingleDoc(self, doc):
        #Get TFIDF keywords
        # 1. create a TFIDF extractor.
        extractor = pke.unsupervised.TfIdf()

        # 2. load the content of the document in a given language
        # Test if lan exists in spacy models. If not considers model en
        if self.__lan not in ['en', 'pt', 'fr', 'it', 'nl', 'de']:
            extractor.load_document(input=doc, language='en', normalization=self.__normalization)
        else:
            extractor.load_document(input=doc, language=self.__lan, normalization=self.__normalization)

        # 3. select {1-3}-grams not containing punctuation marks as candidates.
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += load_stop_words(self.__lan)

        extractor.candidate_selection(n=3, stoplist=stoplist)

        try:
            # 4. weight the candidates using a `tf` x `idf`
            df = pke.load_document_frequency_file(input_file=self.__pathToDFFile)
            extractor.candidate_weighting(df=df)

            # 5. get the numOfKeywords-highest scored candidates as keyphrases
            keywords = extractor.get_n_best(n=self.__numOfKeywords)
        except:
            keywords = []

        return keywords
def test(files, number_of_tags, trained_model, test_DF_zip):
    # create a Kea extractor and set the input language to English (used for
    # the stoplist in the candidate selection method)
    extractor = pke.supervised.Kea()

    # load the content of the document, here in CoreNLP XML format
    # the use_lemmas parameter allows to choose using CoreNLP lemmas or stems
    # computed using nltk
    extractor.load_document(files)

    # select the keyphrase candidates, for Kea the 1-3 grams that do not start or
    # end with a stopword.
    extractor.candidate_selection()

    # load the df counts
    df_counts = pke.load_document_frequency_file(input_file=test_DF_zip,
                                                 delimiter='\t')

    # weight the candidates using Kea model.
    extractor.candidate_weighting(model_file=trained_model, df=df_counts)

    key_list = []

    for (keyphrase, score) in extractor.get_n_best(n=number_of_tags):
        key_list.append(keyphrase)

    files = files.split('/')[-1].split('.')[0]
    return files, key_list
Exemple #3
0
    def runSingleDoc(self, doc):
        #Get keywords for a single doc. It will only retrieve the keywords for further processing
        #Either they will be printed in case we really just want to extract keywords from a single doc
        #Or they will be saved in case we are extracting keywords from multiple docs (that is, if this method is called externally by runMultipleDocs)
        # 1. create extractor.
        extractor = pke.supervised.Kea()

        # 2. load the content of the document in a given language
        extractor.load_document(input=doc, language=self.__lan, normalization=self.__normalization)

        # 3. select 1-3 grams that do not start or end with a stopword as
        #    candidates. Candidates that contain punctuation marks as words
        #    are discarded.
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        a = load_stop_words(self.__lan)
        stoplist += load_stop_words(self.__lan)
        extractor.candidate_selection(stoplist=stoplist)

        try:
            # 4. classify candidates as keyphrase or not keyphrase.
            #df = pke.load_document_frequency_file(input_file= self.__pathToDFFile +  "/df.tsv.gz")
            df = pke.load_document_frequency_file(input_file=self.__pathToDFFile)
            #extractor.candidate_weighting(model_file=self.__pathToKeaModelsFolder + "/model.pickle", df=df)
            extractor.candidate_weighting(model_file=self.__pathToKEAFile, df=df)

            # 5. get the numOfKeywords-highest scored candidates as keyphrases
            keywords = extractor.get_n_best(n=self.__numOfKeywords)
        except:
            keywords = []

        return keywords
Exemple #4
0
    def runSingleDoc(self, doc):
        #Get KPMiner keywords
        # 1. create a SingleRank extractor.
        extractor = pke.unsupervised.KPMiner()

        # 2. load the content of the document in a given language
        extractor.load_document(input=doc,
                                language=self.__lan,
                                normalization=self.__normalization)

        # 3. select {1-5}-grams that do not contain punctuation marks or
        #    stopwords as keyphrase candidates. Set the least allowable seen
        #    frequency to 5 and the number of words after which candidates are
        #    filtered out to 200.
        lasf = 5
        cutoff = 200
        extractor.candidate_selection(lasf=lasf, cutoff=cutoff)

        try:
            # 4. weight the candidates using KPMiner weighting function.
            df = pke.load_document_frequency_file(
                input_file=self.__pathToDFFile)
            alpha = 2.3
            sigma = 3.0
            extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma)

            # 5. get the numOfKeywords-highest scored candidates as keyphrases
            keywords = extractor.get_n_best(n=self.__numOfKeywords)
        except:
            keywords = []

        return keywords
Exemple #5
0
    def TrainingKEAModel(self, pathToCollectionOfDocs, groundTruthFile, lang,
                         normalization, pathToDFFile, pathToKEAFile,
                         pathToKeaModelsFolder):
        print(f"\nSTEP 2: Compute Document Frequency")
        ComputeDF(pathToCollectionOfDocs, lang, normalization, pathToDFFile)
        df = pke.load_document_frequency_file(input_file=pathToDFFile)

        print(
            f"\nSTEP 3: Train KEA Model on top of the following set of docs: {pathToCollectionOfDocs}"
        )

        if os.path.exists(pathToKEAFile):
            print(f"KEA Model File already exists here:  {pathToKEAFile} ")
        else:
            print(
                f"KEA Model doesn't exists. Let's create here: {pathToCollectionOfDocs}. It may take a while."
            )
            # If folder Models does not exist: Create it
            if not os.path.exists(pathToKeaModelsFolder):
                os.makedirs(pathToKeaModelsFolder)

            pke.train_supervised_model(input_dir=pathToCollectionOfDocs,
                                       reference_file=groundTruthFile,
                                       model_file=pathToKEAFile,
                                       extension='txt',
                                       language=lang,
                                       normalization=normalization,
                                       df=df,
                                       model=pke.supervised.Kea())
Exemple #6
0
def main():

    #process the document frequency of the reference corpus
    """Compute Document Frequency (DF) counts from a collection of documents.

	N-grams up to 3-grams are extracted and converted to their n-stems forms.
	Those containing a token that occurs in a stoplist are filtered out.
	Output file is in compressed (gzip) tab-separated-values format (tsv.gz).
	"""

    # stoplist for filtering n-grams
    stoplist = list(punctuation)

    # compute df counts and store as n-stem -> weight values
    compute_document_frequency(
        input_dir=
        '/Users/gmt28/Documents/Workspace/Docker_Engine/varad/Yale_Projects/shoah-foundation-data-restored/shoah-foundation-data/data/inputs/fortunoff/transcripts/',
        output_file=
        '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz',
        extension='txt',  # input file extension
        language='en',  # language of files
        normalization=None,  # use porter stemmer
        stoplist=stoplist,
        n=1)

    pdb.set_trace()
    """Keyphrase extraction using TfIdf and newly computed DF counts."""

    # initialize TfIdf model
    extractor = pke.unsupervised.TfIdf()

    # load the DF counts from file
    df_counts = pke.load_document_frequency_file(
        input_file=
        '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/output.tsv.gz'
    )

    # load the content of the document
    extractor.load_document(
        input=
        '/Users/gmt28/Documents/Workspace/data_analysis_lts/Processes/Extract_Keywords/text.txt',
        normalization=None,
        language='en')

    # keyphrase candidate selection
    extractor.candidate_selection(n=1, stoplist=list(string.punctuation))

    # candidate weighting with the provided DF counts
    extractor.candidate_weighting(df=df_counts)

    # N-best selection, keyphrases contains the 10 highest scored candidates as
    # (keyphrase, score) tuples
    keyphrases = extractor.get_n_best(n=15)
    print(keyphrases)
    pdb.set_trace()
Exemple #7
0
 def __init__(self, max_ngram_size=3, df=None, **kwargs):
     super().__init__(**kwargs)
     self.name = kwargs.get('name', 'TfIdf')
     self.max_ngram_size = max_ngram_size
     self.df = df
     if isinstance(self.df, str):
         self.df = pke.load_document_frequency_file(input_file=self.df)
     self.pos = {'NOUN', 'PROPN', 'ADJ'}
     self.stoplist = list(string.punctuation)
     self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
     self.stoplist += stopwords.words('english')
     self.kw_extractor = pke.unsupervised.TfIdf()
Exemple #8
0
def try_export_jsonl():
    n = 10
    # snlp_folder = "../data/processed/news/relevant/train/"
    snlp_folder = "../data/processed/news/relevant/train/"
    compute_document_frequency(
        snlp_folder,
        os.path.join("../data/interim/news_cargo_df.tsv.gz"),
        stoplist=list(STOP_WORDS))
    cargo_df = load_document_frequency_file(
        "../data/interim/news_cargo_df.tsv.gz")
    pke_factory = {
        "grammar": r"""
                NBAR:
                    {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>}

                NP:
                    {<NBAR>}
                    {<NBAR><ADP><NBAR>}
                """,
        "filtering_params": {
            "stoplist": list(STOP_WORDS)
        },
        "extractors": {
            "kpm": {
                "instance": PKEBasedTermsExtractor(KPMiner),
                "weighting_params": {
                    "df": cargo_df
                }
            },
        }
    }
    for name in pke_factory["extractors"]:
        log.info(f"Begin Extraction with PKE based extractor: {name}")
        extractor_instance = pke_factory["extractors"][name]["instance"]
        if "filtering_params" in pke_factory["extractors"][name]:
            filtering_params = {
                **pke_factory["filtering_params"],
                **pke_factory["extractors"][name]["filtering_params"]
            }
        else:
            filtering_params = pke_factory["filtering_params"]
        extractor_instance.extract(
            snlp_folder,
            n,
            grammar=pke_factory["grammar"],
            filtering_params=filtering_params,
            weighting_params=pke_factory["extractors"][name]
            ["weighting_params"],
            output_file=f"../results/extracted_terms/train/{name}.csv",
            auto_term_file=f"../data/annotations/automatic/terms/{name}.jsonl")
Exemple #9
0
 def __init__(self, lasf=3, cutoff=200, alpha=2.3, sigma=3.0, df=None, **kwargs):
     super().__init__(**kwargs)
     self.name = kwargs.get('name', 'KPMiner')
     self.lasf = lasf
     self.cutoff = cutoff
     self.alpha = alpha
     self.sigma = sigma
     self.df = df
     if isinstance(self.df, str):
         self.df = pke.load_document_frequency_file(input_file=self.df)
     self.pos = {'NOUN', 'PROPN', 'ADJ'}
     self.stoplist = list(string.punctuation)
     self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
     self.stoplist += stopwords.words('english')
     self.kw_extractor = pke.unsupervised.KPMiner()
Exemple #10
0
def extract_keyphrases(data):
    gold_keyphrases = []  # save the gold keyphrases of documents
    pred_keyphrases = []  # save the predicted keyphrases of documents
    for indx, abstract_document in enumerate(data['abstract']):
        # print('train_test_combined/' + key + '.xml')
        # print(keyphrases_dictionary[key])

        #if 'json' in file:
        gold_keyphrases.append([
            [Stemmer('porter').stem(keyword) for keyword in keyphrase.split()]
            for keyphrase in data['keyword'][indx].split(';')
        ])  # split gold keywords to separate them from one another

        # ======================================================================================================================
        # TF-IDF Extractor
        # ======================================================================================================================

        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')

        # 1. create a TfIdf extractor.
        extractor = pke.unsupervised.TfIdf()
        #print(' '.join(abstract_document))
        print(abstract_document)
        # 2. load the content of the document.
        extractor.load_document(
            input=abstract_document,  # ' '.join(abstract_document
            language='en',
            normalization="stemming")

        # 3. select {1-3}-grams not containing punctuation marks as candidates.
        extractor.candidate_selection(n=3, stoplist=stoplist)

        # 4. weight the candidates using a `tf` x `idf`
        df = pke.load_document_frequency_file(input_file=input_file)
        extractor.candidate_weighting(df=df)

        # 5. get the 10-highest scored candidates as keyphrases
        pred_kps = extractor.get_n_best(n=10)

        # keep only the predicted keyphrase (first position -> [0]) and discard the frequency number
        pred_keyphrases.append([kp[0].split() for kp in pred_kps])

    print(pred_keyphrases)
    print(gold_keyphrases)

    return pred_keyphrases, gold_keyphrases
def pke_unsupervised(cur_text,
                     top_k,
                     kw_extractor,
                     lang='en',
                     document_frequency_file=None):
    arg_tokens = kw_extractor.split('-')
    extractor = getattr(pke.unsupervised, arg_tokens[-1])()
    extractor.load_document(input=cur_text, language=lang)
    extractor.candidate_selection()
    if document_frequency_file is not None:
        df_counts = pke.load_document_frequency_file(
            input_file=document_frequency_file)
        extractor.candidate_weighting(df=df_counts)
    else:  # go back to the default values
        extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=top_k)
    final_kw = [(score, term) for term, score in keyphrases]
    return final_kw
Exemple #12
0
import pke

base = os.path.dirname(__file__)

# create a Kea extractor and set the input language to English (used for
# the stoplist in the candidate selection method)
extractor = pke.supervised.Kea()

# load the content of the document, here in corenlp format
with open(base + os.sep + '2.txt') as f:
    doc = f.read()
extractor.load_document(doc)

# select the keyphrase candidates, for Kea the 1-3 grams that do not start or
# end with a stopword.
extractor.candidate_selection()

# load the df counts
df_counts = pke.load_document_frequency_file(input_file=base + os.sep +
                                             'df.tsv.gz',
                                             delimiter='\t')

# weight the candidates using Kea model.
extractor.candidate_weighting(model_file=base + os.sep + 'model.pickle',
                              df=df_counts)

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10):
    print(keyphrase, score)
Exemple #13
0
import logging
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = 'train/'

# path to the reference file
reference_file = "gold-annotation.txt"

# path to the df file
df_file = "df.tsv.gz"
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
                                             delimiter='\t')

# path to the model, saved as a pickle
output_mdl = "model.pickle"

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           df=df_counts,
                           format="corenlp",
                           use_lemmas=False,
                           stemmer="porter",
                           model=pke.supervised.Kea(),
                           language='english',
                           extension="xml")
Exemple #14
0
    pke.compute_lda_model(input_dir=path_to_train,
                          output_file=path_to_lda_file,
                          n_topics=params["n_topics"],
                          extension=params["extension"],
                          language=params["language"],
                          normalization=params["normalization"])


# pre-compute pairwise similarities if needed
need_pairwise = any(model in ['ExpandRank'] for model in params['models'])
if need_pairwise and not os.path.isfile(path_to_pairwise_file):
    logging.info("computing pairwise similarities in {}".format(
        params["path"]))

    logging.info("loading DF counts from {}".format(path_to_df_file))
    df_counts = pke.load_document_frequency_file(input_file=path_to_df_file)

    pke.compute_pairwise_similarity_matrix(
        input_dir=path_to_test,
        output_file=path_to_pairwise_file,
        collection_dir=path_to_train,
        df=df_counts,
        extension=params["extension"],
        language=params["language"],
        normalization=params["normalization"],
        stoplist=stoplist)

###############################################################################

###############################################################################
# TRAINING SUPERVISED MODEL
Exemple #15
0
    stoplist += stopwords.words('english')

    # 1. create a TfIdf extractor.
    extractor = pke.unsupervised.TfIdf()
    #print(' '.join(abstract_document))
    print(abstract_document)
    # 2. load the content of the document.
    extractor.load_document(input=abstract_document,  # ' '.join(abstract_document
                            language='en',
                            normalization="stemming")

    # 3. select {1-3}-grams not containing punctuation marks as candidates.
    extractor.candidate_selection(n=3, stoplist=stoplist)

    # 4. weight the candidates using a `tf` x `idf`
    df = pke.load_document_frequency_file(input_file=input_file)
    extractor.candidate_weighting(df=df)

    # 5. get the 10-highest scored candidates as keyphrases
    pred_kps = extractor.get_n_best(n=10)

    pred_keyphrases.append([kp[0].split() for kp in pred_kps])  # keep only the predicted keyphrase and discard the frequency number

print(pred_keyphrases)
print(gold_keyphrases)

# ======================================================================================================================
# Evaluation
# ======================================================================================================================

# traditional evaluation the model's performance
def get_keywords(content):
    content = content.replace("-", "")
    if len(content) <= 500:
        NGraph = 8
        NStat = 5
    if 500 < len(content) < 1000:
        NGraph = 13
        NStat = 10
    if len(content) >= 1000:
        NGraph = 18
        NStat = 15

    PositionRank = []
    MultipartiteRank = []
    TFIDF = []
    TextRank = []

    # PKE - TF-IDF
    extractorTFIDF = pke.unsupervised.TfIdf()
    extractorTFIDF.load_document(input=content,
                                 language="en",
                                 normalization=None)
    extractorTFIDF.candidate_selection(n=4, stoplist=stoplist)
    df = pke.load_document_frequency_file(
        input_file=
        'C:/Users/admin/Anaconda3/Lib/site-packages/pke/models/df-semeval2010.tsv.gz'
    )
    extractorTFIDF.candidate_weighting(df=df)
    keyphrasesTFIDF = extractorTFIDF.get_n_best(n=NStat)
    for key in keyphrasesTFIDF:
        TFIDF.append(key[0])

    # PKE - TextRank
    pos = {'NOUN', 'PROPN', 'ADJ'}
    extractorTextRank = pke.unsupervised.TextRank()
    extractorTextRank.load_document(input=content,
                                    language='en',
                                    normalization=None)
    extractorTextRank.candidate_weighting(window=2, pos=pos, top_percent=0.33)
    keyphrasesTextRank = extractorTextRank.get_n_best(n=NGraph)
    for key in keyphrasesTextRank:
        TextRank.append(key[0])

    # PKE - PositionRank
    pos = {'NOUN', 'PROPN', 'ADJ'}
    grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
    extractorPositionRank = pke.unsupervised.PositionRank()
    extractorPositionRank.load_document(input=content,
                                        language='en',
                                        normalization=None)
    extractorPositionRank.candidate_selection(grammar=grammar,
                                              maximum_word_number=4)
    extractorPositionRank.candidate_weighting(window=2, pos=pos)
    keyphrasesPositionRank = extractorPositionRank.get_n_best(n=NGraph)
    for key in keyphrasesPositionRank:
        PositionRank.append(key[0])

    # PKE - MultipartiteRank
    extractorMultipartiteRank = pke.unsupervised.MultipartiteRank()
    extractorMultipartiteRank.load_document(input=content)
    pos = {'NOUN', 'PROPN', 'ADJ'}
    extractorMultipartiteRank.candidate_selection(pos=pos, stoplist=stoplist)
    extractorMultipartiteRank.candidate_weighting(alpha=3,
                                                  threshold=0.95,
                                                  method='average')
    keyphrasesMultipartiteRank = extractorMultipartiteRank.get_n_best(n=NGraph)
    for key in keyphrasesMultipartiteRank:
        MultipartiteRank.append(key[0])

    inter1 = set(PositionRank).intersection(set(MultipartiteRank))
    inter2 = set(TFIDF).intersection(set(TextRank))
    to_remove_fin = []
    to_add = []
    to_remove = []
    for elem1 in inter2:
        for elem2 in inter1:
            if (" " not in elem1) and (
                    " " not in elem2) and (lemmatizer.lemmatize(elem1)
                                           in lemmatizer.lemmatize(elem2)):
                to_remove_fin.append(elem2)
                to_remove.append(elem1)
                to_add.append(elem1)
            if (" " not in elem1) and (
                    " " not in elem2) and (lemmatizer.lemmatize(elem2)
                                           in lemmatizer.lemmatize(elem1)):
                to_remove_fin.append(elem2)
                to_remove.append(elem1)
                to_add.append(elem2)
            if (elem1 in elem2) and (' ' in elem1) and (elem1 != elem2):
                to_remove_fin.append(elem2)
            elif (elem1 in elem2) and (' ' not in elem1) and (elem1 != elem2):
                to_remove.append(elem1)
    to_remove = set(to_remove)
    for elem in to_remove:
        inter2.remove(elem)

    inter = set(inter1).union(set(inter2))
    inter = list(inter)

    new_inter = inter
    new_inter = new_inter + list(set(to_add))
    for i in range(0, len(inter)):
        count = 0
        poses = []
        tokens = [
            word for word in nltk.word_tokenize(inter[i])
            if word not in stoplist
        ]
        new_inter[i] = ' '.join(tokens)
        tags = list(nltk.pos_tag(tokens))
        for tag in tags:
            poses.append(tag[1])
        for pos in poses:
            if 'NN' in pos:
                count += 1
        if count == 0:
            to_remove_fin.append(new_inter[i])
        if len(poses) > 4:
            to_remove_fin.append(new_inter[i])
    to_remove_fin = list(set(to_remove_fin))
    new_inter = list(set(new_inter).difference(to_remove_fin))
    return new_inter
Exemple #17
0
import pke
#import logging
## Training the model on train set.
#train_input_dir = 'drive/My Drive/Recommendation systems/kea_trained/train_doc/'
reference_file = 'drive/My Drive/Recommendation systems/kea_trained/reference.txt'
output_mdl = "drive/My Drive/Recommendation systems/kea_trained/Models/kea_model.pickle"
#train_df_file = 'drive/My Drive/Recommendation systems/kea_trained/train_DF.tsv.gz'

#logging.info('Loading df counts from {}'.format(df_file))

df_counts = pke.load_document_frequency_file(input_file='train_DF.tsv.gz',
                                             delimiter='\t')

pke.train_supervised_model(input_dir='train_doc/',
                           reference_file='reference.txt',
                           model_file='model/kea_model.pickle',
                           extension='txt',
                           language='en',
                           normalization="stemming",
                           df=df_counts,
                           model=pke.supervised.Kea())
Exemple #18
0
    if args.verbose:
        logging.basicConfig(level=logging.INFO)

    # get class from module
    class_ = getattr(pke, args.approach, None)

    if not class_:
        logging.error('No valid extraction model given [' + args.approach +
                      ']')
        sys.exit(0)

    logging.info('keyphrase extraction using ' + args.approach)

    if args.df:
        logging.info('loading df weights from ' + args.df)
        df = pke.load_document_frequency_file(args.df, delimiter="\t")

    extr = class_(input_file=args.input)

    extr.read_document(format=args.format)

    extr.candidate_selection()

    if args.approach in ['TfIdf', 'TopicRank', 'SingleRank', 'KPMiner']:
        extr.candidate_weighting()
    elif args.approach in ['WINGNUS', 'Kea']:
        extr.feature_extraction(df=df)
        extr.classify_candidates(model=args.model)

    keyphrases = extr.get_n_best(n=args.nbest)
Exemple #19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pke

# create a Kea extractor and set the input language to English (used for
# the stoplist in the candidate selection method)
extractor = pke.supervised.Kea()

# load the content of the document, here in CoreNLP XML format
# the use_lemmas parameter allows to choose using CoreNLP lemmas or stems
# computed using nltk
extractor.load_document('C-1.xml')

# select the keyphrase candidates, for Kea the 1-3 grams that do not start or
# end with a stopword.
extractor.candidate_selection()

# load the df counts
df_counts = pke.load_document_frequency_file(input_file="df.tsv.gz",
                                             delimiter='\t')

# weight the candidates using Kea model.
extractor.candidate_weighting(model_file="model.pickle", df=df_counts)

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10):
    print(keyphrase, score)
Exemple #20
0
import codecs
import logging
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the reference file
reference_file = sys.argv[2]

# path to the df file
df_file = sys.argv[3]
logging.info('loading df counts from '+df_file)
df_counts = pke.load_document_frequency_file(df_file, delimiter='\t')

# path to the model, saved as a pickle
output_mdl = sys.argv[4]

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           df=df_counts,
                           format="corenlp",
                           use_lemmas=False,
                           stemmer="porter",
                           model=pke.Kea()
                           language='english',
                           extension="xml")
Exemple #21
0
    # enabling verbose
    if args.verbose:
        logging.basicConfig(level=logging.INFO)

    # get class from module
    class_ = getattr(pke, args.approach, None)

    if not class_:
        logging.error('No valid extraction model given ['+args.approach+']')
        sys.exit(0)

    logging.info('keyphrase extraction using '+args.approach)

    if args.df:
        logging.info('loading df weights from '+args.df)
        df = pke.load_document_frequency_file(args.df, delimiter="\t")

    extr = class_(input_file=args.input)

    extr.read_document(format=args.format)

    extr.candidate_selection()

    if args.approach in ['TfIdf', 'TopicRank', 'SingleRank', 'KPMiner']:
        extr.candidate_weighting()
    elif args.approach in ['WINGNUS', 'Kea']:
        extr.feature_extraction(df=df)
        extr.classify_candidates(model=args.model)

    keyphrases = extr.get_n_best(n=args.nbest)
labels_to_id = {}
rows_lst = []
article = pd.read_csv(
    "/Users/senresearchlab/PycharmProjects/cartograph-alg/data/georgraphy/article_text_gloss.csv"
)
for row in article.itertuples():
    text = row.text
    if not isinstance(text, float):
        extractor = pke.unsupervised.KPMiner()
        extractor.load_document(text, language='en', normalization=None)

        lasf = 4
        cutoff = 200
        extractor.candidate_selection(lasf=lasf, cutoff=cutoff)

        df = pke.load_document_frequency_file(
            input_file='./doc_frequency.tsv.gz')
        alpha = 2.3
        sigma = 3.0
        extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma)

        # 5. get the 10-highest scored candidates as keyphrases
        keyphrases = extractor.get_n_best(n=10)
        if len(keyphrases) is not 0:
            for keyphrase, score in keyphrases:
                if keyphrase not in labels_to_id:
                    labels_to_id[keyphrase] = len(labels_to_id)
                id = labels_to_id.get(keyphrase, len(labels_to_id))
                rows_lst.append({
                    "article_id": row.article_id,
                    "label_id": id,
                    "score": score
import pke
import sys
import os
from string import punctuation

# initialize TfIdf model
extractor = pke.unsupervised.TfIdf(input_file=sys.argv[1])

# load the DF counts from file
df_counts = pke.load_document_frequency_file(input_file=sys.argv[2])

# load the content of the document
extractor.read_document(format='raw')

# keyphrase candidate selection
extractor.candidate_selection(n=3)

# candidate weighting with the provided DF counts
extractor.candidate_weighting(df=df_counts)

# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
keyphrases = extractor.get_n_best(n=10000, stemming=False)

base = os.path.basename(sys.argv[1])
filename = os.path.splitext(base)[0]

file = open(os.getcwd() + "/corpus/" + filename + "_saliency.txt", "wb")
try:
    for k in keyphrases:
        file.write(k[0].encode('utf-8') + '  ' + str(k[1]).encode('utf-8') +
Exemple #24
0
     # CURRENT_VERSION
     extractor = KeywordExtractor(snlp)
     extractor.load_document(input=text, language='ru')
     extractor.candidate_selection()
     with open("./DF.txt", encoding='utf-8') as fp:
         df = json.load(fp)
     extractor.candidate_weighting(df=df)
 elif MODE == 3:
     # TFIDF
     extractor = pke.unsupervised.TfIdf()
     extractor.load_document(input=text,
                             language='ru',
                             spacy_model=spacy_pipelines)
     stoplist = stopwords.words('russian')
     extractor.candidate_selection(n=3, stoplist=stoplist)
     df = pke.load_document_frequency_file(
         input_file='./df-weight.tsv.gz')
     extractor.candidate_weighting(df=df)
 elif MODE == 4:
     # KEA
     extractor = pke.supervised.Kea()
     extractor.load_document(input=text,
                             language='ru',
                             spacy_model=spacy_pipelines)
     stoplist = stopwords.words('russian')
     df = pke.load_document_frequency_file(
         input_file='./df-weight.tsv.gz')
     extractor.candidate_selection(stoplist=stoplist)
     extractor.candidate_weighting(df=df)
 elif MODE == 5:
     # MULTIPARTITE
     extractor = pke.unsupervised.MultipartiteRank()
Exemple #25
0
#pke.utils.compute_document_frequency('./test2', 'df_2_test.tsv.gz', format='raw', extension='txt', use_lemmas=False, stemmer=None, stoplist=stoplist, delimiter='\t', n=3)

for i in range(25000):
    input_file = './test2/' + str(i) + '.txt'

    # 1. create a TfIdf extractor.
    extractor = pke.unsupervised.TfIdf(input_file=input_file)

    # 2. load the content of the document.
    extractor.read_document(format='raw',
                            use_lemmas=False,
                            stemmer=None,
                            sep='/')

    # 3. select {1-3}-grams not containing punctuation marks as candidates.
    n = 3

    extractor.candidate_selection(n=n, stoplist=stoplist)

    # 4. weight the candidates using a `tf` x `idf`
    df = pke.load_document_frequency_file(input_file='df_2_test.tsv.gz')
    extractor.candidate_weighting(df=df)

    # 5. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=50)

    with open('./results/tfidf_2/' + str(i) + '.txt', 'w+') as file:
        for key in keyphrases:
            file.write(key[0].encode('utf-8') + '\n')
logging.basicConfig(level=logging.INFO)

# path to the input set of documents
input_dir = sys.argv[1]

# path to the pairwise similarity scores
output_file = sys.argv[2]

# path to the collection of documents
collection_dir = sys.argv[3]

# path to the df counts, saved as a gzipped csv file
df_file = sys.argv[4]

# load the DF counts
df_counts = load_document_frequency_file(input_file=df_file)

# stoplist for terms in document vectors
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')

# compute the pairwise similarity measures and write output
compute_pairwise_similarity_matrix(input_dir=input_dir,
                                   output_file=output_file,
                                   collection_dir=collection_dir,
                                   df=df_counts,
                                   format="corenlp",
                                   extension="xml",
                                   use_lemmas=False,
                                   stemmer="porter",
Exemple #27
0
def run_trial():
    n = 10
    snlp_folder = "../data/test/core_nlp_samples"
    compute_document_frequency(
        snlp_folder,
        os.path.join("../data/test/interim/test_cargo_df.tsv.gz"),
        stoplist=list(STOP_WORDS))
    cargo_df = load_document_frequency_file(
        "../data/test/interim/test_cargo_df.tsv.gz")
    pke_factory = {
        "grammar": r"""
                NBAR:
                    {<NOUN|PROPN|NUM|ADJ>*<NOUN|PROPN>}

                NP:
                    {<NBAR>}
                    {<NBAR><ADP><NBAR>}
                """,
        "filtering_params": {
            "stoplist": list(STOP_WORDS)
        },
        "extractors": {
            "tfidf": {
                "instance": PKEBasedTermsExtractor(TfIdf),
                "weighting_params": {
                    "df": cargo_df
                }
            },
            "yake": {
                "instance": PKEBasedTermsExtractor(YAKE),
                "filtering_params": {
                    "only_alphanum": True,
                    "strip_outer_stopwords": True
                },
                "weighting_params": {
                    "stoplist": list(STOP_WORDS)
                }
            },
            "kpm": {
                "instance": PKEBasedTermsExtractor(KPMiner),
                "weighting_params": {
                    "df": cargo_df
                }
            },
            "mprank": {
                "instance": PKEBasedTermsExtractor(MultipartiteRank),
                "weighting_params": {}
            },
            "positionrank": {
                "instance": PKEBasedTermsExtractor(PositionRank),
                "weighting_params": {}
            }
        }
    }
    for name in pke_factory["extractors"]:
        log.info(f"Begin Extraction with PKE based extractor: {name}")
        extractor_instance = pke_factory["extractors"][name]["instance"]
        if "filtering_params" in pke_factory["extractors"][name]:
            filtering_params = {
                **pke_factory["filtering_params"],
                **pke_factory["extractors"][name]["filtering_params"]
            }
        else:
            filtering_params = pke_factory["filtering_params"]
        extractor_instance.extract(
            snlp_folder,
            n,
            grammar=pke_factory["grammar"],
            filtering_params=filtering_params,
            weighting_params=pke_factory["extractors"][name]
            ["weighting_params"],
            output_file=f"../data/test/extracted_terms_sample/{name}.csv",
            auto_term_file=f"../data/test/automatic_annotations/{name}.jsonl")