コード例 #1
0
ファイル: base.py プロジェクト: zuacubd/pke
    def read_raw_document(self, stemmer='porter'):
        """ Read the raw input file and populate the sentence list.

            Args:
                stemmer (str): the stemmer in nltk to use, defaults to porter
                    (can be set to None for using word surface forms instead of
                    stems).
        """

        # parse the document using the preprocessed text parser
        parse = RawTextReader(self.input_file)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container
            self.sentences.append(Sentence(words=sentence['words']))

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the stems
            if stemmer is not None:
                for j, word in enumerate(self.sentences[i].words):
                    self.sentences[i].stems.append(Stemmer(stemmer).stem(word))

            # otherwise computations are performed on surface forms
            else:
                self.sentences[i].stems = self.sentences[i].words

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()
コード例 #2
0
class Sentence:
    stemmer = Stemmer()

    def __init__(self, dictionary, startIndex: int, endIndex: int, sent: str,
                 start: int, end: int):
        self.startIndex = startIndex
        self.endIndex = endIndex
        self.sent = sent
        self.words = self.sentToWords()
        self.nGrams = self.wordsToTrigramsWithIndices(dictionary)
        self.start = start
        self.end = end

    def sentToWords(self) -> List[str]:
        # FIXME: remove_stops . remove_puncts ~> remove_sth(_, stops | puncts)
        return wordsToStemmed(
            remove_stops(remove_puncts(word_tokenize(self.sent))))

    def wordsToTrigramsWithIndices(self, dictionary):
        def getIndexedTuple(word: str):
            index = -1
            if word in dictionary.wordsToIndices:
                index = dictionary.wordsToIndices[word]
            return (index, word)

        return list(trigrams(list(map(getIndexedTuple, self.words))))
コード例 #3
0
    def read_preprocessed_document(self, stemmer='porter', sep='/'):
        """ Read the preprocessed input file and populate the sentence list.

            Args:
                stemmer (str): the stemmer in nltk to use, defaults to porter.
                sep (str): the separator of the tagged word, defaults to /.
        """

        # parse the document using the preprocessed text parser
        parse = PreProcessedTextReader(self.input_file, sep=sep)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container
            self.sentences.append(Sentence(words=sentence['words']))

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the stems
            if stemmer == None:
                self.sentences[i].stems = list(self.sentences[i].words)
            else:
                for j, word in enumerate(self.sentences[i].words):
                    self.sentences[i].stems.append(Stemmer(stemmer).stem(word))

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()
コード例 #4
0
ファイル: utils.py プロジェクト: zuacubd/pke
def load_references(input_file,
                    sep_doc_id=':',
                    sep_ref_keyphrases=',',
                    reference_stemming=False,
                    stemmer='porter'):
    """ Load a reference file and returns a dictionary. """

    logging.info('loading reference keyphrases from ' + input_file)

    references = defaultdict(list)

    with codecs.open(input_file, 'r', 'utf-8') as f:
        for line in f:
            cols = line.strip().split(sep_doc_id)
            doc_id = cols[0].strip()
            keyphrases = cols[1].strip().split(sep_ref_keyphrases)
            for v in keyphrases:
                if '+' in v:
                    for s in v.split('+'):
                        references[doc_id].append(s)
                else:
                    references[doc_id].append(v)
            if reference_stemming:
                for i, k in enumerate(references[doc_id]):
                    stems = [Stemmer(stemmer).stem(u) for u in k.split()]
                    references[doc_id][i] = ' '.join(stems)

    return references
コード例 #5
0
    def read_raw_document(self, stemmer='porter'):
        """ Read the raw input file and populate the sentence list.

            Args:
                stemmer (str): the stemmer in nltk to use, defaults to porter.
        """

        # parse the document using the preprocessed text parser
        parse = RawTextReader(self.input_file)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container
            self.sentences.append(Sentence(words=sentence['words']))

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the stems
            for j, word in enumerate(self.sentences[i].words):
                self.sentences[i].stems.append(Stemmer(stemmer).stem(word))

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()
def tokenize_lowercase(text):
    '''
    Toekenize, stem and convert to lower case the text of documents
    :param text: text of a specific document
    :return: formatted text
    '''
    words = word_tokenize(text)  # tokenize document text
    # get words of all keyphrases in a single list
    formatted_tok_text = [
        Stemmer('porter').stem(word_token.lower()) for word_token in words
    ]  # DO NOT STEM TEXT WORDS TO TRAIN THE CLASSIFIER
    formatted_text = ' '.join(formatted_tok_text)
    return formatted_text
コード例 #7
0
class Sentence:
    stemmer = Stemmer()
    lemmater = WordNetLemmatizer()

    def __init__(self, index: int, sent: str, start: int, end: int):
        self.index = index
        self.sent = sent
        self.words = self.sentToWords()
        self.nGrams = list(trigrams(self.words))
        self.start = start
        self.end = end

    def sentToWords(self) -> List[str]:
        return word_tokenize(self.sent)
コード例 #8
0
ファイル: base.py プロジェクト: midnitekoder/pke
    def read_corenlp_document(self, use_lemmas=False, stemmer='porter'):
        """ Read the input file in CoreNLP XML format and populate the sentence
            list.

            Args:
                use_lemmas (bool): whether lemmas from stanford corenlp are used
                    instead of stems (computed by nltk), defaults to False.
                stemmer (str): the stemmer in nltk to use (if used), defaults
                    to porter (can be set to None for using word surface forms
                    instead of stems).
        """

        # parse the document using the Minimal CoreNLP parser
        parse = MinimalCoreNLPParser(self.input_file)

        # loop through the parsed sentences
        for i, sentence in enumerate(parse.sentences):

            # add the sentence to the container
            self.sentences.append(Sentence(words=sentence['words']))

            # add the POS
            self.sentences[i].pos = sentence['POS']

            # add the lemmas
            self.sentences[i].stems = sentence['lemmas']

            # flatten with the stems if required
            if not use_lemmas:

                # if stemming is performed
                if stemmer is not None:
                    for j, word in enumerate(self.sentences[i].words):
                        self.sentences[i].stems[j] = Stemmer(stemmer).stem(
                            word)

                # else, all computations are performed on surface forms
                else:
                    self.sentences[i].stems = self.sentences[i].words

            # lowercase the stems/lemmas
            for j, stem in enumerate(self.sentences[i].stems):
                self.sentences[i].stems[j] = stem.lower()

            # add the meta-information
            # for k, infos in sentence.iteritems(): -- Python 2/3 compatible
            for (k, infos) in sentence.items():
                if k not in set(['POS', 'lemmas', 'words']):
                    self.sentences[i].meta[k] = infos
コード例 #9
0
def extract_keyphrases(data):
    gold_keyphrases = []  # save the gold keyphrases of documents
    pred_keyphrases = []  # save the predicted keyphrases of documents
    for indx, abstract_document in enumerate(data['abstract']):
        # print('train_test_combined/' + key + '.xml')
        # print(keyphrases_dictionary[key])

        #if 'json' in file:
        gold_keyphrases.append([
            [Stemmer('porter').stem(keyword) for keyword in keyphrase.split()]
            for keyphrase in data['keyword'][indx].split(';')
        ])  # split gold keywords to separate them from one another

        # ======================================================================================================================
        # TF-IDF Extractor
        # ======================================================================================================================

        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')

        # 1. create a TfIdf extractor.
        extractor = pke.unsupervised.TfIdf()
        #print(' '.join(abstract_document))
        print(abstract_document)
        # 2. load the content of the document.
        extractor.load_document(
            input=abstract_document,  # ' '.join(abstract_document
            language='en',
            normalization="stemming")

        # 3. select {1-3}-grams not containing punctuation marks as candidates.
        extractor.candidate_selection(n=3, stoplist=stoplist)

        # 4. weight the candidates using a `tf` x `idf`
        df = pke.load_document_frequency_file(input_file=input_file)
        extractor.candidate_weighting(df=df)

        # 5. get the 10-highest scored candidates as keyphrases
        pred_kps = extractor.get_n_best(n=10)

        # keep only the predicted keyphrase (first position -> [0]) and discard the frequency number
        pred_keyphrases.append([kp[0].split() for kp in pred_kps])

    print(pred_keyphrases)
    print(gold_keyphrases)

    return pred_keyphrases, gold_keyphrases
コード例 #10
0
class Sentence:
    stemmer = Stemmer()

    def __init__(self, startIndex: int, endIndex: int, sent: str, start: int,
                 end: int):
        self.startIndex = startIndex
        self.endIndex = endIndex
        self.sent = sent
        self.words = self.sentToWords()
        self.nGrams = list(trigrams(self.words))
        self.start = start
        self.end = end

    def sentToWords(self) -> List[str]:
        # FIXME: remove_stops . remove_puncts ~> remove_sth(_, stops | puncts)
        return wordsToStemmed(
            remove_stops(remove_puncts(word_tokenize(self.sent))))
def extract_keyphrases(data):
    gold_keyphrases = []  # save the gold keyphrases of documents
    pred_keyphrases = []  # save the predicted keyphrases of documents
    for indx, abstract_document in enumerate(data['abstract']):
        # print('train_test_combined/' + key + '.xml')
        # print(keyphrases_dictionary[key])

        gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')])  # split gold keywords to separate them from one another

        # ======================================================================================================================
        # MultipartiteRank Extractor
        # ======================================================================================================================

        # 1. create a MultipartiteRank extractor.
        extractor = pke.unsupervised.MultipartiteRank()

        # 2. load the content of the document.
        extractor.load_document(input=abstract_document,
                                normalization="stemming")

        # 3. select the longest sequences of nouns and adjectives, that do
        #    not contain punctuation marks or stopwords as candidates.
        pos = {'NOUN', 'PROPN', 'ADJ'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos, stoplist=stoplist)

        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.74,
                                      method='average')

        # 5. get the 10-highest scored candidates as keyphrases
        pred_kps = extractor.get_n_best(n=10)

        pred_keyphrases.append([kp[0].split() for kp in pred_kps])  # keep only the predicted keyphrase and discard the frequency number

    print(pred_keyphrases)
    print(gold_keyphrases)

    return pred_keyphrases, gold_keyphrases
コード例 #12
0
def extract_keywords(sentence):
    sentence = sentence.lower()
    not_stopw = [
        "no", "nor", "not", "over", "under", "again", "further", "but",
        "against", "too", "very"
    ]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    print(stopw)
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace("n't", " not")
    sentence = clean_string(sentence)
    sentence = pattern.sub('', sentence)
    stemmer = Stemmer()
    s = [stemmer.stem(w) for w in sentence.split()]
    b = zip(*[s[i:] for i in [0, 1]])
    b = [bigram[0] + " " + bigram[1] for bigram in b]
    return s + b
for index, list_of_keyphrases in enumerate(data['keyword']):
    keyphrases_list = []
    for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
        # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
        keyphrase = keyphrase.strip()  # remove whitespaces
        if len(keyphrase):  # check if the keyphrase is empty
            tokens = word_tokenize(keyphrase)  # tokenize
            # Replace the pure digit terms with DIGIT_REPL
            tokens = [
                tok if not re.match('^\d+$', tok) else 'DIGIT_REPL'
                for tok in tokens
            ]
            # Replace the combination of characters and digits with WORD_DIGIT_REPL
            # tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
            tokens = [
                Stemmer('porter').stem(keyword.lower()) for keyword in tokens
            ]  # stem + lower case
            tokens = ' '.join(tokens)
            keyphrases_list.append(tokens)

    data['keyword'].iat[index] = keyphrases_list

# ======================================================================================================================
# Count logistics
# ======================================================================================================================

keywords_in_title = 0  # the count of keywords in title
keywords_in_abstract = 0  # the count of keywords in abstract
keywords_in_title_abstract = 0  # the count of keywords that are either in title or abstract
keywords_in_title_NOT_abstract = 0  # the count of keywords that are in title BUT NOT in abstract
total_keywords = 0  # the count of all keywords
コード例 #14
0
print('tokenization - abstract finish')


# stem, tokenize and lower case keyphrases and keep them categorized by document
for index, list_of_keyphrases in enumerate(data['keywords']):
    keyphrases_list = []
    for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
        # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
        keyphrase = keyphrase.strip()  # remove whitespaces
        if len(keyphrase):  # check if the keyphrase is empty
            tokens = word_tokenize(keyphrase)  # tokenize
            # Replace the pure digit terms with DIGIT_REPL
            tokens = [tok if not re.match('^\d+$', tok) else 'DIGIT_REPL' for tok in tokens]
            # Replace the combination of characters and digits with WORD_DIGIT_REPL
            #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
            keyphrases_list.append([Stemmer('porter').stem(keyword.lower()) for keyword in tokens])  # stem + lower case
    data['keywords'].iat[index] = keyphrases_list
#    print('THESE ARE THE KEYPHRASE LIST', len(keyphrases_list), keyphrases_list)


# ======================================================================================================================
# Write pre-processed keyphrases to csv file
# ======================================================================================================================

data['abstract'].to_csv(x_text_filename, index=False)  # save the preprocessed document text

# rename column "keywords" to "keyword" for uniformity between datasets
data.rename(columns={"keywords": "keyword"}, inplace=True)
data[['keyword', 'assemble_documents_index']].to_csv(y_text_filename, index=False)  # save the preprocessed keyphrases

def evaluation(y_pred=None, y_test=None, x_test=None, x_filename=None, y_filename=None, paragraph_assemble_docs=None):
    """
    Evaluate the performance
    :param y_pred: the predicted labels
    :param y_test: the test labels
    :param x_filename: the name of the GOLD document text file - NEED TO MATCH THE LOADED FILE WHEN MAKING PREDICTIONS (default evaluation dataset is KP20K)
    :param y_filename: the name of the GOLD keyphrase file - NEED TO MATCH THE LOADED FILE WHEN MAKING PREDICTIONS (default evaluation dataset is KP20K)
    :param paragraph_assemble_docs: (ONLY FOR UNSUPERVISED METHODS) the indices to re-assemble first 3 paragraphs
    :return: -
    """

    if y_test is None:  # evaluate the Bi-LSTM-CRF + unsupervised methods
        # ======================================================================================================================
        # Load all validation target data (y_test\labels) data on memory (needed for evaluation)
        # ======================================================================================================================

        # read preprocessed document text (x) and preprocessed keyphrases (y)
        x_test = pd.read_csv(x_filename, encoding="utf8")
        y_test = pd.read_csv(y_filename, encoding="utf8")

        # translate string back to list of lists (when reading dataframe, lists of lists are read as strings)
        x_test['abstract'] = x_test['abstract'].map(ast.literal_eval)
        if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename:
            assembl_docs = y_test['assemble_documents_index']
        y_test = y_test['keyword'].map(ast.literal_eval)

        # print(x_test)
        print(y_test)


        # ======================================================================================================================
        # Convert y_test and y_pred from categorical (two columns, 1 for each label) to a single value label (1 column)
        # ======================================================================================================================

        def pred2label(all_abstract_preds):
            '''
            Converts prediction set and test/validation set from two columns (one for each label value)
            to just one column with the number of the corresponding label
            [ initial array: [1, 0] => final array: [0] ]   -   [ initial array: [0, 1] => final array: [1] ]
            :param all_abstract_preds: array with predictions or test/validation set [documents/abstracts, number of words]
            :return: flattened array that contains the prediction for each word [number of total words of all abstracts]
            '''
            preds = []
            for abstract_preds in all_abstract_preds:
                # the position of the max value is corresponding to the actual label value (0: Non-KP, 1: KP)
                doc_preds = [np.argmax(word_pred) for word_pred in abstract_preds]
                preds.append(doc_preds)
            return preds

        # print('BEFORE y_pred', y_pred)
        y_pred = pred2label(y_pred)  # convert y_pred from categorical (two columns, 1 for each label) to a single value label
        # print('AFTER y_pred', y_pred)


        # ======================================================================================================================
        # Extract keyphrases from the predicted set
        # ======================================================================================================================

        pred_keyphrase_list = []  # save all predicted keyphrases
        for doc_index, doc_prediction in enumerate(y_pred):  # iterate through predictions for documents
            document_keyphrases = []  # save the keyphrases of a document
            consecutive_keywords = []  # save consecutive keywords that form a keyphrase
            for word_index, word_prediction in enumerate(doc_prediction):  # iterate through predictions for WORDS of documents
                if word_index >= len(x_test['abstract'][doc_index]):
                    break  # check if the abstract reached to an end (padding adds more dummy words non existing in real abstract)
                if word_index:  # check if this is the FIRST WORD in the abstract [to avoid negative index value]
                    if doc_prediction[word_index - 1]:  # check if the previous word is a keyword
                        if word_prediction:  # check if the current word is a keyword
                            #                        print(x_test['abstract'][doc_index])
                            #                        print(x_test['abstract'][doc_index][word_index])
                            consecutive_keywords.append(x_test['abstract'][doc_index][word_index])
                    else:
                        if len(consecutive_keywords):  # save keyword list if exists (not empty list)
                            document_keyphrases.append(consecutive_keywords)
                        consecutive_keywords = []  # re-initialize (empty) list
                        if word_prediction:  # check if the current word is a keyword
                            consecutive_keywords.append(x_test['abstract'][doc_index][word_index])
                else:  # save the FIRST WORD of the abstract if it is a keyword
                    if word_prediction:  # check if the current word is a keyword
                        #               print('HEREEEE', doc_index, word_index)
                        #               print(x_test['abstract'][doc_index])
                        consecutive_keywords.append(x_test['abstract'][doc_index][word_index])

            if len(consecutive_keywords):  # save the keywords that occur in the END of the abstract, if they exist (not empty list)
                document_keyphrases.append(consecutive_keywords)

            pred_keyphrase_list.append(document_keyphrases)
    else:  # evaluate the unsupervised methods that use .xml files
        # tokenize the text
        x_test['abstract'] = x_test['abstract'].apply(lambda row: row.split())
        print(x_test)

        # define pred_keyphrase_list - contains words
        pred_keyphrase_list = y_pred
        # define y_test if full-text in paragraphs/stentences
        if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename:
            assembl_docs = paragraph_assemble_docs

   # print(pred_keyphrase_list)
    #print(y_test)

    # FIND IF ANY KEYPHRASES EXIST ON THE PREDICTION SET
    here = [1 if any(doc) else 0 for doc in y_pred]
    print('\ny_pred', np.array(y_pred, dtype=object).shape)
    if any(here):
        print('THERE ARE KEYPHRASES')
    else:
        print('THERE ARE NOOOOOOT KEYPHRASES')


    # ======================================================================================================================
    # Calculate metrics
    # ======================================================================================================================

    def calculate_metrics(y_test_set, pred_keyphrase_list_set, eval_method):
        """
        Calculate and print metrics
        :param y_test_set: GOLD set
        :param pred_keyphrase_list_set: PREDICTION set
        :param eval_method: the name of the evaluation method (exact/partial match)
        :return: -
        """
        TP = 0  # True Positive
        FP = 0  # False Positive
        FN = 0  # False Negative
        for index_pred, doc_pred in enumerate(pred_keyphrase_list_set):
            for key_test in y_test_set[index_pred]:
                #if any(key_test not in keyp for keyp in doc_pred):
                if key_test not in doc_pred:  # FN: keyphrases that exist in GOLD but not in PREDICTED
                    FN += 1
            if len(doc_pred):  # continue if prediction list is NOT empty | if prediction list is empty -> skip checking
                for key_pred in doc_pred:
                    #if any(key_pred in keyp for keyp in y_test_set[index_pred]):
                    if key_pred in y_test_set[index_pred]:  # TP: keyphrases that exist both in PREDICTED and GOLD
                        TP += 1
                    else:  # FP: keyphrases that exist in PREDICTED but not in GOLD (if key_pred not in y_test_set)
                        FP += 1
        precision = 0
        recall = 0
        f1_score = 0
        # print(TP, FN, FP)
        # print('precision=', TP / (TP + FP), 'recall=', TP / (TP + FN))
        if not (TP == FP == 0):
            precision = TP / (TP + FP)
        if not (TP == FN == 0):
            recall = TP / (TP + FN)
        if not (precision == recall == 0):
            f1_score = 2 * (precision * recall) / (precision + recall)

        print('\n' + eval_method)
        print('Precision: %.4f' % precision)
        print('Recall: %.4f' % recall)
        print('F1-score: %.4f\n' % f1_score)


    # ======================================================================================================================
    # Calculate NEW metrics (semi-exact matching)
    # ======================================================================================================================

    def calculate_semi_exact_match_metrics(y_test_set, pred_keyphrase_list_set, eval_method):
        """
        Calculate and print metrics
        :param y_test_set: GOLD set
        :param pred_keyphrase_list_set: PREDICTION set
        :param eval_method: the name of the evaluation method (exact/partial match)
        :return: -
        """
        # each 0 and 1 represents a keyphrase and the 0 means that it exists in gold/pred set, while 0 means it does not
        pred_list = []  # contains 0, 1 for predicted keyphrases depending on if a predicted keyphrase matches with a gold one
        gold_list = []  # contains 0, 1 for gold keyphrases depending on if a gold keyphrase matches with a predicted one
        for index_pred, doc_pred in enumerate(pred_keyphrase_list_set):
            pred_kps = [0] * len(doc_pred)  # initialize the list with 0s and length equal to the total predicted keyphrases
            gold_kps = [0.0] * len(y_test_set[index_pred])  # initialize the list with 0s and length equal to the total gold keyphrases

            if doc_pred:  # if predicted keyphrase set is not empty (the case of empty predicted keyphrase is handled by the initialization of pred_kps and gold_kps)
                # find if the gold keyphrases exist in the predicted set, and if so mark which gold and predicted keyphrases have a match
                for gold_kp_index, gold_keyphr in enumerate(y_test_set[index_pred]):
                    gold_keyphrase_tokens = gold_keyphr.split()
          #          print('gold: ', gold_keyphrase_tokens)
         #           print('pred: ', doc_pred)
                    avg_coverage_ratio_list = []
                    gold_coverage_ratio_list=[]
                    for pred_kp in doc_pred:
                        kw_coverage = 0  # gold keyword coverage of a predicted keyphrase
                        for keyword_gold in gold_keyphrase_tokens:
                            if keyword_gold in pred_kp:
                                kw_coverage += 1
                        # a gold keyword might exist multiple times in a pred keyphrase, but with this approach we assume that it does not as this happens rarely
                        if len(pred_kp.split()):
                            pred_coverage_ratio = kw_coverage / len(pred_kp.split())  # calculate the ratio of the covered predicted kps
                        else:
                            pred_coverage_ratio = 0
                        if len(gold_keyphrase_tokens):
                            gold_coverage_ratio = kw_coverage / len(gold_keyphrase_tokens)  # calculate the ratio of the covered gold kps
                        else:
                            gold_coverage_ratio = 0
                        avg_coverage_ratio_list.append((gold_coverage_ratio + pred_coverage_ratio) / 2)  # save the average of the keyphrase coverage and the coverage ratio
                        gold_coverage_ratio_list.append(gold_coverage_ratio)
         #           print('percent: ', avg_coverage_ratio_list)
                    # find the max average coverage ratio and its position on the list
                    max_index, max_avg_coverage_ratio_list = max(enumerate(avg_coverage_ratio_list), key=itemgetter(1))
                    if max_avg_coverage_ratio_list > 0.5:
                        # set 1 or the average value of keyphrase coverage and ratio for possibly more accurate results
               #         gold_kps[gold_kp_index] = 1  # set 1 the gold kp that matched to a predicted one
                        gold_kps[gold_kp_index] = gold_coverage_ratio_list[max_index] #max_avg_coverage_ratio_list # gold_coverage_ratio_list[gold_kp_index]
                        pred_kps[max_index] = 1  # set 1 the predicted kp that was matched with a gold one

            # save the kp predicted/gold matches of each document
            pred_list.extend(pred_kps)
            gold_list.extend(gold_kps)

        FN = gold_list.count(0)  # False Negative: keyphrases that exist in GOLD but not in PREDICTED
   #     TP = gold_list.count(1)  # True Positive: keyphrases that exist both in PREDICTED and GOLD
        TP = sum(gold_list)
        FP = pred_list.count(0)  # False Positive: keyphrases that exist in PREDICTED but not in GOLD (if key_pred not in y_test_set)

        precision = 0
        recall = 0
        f1_score = 0
        # print(TP, FN, FP)
        # print('precision=', TP / (TP + FP), 'recall=', TP / (TP + FN))
        if not (TP == FP == 0):
            precision = TP / (TP + FP)
        if not (TP == FN == 0):
            recall = TP / (TP + FN)
        if not (precision == recall == 0):
            f1_score = 2 * (precision * recall) / (precision + recall)


        print('\n' + eval_method)
        print('Precision: %.4f' % precision)
        print('Recall: %.4f' % recall)
        print('F1-score: %.4f\n' % f1_score)


    # ======================================================================================================================
    # Get the SETS of (unique) keyphrases for predicted and gold set
    # ======================================================================================================================

    # assemble the sentences of a document into a whole document again (only for the SENTEC and PARAGRAPH)
    print(x_filename)
    if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename:
        print('ENTERED SENTENC & PARAGRAPH MODE')
        y_test_set = []  # set of original/all GOLD keyphrases for each document
        y_test_set_extraction = []  # keep only the GOLD keyphrases that exist in their corresponding document
        pred_keyphrase_list_set = []  # set of PREDICTED keyphrases for each document
        gold_same_document_keyphrases = []  # save the gold keyphrases that are from the same document (only for the SENTEC and PARAGRAPH)
        gold_extraction_same_document_keyphrases = []  # save the gold keyphrases that are from the same document - extraction (only for the SENTEC and PARAGRAPH)
        pred_same_document_keyphrases = []  # save the pred keyphrases that are from the same document (only for the SENTEC and PARAGRAPH)

        for doc_index, doc in enumerate(y_test):  # get the set of GOLD keyphrases for each document
            # y gold set
            gold_document_keyphrases = []  # save the keyphrases of a document as strings (each keyphrase -> string)
            # y gold set - extraction
            gold_document_keyphrases_extraction = []  # save the keyphrases of a document as strings (each keyphrase -> string)
            # y predicted
            pred_document_keyphrases = []  # save the keyphrases of a document as strings (each keyphrase -> string)

            abstract_as_string = ' '.join([Stemmer('porter').stem(word) for word in x_test['abstract'][doc_index]])

            for tokenized_keyphrase in doc:
                keyphrase = ' '.join(tokenized_keyphrase)  # STEMMING is already applied

                gold_document_keyphrases.append(keyphrase.strip())

                if keyphrase.strip() in abstract_as_string:  # keep only keyphrases that exist in the text - keyphrase EXTRACTION
                    gold_document_keyphrases_extraction.append(keyphrase.strip())



            for tokenized_keyphrase in pred_keyphrase_list[doc_index]:
                keyphrase = ''
                for word in tokenized_keyphrase:
                    keyphrase += Stemmer('porter').stem(word) + ' '  # apply STEMMING
                pred_document_keyphrases.append(keyphrase.strip())



            # check if the previous sentence is in the same document (has the same document id) as the current
            if doc_index == 0:
                # print('we are in the 1st document')
                gold_same_document_keyphrases.extend(gold_document_keyphrases)
                gold_extraction_same_document_keyphrases.extend(gold_document_keyphrases_extraction)
                pred_same_document_keyphrases.extend(pred_document_keyphrases)
            elif assembl_docs[doc_index] == assembl_docs[doc_index - 1]:
                # print('we are in the same document', y_test['assemble_documents_index'][doc_index], '==', y_test['assemble_documents_index'][doc_index - 1])
                gold_same_document_keyphrases.extend(gold_document_keyphrases)
                gold_extraction_same_document_keyphrases.extend(gold_document_keyphrases_extraction)
                pred_same_document_keyphrases.extend(pred_document_keyphrases)
            else:  # different documents
                # print('CHANGED document', y_test['assemble_documents_index'][doc_index], '==', y_test['assemble_documents_index'][doc_index - 1])
                # save keyphrases for the previous document
                y_test_set.append(set(gold_same_document_keyphrases))  # get each keyphrase just once
                y_test_set_extraction.append(set(gold_extraction_same_document_keyphrases))
                pred_keyphrase_list_set.append(set(pred_same_document_keyphrases))  # get each keyphrase just once

                # create the new document keyphrase set
                gold_same_document_keyphrases = gold_document_keyphrases
                gold_extraction_same_document_keyphrases = gold_document_keyphrases_extraction
                pred_same_document_keyphrases = pred_document_keyphrases

            # save the keyphrases for the last document
            if (doc_index + 2) > len(pred_keyphrase_list):  # (+2 because counting starts from 0 and we want the next element as well)
                # save keyphrases for the current document
                y_test_set.append(set(gold_same_document_keyphrases))  # get each keyphrase just once
                y_test_set_extraction.append(set(gold_extraction_same_document_keyphrases))
                pred_keyphrase_list_set.append(set(pred_same_document_keyphrases))  # get each keyphrase just once


        # count all keyphrases and keyphrases existing in text
        keyphrase_counter = 0
        extraction_keyphrase_counter = 0
        for doc_idx, y_test_extraction_doc in enumerate(y_test_set_extraction):
            extraction_keyphrase_counter += len(y_test_extraction_doc)
            keyphrase_counter += len(y_test_set[doc_idx])
        print('existing keyphrases', extraction_keyphrase_counter)
        print('all keyphrases', keyphrase_counter)

    else:  # for the full-text documents
        y_test_set = []  # set of original/all GOLD keyphrases for each document
        y_test_set_extraction = []  # keep only the GOLD keyphrases that exist in their corresponding document
        for doc_index, test_doc in enumerate(y_test):  # get the set of GOLD keyphrases for each document
            extraction_document_keyphrases = []  # save the keyphrases that exist in text (extraction) of a document as strings (each keyphrase -> string)
            document_keyphrases = []  # save all keyphrases of a document as strings (each keyphrase -> string)

            abstract_as_string = ' '.join([Stemmer('porter').stem(word) for word in x_test['abstract'][doc_index]])

            for tokenized_keyphrase in test_doc:
                keyphrase = ' '.join(tokenized_keyphrase)  # STEMMING is already applied

                document_keyphrases.append(keyphrase.strip())

                if keyphrase.strip() in abstract_as_string:  # keep only keyphrases that exist in the text - keyphrase EXTRACTION
                    extraction_document_keyphrases.append(keyphrase.strip())
                #            print(document_keyphrases)
            y_test_set.append(set(document_keyphrases))  # get each keyphrase just once
            y_test_set_extraction.append(set(extraction_document_keyphrases))  # get each keyphrase just once


        # count all keyphrases and keyphrases existing in text
        keyphrase_counter = 0
        extraction_keyphrase_counter = 0
        for doc_idx, y_test_extraction_doc in enumerate(y_test_set_extraction):
            extraction_keyphrase_counter += len(y_test_extraction_doc)
            keyphrase_counter += len(y_test_set[doc_idx])
        print('existing keyphrases', extraction_keyphrase_counter)
        print('all keyphrases', keyphrase_counter)


        pred_keyphrase_list_set = []  # set of PREDICTED keyphrases for each document
        for doc in pred_keyphrase_list:  # get the set of PREDICTED keyphrases for each document
            document_keyphrases = []  # save the keyphrases of a document as strings (each keyphrase -> string)
            for tokenized_keyphrase in doc:
                keyphrase = ''
                for word in tokenized_keyphrase:
                    keyphrase += Stemmer('porter').stem(word) + ' '  # apply STEMMING
                document_keyphrases.append(keyphrase.strip())
            pred_keyphrase_list_set.append(set(document_keyphrases))  # get each keyphrase just once



    # print y_test and y_pred

    #for i in range(len(pred_keyphrase_list_set)):
    for i in range(10):
        print('pred', pred_keyphrase_list_set[i])
        print('test', y_test_set[i])
        print('extraction test', y_test_set_extraction[i])


    # ======================================================================================================================
    # Exact Match - Model Evaluation
    # ======================================================================================================================

    # Exact Match: the keyphrases must be given as whole strings

    # extraction - only GOLD KPs existing in text
    calculate_metrics(y_test_set_extraction, pred_keyphrase_list_set, 'Exact Match - Extraction')
    # all GOLD KPs
    calculate_metrics(y_test_set, pred_keyphrase_list_set, 'Exact Match')


    # ======================================================================================================================
    # NEW METHOD - Semi-Exact Match - Model Evaluation
    # ======================================================================================================================

    # extraction - only GOLD KPs existing in text
    calculate_semi_exact_match_metrics(y_test_set_extraction, pred_keyphrase_list_set, 'Semi-exact Match - Extraction')
    # all GOLD KPs
    calculate_semi_exact_match_metrics(y_test_set, pred_keyphrase_list_set, 'Semi-exact Match')


    # ======================================================================================================================
    # Partial Match - Model Evaluation
    # ======================================================================================================================

    # Partial Match: the keyphrases must be given as a set of words

    # Get the sets of all gold keyphrases
    y_test_set_partial = []
    for doc in y_test_set:  # get the set of GOLD keyphrases for each document
        document_keywords = []
        for keyphrase in doc:
            keyphrase = word_tokenize(keyphrase)
            for word in keyphrase:
                document_keywords.append(word)
        y_test_set_partial.append(set(document_keywords))

    # Get the sets of all gold keyphrases existing in text (extraction)
    y_test_set_partial_extraction = []
    for doc in y_test_set_extraction:  # get the set of GOLD keyphrases for each document
        document_keywords = []
        for keyphrase in doc:
            keyphrase = word_tokenize(keyphrase)
            for word in keyphrase:
                document_keywords.append(word)
        y_test_set_partial_extraction.append(set(document_keywords))

    # Get the sets of all predicted keyphrases
    pred_keyphrase_list_set_partial = []
    for doc in pred_keyphrase_list_set:  # get the set of PREDICTED keyphrases for each document
        document_keywords = []
        for keyphrase in doc:
            keyphrase = word_tokenize(keyphrase)
            for word in keyphrase:
                document_keywords.append(word)
        pred_keyphrase_list_set_partial.append(set(document_keywords))

    # extraction - only GOLD KPs existing in text
    calculate_metrics(y_test_set_partial_extraction, pred_keyphrase_list_set_partial, 'Partial Match - Extraction')
    # all GOLD KPs
    calculate_metrics(y_test_set_partial, pred_keyphrase_list_set_partial, 'Partial Match')
def semeval_summarized_statistics():
    # reading the initial JSON data using json.load()
    file = '..\\data\\benchmark_data\\summarization_experiment\\SemEval-2010_summarized.csv'  # TEST data to evaluate the final model

    # ======================================================================================================================
    # Read data
    # ======================================================================================================================

    data = pd.read_csv(file, encoding="utf8")
    print(data)

    # ======================================================================================================================
    # Split keyphrases list of keyphrases from string that contains all the keyphrases
    # ======================================================================================================================

    for index, keywords in enumerate(data['keyword']):
        data['keyword'].iat[index] = keywords.split(
            ';')  # split keywords to separate them from one another

    # ======================================================================================================================
    # Isolate the title, abstract and the main body (+ remove section identifiers and '\n')
    # ======================================================================================================================

    # tokenize key-phrases and keep them categorized by document
    for index, abstract in enumerate(data['abstract']):
        title_summary = data['title'][
            index] + ' ' + abstract  # combine title + abstract + main body
        # remove '\n'
        title_summary = title_summary.replace('\n', ' ')

        data['abstract'].iat[index] = title_summary

    # ======================================================================================================================
    # Remove Contractions (pre-processing)
    # ======================================================================================================================

    # substitute contractions with full words
    data['abstract'] = data['abstract'].apply(replace_contractions)
    data['keyword'] = data['keyword'].apply(
        lambda set_of_keyphrases:
        [replace_contractions(keyphrase) for keyphrase in set_of_keyphrases])

    # ======================================================================================================================
    # Remove punctuation (with whitespace) + digits (from ABSTRACT) + clean empty strings
    # ======================================================================================================================

    # remove parenthesis, brackets and their contents
    data['abstract'] = data['abstract'].apply(remove_brackets_and_contents)

    # remove references of publications (in document text)
    data['abstract'] = data['abstract'].apply(remove_references)

    # remove punctuation
    data['abstract'] = data['abstract'].apply(remove_punct_and_non_ascii)
    data['keyword'] = data['keyword'].apply(keyword_remove_punct_and_non_ascii)

    # Replace the pure digit terms with DIGIT_REPL
    data['abstract'] = data['abstract'].apply(lambda text: " ".join([
        token if not re.match('^\d+$', token) else 'DIGIT_REPL'
        for token in text.split()
    ]))  # remove spaces
    print('convert digits - abstract finish')

    # remove rows with empty and one word abstracts/sentences
    data = data[data['abstract'].str.strip().astype(bool)]
    data.reset_index(drop=True, inplace=True)

    # remove empty keyphrases
    data['keyword'] = data['keyword'].apply(
        lambda set_of_keyws:
        [key_text for key_text in set_of_keyws if key_text.strip()])
    # remove rows with empty keyphrases
    data = data[data['keyword'].map(len) > 0]

    # ======================================================================================================================
    # Tokenize each sentence + remove digits (from KEYPHRASES)
    # ======================================================================================================================

    # tokenize text
    data['abstract'] = data['abstract'].apply(tokenize_lowercase)
    print('tokenization - abstract finish')

    # stem, tokenize and lower case keyphrases and keep them categorized by document
    for index, list_of_keyphrases in enumerate(data['keyword']):
        keyphrases_list = []
        for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
            # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
            keyphrase = keyphrase.strip()  # remove whitespaces
            if len(keyphrase):  # check if the keyphrase is empty
                tokens = word_tokenize(keyphrase)  # tokenize
                # Replace the pure digit terms with DIGIT_REPL
                tokens = [
                    tok if not re.match('^\d+$', tok) else 'DIGIT_REPL'
                    for tok in tokens
                ]
                # Replace the combination of characters and digits with WORD_DIGIT_REPL
                #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
                tokens = [
                    Stemmer('porter').stem(keyword.lower())
                    for keyword in tokens
                ]  # stem + lower case
                tokens = ' '.join(tokens)
                keyphrases_list.append(tokens)

        data['keyword'].iat[index] = keyphrases_list

    # ======================================================================================================================
    # Count logistics
    # ======================================================================================================================

    semeval_keywords_in_summary = 0  # the count of keywords in abstract
    semeval_total_keywords = 0  # the count of all keywords
    for index, keywords in enumerate(data['keyword']):
        semeval_total_keywords += len(keywords)
        # print('total_keywords', len(test))
        # print('total_keywords', test)

        for keyword in keywords:
            # check if keyword exists on abstract
            if keyword in data['abstract'][index]:
                semeval_keywords_in_summary += 1
                # print(keyword)
                # print(data['abstract'][index])

    print('SemEval summarized: ', semeval_keywords_in_summary)
    print('SemEval summarized - total keyphrases: ', semeval_total_keywords)

    print('SemEval summarized - count of keywords in abstract: ',
          semeval_keywords_in_summary / semeval_total_keywords)

    return semeval_keywords_in_summary
コード例 #17
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import sys
import os
import glob
import json
from nltk.stem.snowball import SnowballStemmer as Stemmer

references = {}

for input_file in glob.glob(sys.argv[1] + '/*.key'):
    file_id = input_file.split('/')[-1].split('.')[0]
    with open(input_file, 'r') as f:
        lines = f.readlines()
        keyphrases = []
        for line in lines:
            words = line.strip().split()
            stems = [Stemmer('porter').stem(w.lower()) for w in words]
            keyphrases.append([' '.join(stems)])
            # keyphrases.append([' '.join([w.lower() for w in words])])
        references[file_id] = keyphrases

with open(sys.argv[2], 'w') as o:
    json.dump(references, o, sort_keys=True, indent=4)
コード例 #18
0
for index, list_of_keyphrases in enumerate(data['keyword']):
    keyphrases_list = []
    for keyphrase in list_of_keyphrases:  # get words of all keyphrases in a single list
        # keyphrase = keyphrase.translate(remove_digits).strip()  # remove digits
        keyphrase = keyphrase.strip()  # remove whitespaces
        if len(keyphrase):  # check if the keyphrase is empty
            tokens = word_tokenize(keyphrase)  # tokenize
            # Replace the pure digit terms with DIGIT_REPL
            tokens = [
                tok if not re.match('^\d+$', tok) else 'DIGIT_REPL'
                for tok in tokens
            ]
            # Replace the combination of characters and digits with WORD_DIGIT_REPL
            #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens]
            keyphrases_list.append([
                Stemmer('porter').stem(keyword.lower()) for keyword in tokens
            ])  # stem + lower case
    data['keyword'].iat[index] = keyphrases_list
#    print('THESE ARE THE KEYPHRASE LIST', len(keyphrases_list), keyphrases_list)

# ======================================================================================================================
# Write pre-processed keyphrases to csv file
# ======================================================================================================================

data['abstract'].to_csv(x_text_filename,
                        index=False)  # save the preprocessed document text
data['keyword'].to_csv(y_text_filename,
                       index=False)  # save the preprocessed keyphrases

# ======================================================================================================================
# Give labels to each word of Abstract (fulltext) - keyword (KP) or Non-keyword (Non-KP)
コード例 #19
0
        doc_id = file_id.split('/')[-1][:-5]

        # print('Loading {}'.format(doc_id))

        with open(file_id, 'r') as f:
            lines = f.readlines()
            tags[doc_id].update([l.lower().strip() for l in lines])

references = {}

for doc_id in tags:

    # group tags by stem
    stem_to_tag = collections.defaultdict(list)
    for tag in tags[doc_id]:
        stem = [Stemmer('porter').stem(w) for w in tag.split()]
        for _ in range(tags[doc_id][tag]):
            stem_to_tag[' '.join(stem)].append(tag)

    valid_tags = []
    for tag in stem_to_tag:
        if len(stem_to_tag[tag]) > 1:
            valid_tags.append(tag)

    if len(valid_tags):
        if sys.argv[3] == 'stem':
            references[doc_id] = [[t] for t in valid_tags]
        else:
            references[doc_id] = [
                list(set(stem_to_tag[t])) for t in valid_tags
            ]
コード例 #20
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import sys
import os
import glob
import codecs
import json
from nltk.stem.snowball import SnowballStemmer as Stemmer

references = {}

for input_file in glob.glob(sys.argv[1] + '/*.key'):
    file_id = '.'.join(input_file.split('/')[-1].split('.')[0:-1])
    print(file_id)
    with codecs.open(input_file, 'r', 'iso-8859-1') as f:
        lines = f.readlines()
        keyphrases = []
        for line in lines:
            words = line.strip().split()
            stems = [Stemmer('portuguese').stem(w.lower()) for w in words]
            if sys.argv[3] == "stem":
                keyphrases.append([' '.join(stems)])
            else:
                keyphrases.append([' '.join([w.lower() for w in words])])
        references[file_id] = keyphrases

with open(sys.argv[2], 'w') as o:
    json.dump(references, o, sort_keys=True, indent=4)
コード例 #21
0
np.set_printoptions(threshold='nan')
import argparse
import nltk
import math
import re
import random
import scipy
import pickle
import sys

from collections import Counter
from nltk.stem.snowball import SnowballStemmer as Stemmer
from scipy.special import expit
# from stemming.porter2 import stem

stemmer = Stemmer("english")

dictionary_counts = dict()
dictionary_indices = dict()
total_documents = 0
regex = re.compile('[^a-zA-Z ]')


def sigmoid(x):
    return expit(x)


def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

コード例 #22
0
references = {}
stemmed_references = {}

with open(sys.argv[1], 'r') as f:
    for file_number, line in enumerate(f.readlines()):
        document = json.loads(line)
        file_id = '{0:05d}'.format(file_number)
        output_file = sys.argv[2] + '/{}.txt'.format(file_id)

        logging.info("writting file {}".format(output_file))
        with codecs.open(output_file, 'w', 'utf-8') as o:
            o.write(document['title'] + "\n\n")
            o.write(document['abstract'])

        references[file_id] = []
        stemmed_references[file_id] = []

        keyphrases = document['keyword'].split(';')
        for keyphrase in keyphrases:
            words = keyphrase.lower().strip().split()
            stems = [Stemmer('porter').stem(w) for w in words]
            references[file_id].append([' '.join(words)])
            stemmed_references[file_id].append([' '.join(stems)])

with open(sys.argv[3], 'w') as o:
    json.dump(references, o, sort_keys=True, indent=4)

with open(sys.argv[4], 'w') as o:
    json.dump(stemmed_references, o, sort_keys=True, indent=4)
print(data)

# ======================================================================================================================
# Format keyphrases and retrieve document text
# ======================================================================================================================

list_of_document_title = []  # save the title of documents
list_of_document_abstract = []  # save the abstract of documents
list_of_document_text = []  # save the body of documents
#gold_keyphrases = []  # save the gold keyphrases of documents
pred_keyphrases = []  # save the predicted keyphrases of documents
for indx, abstract_document in enumerate(data['abstract']):
    # print('train_test_combined/' + key + '.xml')
    # print(keyphrases_dictionary[key])

    gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')])  # split gold keywords to separate them from one another



# ======================================================================================================================
# MultipartiteRank Extractor
# ======================================================================================================================

    # 1. create a MultipartiteRank extractor.
    extractor = pke.unsupervised.MultipartiteRank()

    # 2. load the content of the document.
    extractor.load_document(input=abstract_document,
                            normalization="stemming")

    # 3. select the longest sequences of nouns and adjectives, that do