def read_raw_document(self, stemmer='porter'): """ Read the raw input file and populate the sentence list. Args: stemmer (str): the stemmer in nltk to use, defaults to porter (can be set to None for using word surface forms instead of stems). """ # parse the document using the preprocessed text parser parse = RawTextReader(self.input_file) # loop through the parsed sentences for i, sentence in enumerate(parse.sentences): # add the sentence to the container self.sentences.append(Sentence(words=sentence['words'])) # add the POS self.sentences[i].pos = sentence['POS'] # add the stems if stemmer is not None: for j, word in enumerate(self.sentences[i].words): self.sentences[i].stems.append(Stemmer(stemmer).stem(word)) # otherwise computations are performed on surface forms else: self.sentences[i].stems = self.sentences[i].words # lowercase the stems/lemmas for j, stem in enumerate(self.sentences[i].stems): self.sentences[i].stems[j] = stem.lower()
class Sentence: stemmer = Stemmer() def __init__(self, dictionary, startIndex: int, endIndex: int, sent: str, start: int, end: int): self.startIndex = startIndex self.endIndex = endIndex self.sent = sent self.words = self.sentToWords() self.nGrams = self.wordsToTrigramsWithIndices(dictionary) self.start = start self.end = end def sentToWords(self) -> List[str]: # FIXME: remove_stops . remove_puncts ~> remove_sth(_, stops | puncts) return wordsToStemmed( remove_stops(remove_puncts(word_tokenize(self.sent)))) def wordsToTrigramsWithIndices(self, dictionary): def getIndexedTuple(word: str): index = -1 if word in dictionary.wordsToIndices: index = dictionary.wordsToIndices[word] return (index, word) return list(trigrams(list(map(getIndexedTuple, self.words))))
def read_preprocessed_document(self, stemmer='porter', sep='/'): """ Read the preprocessed input file and populate the sentence list. Args: stemmer (str): the stemmer in nltk to use, defaults to porter. sep (str): the separator of the tagged word, defaults to /. """ # parse the document using the preprocessed text parser parse = PreProcessedTextReader(self.input_file, sep=sep) # loop through the parsed sentences for i, sentence in enumerate(parse.sentences): # add the sentence to the container self.sentences.append(Sentence(words=sentence['words'])) # add the POS self.sentences[i].pos = sentence['POS'] # add the stems if stemmer == None: self.sentences[i].stems = list(self.sentences[i].words) else: for j, word in enumerate(self.sentences[i].words): self.sentences[i].stems.append(Stemmer(stemmer).stem(word)) # lowercase the stems/lemmas for j, stem in enumerate(self.sentences[i].stems): self.sentences[i].stems[j] = stem.lower()
def load_references(input_file, sep_doc_id=':', sep_ref_keyphrases=',', reference_stemming=False, stemmer='porter'): """ Load a reference file and returns a dictionary. """ logging.info('loading reference keyphrases from ' + input_file) references = defaultdict(list) with codecs.open(input_file, 'r', 'utf-8') as f: for line in f: cols = line.strip().split(sep_doc_id) doc_id = cols[0].strip() keyphrases = cols[1].strip().split(sep_ref_keyphrases) for v in keyphrases: if '+' in v: for s in v.split('+'): references[doc_id].append(s) else: references[doc_id].append(v) if reference_stemming: for i, k in enumerate(references[doc_id]): stems = [Stemmer(stemmer).stem(u) for u in k.split()] references[doc_id][i] = ' '.join(stems) return references
def read_raw_document(self, stemmer='porter'): """ Read the raw input file and populate the sentence list. Args: stemmer (str): the stemmer in nltk to use, defaults to porter. """ # parse the document using the preprocessed text parser parse = RawTextReader(self.input_file) # loop through the parsed sentences for i, sentence in enumerate(parse.sentences): # add the sentence to the container self.sentences.append(Sentence(words=sentence['words'])) # add the POS self.sentences[i].pos = sentence['POS'] # add the stems for j, word in enumerate(self.sentences[i].words): self.sentences[i].stems.append(Stemmer(stemmer).stem(word)) # lowercase the stems/lemmas for j, stem in enumerate(self.sentences[i].stems): self.sentences[i].stems[j] = stem.lower()
def tokenize_lowercase(text): ''' Toekenize, stem and convert to lower case the text of documents :param text: text of a specific document :return: formatted text ''' words = word_tokenize(text) # tokenize document text # get words of all keyphrases in a single list formatted_tok_text = [ Stemmer('porter').stem(word_token.lower()) for word_token in words ] # DO NOT STEM TEXT WORDS TO TRAIN THE CLASSIFIER formatted_text = ' '.join(formatted_tok_text) return formatted_text
class Sentence: stemmer = Stemmer() lemmater = WordNetLemmatizer() def __init__(self, index: int, sent: str, start: int, end: int): self.index = index self.sent = sent self.words = self.sentToWords() self.nGrams = list(trigrams(self.words)) self.start = start self.end = end def sentToWords(self) -> List[str]: return word_tokenize(self.sent)
def read_corenlp_document(self, use_lemmas=False, stemmer='porter'): """ Read the input file in CoreNLP XML format and populate the sentence list. Args: use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to use (if used), defaults to porter (can be set to None for using word surface forms instead of stems). """ # parse the document using the Minimal CoreNLP parser parse = MinimalCoreNLPParser(self.input_file) # loop through the parsed sentences for i, sentence in enumerate(parse.sentences): # add the sentence to the container self.sentences.append(Sentence(words=sentence['words'])) # add the POS self.sentences[i].pos = sentence['POS'] # add the lemmas self.sentences[i].stems = sentence['lemmas'] # flatten with the stems if required if not use_lemmas: # if stemming is performed if stemmer is not None: for j, word in enumerate(self.sentences[i].words): self.sentences[i].stems[j] = Stemmer(stemmer).stem( word) # else, all computations are performed on surface forms else: self.sentences[i].stems = self.sentences[i].words # lowercase the stems/lemmas for j, stem in enumerate(self.sentences[i].stems): self.sentences[i].stems[j] = stem.lower() # add the meta-information # for k, infos in sentence.iteritems(): -- Python 2/3 compatible for (k, infos) in sentence.items(): if k not in set(['POS', 'lemmas', 'words']): self.sentences[i].meta[k] = infos
def extract_keyphrases(data): gold_keyphrases = [] # save the gold keyphrases of documents pred_keyphrases = [] # save the predicted keyphrases of documents for indx, abstract_document in enumerate(data['abstract']): # print('train_test_combined/' + key + '.xml') # print(keyphrases_dictionary[key]) #if 'json' in file: gold_keyphrases.append([ [Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';') ]) # split gold keywords to separate them from one another # ====================================================================================================================== # TF-IDF Extractor # ====================================================================================================================== stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += stopwords.words('english') # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf() #print(' '.join(abstract_document)) print(abstract_document) # 2. load the content of the document. extractor.load_document( input=abstract_document, # ' '.join(abstract_document language='en', normalization="stemming") # 3. select {1-3}-grams not containing punctuation marks as candidates. extractor.candidate_selection(n=3, stoplist=stoplist) # 4. weight the candidates using a `tf` x `idf` df = pke.load_document_frequency_file(input_file=input_file) extractor.candidate_weighting(df=df) # 5. get the 10-highest scored candidates as keyphrases pred_kps = extractor.get_n_best(n=10) # keep only the predicted keyphrase (first position -> [0]) and discard the frequency number pred_keyphrases.append([kp[0].split() for kp in pred_kps]) print(pred_keyphrases) print(gold_keyphrases) return pred_keyphrases, gold_keyphrases
class Sentence: stemmer = Stemmer() def __init__(self, startIndex: int, endIndex: int, sent: str, start: int, end: int): self.startIndex = startIndex self.endIndex = endIndex self.sent = sent self.words = self.sentToWords() self.nGrams = list(trigrams(self.words)) self.start = start self.end = end def sentToWords(self) -> List[str]: # FIXME: remove_stops . remove_puncts ~> remove_sth(_, stops | puncts) return wordsToStemmed( remove_stops(remove_puncts(word_tokenize(self.sent))))
def extract_keyphrases(data): gold_keyphrases = [] # save the gold keyphrases of documents pred_keyphrases = [] # save the predicted keyphrases of documents for indx, abstract_document in enumerate(data['abstract']): # print('train_test_combined/' + key + '.xml') # print(keyphrases_dictionary[key]) gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')]) # split gold keywords to separate them from one another # ====================================================================================================================== # MultipartiteRank Extractor # ====================================================================================================================== # 1. create a MultipartiteRank extractor. extractor = pke.unsupervised.MultipartiteRank() # 2. load the content of the document. extractor.load_document(input=abstract_document, normalization="stemming") # 3. select the longest sequences of nouns and adjectives, that do # not contain punctuation marks or stopwords as candidates. pos = {'NOUN', 'PROPN', 'ADJ'} stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += stopwords.words('english') extractor.candidate_selection(pos=pos, stoplist=stoplist) # 4. build the Multipartite graph and rank candidates using random walk, # alpha controls the weight adjustment mechanism, see TopicRank for # threshold/method parameters. extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases pred_kps = extractor.get_n_best(n=10) pred_keyphrases.append([kp[0].split() for kp in pred_kps]) # keep only the predicted keyphrase and discard the frequency number print(pred_keyphrases) print(gold_keyphrases) return pred_keyphrases, gold_keyphrases
def extract_keywords(sentence): sentence = sentence.lower() not_stopw = [ "no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very" ] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) print(stopw) pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') sentence = sentence.replace('\n', '') sentence = sentence.replace("n't", " not") sentence = clean_string(sentence) sentence = pattern.sub('', sentence) stemmer = Stemmer() s = [stemmer.stem(w) for w in sentence.split()] b = zip(*[s[i:] for i in [0, 1]]) b = [bigram[0] + " " + bigram[1] for bigram in b] return s + b
for index, list_of_keyphrases in enumerate(data['keyword']): keyphrases_list = [] for keyphrase in list_of_keyphrases: # get words of all keyphrases in a single list # keyphrase = keyphrase.translate(remove_digits).strip() # remove digits keyphrase = keyphrase.strip() # remove whitespaces if len(keyphrase): # check if the keyphrase is empty tokens = word_tokenize(keyphrase) # tokenize # Replace the pure digit terms with DIGIT_REPL tokens = [ tok if not re.match('^\d+$', tok) else 'DIGIT_REPL' for tok in tokens ] # Replace the combination of characters and digits with WORD_DIGIT_REPL # tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens] tokens = [ Stemmer('porter').stem(keyword.lower()) for keyword in tokens ] # stem + lower case tokens = ' '.join(tokens) keyphrases_list.append(tokens) data['keyword'].iat[index] = keyphrases_list # ====================================================================================================================== # Count logistics # ====================================================================================================================== keywords_in_title = 0 # the count of keywords in title keywords_in_abstract = 0 # the count of keywords in abstract keywords_in_title_abstract = 0 # the count of keywords that are either in title or abstract keywords_in_title_NOT_abstract = 0 # the count of keywords that are in title BUT NOT in abstract total_keywords = 0 # the count of all keywords
print('tokenization - abstract finish') # stem, tokenize and lower case keyphrases and keep them categorized by document for index, list_of_keyphrases in enumerate(data['keywords']): keyphrases_list = [] for keyphrase in list_of_keyphrases: # get words of all keyphrases in a single list # keyphrase = keyphrase.translate(remove_digits).strip() # remove digits keyphrase = keyphrase.strip() # remove whitespaces if len(keyphrase): # check if the keyphrase is empty tokens = word_tokenize(keyphrase) # tokenize # Replace the pure digit terms with DIGIT_REPL tokens = [tok if not re.match('^\d+$', tok) else 'DIGIT_REPL' for tok in tokens] # Replace the combination of characters and digits with WORD_DIGIT_REPL #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens] keyphrases_list.append([Stemmer('porter').stem(keyword.lower()) for keyword in tokens]) # stem + lower case data['keywords'].iat[index] = keyphrases_list # print('THESE ARE THE KEYPHRASE LIST', len(keyphrases_list), keyphrases_list) # ====================================================================================================================== # Write pre-processed keyphrases to csv file # ====================================================================================================================== data['abstract'].to_csv(x_text_filename, index=False) # save the preprocessed document text # rename column "keywords" to "keyword" for uniformity between datasets data.rename(columns={"keywords": "keyword"}, inplace=True) data[['keyword', 'assemble_documents_index']].to_csv(y_text_filename, index=False) # save the preprocessed keyphrases
def evaluation(y_pred=None, y_test=None, x_test=None, x_filename=None, y_filename=None, paragraph_assemble_docs=None): """ Evaluate the performance :param y_pred: the predicted labels :param y_test: the test labels :param x_filename: the name of the GOLD document text file - NEED TO MATCH THE LOADED FILE WHEN MAKING PREDICTIONS (default evaluation dataset is KP20K) :param y_filename: the name of the GOLD keyphrase file - NEED TO MATCH THE LOADED FILE WHEN MAKING PREDICTIONS (default evaluation dataset is KP20K) :param paragraph_assemble_docs: (ONLY FOR UNSUPERVISED METHODS) the indices to re-assemble first 3 paragraphs :return: - """ if y_test is None: # evaluate the Bi-LSTM-CRF + unsupervised methods # ====================================================================================================================== # Load all validation target data (y_test\labels) data on memory (needed for evaluation) # ====================================================================================================================== # read preprocessed document text (x) and preprocessed keyphrases (y) x_test = pd.read_csv(x_filename, encoding="utf8") y_test = pd.read_csv(y_filename, encoding="utf8") # translate string back to list of lists (when reading dataframe, lists of lists are read as strings) x_test['abstract'] = x_test['abstract'].map(ast.literal_eval) if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename: assembl_docs = y_test['assemble_documents_index'] y_test = y_test['keyword'].map(ast.literal_eval) # print(x_test) print(y_test) # ====================================================================================================================== # Convert y_test and y_pred from categorical (two columns, 1 for each label) to a single value label (1 column) # ====================================================================================================================== def pred2label(all_abstract_preds): ''' Converts prediction set and test/validation set from two columns (one for each label value) to just one column with the number of the corresponding label [ initial array: [1, 0] => final array: [0] ] - [ initial array: [0, 1] => final array: [1] ] :param all_abstract_preds: array with predictions or test/validation set [documents/abstracts, number of words] :return: flattened array that contains the prediction for each word [number of total words of all abstracts] ''' preds = [] for abstract_preds in all_abstract_preds: # the position of the max value is corresponding to the actual label value (0: Non-KP, 1: KP) doc_preds = [np.argmax(word_pred) for word_pred in abstract_preds] preds.append(doc_preds) return preds # print('BEFORE y_pred', y_pred) y_pred = pred2label(y_pred) # convert y_pred from categorical (two columns, 1 for each label) to a single value label # print('AFTER y_pred', y_pred) # ====================================================================================================================== # Extract keyphrases from the predicted set # ====================================================================================================================== pred_keyphrase_list = [] # save all predicted keyphrases for doc_index, doc_prediction in enumerate(y_pred): # iterate through predictions for documents document_keyphrases = [] # save the keyphrases of a document consecutive_keywords = [] # save consecutive keywords that form a keyphrase for word_index, word_prediction in enumerate(doc_prediction): # iterate through predictions for WORDS of documents if word_index >= len(x_test['abstract'][doc_index]): break # check if the abstract reached to an end (padding adds more dummy words non existing in real abstract) if word_index: # check if this is the FIRST WORD in the abstract [to avoid negative index value] if doc_prediction[word_index - 1]: # check if the previous word is a keyword if word_prediction: # check if the current word is a keyword # print(x_test['abstract'][doc_index]) # print(x_test['abstract'][doc_index][word_index]) consecutive_keywords.append(x_test['abstract'][doc_index][word_index]) else: if len(consecutive_keywords): # save keyword list if exists (not empty list) document_keyphrases.append(consecutive_keywords) consecutive_keywords = [] # re-initialize (empty) list if word_prediction: # check if the current word is a keyword consecutive_keywords.append(x_test['abstract'][doc_index][word_index]) else: # save the FIRST WORD of the abstract if it is a keyword if word_prediction: # check if the current word is a keyword # print('HEREEEE', doc_index, word_index) # print(x_test['abstract'][doc_index]) consecutive_keywords.append(x_test['abstract'][doc_index][word_index]) if len(consecutive_keywords): # save the keywords that occur in the END of the abstract, if they exist (not empty list) document_keyphrases.append(consecutive_keywords) pred_keyphrase_list.append(document_keyphrases) else: # evaluate the unsupervised methods that use .xml files # tokenize the text x_test['abstract'] = x_test['abstract'].apply(lambda row: row.split()) print(x_test) # define pred_keyphrase_list - contains words pred_keyphrase_list = y_pred # define y_test if full-text in paragraphs/stentences if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename: assembl_docs = paragraph_assemble_docs # print(pred_keyphrase_list) #print(y_test) # FIND IF ANY KEYPHRASES EXIST ON THE PREDICTION SET here = [1 if any(doc) else 0 for doc in y_pred] print('\ny_pred', np.array(y_pred, dtype=object).shape) if any(here): print('THERE ARE KEYPHRASES') else: print('THERE ARE NOOOOOOT KEYPHRASES') # ====================================================================================================================== # Calculate metrics # ====================================================================================================================== def calculate_metrics(y_test_set, pred_keyphrase_list_set, eval_method): """ Calculate and print metrics :param y_test_set: GOLD set :param pred_keyphrase_list_set: PREDICTION set :param eval_method: the name of the evaluation method (exact/partial match) :return: - """ TP = 0 # True Positive FP = 0 # False Positive FN = 0 # False Negative for index_pred, doc_pred in enumerate(pred_keyphrase_list_set): for key_test in y_test_set[index_pred]: #if any(key_test not in keyp for keyp in doc_pred): if key_test not in doc_pred: # FN: keyphrases that exist in GOLD but not in PREDICTED FN += 1 if len(doc_pred): # continue if prediction list is NOT empty | if prediction list is empty -> skip checking for key_pred in doc_pred: #if any(key_pred in keyp for keyp in y_test_set[index_pred]): if key_pred in y_test_set[index_pred]: # TP: keyphrases that exist both in PREDICTED and GOLD TP += 1 else: # FP: keyphrases that exist in PREDICTED but not in GOLD (if key_pred not in y_test_set) FP += 1 precision = 0 recall = 0 f1_score = 0 # print(TP, FN, FP) # print('precision=', TP / (TP + FP), 'recall=', TP / (TP + FN)) if not (TP == FP == 0): precision = TP / (TP + FP) if not (TP == FN == 0): recall = TP / (TP + FN) if not (precision == recall == 0): f1_score = 2 * (precision * recall) / (precision + recall) print('\n' + eval_method) print('Precision: %.4f' % precision) print('Recall: %.4f' % recall) print('F1-score: %.4f\n' % f1_score) # ====================================================================================================================== # Calculate NEW metrics (semi-exact matching) # ====================================================================================================================== def calculate_semi_exact_match_metrics(y_test_set, pred_keyphrase_list_set, eval_method): """ Calculate and print metrics :param y_test_set: GOLD set :param pred_keyphrase_list_set: PREDICTION set :param eval_method: the name of the evaluation method (exact/partial match) :return: - """ # each 0 and 1 represents a keyphrase and the 0 means that it exists in gold/pred set, while 0 means it does not pred_list = [] # contains 0, 1 for predicted keyphrases depending on if a predicted keyphrase matches with a gold one gold_list = [] # contains 0, 1 for gold keyphrases depending on if a gold keyphrase matches with a predicted one for index_pred, doc_pred in enumerate(pred_keyphrase_list_set): pred_kps = [0] * len(doc_pred) # initialize the list with 0s and length equal to the total predicted keyphrases gold_kps = [0.0] * len(y_test_set[index_pred]) # initialize the list with 0s and length equal to the total gold keyphrases if doc_pred: # if predicted keyphrase set is not empty (the case of empty predicted keyphrase is handled by the initialization of pred_kps and gold_kps) # find if the gold keyphrases exist in the predicted set, and if so mark which gold and predicted keyphrases have a match for gold_kp_index, gold_keyphr in enumerate(y_test_set[index_pred]): gold_keyphrase_tokens = gold_keyphr.split() # print('gold: ', gold_keyphrase_tokens) # print('pred: ', doc_pred) avg_coverage_ratio_list = [] gold_coverage_ratio_list=[] for pred_kp in doc_pred: kw_coverage = 0 # gold keyword coverage of a predicted keyphrase for keyword_gold in gold_keyphrase_tokens: if keyword_gold in pred_kp: kw_coverage += 1 # a gold keyword might exist multiple times in a pred keyphrase, but with this approach we assume that it does not as this happens rarely if len(pred_kp.split()): pred_coverage_ratio = kw_coverage / len(pred_kp.split()) # calculate the ratio of the covered predicted kps else: pred_coverage_ratio = 0 if len(gold_keyphrase_tokens): gold_coverage_ratio = kw_coverage / len(gold_keyphrase_tokens) # calculate the ratio of the covered gold kps else: gold_coverage_ratio = 0 avg_coverage_ratio_list.append((gold_coverage_ratio + pred_coverage_ratio) / 2) # save the average of the keyphrase coverage and the coverage ratio gold_coverage_ratio_list.append(gold_coverage_ratio) # print('percent: ', avg_coverage_ratio_list) # find the max average coverage ratio and its position on the list max_index, max_avg_coverage_ratio_list = max(enumerate(avg_coverage_ratio_list), key=itemgetter(1)) if max_avg_coverage_ratio_list > 0.5: # set 1 or the average value of keyphrase coverage and ratio for possibly more accurate results # gold_kps[gold_kp_index] = 1 # set 1 the gold kp that matched to a predicted one gold_kps[gold_kp_index] = gold_coverage_ratio_list[max_index] #max_avg_coverage_ratio_list # gold_coverage_ratio_list[gold_kp_index] pred_kps[max_index] = 1 # set 1 the predicted kp that was matched with a gold one # save the kp predicted/gold matches of each document pred_list.extend(pred_kps) gold_list.extend(gold_kps) FN = gold_list.count(0) # False Negative: keyphrases that exist in GOLD but not in PREDICTED # TP = gold_list.count(1) # True Positive: keyphrases that exist both in PREDICTED and GOLD TP = sum(gold_list) FP = pred_list.count(0) # False Positive: keyphrases that exist in PREDICTED but not in GOLD (if key_pred not in y_test_set) precision = 0 recall = 0 f1_score = 0 # print(TP, FN, FP) # print('precision=', TP / (TP + FP), 'recall=', TP / (TP + FN)) if not (TP == FP == 0): precision = TP / (TP + FP) if not (TP == FN == 0): recall = TP / (TP + FN) if not (precision == recall == 0): f1_score = 2 * (precision * recall) / (precision + recall) print('\n' + eval_method) print('Precision: %.4f' % precision) print('Recall: %.4f' % recall) print('F1-score: %.4f\n' % f1_score) # ====================================================================================================================== # Get the SETS of (unique) keyphrases for predicted and gold set # ====================================================================================================================== # assemble the sentences of a document into a whole document again (only for the SENTEC and PARAGRAPH) print(x_filename) if 'SENTENC' in x_filename or 'SENTEC' in x_filename or 'PARAGRAPH' in x_filename: print('ENTERED SENTENC & PARAGRAPH MODE') y_test_set = [] # set of original/all GOLD keyphrases for each document y_test_set_extraction = [] # keep only the GOLD keyphrases that exist in their corresponding document pred_keyphrase_list_set = [] # set of PREDICTED keyphrases for each document gold_same_document_keyphrases = [] # save the gold keyphrases that are from the same document (only for the SENTEC and PARAGRAPH) gold_extraction_same_document_keyphrases = [] # save the gold keyphrases that are from the same document - extraction (only for the SENTEC and PARAGRAPH) pred_same_document_keyphrases = [] # save the pred keyphrases that are from the same document (only for the SENTEC and PARAGRAPH) for doc_index, doc in enumerate(y_test): # get the set of GOLD keyphrases for each document # y gold set gold_document_keyphrases = [] # save the keyphrases of a document as strings (each keyphrase -> string) # y gold set - extraction gold_document_keyphrases_extraction = [] # save the keyphrases of a document as strings (each keyphrase -> string) # y predicted pred_document_keyphrases = [] # save the keyphrases of a document as strings (each keyphrase -> string) abstract_as_string = ' '.join([Stemmer('porter').stem(word) for word in x_test['abstract'][doc_index]]) for tokenized_keyphrase in doc: keyphrase = ' '.join(tokenized_keyphrase) # STEMMING is already applied gold_document_keyphrases.append(keyphrase.strip()) if keyphrase.strip() in abstract_as_string: # keep only keyphrases that exist in the text - keyphrase EXTRACTION gold_document_keyphrases_extraction.append(keyphrase.strip()) for tokenized_keyphrase in pred_keyphrase_list[doc_index]: keyphrase = '' for word in tokenized_keyphrase: keyphrase += Stemmer('porter').stem(word) + ' ' # apply STEMMING pred_document_keyphrases.append(keyphrase.strip()) # check if the previous sentence is in the same document (has the same document id) as the current if doc_index == 0: # print('we are in the 1st document') gold_same_document_keyphrases.extend(gold_document_keyphrases) gold_extraction_same_document_keyphrases.extend(gold_document_keyphrases_extraction) pred_same_document_keyphrases.extend(pred_document_keyphrases) elif assembl_docs[doc_index] == assembl_docs[doc_index - 1]: # print('we are in the same document', y_test['assemble_documents_index'][doc_index], '==', y_test['assemble_documents_index'][doc_index - 1]) gold_same_document_keyphrases.extend(gold_document_keyphrases) gold_extraction_same_document_keyphrases.extend(gold_document_keyphrases_extraction) pred_same_document_keyphrases.extend(pred_document_keyphrases) else: # different documents # print('CHANGED document', y_test['assemble_documents_index'][doc_index], '==', y_test['assemble_documents_index'][doc_index - 1]) # save keyphrases for the previous document y_test_set.append(set(gold_same_document_keyphrases)) # get each keyphrase just once y_test_set_extraction.append(set(gold_extraction_same_document_keyphrases)) pred_keyphrase_list_set.append(set(pred_same_document_keyphrases)) # get each keyphrase just once # create the new document keyphrase set gold_same_document_keyphrases = gold_document_keyphrases gold_extraction_same_document_keyphrases = gold_document_keyphrases_extraction pred_same_document_keyphrases = pred_document_keyphrases # save the keyphrases for the last document if (doc_index + 2) > len(pred_keyphrase_list): # (+2 because counting starts from 0 and we want the next element as well) # save keyphrases for the current document y_test_set.append(set(gold_same_document_keyphrases)) # get each keyphrase just once y_test_set_extraction.append(set(gold_extraction_same_document_keyphrases)) pred_keyphrase_list_set.append(set(pred_same_document_keyphrases)) # get each keyphrase just once # count all keyphrases and keyphrases existing in text keyphrase_counter = 0 extraction_keyphrase_counter = 0 for doc_idx, y_test_extraction_doc in enumerate(y_test_set_extraction): extraction_keyphrase_counter += len(y_test_extraction_doc) keyphrase_counter += len(y_test_set[doc_idx]) print('existing keyphrases', extraction_keyphrase_counter) print('all keyphrases', keyphrase_counter) else: # for the full-text documents y_test_set = [] # set of original/all GOLD keyphrases for each document y_test_set_extraction = [] # keep only the GOLD keyphrases that exist in their corresponding document for doc_index, test_doc in enumerate(y_test): # get the set of GOLD keyphrases for each document extraction_document_keyphrases = [] # save the keyphrases that exist in text (extraction) of a document as strings (each keyphrase -> string) document_keyphrases = [] # save all keyphrases of a document as strings (each keyphrase -> string) abstract_as_string = ' '.join([Stemmer('porter').stem(word) for word in x_test['abstract'][doc_index]]) for tokenized_keyphrase in test_doc: keyphrase = ' '.join(tokenized_keyphrase) # STEMMING is already applied document_keyphrases.append(keyphrase.strip()) if keyphrase.strip() in abstract_as_string: # keep only keyphrases that exist in the text - keyphrase EXTRACTION extraction_document_keyphrases.append(keyphrase.strip()) # print(document_keyphrases) y_test_set.append(set(document_keyphrases)) # get each keyphrase just once y_test_set_extraction.append(set(extraction_document_keyphrases)) # get each keyphrase just once # count all keyphrases and keyphrases existing in text keyphrase_counter = 0 extraction_keyphrase_counter = 0 for doc_idx, y_test_extraction_doc in enumerate(y_test_set_extraction): extraction_keyphrase_counter += len(y_test_extraction_doc) keyphrase_counter += len(y_test_set[doc_idx]) print('existing keyphrases', extraction_keyphrase_counter) print('all keyphrases', keyphrase_counter) pred_keyphrase_list_set = [] # set of PREDICTED keyphrases for each document for doc in pred_keyphrase_list: # get the set of PREDICTED keyphrases for each document document_keyphrases = [] # save the keyphrases of a document as strings (each keyphrase -> string) for tokenized_keyphrase in doc: keyphrase = '' for word in tokenized_keyphrase: keyphrase += Stemmer('porter').stem(word) + ' ' # apply STEMMING document_keyphrases.append(keyphrase.strip()) pred_keyphrase_list_set.append(set(document_keyphrases)) # get each keyphrase just once # print y_test and y_pred #for i in range(len(pred_keyphrase_list_set)): for i in range(10): print('pred', pred_keyphrase_list_set[i]) print('test', y_test_set[i]) print('extraction test', y_test_set_extraction[i]) # ====================================================================================================================== # Exact Match - Model Evaluation # ====================================================================================================================== # Exact Match: the keyphrases must be given as whole strings # extraction - only GOLD KPs existing in text calculate_metrics(y_test_set_extraction, pred_keyphrase_list_set, 'Exact Match - Extraction') # all GOLD KPs calculate_metrics(y_test_set, pred_keyphrase_list_set, 'Exact Match') # ====================================================================================================================== # NEW METHOD - Semi-Exact Match - Model Evaluation # ====================================================================================================================== # extraction - only GOLD KPs existing in text calculate_semi_exact_match_metrics(y_test_set_extraction, pred_keyphrase_list_set, 'Semi-exact Match - Extraction') # all GOLD KPs calculate_semi_exact_match_metrics(y_test_set, pred_keyphrase_list_set, 'Semi-exact Match') # ====================================================================================================================== # Partial Match - Model Evaluation # ====================================================================================================================== # Partial Match: the keyphrases must be given as a set of words # Get the sets of all gold keyphrases y_test_set_partial = [] for doc in y_test_set: # get the set of GOLD keyphrases for each document document_keywords = [] for keyphrase in doc: keyphrase = word_tokenize(keyphrase) for word in keyphrase: document_keywords.append(word) y_test_set_partial.append(set(document_keywords)) # Get the sets of all gold keyphrases existing in text (extraction) y_test_set_partial_extraction = [] for doc in y_test_set_extraction: # get the set of GOLD keyphrases for each document document_keywords = [] for keyphrase in doc: keyphrase = word_tokenize(keyphrase) for word in keyphrase: document_keywords.append(word) y_test_set_partial_extraction.append(set(document_keywords)) # Get the sets of all predicted keyphrases pred_keyphrase_list_set_partial = [] for doc in pred_keyphrase_list_set: # get the set of PREDICTED keyphrases for each document document_keywords = [] for keyphrase in doc: keyphrase = word_tokenize(keyphrase) for word in keyphrase: document_keywords.append(word) pred_keyphrase_list_set_partial.append(set(document_keywords)) # extraction - only GOLD KPs existing in text calculate_metrics(y_test_set_partial_extraction, pred_keyphrase_list_set_partial, 'Partial Match - Extraction') # all GOLD KPs calculate_metrics(y_test_set_partial, pred_keyphrase_list_set_partial, 'Partial Match')
def semeval_summarized_statistics(): # reading the initial JSON data using json.load() file = '..\\data\\benchmark_data\\summarization_experiment\\SemEval-2010_summarized.csv' # TEST data to evaluate the final model # ====================================================================================================================== # Read data # ====================================================================================================================== data = pd.read_csv(file, encoding="utf8") print(data) # ====================================================================================================================== # Split keyphrases list of keyphrases from string that contains all the keyphrases # ====================================================================================================================== for index, keywords in enumerate(data['keyword']): data['keyword'].iat[index] = keywords.split( ';') # split keywords to separate them from one another # ====================================================================================================================== # Isolate the title, abstract and the main body (+ remove section identifiers and '\n') # ====================================================================================================================== # tokenize key-phrases and keep them categorized by document for index, abstract in enumerate(data['abstract']): title_summary = data['title'][ index] + ' ' + abstract # combine title + abstract + main body # remove '\n' title_summary = title_summary.replace('\n', ' ') data['abstract'].iat[index] = title_summary # ====================================================================================================================== # Remove Contractions (pre-processing) # ====================================================================================================================== # substitute contractions with full words data['abstract'] = data['abstract'].apply(replace_contractions) data['keyword'] = data['keyword'].apply( lambda set_of_keyphrases: [replace_contractions(keyphrase) for keyphrase in set_of_keyphrases]) # ====================================================================================================================== # Remove punctuation (with whitespace) + digits (from ABSTRACT) + clean empty strings # ====================================================================================================================== # remove parenthesis, brackets and their contents data['abstract'] = data['abstract'].apply(remove_brackets_and_contents) # remove references of publications (in document text) data['abstract'] = data['abstract'].apply(remove_references) # remove punctuation data['abstract'] = data['abstract'].apply(remove_punct_and_non_ascii) data['keyword'] = data['keyword'].apply(keyword_remove_punct_and_non_ascii) # Replace the pure digit terms with DIGIT_REPL data['abstract'] = data['abstract'].apply(lambda text: " ".join([ token if not re.match('^\d+$', token) else 'DIGIT_REPL' for token in text.split() ])) # remove spaces print('convert digits - abstract finish') # remove rows with empty and one word abstracts/sentences data = data[data['abstract'].str.strip().astype(bool)] data.reset_index(drop=True, inplace=True) # remove empty keyphrases data['keyword'] = data['keyword'].apply( lambda set_of_keyws: [key_text for key_text in set_of_keyws if key_text.strip()]) # remove rows with empty keyphrases data = data[data['keyword'].map(len) > 0] # ====================================================================================================================== # Tokenize each sentence + remove digits (from KEYPHRASES) # ====================================================================================================================== # tokenize text data['abstract'] = data['abstract'].apply(tokenize_lowercase) print('tokenization - abstract finish') # stem, tokenize and lower case keyphrases and keep them categorized by document for index, list_of_keyphrases in enumerate(data['keyword']): keyphrases_list = [] for keyphrase in list_of_keyphrases: # get words of all keyphrases in a single list # keyphrase = keyphrase.translate(remove_digits).strip() # remove digits keyphrase = keyphrase.strip() # remove whitespaces if len(keyphrase): # check if the keyphrase is empty tokens = word_tokenize(keyphrase) # tokenize # Replace the pure digit terms with DIGIT_REPL tokens = [ tok if not re.match('^\d+$', tok) else 'DIGIT_REPL' for tok in tokens ] # Replace the combination of characters and digits with WORD_DIGIT_REPL #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens] tokens = [ Stemmer('porter').stem(keyword.lower()) for keyword in tokens ] # stem + lower case tokens = ' '.join(tokens) keyphrases_list.append(tokens) data['keyword'].iat[index] = keyphrases_list # ====================================================================================================================== # Count logistics # ====================================================================================================================== semeval_keywords_in_summary = 0 # the count of keywords in abstract semeval_total_keywords = 0 # the count of all keywords for index, keywords in enumerate(data['keyword']): semeval_total_keywords += len(keywords) # print('total_keywords', len(test)) # print('total_keywords', test) for keyword in keywords: # check if keyword exists on abstract if keyword in data['abstract'][index]: semeval_keywords_in_summary += 1 # print(keyword) # print(data['abstract'][index]) print('SemEval summarized: ', semeval_keywords_in_summary) print('SemEval summarized - total keyphrases: ', semeval_total_keywords) print('SemEval summarized - count of keywords in abstract: ', semeval_keywords_in_summary / semeval_total_keywords) return semeval_keywords_in_summary
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import sys import os import glob import json from nltk.stem.snowball import SnowballStemmer as Stemmer references = {} for input_file in glob.glob(sys.argv[1] + '/*.key'): file_id = input_file.split('/')[-1].split('.')[0] with open(input_file, 'r') as f: lines = f.readlines() keyphrases = [] for line in lines: words = line.strip().split() stems = [Stemmer('porter').stem(w.lower()) for w in words] keyphrases.append([' '.join(stems)]) # keyphrases.append([' '.join([w.lower() for w in words])]) references[file_id] = keyphrases with open(sys.argv[2], 'w') as o: json.dump(references, o, sort_keys=True, indent=4)
for index, list_of_keyphrases in enumerate(data['keyword']): keyphrases_list = [] for keyphrase in list_of_keyphrases: # get words of all keyphrases in a single list # keyphrase = keyphrase.translate(remove_digits).strip() # remove digits keyphrase = keyphrase.strip() # remove whitespaces if len(keyphrase): # check if the keyphrase is empty tokens = word_tokenize(keyphrase) # tokenize # Replace the pure digit terms with DIGIT_REPL tokens = [ tok if not re.match('^\d+$', tok) else 'DIGIT_REPL' for tok in tokens ] # Replace the combination of characters and digits with WORD_DIGIT_REPL #tokens = [tok if not re.match('.*\d+', tok) else 'WORD_DIGIT_REPL' for tok in tokens] keyphrases_list.append([ Stemmer('porter').stem(keyword.lower()) for keyword in tokens ]) # stem + lower case data['keyword'].iat[index] = keyphrases_list # print('THESE ARE THE KEYPHRASE LIST', len(keyphrases_list), keyphrases_list) # ====================================================================================================================== # Write pre-processed keyphrases to csv file # ====================================================================================================================== data['abstract'].to_csv(x_text_filename, index=False) # save the preprocessed document text data['keyword'].to_csv(y_text_filename, index=False) # save the preprocessed keyphrases # ====================================================================================================================== # Give labels to each word of Abstract (fulltext) - keyword (KP) or Non-keyword (Non-KP)
doc_id = file_id.split('/')[-1][:-5] # print('Loading {}'.format(doc_id)) with open(file_id, 'r') as f: lines = f.readlines() tags[doc_id].update([l.lower().strip() for l in lines]) references = {} for doc_id in tags: # group tags by stem stem_to_tag = collections.defaultdict(list) for tag in tags[doc_id]: stem = [Stemmer('porter').stem(w) for w in tag.split()] for _ in range(tags[doc_id][tag]): stem_to_tag[' '.join(stem)].append(tag) valid_tags = [] for tag in stem_to_tag: if len(stem_to_tag[tag]) > 1: valid_tags.append(tag) if len(valid_tags): if sys.argv[3] == 'stem': references[doc_id] = [[t] for t in valid_tags] else: references[doc_id] = [ list(set(stem_to_tag[t])) for t in valid_tags ]
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import sys import os import glob import codecs import json from nltk.stem.snowball import SnowballStemmer as Stemmer references = {} for input_file in glob.glob(sys.argv[1] + '/*.key'): file_id = '.'.join(input_file.split('/')[-1].split('.')[0:-1]) print(file_id) with codecs.open(input_file, 'r', 'iso-8859-1') as f: lines = f.readlines() keyphrases = [] for line in lines: words = line.strip().split() stems = [Stemmer('portuguese').stem(w.lower()) for w in words] if sys.argv[3] == "stem": keyphrases.append([' '.join(stems)]) else: keyphrases.append([' '.join([w.lower() for w in words])]) references[file_id] = keyphrases with open(sys.argv[2], 'w') as o: json.dump(references, o, sort_keys=True, indent=4)
np.set_printoptions(threshold='nan') import argparse import nltk import math import re import random import scipy import pickle import sys from collections import Counter from nltk.stem.snowball import SnowballStemmer as Stemmer from scipy.special import expit # from stemming.porter2 import stem stemmer = Stemmer("english") dictionary_counts = dict() dictionary_indices = dict() total_documents = 0 regex = re.compile('[^a-zA-Z ]') def sigmoid(x): return expit(x) def sigmoid_derivative(x): return sigmoid(x) * (1 - sigmoid(x))
references = {} stemmed_references = {} with open(sys.argv[1], 'r') as f: for file_number, line in enumerate(f.readlines()): document = json.loads(line) file_id = '{0:05d}'.format(file_number) output_file = sys.argv[2] + '/{}.txt'.format(file_id) logging.info("writting file {}".format(output_file)) with codecs.open(output_file, 'w', 'utf-8') as o: o.write(document['title'] + "\n\n") o.write(document['abstract']) references[file_id] = [] stemmed_references[file_id] = [] keyphrases = document['keyword'].split(';') for keyphrase in keyphrases: words = keyphrase.lower().strip().split() stems = [Stemmer('porter').stem(w) for w in words] references[file_id].append([' '.join(words)]) stemmed_references[file_id].append([' '.join(stems)]) with open(sys.argv[3], 'w') as o: json.dump(references, o, sort_keys=True, indent=4) with open(sys.argv[4], 'w') as o: json.dump(stemmed_references, o, sort_keys=True, indent=4)
print(data) # ====================================================================================================================== # Format keyphrases and retrieve document text # ====================================================================================================================== list_of_document_title = [] # save the title of documents list_of_document_abstract = [] # save the abstract of documents list_of_document_text = [] # save the body of documents #gold_keyphrases = [] # save the gold keyphrases of documents pred_keyphrases = [] # save the predicted keyphrases of documents for indx, abstract_document in enumerate(data['abstract']): # print('train_test_combined/' + key + '.xml') # print(keyphrases_dictionary[key]) gold_keyphrases.append([[Stemmer('porter').stem(keyword) for keyword in keyphrase.split()] for keyphrase in data['keyword'][indx].split(';')]) # split gold keywords to separate them from one another # ====================================================================================================================== # MultipartiteRank Extractor # ====================================================================================================================== # 1. create a MultipartiteRank extractor. extractor = pke.unsupervised.MultipartiteRank() # 2. load the content of the document. extractor.load_document(input=abstract_document, normalization="stemming") # 3. select the longest sequences of nouns and adjectives, that do