forked from mpk001/RAKE-keywordsExtraction
/
rake_tutorial.py
204 lines (180 loc) · 8.1 KB
/
rake_tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
import six
__author__ = 'a_medelyan'
import rake
import operator
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
import io
import codecs
# EXAMPLE ONE - SIMPLE
stoppath = "SmartStoplist.txt"
MAX_NB_WORDS = 20000
# 1. initialize RAKE by providing a path to a stopwords file
rake_object = rake.Rake(stoppath, 5, 3, 4)
# # 2. run on RAKE on a given text
# sample_file = io.open("data/docs/fao_test/w2167e.txt", 'r',encoding="iso-8859-1")
# text = sample_file.read()
#
# keywords = rake_object.run(text)
#
# # 3. print results
# print("Keywords:", keywords)
#
# print("----------")
# # EXAMPLE TWO - BEHIND THE SCENES (from https://github.com/aneesha/RAKE/rake.py)
#
# # 1. initialize RAKE by providing a path to a stopwords file
# rake_object = rake.Rake(stoppath)
# text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility " \
# "of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. " \
# "Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating"\
# " sets of solutions for all types of systems are given. These criteria and the corresponding algorithms " \
# "for constructing a minimal supporting set of solutions can be used in solving all the considered types of " \
# "systems and systems of mixed types."
def clean_str(string):
"""
Tokenization/string cleaning for dataset
Every dataset is lower cased except
"""
# string = re.sub("['`,!\"\'\\.,;?\t\n]", "", string)
string = re.sub("[?\t\n]", "", string)
# string = re.sub("[\t\n]", "", string)
sss = string.split()
# for jj in sss:
# if str != ' ':
# string = jj + ' '
string = ' '.join(str(jj) for jj in sss)
# string = string[0:len(string)-1]
return string
def clean_word(string):
"""
Tokenization/string cleaning for dataset
Every dataset is lower cased except
"""
string = re.sub("^['\.`,/:()\-\[\]!\"\s\\;?\t\n]", "", string)
string = re.sub("['\.`,/:()\-\[\]!\"\s\\;?\t\n]$", "", string)
# string = string[0:len(string)-1]
return string
def keyphrase_extract(text):
stoppath = "SmartStoplist.txt"
# generate candidate keywords
sentenceList = []
sentenceList.append(text)
stopwordpattern = rake.build_stop_word_regex(stoppath)
phraseList = rake.generate_candidate_keywords(sentenceList, stopwordpattern)
# print("Phrases:", phraseList)
# calculate individual word scores
wordscores = rake.calculate_word_scores(phraseList)
# generate candidate keyword scores
keywordcandidates = rake.generate_candidate_keyword_scores(phraseList, wordscores)
# for candidate in keywordcandidates.keys():
# print("Candidate: ", candidate, ", score: ", keywordcandidates.get(candidate))
# sort candidates by score to determine top-scoring keywords
sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
return sortedKeywords
# for example, you could just take the top third as the final keywords
# if len(sortedKeywords) > 1:
# for keyword in sortedKeywords[0:3]:
# print("Keyword: ", keyword[0], ", score: ", keyword[1])
if __name__ == '__main__':
# data = pd.read_csv('F://workspace_mpk0//dataset//quora_duplicate_questions.tsv', sep='\t')
# f0=file('F://workspace_mpk0//dataset//quora_duplicate_questions_keyphrase2.tsv',"a+")
# # f0.writelines('id\tqid1\tqid2\tquestion1\tquestion2\tkeyphrase1\tkeyphrase2\tis_duplicate\n')
data = pd.read_csv('F://workspace_mpk0//dataset//Quora_question_pair_partition//test.tsv', sep='\t')
f0=file('F://workspace_mpk0//dataset//Quora_question_pair_partition//test_keyphrase_v3.tsv',"a+")
f0.writelines('is_duplicate\tquestion1\tquestion2\tkeyphrase1\tkeyphrase2\tid\n')
count = 0
max_keynum1 = 0
max_keynum2 = 0
num1 = 0
num2 = 0
entity = []
with open('F://workspace_mpk0//dataset//Quora_question_pair_partition//freebase_code_names.txt', 'r') as f:
lines = f.readlines()
for line in lines:
entity.append(line.split('>')[0].split('/')[6])
for idx in range(data.id.shape[0]):
res = ''
text10 = BeautifulSoup(data.question1[idx], "lxml")
text1 = clean_str(text10.get_text().encode('ascii', 'ignore'))
# text1 = text10.get_text().encode('ascii', 'ignore')
text20 = BeautifulSoup(data.question2[idx], "lxml")
text2 = clean_str(text20.get_text().encode('ascii', 'ignore'))
# text2 = text20.get_text().encode('ascii', 'ignore')
if text1 and text2:
t1 = text1.split('\t')
t2 = text2.split('\t')
entity1 = []
entity2 = []
temp1 = text1.split(' ')
temp2 = text2.split(' ')
for t in temp1:
if t in entity:
entity1.append(t)
for t in temp2:
if t in entity:
entity2.append(t)
if len(t1) == 1 and len(t2) == 1 and len(entity1) > 0 and len(entity2) > 0:
# if len(t1) == 1 and len(t2) == 1:
#提取关键词
sortedKeywords1 = keyphrase_extract(text1)
sortedKeywords2 = keyphrase_extract(text2)
key1 = ''
key2 = ''
if len(sortedKeywords1) > 0 and len(sortedKeywords2) > 0:
count += 1
num1 += len(sortedKeywords1)
num2 += len(sortedKeywords2)
if max_keynum1 < len(sortedKeywords1): max_keynum1 = len(sortedKeywords1)
if max_keynum2 < len(sortedKeywords2): max_keynum2 = len(sortedKeywords2)
# for keyword in sortedKeywords1[0:3]:
for keyword in sortedKeywords1:
kkk = keyword[0].split()
kkk0 = ''
for i in range(len(kkk)):
ll = clean_word(str(kkk[i]))
if ll:
kkk0 += ll + '_'
key1 += kkk0[:len(kkk0)-1] + ' '
key1 = key1[0:len(key1)-1]
print (key1)
# for keyword in sortedKeywords2[0:3]:
for keyword in sortedKeywords2:
kkk = keyword[0].split()
kkk0 = ''
for i in range(len(kkk)):
ll = clean_word(str(kkk[i]))
if ll:
kkk0 += ll + '_'
key2 += kkk0[:len(kkk0)-1] + ' '
key2 = key2[0:len(key2)-1]
print (key2)
Entity1 = ''
for e in entity1:
Entity1 += e + '_'
Entity1 = Entity1[0:len(Entity1)-1]
Entity2 = ''
for e in entity2:
Entity2 += e + '_'
Entity2 = Entity2[0:len(Entity2)-1]
# res = str(int(data.is_duplicate[idx])) + '\t' + text1 + '\t' + text2 + '\t' + key1 + '\t' + key2 + '\t' + str(data.id[idx]) + '\n'
res = str(int(data.is_duplicate[idx])) + '\t' + data.question1[idx] + '\t' + data.question2[idx] + '\t' + key1 + '\t' + key2 + '\t' + Entity1 + '\t' + Entity2 + '\t' + str(data.id[idx]) + '\n'
# print (res)
print (count)
f0.writelines(str(res))
f0.close()
print (max_keynum1, max_keynum2)
print (num1/count, num2/count)
# text = "while alkalines are 1.5 V"
# # sortedKeywords = keyphrase_extract(text)
# # print (len(sortedKeywords))
# # if len(sortedKeywords) > 0:
# # for keyword in sortedKeywords[0:3]:
# # print("Keyword: ", keyword[0], ", score: ", keyword[1])
# text = clean_str(text)
# print (text)