/
getPage.py
228 lines (202 loc) · 9.87 KB
/
getPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from bs4 import BeautifulSoup
import math,random,time
import re,os
import requests
from static_parameters import my_parameters
import json
from pro import *
#设立缓存,如果已经有期刊信息,则直接将影响因子复制即可
try:
with open('Journal_Point.json','r') as f:
Journal_Point = json.load(f)
except:
Journal_Point = {}
GET_ARTICAL_DETAIL_URL = 'https://kns.cnki.net/KCMS/detail/detail.aspx?'
class GetPageInfo(object):
def __init__(self):
self.all_info = []
self.count = 0 #用于递归计数
def get_Impact_Factor(self,href):
'''利用重定向获取页面'''
bad_response = '<script>window.location.href=\'//www.cnki.net\'</script>'
# time.sleep(0.3)
source_url = 'https://kns.cnki.net' + href
source = requests.get(source_url,stream=True)
source_text = source.text
print(source_text==bad_response)
if source_text==bad_response:
return self.get_Impact_Factor(href)
partten_get_fuhe = re.compile(r'复合影响因子:.*?(\d\.\d+).*?')
partten_get_zonghe = re.compile(r'综合影响因子:.*?(\d\.\d+).*?')
#如果该期刊有影响因子,则找出它,如果没有,则返回空,类型均为str
try:
fuhe_impact_factor = re.search(partten_get_fuhe, source_text).group(1)
zonghe_impact_factor = re.search(partten_get_zonghe, source_text).group(1)
return str(fuhe_impact_factor), str(zonghe_impact_factor)
except:
return '',''
def get_Impact_Factor2(self,parameters):
'''获取到被访问页面的关键参数,直接请求该页面'''
self.count += 1
if self.count >20:
self.count = 0
print('无法请求到相应页面,请稍后或者更换网络再试。')
return
bad_response = '<script>window.location.href=\'//www.cnki.net\'</script>'
# time.sleep(0.3)
source_url = 'http://navi.cnki.net/KNavi/JournalDetail?'
source = requests.get(source_url,headers=my_parameters.headers_navi,params=parameters)
source_text = source.text
if source_text==bad_response:
return self.get_Impact_Factor2(parameters)
partten_get_fuhe = re.compile(r'复合影响因子:.*?(\d\.\d+).*?')
partten_get_zonghe = re.compile(r'综合影响因子:.*?(\d\.\d+).*?')
#如果该期刊有影响因子,则找出它,如果没有,则返回空,类型均为str
try:
fuhe_impact_factor = re.search(partten_get_fuhe, source_text).group(1)
zonghe_impact_factor = re.search(partten_get_zonghe, source_text).group(1)
self.count = 0
return str(fuhe_impact_factor), str(zonghe_impact_factor)
except:
self.count = 0
return '',''
def find_ajax(self,url,parameters):
# time.sleep(0.1)
#根据url和参数获取相应页面的信息
info = []
try:
ajax_list_page = requests.get(url, params=parameters, headers=my_parameters.headers_kns)
soup = BeautifulSoup(ajax_list_page.text, 'lxml')
ajax_list = soup.find('div', attrs={'class': 'ebBd'}).find_all('li')
for item in ajax_list:
href = 'https://kns.cnki.net' + item.find('a')['href']
text = item.text.strip(' \n\r')
text = re.sub(r'(  )','', text)
text = re.sub(r'[\n\r\s]+', '', text)
info.append(text)
info.append(href)
except:
pass
return info
def get_artical_detail(self,artical_href,dict_artcical):
# time.sleep(0.1)
#获取文章详细内容的url,发现和href中的三个关键参数有关
parameters = {
'DbCode':'',
'DbName':'',
'FileName':'',
}
pattern_DbCode = re.compile(r'.*?[dD]b[cC]ode=\s?(.*?)&')
pattern_DbName = re.compile(r'.*?[dD]b[nN]ame=\s?(.*?)&')
pattern_FileName = re.compile(r'.*?[fF]ile[nN]ame=\s?(.*?)&')
parameters['DbCode'] = re.search(pattern_DbCode,artical_href).group(1)
parameters['DbName'] = re.search(pattern_DbName,artical_href).group(1)
parameters['FileName'] = re.search(pattern_FileName,artical_href).group(1)
print('FileName=' + parameters['FileName'])
req = requests.get(GET_ARTICAL_DETAIL_URL,params=parameters,headers=my_parameters.headers_kns)
#请求到文章详细内容的页面后,获取文章关键词
soup = BeautifulSoup(req.text, 'lxml')
keyword = []
try:
keyword_list = soup.find('label', attrs={'id': 'catalog_KEYWORD'}).parent.find_all('a')
for item in keyword_list:
keyword.append(item.text.strip(';\r\n\t '))
except:
pass
#将获取的关键词保存为列表,插入到每个文章的信息中
dict_artcical['关键词'] = keyword
#查找摘要
try:
summary = soup.find('span',attrs={'id':'ChDivSummary'}).text
except:
summary="kong"
dict_artcical['摘要'] = summary
#查找相似文献
parameters.update({
'curdbcode': 'CJFQ',
'reftype': '604',
'catalogId': 'lcatalog_func604',
'catalogName': '相似文献',
})
ajax_url = 'https://kns.cnki.net/kcms/detail/frame/asynlist.aspx?'
dict_artcical['相似文献'] = self.find_ajax(ajax_url,parameters)
#查找读者推荐
parameters.update({
'curdbcode': 'CJFQ',
'reftype': '605',
'catalogId': 'lcatalog_func605',
'catalogName': '读者推荐',
})
dict_artcical['读者推荐'] = self.find_ajax(ajax_url, parameters)
#获取复合影响因子、获取综合影响因子
parameters_fators = {
'pcode':'',
'pykm':'',
}
infomation = soup.select('.sourinfo .title a')
pattern = re.compile(r'.*?\(\'(.*?)\',\'(.*?)\',\'(.*?)\',\'(.*?)\'\);')
parameters_fators['pcode'] = pattern.search(str(infomation)).group(2)
parameters_fators['pykm'] = pattern.search(str(infomation)).group(4)
if parameters_fators['pykm'] in Journal_Point.keys():
dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = Journal_Point[parameters_fators['pykm']][0:2]
else:
try:
dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = self.get_Impact_Factor2(parameters_fators)
except:
dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = 0,0
print("没找到")
#将期刊的代号插入到字典中
if(dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] != 0,0):
Journal_Point[parameters_fators['pykm']] = [dict_artcical['复合影响因子'], dict_artcical['综合影响因子']]
def get_Detail_Info(self,text,session):
# time.sleep(0.1)
'''
从每个文章列表中提取出文章的相关信息
'''
soup = BeautifulSoup(text,'lxml')
#将每个文章列表中的信息装在一个列表中
dict_pre_page = []
artical_list = soup.find('table',attrs={'class':'GridTableContent'}).find_all('tr')
for i in range (1,len(artical_list)):#artical_list[0]是表头
print('正在获取第' + str(i) + '条信息,',end='')
dict_artcical = {'题名':'',
'作者':'',
'来源': '',
'发表时间': '',
'被引量': '',
'下载量': '',
'复合影响因子': '',
'综合影响因子': '',
}
# 获取题名
dict_artcical['题名'] = artical_list[i].find('a',attrs={'class':'fz14'}).text
artical_href = artical_list[i].find('a',attrs={'class':'fz14'})['href']
self.get_artical_detail(artical_href,dict_artcical)
# 获取作者
dict_artcical['作者'] = artical_list[i].find('td',attrs={'class':'author_flag'}).text.strip('\n')#用strip函数去除字符串首尾的非法字符
# 获取来源
dict_artcical['来源'] = artical_list[i].find_all('a',attrs={'target':'_blank'})[1].text
#获取复合影响因子、获取综合影响因子 !!!!!!此方法没有添加缓存
# href = artical_list[i].find_all('a',attrs={'target':'_blank'})[1]['href']
# dict_artcical['复合影响因子'],dict_artcical['综合影响因子'] = self.get_Impact_Factor(href)
# 获取发表时间
dict_artcical['发表时间'] = artical_list[i].find('td',attrs={'align':'center'}).text.strip('\r\n ')
# 获取被引量
try:
dict_artcical['被引量'] = artical_list[i].find('td',attrs={'align':'right'}).text.strip('\n ')
except:
dict_artcical['被引量'] = '0'
# 获取下载量
try:
dict_artcical['下载量'] = artical_list[i].find('span',attrs={'class':'downloadCount'}).text
except:
dict_artcical['下载量'] = '0'
dict_pre_page.append(dict_artcical)
#更新缓存表
with open('Journal_Point.json', 'w') as f:
json.dump(Journal_Point,f)
#返回每一页信息的列表
return dict_pre_page
get_page_info = GetPageInfo()
if __name__ == '__main__':
pass