-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
320 lines (258 loc) · 13.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# coding:utf-8
import os
import re
import urllib.request
import shutil
import docx
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Cm, RGBColor
url = "http://222.197.183.99/TutorList.aspx"
pattern = "div"
html_class = "NewsDetails"
domain = "http://222.197.183.99"
key_words = ["导师代码", "导师姓名", "性别", "出生年月", "特称", "职称", "学位", "属性", "电子邮件",
"学术经历", "个人简介", "科研项目", "发表文章", "博士招生专业", "硕士招生专业"]
def instructor_pageaddr_list(html_str, school_name):
inst_soup = BeautifulSoup(html_str[html_str.find(school_name):], "lxml")
addrs_list = {}
for link_div in inst_soup.find("td").find_all("div"):
url = link_div.find("a").get("href")
name = re.sub("\d+\s", "", link_div.find("a").get_text().strip())
url = url if url[0] != "/" else url[1:]
addrs_list[name] = domain + "/" + url
return addrs_list
def schools_list(html_str):
schools = {}
all_list_soup = BeautifulSoup(html_str, "lxml")
for school in all_list_soup.find_all("b"):
if re.match("\d{3}", school.get_text()):
schools[int(school.get_text()[:3])] = school.get_text()[3:].strip()
return schools
def construct_instructor_info(url):
response = None
for i in range(2, 6):
try:
response = urllib.request.urlopen(url, timeout=10)
break
except Exception as e:
print(e, "获取导师信息超时, 尝试第" + str(i) + "次连接: " + url)
if response is None:
return None
html = response.read().decode(encoding="utf8", errors="ignore")
html_soup = BeautifulSoup(str(html).replace(" ", ""), "lxml").find("table")
instructor = {}
instructor["网页地址"] = url
instructor["学院"] = html_soup.find("span", attrs={"id": "Labelxymc"}).get_text().strip()
instructor["照片"] = domain + html_soup.find("img").get("src").strip()[1:]
html = re.sub("\s{2,}", "", str(html_soup))
html = re.sub("<style((.*)\n)+</style>", "", html)
pos = 0
for key_word in key_words:
pos = html.find(key_word, pos)
soup = BeautifulSoup(html[pos:], "lxml")
instructor[key_word] = soup.find("span").get_text().strip()
table_soup = BeautifulSoup(html, "lxml").find("table", attrs={"cellspacing": "0", "cellpadding": "0", "width": "100%", "border": "0"})
'''
tuple[0]两个td标签, 一个博士专业代码及名称, 一个硕士专业代码及名称
tuple[1]两个table标签分别嵌套在td中, 一个博士方向, 一个硕士方向, 如果没有方向, 则没有table
'''
pos_mark = [(2, 3), (5, 6), (8, 9), (11, 12), (14, 15)]
trs = table_soup.find_all("tr", recursive=False)
bs_spciality_code_label = "Labelbszydm"
bs_spciality_name_label = "Labelbszymc"
ss_spciality_code_label = "Labelsszydm"
ss_spciality_name_label = "Labelsszymc"
bs_spcialitys = []
ss_spcialitys = []
for i in range(len(pos_mark)):
speciality = trs[pos_mark[i][0]]
# 获取单个条目的专业名称(包含代码)
# 博士专业
bs_spciality_name = speciality.find("span", attrs={"id": bs_spciality_code_label + str(i + 1)}).get_text() + " " + speciality.find("span", attrs={"id": bs_spciality_name_label + str(i + 1)}).get_text()
bs_spciality_name = bs_spciality_name.strip()
# 硕士专业
ss_spciality_name = speciality.find("span", attrs={"id": ss_spciality_code_label + str(i + 1)}).get_text() + " " + speciality.find("span", attrs={"id": ss_spciality_name_label + str(i + 1)}).get_text()
ss_spciality_name = ss_spciality_name.strip()
# 获取博士和硕士的专业方向, html分别放在table1, table2
tds = trs[pos_mark[i][1]].find_all("td", recursive=False)
table1 = tds[0].find("table")
table2 = tds[1].find("table")
single_bs_speciality = {}
single_ss_speciality = {}
bs_orientations = []
ss_orientations = []
if table1:
for tr in table1.find_all("tr", recursive=False):
bs_orientations.append(tr.find("td", attrs={"class": "width4em"}).get_text().strip() +
tr.find("td", attrs={"class": "alignleft"}).get_text().strip())
if table2:
for tr in table2.find_all("tr", recursive=False):
ss_orientations.append(tr.find("td", attrs={"class": "width4em"}).get_text().strip() +
tr.find("td", attrs={"class": "alignleft"}).get_text().strip())
if len(bs_spciality_name.strip()) != 0:
single_bs_speciality[bs_spciality_name] = bs_orientations
bs_spcialitys.append(single_bs_speciality)
if len(ss_spciality_name.strip()) != 0:
single_ss_speciality[ss_spciality_name] = ss_orientations
ss_spcialitys.append(single_ss_speciality)
instructor["博士招生专业"] = bs_spcialitys
instructor["硕士招生专业"] = ss_spcialitys
return instructor
def show_instructor(instructor, file):
for key_word in ["网页地址", "照片", "学院"] + key_words:
if (key_word == "博士招生专业") or (key_word == "硕士招生专业"):
if len(instructor[key_word]) == 0:
# print("\n[" + key_word + "]: 不招生", end="")
file.write("\n[" + key_word + "]: 不招生\n")
else:
# print("\n[" + key_word + "]: ")
file.write("\n[" + key_word + "]: \n")
for speciality in instructor[key_word]: # 专业到方向列表的映射
for x in speciality:
# print("\t(" + x +"): ")
file.write("\t(" + x +"): \n")
for oriention in speciality[x]:
# print("\t\t\t", oriention)
file.write("\t\t\t" + oriention + "\n")
# print()
file.write("\n")
elif key_word == "学术经历" or key_word == "个人简介" or key_word == "科研项目" or key_word == "发表文章":
if len(instructor[key_word]) == 0:
# print("[" + key_word + "]: 未介绍")
file.write("[" + key_word + "]: 未介绍\n")
else:
# print("\n[" + key_word + "]: ")
# print("\t\t", instructor[key_word])
# print()
if key_word == "发表文章" and len(instructor["发表文章"]) > 500:
file.write("\n[" + key_word + "]: \n" + "\t\t" + "该项太长, 请到网页中浏览" + "\n\n")
else:
file.write("\n[" + key_word + "]: \n" + "\t\t" + instructor[key_word] + "\n\n")
else:
wrap_word = key_word if len(key_word) == 4 else " " + key_word
# print("%6s: %s" % (wrap_word, instructor[key_word] if len(instructor[key_word]) != 0 else "未介绍"))
file.write("%6s: %s\n" % (wrap_word, instructor[key_word] if len(instructor[key_word]) != 0 else "未介绍"))
file.write("===============================================================================================\n\n\n")
# print("===============================================================================================\n")
def gen_speciality_str(specialitys):
speciality_str = ""
if len(specialitys) == 0:
speciality_str = "博士不招生"
else:
for speciality in specialitys: # 专业到方向列表的映射
for x in speciality:
speciality_str += "[" + x + "]:"
for oriention in speciality[x]:
speciality_str += ("\n\t" + oriention)
speciality_str += "\n"
return speciality_str
def write_docx(instructor, sample_docx, dir_name):
document = Document(sample_docx)
outer_table = document.tables[0]
inner_table = outer_table.cell(0, 1).tables[0]
pic_file_name = instructor["照片"][instructor["照片"].rfind("/") + 1:]
for i in range(2, 6):
try:
response = urllib.request.urlopen(instructor["照片"], timeout=10)
f = open(pic_file_name, "w+b")
f.write(response.read())
f.close()
break
except Exception as e:
print(e, "获取头像超时, 尝试第" + str(i) + "次连接: " + instructor["照片"])
paragraph = outer_table.rows[0].cells[0].paragraphs[0]
run = paragraph.add_run()
if os.path.exists(pic_file_name) == True:
# run.add_picture(temp_pic_file, width=Cm(4.2), height=Cm(6.2)) # 窄边框使用
try:
run.add_picture(pic_file_name, width=Cm(4.0), height=Cm(5.7)) # 厚边框使用
os.remove(pic_file_name)
except Exception as e:
run.add_picture("default.gif", width=Cm(4.0), height=Cm(5.7)) # 厚边框使用
else:
# run.add_picture(temp_pic_file, width=Cm(4.2), height=Cm(6.2)) # 窄边框使用
run.add_picture("default.gif", width=Cm(4.0), height=Cm(5.7)) # 厚边框使用
inner_table.cell(0, 1).text = instructor["导师代码"]
inner_table.cell(1, 1).text = instructor["导师姓名"]
inner_table.cell(2, 1).text = instructor["性别"]
inner_table.cell(3, 1).text = instructor["出生年月"]
inner_table.cell(4, 1).text = instructor["特称"] if (len(instructor["特称"])) != 0 else "未介绍"
inner_table.cell(5, 1).text = instructor["职称"] if (len(instructor["职称"])) != 0 else "未介绍"
inner_table.cell(6, 1).text = instructor["学位"] if (len(instructor["学位"])) != 0 else "未介绍"
inner_table.cell(7, 1).text = instructor["属性"] if (len(instructor["属性"])) != 0 else "未介绍"
inner_table.cell(8, 1).text = instructor["电子邮件"]
inner_table.cell(9, 1).text = instructor["学院"]
inner_table.cell(10, 1).text = instructor["网页地址"]
paragraphs = [p for p in document.paragraphs]
# 添加导师姓名在最开头并且是一级标题
run = paragraphs[0].add_run()
font = run.font
run.add_text(instructor["导师姓名"])
font.color.rgb = RGBColor(255, 0, 0)
# 分别添加导师的每一项信息
for i in range(len(paragraphs)):
if paragraphs[i].text == "个人简介":
document.paragraphs[i + 1].text = instructor["个人简介"] if len(instructor["个人简介"]) != 0 else "未介绍"
elif paragraphs[i].text == "学术经历":
document.paragraphs[i + 1].text = instructor["学术经历"] if len(instructor["学术经历"]) != 0 else "未介绍"
elif paragraphs[i].text == "科研项目":
document.paragraphs[i + 1].text = instructor["科研项目"] if len(instructor["科研项目"]) != 0 else "未介绍"
elif paragraphs[i].text == "发表文章":
if len(instructor["发表文章"]) <= 500:
document.paragraphs[i + 1].text = instructor["发表文章"] if len(instructor["发表文章"]) != 0 else "未介绍"
else:
document.paragraphs[i + 1].text = "该导师发表文章项内容超级长, 为了不影响参阅, 详见请打开导师介绍页面"
elif paragraphs[i].text == "博士招生专业":
document.paragraphs[i + 1].text = gen_speciality_str(instructor["博士招生专业"])
elif paragraphs[i].text == "硕士招生专业":
document.paragraphs[i + 1].text = gen_speciality_str(instructor["硕士招生专业"])
document.add_page_break()
document.save(dir_name + "/" + "[" + instructor["导师代码"] + "]" + instructor["导师姓名"] + ".docx")
def debug():
ins_url = "http://222.197.183.99/TutorDetails.aspx?id=308"
#ins_url = "http://222.197.183.99/TutorDetails.aspx?id=2257" #出现以为HTML标签
ins_url = "http://222.197.183.99/TutorDetails.aspx?id=113"
ins_url = "http://222.197.183.99/TutorDetails.aspx?id=2177"
instructor = construct_instructor_info(ins_url)
write_docx(instructor, "sample.docx", ".")
show_instructor(instructor, open("uestc.txt", "w"))
exit(0)
if 1 == 91:
debug()
# ================================start==================================== #
response = None
for i in range(2, 6):
try:
response = urllib.request.urlopen(url, timeout=10)
break
except Exception as e:
print(e, "网络访问超时, 尝试第" + str(i) + "次连接: " + url)
if response is None:
print("网络访问失败")
exit(0)
html = response.read().decode(encoding="utf8", errors="ignore")
full_page_soup = BeautifulSoup(str(html), "lxml")
all_list_soup = full_page_soup.body.find(pattern, attrs = {"class": html_class})
schools = schools_list(str(all_list_soup))
for x in schools.items():
print(x)
school_id = input("输入学院ID:")
list_url = instructor_pageaddr_list(str(all_list_soup), schools[int(school_id)])
dir_name = "generatefiles"
sample_docx = "sample.docx"
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
os.mkdir(dir_name)
f = open(dir_name + "/" + "导师列表纯文本.txt", "w")
for index, x in enumerate(list_url.items()):
print("抓取页面: " + x[0] + " " + x[1])
instructor_info = construct_instructor_info(x[1])
if instructor_info is not None:
show_instructor(instructor_info, f)
write_docx(instructor_info, sample_docx, dir_name)
else:
print("获取导师信息失败:", x)
print(index + 1, "完成创建: " + "[" + instructor_info["导师代码"] + "]" + instructor_info["导师姓名"] + ".docx\n")
print("任务完成...")
f.close()