/
loader.py
128 lines (116 loc) · 5.04 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
import regex as re
from author import Author
from source import Source
from publication import Publication
class Loader:
def __init__(self):
pass
def load(self, load_format = None, filename = None):
self.load_format = load_format if load_format else "txt"
self.filename = filename if filename else "data."+self.load_format
data = []
if self.load_format == "txt":
data = self.__load_txt()
elif self.load_format == "id":
data = self.__load_id()
else:
data = None
return data
def __load_txt(self):
rn1 = r"(?P<authors>((\pL\. ?(\pL\. )?\pL+,? )|(\pL+ \pL\. ?(\pL\.)?,? )" #regular for authors
rn2 = r"|(\p{Lu}\p{Ll}+ \p{Lu}\p{Ll}+,? )"
rn3 = r")+)"
ra_ru = r"(?P<article>\p{Lu}\p{Ll}+ \p{Ll}+.*?) *\/\/ *" #regular for article
ra_eng = r"(?P<article>\p{Lu}.*?) *\/\/ *" #regular for article
rj = r'(?P<source>[ \pL"“”]+)' #regular for source
rm = r"(?P<misc>.+)" #regular for misc
reg_ru = re.compile(rn1+rn2+rn3+ra_ru+rj+rm, re.UNICODE)
reg_eng = re.compile(rn1+rn3+ra_eng+rj+rm, re.UNICODE)
data = []
f = open(self.filename, 'r')
content = f.read()
items = content.split('\n')
for item in items:
res = None
if isEnglish(item[:15]):
res = reg_eng.match(item.strip())
else:
res = reg_ru.match(item.strip())
if res != None:
publication = Publication()
publication.authors = Author.parseAuthors(res.group("authors"))
data.append({"authors": split_authors(res.group("authors")), "article": res.group("article"), "source": res.group("source"), "misc": res.group("misc")})
else:
print("Wrong line: " + item)
return data
def __load_id(self):
data = []
f = open(self.filename, 'r')
content = f.read()
author_ids = content.split('\n')
print(author_ids)
article_reg_string = r'id="arw(?P<id>\d*).+?".*?a href=.*?<b>(?P<article>.*?)<\/b>.*?<i>(?P<authors>.*?)<\/i>.*?<font color=.+?>((.*?В сборнике: )?<a .+?>(?P<source>.+?)<\/a>)?(?P<misc>.+?)<\/td'
article_reg = re.compile(article_reg_string, re.UNICODE)
rsci_reg_string = r'Входит в РИНЦ.*?<font color=#00008f>(?P<rsci>.*?)<\/font>.*?(Импакт-фактор журнала в РИНЦ.*?<font color=#00008f>(?P<rsci_factor>\d+)<\/font>)?'
scopus_reg_string = r'.*?Входит в Scopus.*?<font color=#00008f>(?P<scopus>.*?)<\/font>.*?(Импакт-фактор журнала в Scopus.*?<font color=#00008f>(?P<scopus_factor>\d+)<\/font>)?'
wos_reg_string = r'.*?Входит в Web of Science.*?<font color=#00008f>(?P<wos>.*?)<\/font>.*?(Импакт-фактор журнала в Web of Science.*?<font color=#00008f>(?P<wos_factor>\d+)<\/font>)?'
hac_reg_string = r'(Входит в ВАК.*?<font color=#00008f>(?P<hac>(да)|(нет))</font>)?.*(Импакт-фактор журнала в ВАК.*?<font color=#00008f>(?P<hac_factor>\d+)</font>)?'
status_reg_string = rsci_reg_string+scopus_reg_string+wos_reg_string+hac_reg_string
status_reg = re.compile(status_reg_string, re.UNICODE)
p = re.compile(r'(<.*?>)|( )|(^[., ])')
for author_id in author_ids:
request_status_code = 0
while request_status_code != 200 :
r = requests.get('http://elibrary.ru/author_items.asp?authorid='+author_id)
request_status_code = r.status_code
html = r.text
number_of_pages = 1
while 'javascript:goto_page('+str(number_of_pages)+')' in html:
number_of_pages+=1
if number_of_pages != 1:
number_of_pages-=1
for page_number in range(number_of_pages):
request_status_code = 0
while request_status_code != 200 :
page_request = requests.post('http://elibrary.ru/author_items.asp', data = {'authorid': author_id, 'pagenum': ''+str(page_number+1)})
request_status_code = page_request.status_code
page = page_request.text.replace('\n', '').replace('\r', '')
matches = article_reg.findall(page)
for article_res in matches:
request_status_code = 0
while request_status_code != 200 :
article_request = requests.get('http://elibrary.ru/item.asp?id='+article_res[0])
request_status_code = article_request.status_code
article_page = article_request.text.replace('\n', '').replace('\r', '')
status_res = status_reg.findall(article_page)
data.append(({"authors": split_authors(clean(article_res[2], p)), "article": clean(article_res[1], p),
"source": clean(article_res[3], p), "misc": clean(article_res[6], p),
"scopus": get_result(status_res[0][3], status_res[0][5]),
"wos": get_result(status_res[0][6], status_res[0][8]),
"hac": get_result(status_res[0][9], status_res[0][11]),
"rsci": get_result(status_res[0][0], status_res[0][2])
}))
return data
def split_authors(res):
authors = res.split(', ')
new_authors = []
for author in authors:
new_authors.append(author.strip().replace(". ", "."))
return new_authors
def isEnglish(s):
try:
s.encode('ascii')
except UnicodeEncodeError:
return False
else:
return True
def clean(string, p):
result = p.sub('', string)
return result
def get_result(found, factor):
if found != "да":
return 0
if factor == '':
return -1
return factor