-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentenceCrawler.py
59 lines (51 loc) · 1.9 KB
/
sentenceCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "kakao.settings")
import django
django.setup()
from bot.models import Sentence
from konlpy.tag import Kkma
from sentenceMod import sentenceModifierSTR
def crawler(base, root):
# return dictionary of {sentence:tag}
import requests
from bs4 import BeautifulSoup
# get into each link in the index page
page = requests.get(root)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.select('body .mw-category-group a')
linkurls = [link for link in links]
# get information of the book & get text
ret={}
txt=''
for linkurl in linkurls:
try:
print('getting into ' + linkurl.text)
linkurl = linkurl['href']
page = requests.get(base + linkurl)
soup = BeautifulSoup(page.text, 'html.parser')
title = soup.find('span', {'id': 'header_title_text'}).text
author = soup.find('span', {'class': 'fn'}).text
tag = title.replace(" ", '') + author.replace(" ", '')
txt = soup.select(' .mw-parser-output p')[:-1]
except Exception as e:
print(e, title, author)
continue
for string in txt:
string = string.text.replace("\n", '')
sentencer = Kkma()
sentence = sentencer.sentences(string)
#refine string
for s in sentence:
s = sentenceModifierSTR(s)
ret[s] = tag
#print(s + 'appended')
print('Store Done for' + tag)
return ret
base = 'https://ko.wikisource.org'
root = 'https://ko.wikisource.org/wiki/%EB%B6%84%EB%A5%98:%EB%8B%A8%ED%8E%B8%EC%86%8C%EC%84%A4'
sentence_dict = crawler(base, root)
for s, t in sentence_dict.items():
if(len(s)>300) : # When parsed data is too long, skip it
continue
else:
Sentence(sentence=s, tag=t).save() # Store in DB