class IdiomCrawler: def __init__(self): self.url = "http://chengyu.haoshiwen.org" self.headers = { 'Host': "chengyu.haoshiwen.org", 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36') } self.db = DB_Maker() def get_response(self, url, params=None): r = requests.get(url, headers=self.headers, params=params) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "lxml") return soup def traverse_character(self): # page style: http://chengyu.haoshiwen.org/list.php?t=A&page=1 url = self.url + '/list.php' for char in ascii_uppercase: for page_num in range(1, 100): params = {"t": char, "page": page_num} soup = self.get_response(url, params) links = soup.findAll("a", {"href": re.compile("/view.php\?id=\d")}) if not links: continue for link in links: t = threading.Thread(target=self.idiom_index, args=(self.url + link["href"], )) t.start() time.sleep(0.1) def idiom_index(self, url): sem.acquire() try: soup = self.get_response(url) tr = soup.find("table").findAll("tr")[:6] info = { td.findAll('td')[0].text: td.findAll('td')[1].text for td in tr } info['人气'] = int(info['人气'][:-1]) sql = "insert into idiom (name,speak,meaning,source,example,hot) values " \ "(?,?,?,?,?,?)" self.db.insert(sql, list(info.values())) except Exception as error_info: print(error_info) finally: sem.release()
def create_table(self): db = DB_Maker() sql = """create table car_info ( [id] integer PRIMARY KEY autoincrement, [name] varchar (10), [image] varchar (30), [founded] varchar (30), [models] varchar (30), [website] varchar (30) )""" print(sql) db.create_table_by_sql(sql=sql)
def correct_names(self): with open('./data/author_url_dic.json', 'r') as f: self.author_url_dic = json.load(f) with open('./data/author_dic.json', 'r') as f: self.author_dic = json.load(f) with open('./data/skip_author.json', 'r') as f: skip = set(json.load(f)) from db_maker import DB_Maker db_maker = DB_Maker() candidates = [] for x in self.author_url_dic.keys(): if db_maker.is_kr( x): # ('.' in x or '-' in x or len(x.split()) > 3) candidates.append(x) candidates += [ smooth(x) for x in get_file('./data/kr_hard_coding.txt') ] candidates = sorted(list(set(candidates))) print(len(candidates)) for i, author in enumerate(candidates): print(i, '/', len(candidates)) if not (author in self.author_url_dic) or author in skip: continue url = self.author_url_dic[author] html = BS(url) primary = smooth(html.find('span', {'class': 'name primary'}).text) secondary_list = [ smooth(x.text) for x in html.find_all('span', {'class': 'name secondary'}) ] print(primary, secondary_list) skip.add(primary) for name in secondary_list: if name and name != name.lower(): skip.add(name) self.author_dic[name] = primary with open('./data/author_dic.json', 'w') as f: json.dump(self.author_dic, f) with open('./data/skip_author.json', 'w') as f: json.dump(sorted(list(skip)), f)
def __init__(self): self.url = "http://chengyu.haoshiwen.org" self.headers = { 'Host': "chengyu.haoshiwen.org", 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36') } self.db = DB_Maker()
def make_data(self): db = DB_Maker() for uppercase in au: url = "http://www.chebiaow.com/logo/{}.html".format(uppercase) response = requests.get(url=url, headers=self.headers) soup = BeautifulSoup(response.content, 'html.parser') items = soup.select("li .zq") for item in items: url2 = "http://www.chebiaow.com{}".format(item.attrs['href']) response2 = requests.get(url2, headers=self.headers) soup2 = BeautifulSoup(response2.content, 'html.parser') image = soup2.select(".xq-left>.img>img")[0].get("src") name = soup2.select(".xq-right>li>a")[0].get_text() founded = soup2.select(".xq-right>li>span")[2].get_text() models = soup2.select(".xq-right>li>span")[4].get_text() website = soup2.select(".xq-right>li>span")[6].get_text() db.insert(self.insert_sql, (name, image, founded, models, website)) pass
def __init__(self): self.url = "http://114.xixik.com/country-flag/" self.headers = { 'Host': "114.xixik.com", 'Connection': 'keep-alive', 'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36') } self.db = DB() self.save_path = 'static/images'
class CountryFlagCrawler: def __init__(self): self.url = "http://114.xixik.com/country-flag/" self.headers = { 'Host': "114.xixik.com", 'Connection': 'keep-alive', 'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36') } self.db = DB() self.save_path = 'static/images' def check_dir(self): if not os.path.exists(self.save_path): os.mkdir(self.save_path) def get_response(self, url, params=None): r = requests.get(url, headers=self.headers, params=params) r.encoding = 'gb2312' soup = BeautifulSoup(r.text, "lxml") return soup def down_load_flag(self): soup = self.get_response(self.url) tds = soup.findAll("div", {"class": "lindBox"})[5].findAll("td") for td in tds: try: picture_url = td.find('img')['src'] country_name = '%s.gif' % re.sub('\s', '', td.text) print("Download %s" % country_name) r = requests.get(picture_url) with open(os.path.join(self.save_path, country_name), 'wb') as f: f.write(r.content) except: pass # contury_info country_list = [] trs = soup.findAll("div", {"class": "lindBox"})[6].findAll("tr") for tr in trs: try: info = list( map(lambda x: re.sub('\s|,', '', x.text), tr.findAll('td')[1:])) if len(info) == 4: # 数据错误补充: print(info) if info[0] == '俄罗斯': country_list.append( [info[0], info[1], int(info[2]), 17098246]) else: country_list.append( [info[0], info[1], int(info[2]), int(info[3])]) except: pass sql = "insert into country_flag (country,capital,population,area) values " \ "(?,?,?,?)" for line in sorted(country_list, key=lambda x: x[3], reverse=True): self.db.insert(sql, line)
import traceback from updater import Updater from db_maker import DB_Maker from datetime import datetime from utils import webhook import json import traceback if __name__ == '__main__': try: webhook("Update start!") current_year = datetime.now().year my_updater = Updater() my_db_maker = DB_Maker() my_db_maker.load_model() recent_year_dict = json.load(open('./data/recent_year_dict.json')) for conf, dblp in my_updater.get_conf2dblp().items(): fromyear = recent_year_dict[conf] + 1 toyear = current_year print(conf, fromyear, toyear) success_years = my_updater.update_conf(conf, dblp, fromyear, toyear) for year in success_years: while not my_db_maker.make_conf_year_db(conf, year): pass #if len(success_years) == 0: # webhook(conf + " is already updated") # For manual update # while not my_db_maker.make_conf_year_db('iclr', 2020):
def query_data(self): db = DB_Maker() print(db.fetch_one(self.query_sql))