コード例 #1
0
class IdiomCrawler:
    def __init__(self):
        self.url = "http://chengyu.haoshiwen.org"
        self.headers = {
            'Host':
            "chengyu.haoshiwen.org",
            'Connection':
            'keep-alive',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'user-agent':
            ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
             '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
        }
        self.db = DB_Maker()

    def get_response(self, url, params=None):
        r = requests.get(url, headers=self.headers, params=params)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, "lxml")
        return soup

    def traverse_character(self):
        # page style: http://chengyu.haoshiwen.org/list.php?t=A&page=1
        url = self.url + '/list.php'
        for char in ascii_uppercase:
            for page_num in range(1, 100):
                params = {"t": char, "page": page_num}
                soup = self.get_response(url, params)
                links = soup.findAll("a",
                                     {"href": re.compile("/view.php\?id=\d")})
                if not links:
                    continue
                for link in links:
                    t = threading.Thread(target=self.idiom_index,
                                         args=(self.url + link["href"], ))
                    t.start()
                    time.sleep(0.1)

    def idiom_index(self, url):
        sem.acquire()
        try:
            soup = self.get_response(url)
            tr = soup.find("table").findAll("tr")[:6]
            info = {
                td.findAll('td')[0].text: td.findAll('td')[1].text
                for td in tr
            }
            info['人气'] = int(info['人气'][:-1])
            sql = "insert into idiom (name,speak,meaning,source,example,hot) values " \
                  "(?,?,?,?,?,?)"

            self.db.insert(sql, list(info.values()))
        except Exception as error_info:
            print(error_info)
        finally:
            sem.release()
コード例 #2
0
ファイル: data_maker.py プロジェクト: AIOSCloud/chebiao
 def create_table(self):
     db = DB_Maker()
     sql = """create table car_info (
                             [id]            integer PRIMARY KEY autoincrement,
                             [name]         varchar (10),
                             [image]      varchar (30),
                             [founded]      varchar (30),
                             [models]      varchar (30),
                             [website]      varchar (30)
                         )"""
     print(sql)
     db.create_table_by_sql(sql=sql)
コード例 #3
0
ファイル: updater.py プロジェクト: rheehot/KCSS-beta-
    def correct_names(self):
        with open('./data/author_url_dic.json', 'r') as f:
            self.author_url_dic = json.load(f)

        with open('./data/author_dic.json', 'r') as f:
            self.author_dic = json.load(f)

        with open('./data/skip_author.json', 'r') as f:
            skip = set(json.load(f))

        from db_maker import DB_Maker
        db_maker = DB_Maker()

        candidates = []
        for x in self.author_url_dic.keys():
            if db_maker.is_kr(
                    x):  # ('.' in x or '-' in x or len(x.split()) > 3)
                candidates.append(x)

        candidates += [
            smooth(x) for x in get_file('./data/kr_hard_coding.txt')
        ]
        candidates = sorted(list(set(candidates)))

        print(len(candidates))

        for i, author in enumerate(candidates):
            print(i, '/', len(candidates))
            if not (author in self.author_url_dic) or author in skip:
                continue
            url = self.author_url_dic[author]
            html = BS(url)

            primary = smooth(html.find('span', {'class': 'name primary'}).text)
            secondary_list = [
                smooth(x.text)
                for x in html.find_all('span', {'class': 'name secondary'})
            ]

            print(primary, secondary_list)

            skip.add(primary)
            for name in secondary_list:
                if name and name != name.lower():
                    skip.add(name)
                    self.author_dic[name] = primary

            with open('./data/author_dic.json', 'w') as f:
                json.dump(self.author_dic, f)
            with open('./data/skip_author.json', 'w') as f:
                json.dump(sorted(list(skip)), f)
コード例 #4
0
 def __init__(self):
     self.url = "http://chengyu.haoshiwen.org"
     self.headers = {
         'Host':
         "chengyu.haoshiwen.org",
         'Connection':
         'keep-alive',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'user-agent':
         ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
          '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
     }
     self.db = DB_Maker()
コード例 #5
0
ファイル: data_maker.py プロジェクト: AIOSCloud/chebiao
 def make_data(self):
     db = DB_Maker()
     for uppercase in au:
         url = "http://www.chebiaow.com/logo/{}.html".format(uppercase)
         response = requests.get(url=url, headers=self.headers)
         soup = BeautifulSoup(response.content, 'html.parser')
         items = soup.select("li .zq")
         for item in items:
             url2 = "http://www.chebiaow.com{}".format(item.attrs['href'])
             response2 = requests.get(url2, headers=self.headers)
             soup2 = BeautifulSoup(response2.content, 'html.parser')
             image = soup2.select(".xq-left>.img>img")[0].get("src")
             name = soup2.select(".xq-right>li>a")[0].get_text()
             founded = soup2.select(".xq-right>li>span")[2].get_text()
             models = soup2.select(".xq-right>li>span")[4].get_text()
             website = soup2.select(".xq-right>li>span")[6].get_text()
             db.insert(self.insert_sql, (name, image, founded, models, website))
     pass
コード例 #6
0
 def __init__(self):
     self.url = "http://114.xixik.com/country-flag/"
     self.headers = {
         'Host':
         "114.xixik.com",
         'Connection':
         'keep-alive',
         'user-agent':
         ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
          '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
     }
     self.db = DB()
     self.save_path = 'static/images'
コード例 #7
0
class CountryFlagCrawler:
    def __init__(self):
        self.url = "http://114.xixik.com/country-flag/"
        self.headers = {
            'Host':
            "114.xixik.com",
            'Connection':
            'keep-alive',
            'user-agent':
            ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
             '(KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36')
        }
        self.db = DB()
        self.save_path = 'static/images'

    def check_dir(self):
        if not os.path.exists(self.save_path):
            os.mkdir(self.save_path)

    def get_response(self, url, params=None):
        r = requests.get(url, headers=self.headers, params=params)
        r.encoding = 'gb2312'
        soup = BeautifulSoup(r.text, "lxml")
        return soup

    def down_load_flag(self):
        soup = self.get_response(self.url)
        tds = soup.findAll("div", {"class": "lindBox"})[5].findAll("td")
        for td in tds:
            try:
                picture_url = td.find('img')['src']
                country_name = '%s.gif' % re.sub('\s', '', td.text)
                print("Download %s" % country_name)
                r = requests.get(picture_url)
                with open(os.path.join(self.save_path, country_name),
                          'wb') as f:
                    f.write(r.content)
            except:
                pass
        # contury_info
        country_list = []
        trs = soup.findAll("div", {"class": "lindBox"})[6].findAll("tr")
        for tr in trs:
            try:
                info = list(
                    map(lambda x: re.sub('\s|,', '', x.text),
                        tr.findAll('td')[1:]))
                if len(info) == 4:
                    # 数据错误补充:
                    print(info)
                    if info[0] == '俄罗斯':
                        country_list.append(
                            [info[0], info[1],
                             int(info[2]), 17098246])
                    else:
                        country_list.append(
                            [info[0], info[1],
                             int(info[2]),
                             int(info[3])])
            except:
                pass

        sql = "insert into country_flag (country,capital,population,area) values " \
              "(?,?,?,?)"
        for line in sorted(country_list, key=lambda x: x[3], reverse=True):
            self.db.insert(sql, line)
コード例 #8
0
ファイル: auto.py プロジェクト: rheehot/KCSS-beta-
import traceback
from updater import Updater
from db_maker import DB_Maker
from datetime import datetime
from utils import webhook
import json
import traceback


if __name__ == '__main__':
    try:
        webhook("Update start!")
        current_year = datetime.now().year
        my_updater = Updater()
        my_db_maker = DB_Maker()
        my_db_maker.load_model()

        recent_year_dict = json.load(open('./data/recent_year_dict.json'))
        for conf, dblp in my_updater.get_conf2dblp().items():
            fromyear = recent_year_dict[conf] + 1
            toyear = current_year
            print(conf, fromyear, toyear)
            success_years = my_updater.update_conf(conf, dblp, fromyear, toyear)
            for year in success_years:
                while not my_db_maker.make_conf_year_db(conf, year):
                    pass
            #if len(success_years) == 0:
            #    webhook(conf + " is already updated")
        
        # For manual update
        # while not my_db_maker.make_conf_year_db('iclr', 2020):
コード例 #9
0
ファイル: data_maker.py プロジェクト: AIOSCloud/chebiao
 def query_data(self):
     db = DB_Maker()
     print(db.fetch_one(self.query_sql))