Beispiel #1
0
def data_parse(url):
    job = Job()
    page_data = []
    res = requests.get(url=url, cookies={'over18': '1'}).text

    soup = BeautifulSoup(res, 'lxml')
    row = soup.select('.r-ent')
    for r in row:
        url = r.select('a')[0]['href']
        job.add_job('ptt', body_parse, 'https://www.ptt.cc' + url)
Beispiel #2
0
def parse(year, semester):
    job = Job()
    url = 'http://course.thu.edu.tw/view-dept/' + str(year) + '/' + str(
        semester) + '/everything'
    res = requests.get(url)
    domain = 'http://course.thu.edu.tw'
    res = BeautifulSoup(res.text, 'lxml')
    for dp in res.select('tr a'):
        dp_url = domain + dp['href']
        job.add_job('thu', dp_parse, dp_url)
Beispiel #3
0
def parse(start,end):
    job =Job()
    if end == -1:
        url = 'https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1'
        res= requests.get(url,headers={'Host': 'rent.591.com.tw','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
        d = json.loads(res.text)
        end = int(d['records'].replace(",", ""))
    for i in range(start,end,30):
        if i == 0:
            i = 1
        url = 'https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow='+str(i)+'&totalRows='+str(end)
        job.add_job('s591',data_parse,url)
Beispiel #4
0
def page_parse(board, start, end):
    job = Job()
    for i in range(int(start), int(end) + 1):
        job.add_job('ptt', data_parse,
                    'https://www.ptt.cc/bbs/' + board + '/index%d.html' % i)