def data_parse(url): job = Job() page_data = [] res = requests.get(url=url, cookies={'over18': '1'}).text soup = BeautifulSoup(res, 'lxml') row = soup.select('.r-ent') for r in row: url = r.select('a')[0]['href'] job.add_job('ptt', body_parse, 'https://www.ptt.cc' + url)
def parse(year, semester): job = Job() url = 'http://course.thu.edu.tw/view-dept/' + str(year) + '/' + str( semester) + '/everything' res = requests.get(url) domain = 'http://course.thu.edu.tw' res = BeautifulSoup(res.text, 'lxml') for dp in res.select('tr a'): dp_url = domain + dp['href'] job.add_job('thu', dp_parse, dp_url)
def parse(start,end): job =Job() if end == -1: url = 'https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1' res= requests.get(url,headers={'Host': 'rent.591.com.tw','User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) d = json.loads(res.text) end = int(d['records'].replace(",", "")) for i in range(start,end,30): if i == 0: i = 1 url = 'https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow='+str(i)+'&totalRows='+str(end) job.add_job('s591',data_parse,url)
def page_parse(board, start, end): job = Job() for i in range(int(start), int(end) + 1): job.add_job('ptt', data_parse, 'https://www.ptt.cc/bbs/' + board + '/index%d.html' % i)