Beispiel #1
0
    for dd in soup.select('div.job-detail-box'):
        string_word = re.sub('"https://.*"|http://.*"', '', dd.text.replace('\n', ' ').replace('、', ' '))
        words += list(set(re.findall('object c|visual basic|[A-Za-z.+#]+', string_word, re.IGNORECASE)))
    return list(set(words))

# Get total page from the website you want to crawl
url = 'https://www.518.com.tw/job-index.html?i=1&am=1&ab=2032001,2032002,&i=1&am=1&ai=1&scr=0&ac='
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
page = int(soup.select('span.pagecountnum > span')[0].text.split('/')[1])


# Create an instance of Crawler class
crawler = Crawler(open_thread=True)

#  Call grab_pagelinks_th_auto to get all links
page_url = 'https://www.518.com.tw/job-index-P-{}.html?i=1&am=1&ab=2032001,2032002,&ai=1,'
crawler.grab_pagelinks_th_auto(page_url, page_func, page, sleep_time=1)

# Call get_alinks to get all links crawled from previous pages
links = crawler.get_alinks()

# Call grab_content_th_auto to get content page by page
crawler.grab_content_th_auto(links, content_func, sleep_time=2)

# Call get_counter to get word count result
print(crawler.get_counter().most_common())

with open('518_1_new.csv', 'w') as f:
    for lang, counts in crawler.get_counter().most_common():
        f.write('{},{}\n'.format(lang,counts))
    "cookies": "over18=1"
}
board = "Soft_Job"
#希望爬取ptt板面、但是頁面擷取的網址內容會缺乏URL,需要自行補上
res = requests.get(URL + "bbs/" + board + "/index.html", headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
bottons = soup.select('a.btn.wide')
#totalpage=上一頁的頁數+1
totalpage = int(bottons[1]['href'].split('index')[1].split('.')[0]) + 1

crawler = Crawler(open_thread=True)

page_url = URL + "bbs/" + board + "/index{}.html"
crawler.grab_pagelinks_th_auto(page_url,
                               pttURL_crawler,
                               totalpage,
                               sleep_time=1,
                               header=headers)

links = crawler.get_alinks()

print(links)
# Call grab_content_th_auto to get content page by page
crawler.grab_content_th_auto(links, content_func, sleep_time=2)

# Call get_counter to get word count result
print(crawler.get_counter().most_common())

with open('pttsoft_1_new.csv', 'w') as f:
    for lang, counts in crawler.get_counter().most_common():
        f.write('{},{}\n'.format(lang, counts))
Beispiel #3
0
def content_func(res):
    words = []
    soup = BeautifulSoup(res.text, 'lxml')
    for dl in soup.select('div.content > dl'):
        string_word = re.sub('"https://.*"|http://.*"', '', dl.text.replace('\n', ' ').replace('、', ' '))
        words += list(set(re.findall('java script|objective c|visual basic|[A-Za-z.+#]+', string_word, re.IGNORECASE)))
    return words


# Create an instance of Crawler class
crawler = Crawler(open_thread=True)

#  Call grab_pagelinks_th_auto to get all links
page_url = 'https://www.104.com.tw/jobbank/joblist/joblist.cfm?jobsource=n104bank1&ro=0&\
           jobcat=2007000000&order=2&asc=0&page={}&psl=N_A'
crawler.grab_pagelinks_th_auto(page_url, page_func, 150 , sleep_time=2)

# Call get_alinks to get all links crawled from previous pages
links = crawler.get_alinks()


# Call grab_content_th_auto to get content page by page
crawler.grab_content_th_auto(links, content_func, sleep_time=2)

# Call get_counter to get word count result
print(crawler.get_counter().most_common())

with open('104_1_new.csv', 'w') as f:
    for lang, counts in crawler.get_counter().most_common():
        f.write('{},{}\n'.format(lang,counts))