def main(): reload(sys) sys.setdefaultencoding('utf8') spider = Spider('python', '杭州') spider.setSalay(5.9, 16, 10.9, 31.0) spider.addShieldCompany('畅唐网络') spider.addShieldCompany('中国亿教亿学网') spider.addContainText('C++') spider.addContainText('c++') #spider.addContainText('爬虫') spider.analyse()
def main(): # project dir create_dir(ROOT) Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT) # 读取url列表 file = open('msglist.json') text = file.read() file.close() urls = json.loads(text) urls_visited = [] if os.path.exists('visited.txt'): file = open('visited.txt', 'r') for line in file: urls_visited.append(line.rstrip()) urlmap = {} for item in urls: title = item['title'] url = item['url'] if url in urls_visited: print 'visited', url continue urlmap[url] = title queue.put(url) # start file = open('visited.txt', 'a') while queue.empty() == False: url = queue.get() print "crawl ", url logging.info('now crawl %s', url) Spider.crawl(url) print "analyse ", url logging.info('now analyse %s', url) images = Spider.analyse() queue.task_done() visited.add(url) save(images, urlmap[url]) file.write(url+'\n') file.flush() file.close() print 'finished' logging.info('finished')
def main(): # project dir create_dir(ROOT) Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT) queue.put(URL) # start while queue.empty() == False: url = queue.get() print "crawl ", url logging.info('now crawl %s', url) html = Spider.crawl(url) images = Spider.analyse(html) links = Spider.analyse_links(html) queue.task_done() visited.add(url) save(images) # new urls for link in links: if (link not in visited) and link[0:18] == 'http://pp.163.com/': exist = False for ignore in IGNORES: match = re.search(re.compile(ignore), link) if match: #logging.info("exclude %s", link) exist = True break if exist == False: queue.put(link) print 'done'