def selenium_uc_spider(): driver = webdriver.PhantomJS() driver.get("http://www.vyi.cc/uc/index.php") time.sleep(1) urls = [] for url in driver.find_elements_by_xpath(u"//a[contains(.,'进入UC')]"): uri = url.get_attribute("href") urls.append(uri) for url in urls: item = {} item["url"] = url try: driver.get(url) item['title'] = driver.find_element_by_xpath("//h1").text item['pubtime'] = driver.find_element_by_xpath( "//p[@class='wmAuthor__header-wm-info_detail__3ccfd1e8a9']/span[last()-1]" ).text item['content'] = driver.find_element_by_xpath( "//div[@class='article-content uc-nf-fontsize-change-dom simple-ui']" ).text item['author'] = driver.find_element_by_xpath("//h3/p").text except: continue item['site_name'] = u'UC号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", False) driver.quit()
def selenium_toutiao_spider(): driver = webdriver.PhantomJS() driver.get("http://vyi.wangzherongyao.cn/toutiao/index.php") time.sleep(1) urls = [] for url in driver.find_elements_by_xpath(u"//a[contains(.,'查看')]"): uri = url.get_attribute("href") urls.append(uri) for url in urls: item = {} item["url"] = url try: driver.get(url) item['title'] = driver.find_element_by_xpath("//h1").text item['pubtime'] = driver.find_element_by_xpath( "//div[@class='article-sub']/span[last()]").text item['content'] = driver.find_element_by_xpath( "//div[@class='article-content']").text item['author'] = driver.find_element_by_xpath( "//div[@class='article-sub']/span[last()-1]").text item['author_url'] = driver.find_element_by_xpath( "//div[@class='user-card-name']/a").get_attribute("href") except: continue item['site_name'] = u'头条号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", False) driver.quit()
def yidian_spider(): driver = webdriver.PhantomJS() for url in author_urls: driver.get(url) time.sleep(1) try: uris = driver.find_elements_by_xpath("//div[@class='channel-news channel-news-0']/a") except: continue for uri in uris: item = {} try: driver.get(uri.get_attribute("href")) item['url'] = driver.current_url item['title'] = driver.find_element_by_xpath("//h2").text item['pubtime'] = driver.find_element_by_xpath("//div[@class='meta']/span[last()-1]").text item['content'] = driver.find_element_by_xpath("//div[@class='imedia-article']").text item['author'] = driver.find_element_by_xpath("//a[@class='doc-source']").text item['author_url'] = url except: continue item['site_name'] = u'一点号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", False) driver.quit()
def selenium_spider(): urls = get_urls() driver = webdriver.Firefox() for url in urls: driver.get(url) a = driver.find_element_by_xpath("//div[@class='detail']/a") uri = a.get_attribute("href") item = {} item["url"] = uri item['site_name'] = u'百家号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "author_url", False) driver.quit()
def selenium_toutiaoauthor_spider(): uris = get_urls() for uri in uris: try: driver = webdriver.PhantomJS() driver.get(uri) time.sleep(1) urls = [] for url in driver.find_elements_by_xpath( u"//a[@class='typeNews']"): url = url.get_attribute("href") urls.append(url) except: continue for urll in urls: item = {} item["url"] = urll try: driver.get(urll) item['title'] = driver.find_element_by_xpath("//h1").text item['pubtime'] = driver.find_element_by_xpath( "//div[@class='infoSet']/span[last()]").text item['content'] = driver.find_element_by_xpath( "//div[@class='mainContent iosStyle']").text item['author'] = driver.find_element_by_xpath( "//div[@class='authorName']").text item['author_url'] = uri except: print "ERROR" continue item['site_name'] = u'百家号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() if len(item['pubtime']) == 11: item['pubtime'] = "%s-%s" % (datetime.datetime.now().year, item['pubtime']) item_fileds(item, "data_wemedia", False) time.sleep(10) driver.quit()
def spider_run(): chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) # 无头浏览 chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) driver.get("http://www.yidianzixun.com") keywords = get_keyword() urls = [] for word in keywords: driver.find_element_by_xpath( "//input[@class='input input-search']").send_keys(u"%s" % word) driver.find_element_by_xpath( "//button[@class='btn btn-search']").click() time.sleep(5) for uri in driver.find_elements_by_xpath( "//div[@class='channel-news channel-news-0']/a"): urls.append(uri.get_attribute("href")) for url in urls: html = requests.get(url).content dom = etree.HTML(html) item = {} try: item['url'] = url item['title'] = "".join(dom.xpath("//h2//text()")) item['pubtime'] = "".join( dom.xpath("//div[@class='meta']/span[last()-1]//text()")) item['content'] = "".join( dom.xpath("//div[@class='imedia-article']//text()")) item['author'] = "".join( dom.xpath("//a[@class='doc-source']//text()")) item['author_url'] = "http://www.yidianzixun.com" + "".join( dom.xpath("//a[@class='wemedia-name']/@href")) except: continue item['site_name'] = u'一点号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", False)
def selenium_toutiao_spider(): chrome_options = webdriver.ChromeOptions() # firefox_options = webdriver.FirefoxOptions() # prefs = {"profile.managed_default_content_settings.images": 2} # chrome_options.add_experimental_option("prefs", prefs) # 无头浏览 # chrome_options.add_argument('--headless') # firefox_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Firefox(firefox_options=firefox_options) author_urls = get_authorurl() urls = [] for url in author_urls: driver.get(url) time.sleep(1) for url in driver.find_elements_by_xpath( "//div[@class='content-inner']//a"): uri = url.get_attribute("href") urls.append(uri) driver.quit() for url in urls: item = {} item["url"] = url try: html = requests.get(url).content dom = etree.HTML(html) item['title'] = "".join(dom.xpath("//h1//text()")) item['pubtime'] = "".join( dom.xpath("//span[@class='at-time']/text()")) item['content'] = "".join( dom.xpath("//div[@class='at-cnt-main']//text()")) item['author'] = "".join( dom.xpath("//span[@class='at-media-name']//text()")) except: continue item['site_name'] = u'搜狐号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", True)
def selenium_toutiao_spider(): chrome_options = webdriver.ChromeOptions() # prefs = {"profile.managed_default_content_settings.images": 2} # chrome_options.add_experimental_option("prefs", prefs) # 无头浏览 # chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) keywords = get_keyword() for word in keywords: driver.get(u"https://www.toutiao.com/search/?keyword={}".format(word)) time.sleep(10) urls = [] for url in driver.find_elements_by_xpath(u"//a[@class='link title']"): uri = url.get_attribute("href") urls.append(uri) for url in urls: item = {} item["url"] = url try: driver.get(url) item['title'] = driver.find_element_by_xpath("//h1").text item['pubtime'] = driver.find_element_by_xpath( "//div[@class='article-sub']/span[last()]").text item['content'] = driver.find_element_by_xpath( "//div[@class='article-content']").text item['author'] = driver.find_element_by_xpath( "//div[@class='article-sub']/span[last()-1]").text item['author_url'] = driver.find_element_by_xpath( "//div[@class='user-card-name']/a").get_attribute("href") except: continue item['site_name'] = u'头条号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", True) driver.quit()
def selenium_toutiaoauthor_spider(): uris = get_urls() for uri in uris: try: driver = webdriver.PhantomJS() driver.get(uri) time.sleep(1) urls = [] for url in driver.find_elements_by_xpath( u"//a[@class='link title']"): url = url.get_attribute("href") urls.append(url) except: continue for urll in urls: item = {} item["url"] = urll try: driver.get(urll) item['title'] = driver.find_element_by_xpath("//h1").text item['pubtime'] = driver.find_element_by_xpath( "//div[@class='article-sub']/span[2]").text item['content'] = driver.find_element_by_xpath( "//div[@class='article-content']").text item['author'] = driver.find_element_by_xpath( "//div[@class='article-sub']/span[1]").text item['author_url'] = driver.find_element_by_xpath( "//div[@class='user-card-name']/a").get_attribute("href") except: continue item['site_name'] = u'头条号' md5 = hashlib.md5() md5.update(item["url"]) item['url_md5'] = md5.hexdigest() item_fileds(item, "data_wemedia", False) driver.quit()