Beispiel #1
0
def selenium_uc_spider():
    driver = webdriver.PhantomJS()
    driver.get("http://www.vyi.cc/uc/index.php")
    time.sleep(1)
    urls = []
    for url in driver.find_elements_by_xpath(u"//a[contains(.,'进入UC')]"):
        uri = url.get_attribute("href")
        urls.append(uri)
    for url in urls:
        item = {}
        item["url"] = url
        try:
            driver.get(url)
            item['title'] = driver.find_element_by_xpath("//h1").text
            item['pubtime'] = driver.find_element_by_xpath(
                "//p[@class='wmAuthor__header-wm-info_detail__3ccfd1e8a9']/span[last()-1]"
            ).text
            item['content'] = driver.find_element_by_xpath(
                "//div[@class='article-content uc-nf-fontsize-change-dom simple-ui']"
            ).text
            item['author'] = driver.find_element_by_xpath("//h3/p").text
        except:
            continue
        item['site_name'] = u'UC号'
        md5 = hashlib.md5()
        md5.update(item["url"])
        item['url_md5'] = md5.hexdigest()
        item_fileds(item, "data_wemedia", False)
    driver.quit()
Beispiel #2
0
def selenium_toutiao_spider():
    driver = webdriver.PhantomJS()
    driver.get("http://vyi.wangzherongyao.cn/toutiao/index.php")
    time.sleep(1)
    urls = []
    for url in driver.find_elements_by_xpath(u"//a[contains(.,'查看')]"):
        uri = url.get_attribute("href")
        urls.append(uri)
    for url in urls:
        item = {}
        item["url"] = url
        try:
            driver.get(url)
            item['title'] = driver.find_element_by_xpath("//h1").text
            item['pubtime'] = driver.find_element_by_xpath(
                "//div[@class='article-sub']/span[last()]").text
            item['content'] = driver.find_element_by_xpath(
                "//div[@class='article-content']").text
            item['author'] = driver.find_element_by_xpath(
                "//div[@class='article-sub']/span[last()-1]").text
            item['author_url'] = driver.find_element_by_xpath(
                "//div[@class='user-card-name']/a").get_attribute("href")
        except:
            continue
        item['site_name'] = u'头条号'
        md5 = hashlib.md5()
        md5.update(item["url"])
        item['url_md5'] = md5.hexdigest()
        item_fileds(item, "data_wemedia", False)
    driver.quit()
Beispiel #3
0
def yidian_spider():
    driver = webdriver.PhantomJS()
    for url in author_urls:
        driver.get(url)
        time.sleep(1)
        try:
            uris = driver.find_elements_by_xpath("//div[@class='channel-news channel-news-0']/a")
        except:
            continue
        for uri in uris:
            item = {}
            try:
                driver.get(uri.get_attribute("href"))
                item['url'] = driver.current_url
                item['title'] = driver.find_element_by_xpath("//h2").text
                item['pubtime'] = driver.find_element_by_xpath("//div[@class='meta']/span[last()-1]").text
                item['content'] = driver.find_element_by_xpath("//div[@class='imedia-article']").text
                item['author'] = driver.find_element_by_xpath("//a[@class='doc-source']").text
                item['author_url'] = url
            except:
                continue
            item['site_name'] = u'一点号'
            md5 = hashlib.md5()
            md5.update(item["url"])
            item['url_md5'] = md5.hexdigest()
            item_fileds(item, "data_wemedia", False)

    driver.quit()
def selenium_spider():
    urls = get_urls()
    driver = webdriver.Firefox()
    for url in urls:
        driver.get(url)
        a = driver.find_element_by_xpath("//div[@class='detail']/a")
        uri = a.get_attribute("href")
        item = {}
        item["url"] = uri
        item['site_name'] = u'百家号'
        md5 = hashlib.md5()
        md5.update(item["url"])
        item['url_md5'] = md5.hexdigest()
        item_fileds(item, "author_url", False)
    driver.quit()
def selenium_toutiaoauthor_spider():
    uris = get_urls()

    for uri in uris:
        try:
            driver = webdriver.PhantomJS()
            driver.get(uri)
            time.sleep(1)
            urls = []
            for url in driver.find_elements_by_xpath(
                    u"//a[@class='typeNews']"):
                url = url.get_attribute("href")
                urls.append(url)
        except:
            continue
        for urll in urls:
            item = {}
            item["url"] = urll
            try:
                driver.get(urll)
                item['title'] = driver.find_element_by_xpath("//h1").text
                item['pubtime'] = driver.find_element_by_xpath(
                    "//div[@class='infoSet']/span[last()]").text
                item['content'] = driver.find_element_by_xpath(
                    "//div[@class='mainContent iosStyle']").text
                item['author'] = driver.find_element_by_xpath(
                    "//div[@class='authorName']").text
                item['author_url'] = uri
            except:
                print "ERROR"
                continue
            item['site_name'] = u'百家号'
            md5 = hashlib.md5()
            md5.update(item["url"])
            item['url_md5'] = md5.hexdigest()
            if len(item['pubtime']) == 11:
                item['pubtime'] = "%s-%s" % (datetime.datetime.now().year,
                                             item['pubtime'])
            item_fileds(item, "data_wemedia", False)
            time.sleep(10)

        driver.quit()
Beispiel #6
0
def spider_run():
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)
    # 无头浏览
    chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get("http://www.yidianzixun.com")
    keywords = get_keyword()
    urls = []
    for word in keywords:
        driver.find_element_by_xpath(
            "//input[@class='input input-search']").send_keys(u"%s" % word)
        driver.find_element_by_xpath(
            "//button[@class='btn btn-search']").click()
        time.sleep(5)
        for uri in driver.find_elements_by_xpath(
                "//div[@class='channel-news channel-news-0']/a"):
            urls.append(uri.get_attribute("href"))
    for url in urls:
        html = requests.get(url).content
        dom = etree.HTML(html)
        item = {}
        try:
            item['url'] = url
            item['title'] = "".join(dom.xpath("//h2//text()"))
            item['pubtime'] = "".join(
                dom.xpath("//div[@class='meta']/span[last()-1]//text()"))
            item['content'] = "".join(
                dom.xpath("//div[@class='imedia-article']//text()"))
            item['author'] = "".join(
                dom.xpath("//a[@class='doc-source']//text()"))
            item['author_url'] = "http://www.yidianzixun.com" + "".join(
                dom.xpath("//a[@class='wemedia-name']/@href"))
        except:
            continue
        item['site_name'] = u'一点号'
        md5 = hashlib.md5()
        md5.update(item["url"])
        item['url_md5'] = md5.hexdigest()
        item_fileds(item, "data_wemedia", False)
Beispiel #7
0
def selenium_toutiao_spider():
    chrome_options = webdriver.ChromeOptions()
    # firefox_options = webdriver.FirefoxOptions()
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # chrome_options.add_experimental_option("prefs", prefs)
    # 无头浏览
    # chrome_options.add_argument('--headless')
    # firefox_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    # driver = webdriver.Firefox(firefox_options=firefox_options)
    author_urls = get_authorurl()
    urls = []
    for url in author_urls:
        driver.get(url)
        time.sleep(1)
        for url in driver.find_elements_by_xpath(
                "//div[@class='content-inner']//a"):
            uri = url.get_attribute("href")
            urls.append(uri)
    driver.quit()
    for url in urls:
        item = {}
        item["url"] = url
        try:
            html = requests.get(url).content
            dom = etree.HTML(html)
            item['title'] = "".join(dom.xpath("//h1//text()"))
            item['pubtime'] = "".join(
                dom.xpath("//span[@class='at-time']/text()"))
            item['content'] = "".join(
                dom.xpath("//div[@class='at-cnt-main']//text()"))
            item['author'] = "".join(
                dom.xpath("//span[@class='at-media-name']//text()"))
        except:
            continue
        item['site_name'] = u'搜狐号'
        md5 = hashlib.md5()
        md5.update(item["url"])
        item['url_md5'] = md5.hexdigest()
        item_fileds(item, "data_wemedia", True)
Beispiel #8
0
def selenium_toutiao_spider():
    chrome_options = webdriver.ChromeOptions()
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # chrome_options.add_experimental_option("prefs", prefs)
    # 无头浏览
    # chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    keywords = get_keyword()
    for word in keywords:
        driver.get(u"https://www.toutiao.com/search/?keyword={}".format(word))
        time.sleep(10)
        urls = []
        for url in driver.find_elements_by_xpath(u"//a[@class='link title']"):
            uri = url.get_attribute("href")
            urls.append(uri)
        for url in urls:
            item = {}
            item["url"] = url
            try:
                driver.get(url)
                item['title'] = driver.find_element_by_xpath("//h1").text
                item['pubtime'] = driver.find_element_by_xpath(
                    "//div[@class='article-sub']/span[last()]").text
                item['content'] = driver.find_element_by_xpath(
                    "//div[@class='article-content']").text
                item['author'] = driver.find_element_by_xpath(
                    "//div[@class='article-sub']/span[last()-1]").text
                item['author_url'] = driver.find_element_by_xpath(
                    "//div[@class='user-card-name']/a").get_attribute("href")
            except:
                continue
            item['site_name'] = u'头条号'
            md5 = hashlib.md5()
            md5.update(item["url"])
            item['url_md5'] = md5.hexdigest()
            item_fileds(item, "data_wemedia", True)
        driver.quit()
Beispiel #9
0
def selenium_toutiaoauthor_spider():
    uris = get_urls()

    for uri in uris:
        try:
            driver = webdriver.PhantomJS()
            driver.get(uri)
            time.sleep(1)
            urls = []
            for url in driver.find_elements_by_xpath(
                    u"//a[@class='link title']"):
                url = url.get_attribute("href")
                urls.append(url)
        except:
            continue
        for urll in urls:
            item = {}
            item["url"] = urll
            try:
                driver.get(urll)
                item['title'] = driver.find_element_by_xpath("//h1").text
                item['pubtime'] = driver.find_element_by_xpath(
                    "//div[@class='article-sub']/span[2]").text
                item['content'] = driver.find_element_by_xpath(
                    "//div[@class='article-content']").text
                item['author'] = driver.find_element_by_xpath(
                    "//div[@class='article-sub']/span[1]").text
                item['author_url'] = driver.find_element_by_xpath(
                    "//div[@class='user-card-name']/a").get_attribute("href")
            except:
                continue
            item['site_name'] = u'头条号'
            md5 = hashlib.md5()
            md5.update(item["url"])
            item['url_md5'] = md5.hexdigest()
            item_fileds(item, "data_wemedia", False)
        driver.quit()