def newshares(self, response): sel = Selector(response) shares = sel.xpath('//*[@id="main"]/div/div[1]/div[@class="post"]') coll = mongo.get_coll("newshares") for share in shares: item = {} title = share.xpath( 'div[2]/h3[@class="title"]/a/text()').extract()[0] href = "https://toutiao.io" + share.xpath( 'div[2]/h3[@class="title"]/a/@href').extract()[0] source = share.xpath('div[2]/div/text()').extract()[0].strip() item["title"] = title item["href"] = href item["source"] = source item["last_read_time"] = "" query_params = { "href": href, } if coll.find(query_params).count() > 0: coll.update({"href": href}, item) print "update:" print item else: coll.insert_one(item) print "create:" print item
def parse_post(self,response): sel = Selector(response) posts = sel.xpath("//div[@class='posts']/div[@class='post']") items = [] coll = mongo.get_coll("link") url = response._get_url() account = int(url.split("?")[0].split("subjects/")[1]) for post in posts: articel = post.xpath("div[@class='content']") title = articel.xpath("h3/a/text()").extract()[0].strip() href= articel.xpath("h3/a[@href]").xpath("@href").extract()[0].strip() source = articel.xpath("div[@class='meta']/text()").extract()[0].strip() item = NewbieItem() item["title"] = title item["href"] = href item["source"] = source item["account"] = account item["type"] = "toutiao.share" items.append(item) query_params = { "href":href, "account":account, } if coll.find(query_params).count()>0: continue else: coll.save(item) return items
def account_settings(self,response): sel = Selector(response) user_avatar = parse_text(sel.xpath('//*[@id="edit_user_73244"]/div[1]/div/img/@src').extract()) nickname = parse_text(sel.xpath('//*[@id="user_name"]/@value').extract()) github = parse_text(sel.xpath('//*[@id="user_github"]/@value').extract()) toutiaoblog = parse_text(sel.xpath('//*[@id="user_blog"]/@value').extract()) description = parse_text(sel.xpath('//*[@id="user_bio"]/@value').extract()) email = parse_text(sel.xpath('//*[@id="user_email"]/@value').extract()) account = dict( user_avatar = user_avatar, nickname = nickname, github = github, toutiaoblog = toutiaoblog, description = description, email = email, account=self.account, ) coll = mongo.get_coll('account') if coll.find({"email":email}).count() == 0: coll.insert_one(account) self.email = email print account else: coll.update({"email":email},account)
def favorites(self,response): sel = Selector(response) favorites = sel.xpath('//*[@id="main"]/div/div/div[@class="post"]') coll = mongo.get_coll("link") for favorite in favorites: item = {} title = favorite.xpath('div[2]/h3[@class="title"]/a/text()').extract()[0] href = "https://toutiao.io" + favorite.xpath('div[2]/h3[@class="title"]/a/@href').extract()[0] source = favorite.xpath('div[2]/div/text()').extract()[0].strip() account = self.account item["title"] = title item["href"] = href item["source"] = source item["account"] = account item["type"] = "toutiao.favorite" item["last_read_time"] = "" query_params = { "href": href, "account": account, "type":item["type"], } if coll.find(query_params).count() > 0: coll.update(query_params,item) print "update:" print item else: coll.save(item) print "create:" print item
def parse_post(self, response): sel = Selector(response) posts = sel.xpath("//div[@class='posts']/div[@class='post']") items = [] coll = mongo.get_coll("link") url = response._get_url() account = int(url.split("?")[0].split("subjects/")[1]) for post in posts: articel = post.xpath("div[@class='content']") title = articel.xpath("h3/a/text()").extract()[0].strip() href = "https://toutiao.io" + articel.xpath("h3/a[@href]").xpath( "@href").extract()[0].strip() source = articel.xpath( "div[@class='meta']/text()").extract()[0].strip() item = SharesItem() item["title"] = title item["href"] = href item["source"] = source item["account"] = account item["type"] = "toutiao.share" item["last_read_time"] = "" items.append(item) query_params = { "href": href, "account": account, } if coll.find(query_params).count() > 0: continue else: coll.save(item) return items
#coding=utf-8 from scrapy.spiders import Spider from scrapy.selector import Selector from scrapy import Request from newbie.items import NewbieItem import newbie.mongo as mongo import sys,pdb subject_coll = mongo.get_coll("subject") class MyShareSpider(Spider): name = "myshare_spider" allowed_domains = ["toutiao.io"] subjects = subject_coll.find() start_urls = [] for subject in subjects: start_urls.append("http://toutiao.io/subjects/"+str(subject['account'])) def parse(self, response): sel = Selector(response) pagination = sel.xpath("//div[@class='text-center']/ul/li[@class='last']/a") try: page = int(pagination.xpath("@href").extract()[0].split("=")[1]) except: page = 1 share_url = response._get_url() new_share_urls = [] for i in range(page): new_share_urls.append(share_url + "?page=" + str(i+1))
import os import time from logging import log import json from urllib import urlencode import scrapy from scrapy import Spider from newbie.items import UserItem from scrapy.selector import Selector from scrapy.shell import inspect_response import newbie.mongo as mongo import pdb link_coll = mongo.get_coll("subject") def parse_text(extract): if len(extract) == 0: return "" else: return extract[0] class NewShares(Spider): name = 'newshares' domain = 'https://toutiao.io/latest' def start_requests(self): yield scrapy.Request(url=self.domain, callback=self.latest_page)