Esempio n. 1
0
    def newshares(self, response):
        sel = Selector(response)
        shares = sel.xpath('//*[@id="main"]/div/div[1]/div[@class="post"]')
        coll = mongo.get_coll("newshares")
        for share in shares:
            item = {}
            title = share.xpath(
                'div[2]/h3[@class="title"]/a/text()').extract()[0]
            href = "https://toutiao.io" + share.xpath(
                'div[2]/h3[@class="title"]/a/@href').extract()[0]
            source = share.xpath('div[2]/div/text()').extract()[0].strip()

            item["title"] = title
            item["href"] = href
            item["source"] = source
            item["last_read_time"] = ""
            query_params = {
                "href": href,
            }
            if coll.find(query_params).count() > 0:
                coll.update({"href": href}, item)
                print "update:"
                print item
            else:
                coll.insert_one(item)
                print "create:"
                print item
Esempio n. 2
0
 def parse_post(self,response):
     sel = Selector(response)
     posts = sel.xpath("//div[@class='posts']/div[@class='post']")
     items = []
     coll = mongo.get_coll("link")
     url = response._get_url()
     account = int(url.split("?")[0].split("subjects/")[1])
     for post in posts:
         articel = post.xpath("div[@class='content']")
         title = articel.xpath("h3/a/text()").extract()[0].strip()
         href= articel.xpath("h3/a[@href]").xpath("@href").extract()[0].strip()
         source =  articel.xpath("div[@class='meta']/text()").extract()[0].strip()
         item = NewbieItem()
         item["title"] = title
         item["href"] = href
         item["source"] = source
         item["account"] = account
         item["type"] = "toutiao.share"
         items.append(item)
         query_params = {
             "href":href,
             "account":account,
         }
         if coll.find(query_params).count()>0:
             continue
         else:
             coll.save(item)
     return items
Esempio n. 3
0
    def account_settings(self,response):
        sel = Selector(response)
        user_avatar = parse_text(sel.xpath('//*[@id="edit_user_73244"]/div[1]/div/img/@src').extract())
        nickname = parse_text(sel.xpath('//*[@id="user_name"]/@value').extract())
        github = parse_text(sel.xpath('//*[@id="user_github"]/@value').extract())
        toutiaoblog = parse_text(sel.xpath('//*[@id="user_blog"]/@value').extract())
        description = parse_text(sel.xpath('//*[@id="user_bio"]/@value').extract())
        email = parse_text(sel.xpath('//*[@id="user_email"]/@value').extract())
        account = dict(
            user_avatar = user_avatar,
            nickname = nickname,
            github = github,
            toutiaoblog = toutiaoblog,
            description = description,
            email = email,
            account=self.account,
        )

        coll = mongo.get_coll('account')
        if coll.find({"email":email}).count() == 0:
            coll.insert_one(account)
            self.email = email
            print account
        else:
            coll.update({"email":email},account)
Esempio n. 4
0
    def favorites(self,response):
        sel = Selector(response)
        favorites = sel.xpath('//*[@id="main"]/div/div/div[@class="post"]')
        coll = mongo.get_coll("link")
        for favorite in favorites:
            item = {}
            title = favorite.xpath('div[2]/h3[@class="title"]/a/text()').extract()[0]
            href = "https://toutiao.io" + favorite.xpath('div[2]/h3[@class="title"]/a/@href').extract()[0]
            source = favorite.xpath('div[2]/div/text()').extract()[0].strip()
            account = self.account

            item["title"] = title
            item["href"] = href
            item["source"] = source
            item["account"] = account
            item["type"] = "toutiao.favorite"
            item["last_read_time"] = ""
            query_params = {
                "href": href,
                "account": account,
                "type":item["type"],
            }
            if coll.find(query_params).count() > 0:
                coll.update(query_params,item)
                print "update:"
                print item
            else:
                coll.save(item)
                print "create:"
                print item
Esempio n. 5
0
 def parse_post(self, response):
     sel = Selector(response)
     posts = sel.xpath("//div[@class='posts']/div[@class='post']")
     items = []
     coll = mongo.get_coll("link")
     url = response._get_url()
     account = int(url.split("?")[0].split("subjects/")[1])
     for post in posts:
         articel = post.xpath("div[@class='content']")
         title = articel.xpath("h3/a/text()").extract()[0].strip()
         href = "https://toutiao.io" + articel.xpath("h3/a[@href]").xpath(
             "@href").extract()[0].strip()
         source = articel.xpath(
             "div[@class='meta']/text()").extract()[0].strip()
         item = SharesItem()
         item["title"] = title
         item["href"] = href
         item["source"] = source
         item["account"] = account
         item["type"] = "toutiao.share"
         item["last_read_time"] = ""
         items.append(item)
         query_params = {
             "href": href,
             "account": account,
         }
         if coll.find(query_params).count() > 0:
             continue
         else:
             coll.save(item)
     return items
Esempio n. 6
0
#coding=utf-8
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from newbie.items import NewbieItem
import newbie.mongo as mongo

import sys,pdb

subject_coll = mongo.get_coll("subject")

class MyShareSpider(Spider):
    name = "myshare_spider"
    allowed_domains = ["toutiao.io"]
    subjects = subject_coll.find()
    start_urls  = []
    for subject in subjects:
        start_urls.append("http://toutiao.io/subjects/"+str(subject['account']))

    def parse(self, response):
        sel = Selector(response)
        pagination = sel.xpath("//div[@class='text-center']/ul/li[@class='last']/a")
        try:
            page = int(pagination.xpath("@href").extract()[0].split("=")[1])
        except:
            page = 1

        share_url = response._get_url()
        new_share_urls = []
        for i in range(page):
            new_share_urls.append(share_url + "?page=" + str(i+1))
Esempio n. 7
0
import os
import time
from logging import log
import json
from urllib import urlencode

import scrapy
from scrapy import Spider
from newbie.items import UserItem
from scrapy.selector import Selector
from scrapy.shell import inspect_response
import newbie.mongo as mongo

import pdb
link_coll = mongo.get_coll("subject")


def parse_text(extract):
    if len(extract) == 0:
        return ""
    else:
        return extract[0]


class NewShares(Spider):
    name = 'newshares'
    domain = 'https://toutiao.io/latest'

    def start_requests(self):
        yield scrapy.Request(url=self.domain, callback=self.latest_page)