Esempi in Python per get_coll, esempi in Python per newbie.mongo.get_coll

Esempio n. 1

0

Mostra file

File: toutiaonewshares.py Progetto: nyflxy/spider

    def newshares(self, response):
        sel = Selector(response)
        shares = sel.xpath('//*[@id="main"]/div/div[1]/div[@class="post"]')
        coll = mongo.get_coll("newshares")
        for share in shares:
            item = {}
            title = share.xpath(
                'div[2]/h3[@class="title"]/a/text()').extract()[0]
            href = "https://toutiao.io" + share.xpath(
                'div[2]/h3[@class="title"]/a/@href').extract()[0]
            source = share.xpath('div[2]/div/text()').extract()[0].strip()

            item["title"] = title
            item["href"] = href
            item["source"] = source
            item["last_read_time"] = ""
            query_params = {
                "href": href,
            }
            if coll.find(query_params).count() > 0:
                coll.update({"href": href}, item)
                print "update:"
                print item
            else:
                coll.insert_one(item)
                print "create:"
                print item

Esempio n. 2

0

Mostra file

File: myshare_spider.py Progetto: niyoufa/pydev

 def parse_post(self,response):
     sel = Selector(response)
     posts = sel.xpath("//div[@class='posts']/div[@class='post']")
     items = []
     coll = mongo.get_coll("link")
     url = response._get_url()
     account = int(url.split("?")[0].split("subjects/")[1])
     for post in posts:
         articel = post.xpath("div[@class='content']")
         title = articel.xpath("h3/a/text()").extract()[0].strip()
         href= articel.xpath("h3/a[@href]").xpath("@href").extract()[0].strip()
         source =  articel.xpath("div[@class='meta']/text()").extract()[0].strip()
         item = NewbieItem()
         item["title"] = title
         item["href"] = href
         item["source"] = source
         item["account"] = account
         item["type"] = "toutiao.share"
         items.append(item)
         query_params = {
             "href":href,
             "account":account,
         }
         if coll.find(query_params).count()>0:
             continue
         else:
             coll.save(item)
     return items

Esempio n. 3

0

Mostra file

File: toutiaouser.py Progetto: nyflxy/spider

    def account_settings(self,response):
        sel = Selector(response)
        user_avatar = parse_text(sel.xpath('//*[@id="edit_user_73244"]/div[1]/div/img/@src').extract())
        nickname = parse_text(sel.xpath('//*[@id="user_name"]/@value').extract())
        github = parse_text(sel.xpath('//*[@id="user_github"]/@value').extract())
        toutiaoblog = parse_text(sel.xpath('//*[@id="user_blog"]/@value').extract())
        description = parse_text(sel.xpath('//*[@id="user_bio"]/@value').extract())
        email = parse_text(sel.xpath('//*[@id="user_email"]/@value').extract())
        account = dict(
            user_avatar = user_avatar,
            nickname = nickname,
            github = github,
            toutiaoblog = toutiaoblog,
            description = description,
            email = email,
            account=self.account,
        )

        coll = mongo.get_coll('account')
        if coll.find({"email":email}).count() == 0:
            coll.insert_one(account)
            self.email = email
            print account
        else:
            coll.update({"email":email},account)

Esempio n. 4

0

Mostra file

File: toutiaouser.py Progetto: nyflxy/spider

    def favorites(self,response):
        sel = Selector(response)
        favorites = sel.xpath('//*[@id="main"]/div/div/div[@class="post"]')
        coll = mongo.get_coll("link")
        for favorite in favorites:
            item = {}
            title = favorite.xpath('div[2]/h3[@class="title"]/a/text()').extract()[0]
            href = "https://toutiao.io" + favorite.xpath('div[2]/h3[@class="title"]/a/@href').extract()[0]
            source = favorite.xpath('div[2]/div/text()').extract()[0].strip()
            account = self.account

            item["title"] = title
            item["href"] = href
            item["source"] = source
            item["account"] = account
            item["type"] = "toutiao.favorite"
            item["last_read_time"] = ""
            query_params = {
                "href": href,
                "account": account,
                "type":item["type"],
            }
            if coll.find(query_params).count() > 0:
                coll.update(query_params,item)
                print "update:"
                print item
            else:
                coll.save(item)
                print "create:"
                print item

Esempio n. 5

0

Mostra file

File: shares.py Progetto: nyflxy/spider

 def parse_post(self, response):
     sel = Selector(response)
     posts = sel.xpath("//div[@class='posts']/div[@class='post']")
     items = []
     coll = mongo.get_coll("link")
     url = response._get_url()
     account = int(url.split("?")[0].split("subjects/")[1])
     for post in posts:
         articel = post.xpath("div[@class='content']")
         title = articel.xpath("h3/a/text()").extract()[0].strip()
         href = "https://toutiao.io" + articel.xpath("h3/a[@href]").xpath(
             "@href").extract()[0].strip()
         source = articel.xpath(
             "div[@class='meta']/text()").extract()[0].strip()
         item = SharesItem()
         item["title"] = title
         item["href"] = href
         item["source"] = source
         item["account"] = account
         item["type"] = "toutiao.share"
         item["last_read_time"] = ""
         items.append(item)
         query_params = {
             "href": href,
             "account": account,
         }
         if coll.find(query_params).count() > 0:
             continue
         else:
             coll.save(item)
     return items

Esempio n. 6

0

Mostra file

File: myshare_spider.py Progetto: niyoufa/pydev

#coding=utf-8
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from newbie.items import NewbieItem
import newbie.mongo as mongo

import sys,pdb

subject_coll = mongo.get_coll("subject")

class MyShareSpider(Spider):
    name = "myshare_spider"
    allowed_domains = ["toutiao.io"]
    subjects = subject_coll.find()
    start_urls  = []
    for subject in subjects:
        start_urls.append("http://toutiao.io/subjects/"+str(subject['account']))

    def parse(self, response):
        sel = Selector(response)
        pagination = sel.xpath("//div[@class='text-center']/ul/li[@class='last']/a")
        try:
            page = int(pagination.xpath("@href").extract()[0].split("=")[1])
        except:
            page = 1

        share_url = response._get_url()
        new_share_urls = []
        for i in range(page):
            new_share_urls.append(share_url + "?page=" + str(i+1))

Esempio n. 7

0

Mostra file

File: toutiaonewshares.py Progetto: nyflxy/spider

import os
import time
from logging import log
import json
from urllib import urlencode

import scrapy
from scrapy import Spider
from newbie.items import UserItem
from scrapy.selector import Selector
from scrapy.shell import inspect_response
import newbie.mongo as mongo

import pdb
link_coll = mongo.get_coll("subject")


def parse_text(extract):
    if len(extract) == 0:
        return ""
    else:
        return extract[0]


class NewShares(Spider):
    name = 'newshares'
    domain = 'https://toutiao.io/latest'

    def start_requests(self):
        yield scrapy.Request(url=self.domain, callback=self.latest_page)