Ejemplos de get_coll en Python, ejemplos de newbie.mongo.get_coll en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: toutiaonewshares.py Proyecto: nyflxy/spider

    def newshares(self, response):
        sel = Selector(response)
        shares = sel.xpath('//*[@id="main"]/div/div[1]/div[@class="post"]')
        coll = mongo.get_coll("newshares")
        for share in shares:
            item = {}
            title = share.xpath(
                'div[2]/h3[@class="title"]/a/text()').extract()[0]
            href = "https://toutiao.io" + share.xpath(
                'div[2]/h3[@class="title"]/a/@href').extract()[0]
            source = share.xpath('div[2]/div/text()').extract()[0].strip()

            item["title"] = title
            item["href"] = href
            item["source"] = source
            item["last_read_time"] = ""
            query_params = {
                "href": href,
            }
            if coll.find(query_params).count() > 0:
                coll.update({"href": href}, item)
                print "update:"
                print item
            else:
                coll.insert_one(item)
                print "create:"
                print item

Ejemplo n.º 2

0

Mostrar archivo

Archivo: myshare_spider.py Proyecto: niyoufa/pydev

 def parse_post(self,response):
     sel = Selector(response)
     posts = sel.xpath("//div[@class='posts']/div[@class='post']")
     items = []
     coll = mongo.get_coll("link")
     url = response._get_url()
     account = int(url.split("?")[0].split("subjects/")[1])
     for post in posts:
         articel = post.xpath("div[@class='content']")
         title = articel.xpath("h3/a/text()").extract()[0].strip()
         href= articel.xpath("h3/a[@href]").xpath("@href").extract()[0].strip()
         source =  articel.xpath("div[@class='meta']/text()").extract()[0].strip()
         item = NewbieItem()
         item["title"] = title
         item["href"] = href
         item["source"] = source
         item["account"] = account
         item["type"] = "toutiao.share"
         items.append(item)
         query_params = {
             "href":href,
             "account":account,
         }
         if coll.find(query_params).count()>0:
             continue
         else:
             coll.save(item)
     return items

Ejemplo n.º 3

0

Mostrar archivo

Archivo: toutiaouser.py Proyecto: nyflxy/spider

    def account_settings(self,response):
        sel = Selector(response)
        user_avatar = parse_text(sel.xpath('//*[@id="edit_user_73244"]/div[1]/div/img/@src').extract())
        nickname = parse_text(sel.xpath('//*[@id="user_name"]/@value').extract())
        github = parse_text(sel.xpath('//*[@id="user_github"]/@value').extract())
        toutiaoblog = parse_text(sel.xpath('//*[@id="user_blog"]/@value').extract())
        description = parse_text(sel.xpath('//*[@id="user_bio"]/@value').extract())
        email = parse_text(sel.xpath('//*[@id="user_email"]/@value').extract())
        account = dict(
            user_avatar = user_avatar,
            nickname = nickname,
            github = github,
            toutiaoblog = toutiaoblog,
            description = description,
            email = email,
            account=self.account,
        )

        coll = mongo.get_coll('account')
        if coll.find({"email":email}).count() == 0:
            coll.insert_one(account)
            self.email = email
            print account
        else:
            coll.update({"email":email},account)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: toutiaouser.py Proyecto: nyflxy/spider

    def favorites(self,response):
        sel = Selector(response)
        favorites = sel.xpath('//*[@id="main"]/div/div/div[@class="post"]')
        coll = mongo.get_coll("link")
        for favorite in favorites:
            item = {}
            title = favorite.xpath('div[2]/h3[@class="title"]/a/text()').extract()[0]
            href = "https://toutiao.io" + favorite.xpath('div[2]/h3[@class="title"]/a/@href').extract()[0]
            source = favorite.xpath('div[2]/div/text()').extract()[0].strip()
            account = self.account

            item["title"] = title
            item["href"] = href
            item["source"] = source
            item["account"] = account
            item["type"] = "toutiao.favorite"
            item["last_read_time"] = ""
            query_params = {
                "href": href,
                "account": account,
                "type":item["type"],
            }
            if coll.find(query_params).count() > 0:
                coll.update(query_params,item)
                print "update:"
                print item
            else:
                coll.save(item)
                print "create:"
                print item

Ejemplo n.º 5

0

Mostrar archivo

Archivo: shares.py Proyecto: nyflxy/spider

 def parse_post(self, response):
     sel = Selector(response)
     posts = sel.xpath("//div[@class='posts']/div[@class='post']")
     items = []
     coll = mongo.get_coll("link")
     url = response._get_url()
     account = int(url.split("?")[0].split("subjects/")[1])
     for post in posts:
         articel = post.xpath("div[@class='content']")
         title = articel.xpath("h3/a/text()").extract()[0].strip()
         href = "https://toutiao.io" + articel.xpath("h3/a[@href]").xpath(
             "@href").extract()[0].strip()
         source = articel.xpath(
             "div[@class='meta']/text()").extract()[0].strip()
         item = SharesItem()
         item["title"] = title
         item["href"] = href
         item["source"] = source
         item["account"] = account
         item["type"] = "toutiao.share"
         item["last_read_time"] = ""
         items.append(item)
         query_params = {
             "href": href,
             "account": account,
         }
         if coll.find(query_params).count() > 0:
             continue
         else:
             coll.save(item)
     return items

Ejemplo n.º 6

0

Mostrar archivo

Archivo: myshare_spider.py Proyecto: niyoufa/pydev

#coding=utf-8
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from newbie.items import NewbieItem
import newbie.mongo as mongo

import sys,pdb

subject_coll = mongo.get_coll("subject")

class MyShareSpider(Spider):
    name = "myshare_spider"
    allowed_domains = ["toutiao.io"]
    subjects = subject_coll.find()
    start_urls  = []
    for subject in subjects:
        start_urls.append("http://toutiao.io/subjects/"+str(subject['account']))

    def parse(self, response):
        sel = Selector(response)
        pagination = sel.xpath("//div[@class='text-center']/ul/li[@class='last']/a")
        try:
            page = int(pagination.xpath("@href").extract()[0].split("=")[1])
        except:
            page = 1

        share_url = response._get_url()
        new_share_urls = []
        for i in range(page):
            new_share_urls.append(share_url + "?page=" + str(i+1))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: toutiaonewshares.py Proyecto: nyflxy/spider

import os
import time
from logging import log
import json
from urllib import urlencode

import scrapy
from scrapy import Spider
from newbie.items import UserItem
from scrapy.selector import Selector
from scrapy.shell import inspect_response
import newbie.mongo as mongo

import pdb
link_coll = mongo.get_coll("subject")


def parse_text(extract):
    if len(extract) == 0:
        return ""
    else:
        return extract[0]


class NewShares(Spider):
    name = 'newshares'
    domain = 'https://toutiao.io/latest'

    def start_requests(self):
        yield scrapy.Request(url=self.domain, callback=self.latest_page)