Esempio n. 1
0
__author__ = 'Xiaomin'

from scrapy.spider import Spider
from scrapy.utils.response import open_in_browser
import scrapy
import os
from superqq_spider.items import Paper
import datetime
import json

from superqq_spider.utils import utils

tool = utils()


def getUrls():
    urls = []
    urls += [
        'http://arxiv.org/list/cs/12?skip=1000&show=1000',
        'http://arxiv.org/list/cs/12?skip=2000&show=1000',
        'http://arxiv.org/list/cs/12?skip=3000&show=1000',
        'http://arxiv.org/list/cs/12?skip=4000&show=1000',
        'http://arxiv.org/list/cs/12?skip=5000&show=1000',
        'http://arxiv.org/list/cs/12?skip=6000&show=1000',
    ]
    return reversed(urls)


class CS499Spider(Spider):
    hostname = 'http://arxiv.org'
    name = 'xxu46_4'
Esempio n. 2
0
__author__ = 'Xiaomin'

from scrapy.spider import Spider
from scrapy.utils.response import open_in_browser
import scrapy
import os
from superqq_spider.items import Paper
import datetime
import json

from superqq_spider.utils import utils

tool = utils()
def getUrls():
    urls = []
    urls += ['http://arxiv.org/list/cs/12?skip=7000&show=1000',
'http://arxiv.org/list/cs/12?skip=8000&show=1000',
'http://arxiv.org/list/cs/12?skip=9000&show=1000',
'http://arxiv.org/list/cs/12?skip=10000&show=1000',
'http://arxiv.org/list/cs/12?skip=11000&show=1000',
'http://arxiv.org/list/cs/12?skip=12000&show=1000',
'http://arxiv.org/list/cs/13?skip=0&show=1000']
    return reversed(urls)

class CS499Spider(Spider):
    hostname = 'http://arxiv.org'
    name = 'xxu46_5'
    #allowed_domains = ['http://cs.illinois.edu']
    start_urls = getUrls()

    def __init__(self):