Ejemplo n.º 1
0
import time
from collections import OrderedDict, defaultdict

import requests
import traceback
from xiaoscript.config import get_root_path

try:
    from github import Github
    import matplotlib.pyplot as plt
except:
    pass

# plt.rcParams['font.sans-serif'] = ['SimHei']

ps_file = '{}/ps.txt'.format(get_root_path())
out_file = '{}/github.json'.format(get_root_path())

out_file2 = 'C:\\Users\\xiaobao\\Desktop\\github-lang.txt'

start = 1000
step = 50
end = 10000

focus_keys = [
    'id', 'url', 'name', 'description', 'language', 'forks', 'stars',
    'created_at', 'updated_at', 'full_name'
]


def run():
Ejemplo n.º 2
0
#!/usr/bin/env python
# encoding: utf-8
"""
@description: go语言网爬虫

@author: baoqiang
@time: 2018/11/6 下午4:35
"""

import scrapy
from scrapy.http import FormRequest
import json
from xiaoscript.config import get_root_path

out_file = '{}/study_go.json'.format(get_root_path())
url_fmt = 'https://studygolang.com/articles?p={}'
root_url = 'https://studygolang.com'


class StudyGoSpider(scrapy.Spider):
    name = 'studygo'

    def start_requests(self):
        for i in range(1, 908):
            # for i in range(1, 3):
            url = url_fmt.format(i)
            yield FormRequest(url, callback=self.parse_cate)

    def parse_cate(self, response):
        datas = []
Ejemplo n.º 3
0
"""
@description: 小米的主题爬虫

@author: baoqiang
@time: 2018/12/6 下午8:26
"""

import scrapy
from scrapy.http import FormRequest
import json
from xiaoscript.config import get_root_path
import time

url_fmt = 'http://zhuti.xiaomi.com/compound?page={}&sort=New'
comment_fmt = 'http://zhuti.xiaomi.com/comment/listall/{}?page=0&t={}&status=3'
out_file = '{}/miui.json'.format(get_root_path())
root_url = 'http://zhuti.xiaomi.com'


class MiuiSpider(scrapy.Spider):
    name = 'miui'

    def start_requests(self):
        for i in range(1, 1225):
        # for i in range(1, 3):
            url = url_fmt.format(i)
            yield FormRequest(url, callback=self.parse_cate)

    def parse_cate(self, response):
        datas = []
Ejemplo n.º 4
0
# encoding: utf-8
"""
@description: 北京景点爬虫

@author: baoqiang
@time: 2019-05-07 20:50
"""

import scrapy
from scrapy import Request
from xiaoscript import config
import json

page_size = 30
url_fmt = 'https://www.tripadvisor.com.hk/Attractions-g294212-Activities-oa{}-Beijing.html'
out_file = '{}/bj_tour.json'.format(config.get_root_path())


class TripAdvisorSpider(scrapy.Spider):
    name = 'trip_advisor'

    def start_requests(self):
        for i in range(0, 55):
            # for i in range(1, 2):
            url = url_fmt.format(page_size * i)

            yield Request(url, callback=self.parse_page, meta={'page': i})

    def parse_page(self, response):
        meta = response.meta
Ejemplo n.º 5
0
    "stand_ids": [287],
    "latitude": 0,
    "key_self": 0,
    "region_ids": [],
    "logicSort": "0",
    "plate_ids": [],
    "longitude": 0,
    "distance": "0",
    "update_time": 0,
    "ab_test": "A",
    "line_ids": [],
    "type_no": 0,
    "key": ""
}

out_file = '{}/hizhu.json'.format(config.get_root_path())


def run():
    # for i in range(1, 30):
    for i in range(1, 20):
        req_body.update({'pageno': i})
        resp = requests.post(url, json=req_body, verify=False, headers=headers)

        print(resp.status_code)

        resp_data = format_data(resp.json())
        # print(resp_data)

        with open(out_file, 'a', encoding='utf-8') as fw:
            json.dump(resp_data, fw, ensure_ascii=False)
Ejemplo n.º 6
0
"""
import random
import sys

from xiaoscript import config
from threading import Lock
import scrapy
from scrapy.http import FormRequest
import json
import re

start_url = 'https://www.douban.com/group/513717/discussion?start={}'

id_pat = re.compile('https://www.douban.com/group/topic/([\\d]+)')

out_file = '{}/douban_xiaozu.json'.format(config.get_root_path())


class DoubanXiaozuSpider(scrapy.Spider):
    name = 'douban_xiaozu_spider'

    lock = Lock()
    ids = set()

    def start_requests(self):
        # for i in range(0, 731):
        for i in range(0, 100):
            url = start_url.format(i * 25)
            headers.update({'X-Real-IP': get_random_ip()})
            yield FormRequest(url, headers=headers, callback=self.parse_cate)
Ejemplo n.º 7
0
"""
@description: 大众点评 北京密室

@author: baoqiang
@time: 2019-06-30 12:52
"""

import scrapy
from scrapy import FormRequest
from threading import Lock
import json
from xiaoscript import config

url_fmt = 'http://www.dianping.com/beijing/ch30/g2754p{}'

out_file = '{}/chamber.json'.format(config.get_root_path())


class DianpingChamberSpider(scrapy.Spider):
    name = 'dianping_chamber'

    lock = Lock()

    def start_requests(self):
        for i in range(1, 24):
            url = url_fmt.format(i)
            yield FormRequest(url, callback=self.parse_page, headers=headers)

    def parse_page(self, response):
        classes = response.selector.xpath('.//div[@class="content"]//ul/li')
Ejemplo n.º 8
0
"""

import json
import re
import urllib.parse
import logging
import threading

import scrapy
from scrapy import Request

from xiaoscript import config

start_url = 'https://coolshell.cn/page/{}'

out_file = '{}/coolshell.json'.format(config.get_root_path())

id_pat = re.compile('https://coolshell.cn/articles/(\d+).html')


class CollShellSpider(scrapy.Spider):
    name = 'cool_shell'

    lock = threading.Lock()

    def start_requests(self):
        for i in range(1, 72):
            # for i in range(1, 3):
            url = start_url.format(i)
            yield Request(url, callback=self.parse_page)
Ejemplo n.º 9
0
@time: 2019-05-15 20:45
"""

import json

import scrapy
from scrapy import FormRequest
import sys
import re
import requests
import logging

from xconcurrent import threadpool
from xiaoscript.config import get_root_path

out_file = '{}/kaola.json'.format(get_root_path())
task_file = '{}/kaola_task.json'.format(get_root_path())
start_url = 'https://www.kaola.com'


class KaolaSpider(scrapy.Spider):
    name = 'kaola'
    cnt = 0
    processed_set = set()

    def start_requests(self):
        yield FormRequest(start_url, callback=self.parse_cate)

    def parse_cate(self, response):
        datas = []
Ejemplo n.º 10
0
@time: 2019-07-24 12:49
"""

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from xiaoscript import config
from threading import Lock
import json
import re
from scrapy import Request

start_url = 'https://bj.lianjia.com/xiaoqu/'

id_pat = re.compile('https://bj.lianjia.com/xiaoqu/([\\d]+)')

out_file = '{}/lianjia_xiaoqu.json'.format(config.get_root_path())


class LjXiaoquSpider(CrawlSpider):
    name = 'lj_xiaoqu_spider'
    start_urls = [
        start_url,
    ]

    links1 = LinkExtractor(allow='.*/xiaoqu/[a-z]+/(pg[\\d]+){0,1}$')

    rules = (Rule(links1, callback='parse_cate', follow=True), )

    lock = Lock()
    ids = set()
Ejemplo n.º 11
0
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from xiaoscript import config
from threading import Lock
import json
import requests
import re

# 图书 小说
start_url = 'https://list.jd.com/list.html?cat=1713,3258&page=1&delivery=1&sort=sort_rank_asc'
comment_url_fmt = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'

id_pat = re.compile('https://item.jd.com/([\\d]+).html')

out_file = '{}/jd_book.json'.format(config.get_root_path())


# https://club.jd.com/comment/productCommentSummaries.action?referenceIds=12178407
# 先爬取id,后续批量协程更新评论
class JdBookSpider(CrawlSpider):
    name = 'jdbook_spider'
    start_urls = [start_url, ]

    links1 = LinkExtractor(allow='.*cat=\\d+,\\d+$')
    links2 = LinkExtractor(allow='.*cat=\\d+,\\d+,\\d+$')
    links3 = LinkExtractor(allow='.*cat=\\d+,\\d+,\\d+&page=[\\d]&sort=sort_rank_asc.*')

    rules = (
        Rule(links1, follow=True),
        Rule(links2, follow=True),
Ejemplo n.º 12
0
#!/usr/bin/env python
# encoding: utf-8

"""
@description: 处理自如的爬虫数据,分析得到最应该租住的区域

@author: pacman
@time: 2018/3/2 17:29
"""

import json
from xiaoscript import config
import re

root_path = config.get_root_path()

floor_pat = re.compile('([\d]+)/([\d]+)层')
distance_pat = re.compile('([\d]+)米')

processed_ids = set()


def process():
    not_print_key = True

    with open('{}/ziru2.json'.format(root_path), 'r', encoding='utf-8') as f, \
            open('{}/ziru2.txt'.format(root_path), 'w', encoding='utf-8') as fw:
        for idx, line in enumerate(f, start=1):
            line = line.strip()
            json_data = json.loads(line.strip())
Ejemplo n.º 13
0
@author: baoqiang
@time: 2019/1/4 下午12:56
"""
import scrapy
from scrapy.http import FormRequest
import json
from xiaoscript.config import get_root_path
import time
from scrapy.selector import Selector
import re
import threading

start_cate_url = 'https://www.wandoujia.com/category/app'
item_cate_fmt = 'https://www.wandoujia.com/wdjweb/api/category/more?catId={}&subCatId={}&page={}&ctoken=ZnrB6v38kAfy6a1GyghJGGtM'
out_file = '{}/wandou2.json'.format(get_root_path())

root_url = 'https://www.wandoujia.com'
cate_url = 'https://www.wandoujia.com/category/'


class Wandou2Spider(scrapy.Spider):
    name = 'wandou2'
    num = 1
    lock = threading.Lock()

    def start_requests(self):
        yield FormRequest(start_cate_url, callback=self.parse_cate)

    def parse_cate(self, response):
        data = []
Ejemplo n.º 14
0
@author: baoqiang
@time: 2018/11/28 下午10:05
"""

import requests
import json

from xiaoscript import config
import pandas as pd

ZIROOM = 'ziroom'

keywords = ['来广营', '东湖渠', '望京']

root_path = '/Users/baoqiang/Downloads/'
out_file = '{}/ziru3.json'.format(config.get_root_path())


def run():
    for keyword in keywords:
        print('process {}'.format(keyword))
        run_item(keyword)


def run_item(keyword):
    datas = []

    for i in range(10, 10001, 10):
        # for i in range(10, 30, 10):
        payload = {'step': i, 'key_word': keyword}
        res = requests.post('http://m.ziroom.com/list/ajax-get-data',
Ejemplo n.º 15
0
#!/usr/bin/env python
# encoding: utf-8
"""
@description: 暖房

@author: pacman
@time: 2017/11/1 15:01
"""

import scrapy
from scrapy.http import FormRequest
import json
from xiaoscript.config import get_root_path

out_file = '{}/nuanfang.json'.format(get_root_path())
out_file2 = '{}/nuanfang.txt'.format(get_root_path())

headers = {
    'User-Agent':
    'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13E234 MicroMessenger/6.5.20 NetType/WIFI Language/zh_CN'
}


class NuanfangSpider(scrapy.Spider):
    name = 'nuanfang'

    def start_requests(self):
        for i in range(1, 1001):
            # for i in range(1, 3):
            url = url_fmt.format(i)
            yield FormRequest(url, headers=headers, callback=self.parse_cate)
Ejemplo n.º 16
0
@time: 2018/10/19 下午12:10
"""

import json
import re
import urllib.parse
import logging

import scrapy
from scrapy import Request

from xiaoscript import config

start_url = 'https://www.dankegongyu.com/room/bj'

out_file = '{}/danke.json'.format(config.get_root_path())

id_pat = re.compile('https://www.dankegongyu.com/room/([\d]+).html')
id_pat2 = re.compile('https://www.dankegongyu.com/duanzu/([\d]+).html')


class DankeSpider(scrapy.Spider):
    name = 'danke'

    def start_requests(self):
        yield Request(start_url, callback=self.parse_area)

    def parse_area(self, response):
        filter_div = './/div[@class="filter_options"]/dl[contains(@class,"area")]/dd/div[@class="option_list"]/div[@class="area-ls-wp"]'
        classes = response.selector.xpath(filter_div)
Ejemplo n.º 17
0
"""
@description: 即刻爬虫

@author: baoqiang
@time: 2018/12/18 下午12:45
"""
import json

import scrapy
from scrapy import FormRequest
import sys

from xiaoscript.config import get_root_path

out_file = '{}/jike.json'.format(get_root_path())

url_fmt = 'https://app.jike.ruguoapp.com/1.0/topics/listSimilarTopics?id={}'
web_fmt = 'https://web.okjike.com/topic/{}/official'
app_fmt = 'http://m.jike.ruguoapp.com/topics/{}'


class JikeSpider(scrapy.Spider):
    name = 'jike'
    cnt = 0
    processed_set = set()

    def start_requests(self):
        with open('../data/jike.txt') as f:
            for line in f:
                url = url_fmt.format(line.strip())