import time from collections import OrderedDict, defaultdict import requests import traceback from xiaoscript.config import get_root_path try: from github import Github import matplotlib.pyplot as plt except: pass # plt.rcParams['font.sans-serif'] = ['SimHei'] ps_file = '{}/ps.txt'.format(get_root_path()) out_file = '{}/github.json'.format(get_root_path()) out_file2 = 'C:\\Users\\xiaobao\\Desktop\\github-lang.txt' start = 1000 step = 50 end = 10000 focus_keys = [ 'id', 'url', 'name', 'description', 'language', 'forks', 'stars', 'created_at', 'updated_at', 'full_name' ] def run():
#!/usr/bin/env python # encoding: utf-8 """ @description: go语言网爬虫 @author: baoqiang @time: 2018/11/6 下午4:35 """ import scrapy from scrapy.http import FormRequest import json from xiaoscript.config import get_root_path out_file = '{}/study_go.json'.format(get_root_path()) url_fmt = 'https://studygolang.com/articles?p={}' root_url = 'https://studygolang.com' class StudyGoSpider(scrapy.Spider): name = 'studygo' def start_requests(self): for i in range(1, 908): # for i in range(1, 3): url = url_fmt.format(i) yield FormRequest(url, callback=self.parse_cate) def parse_cate(self, response): datas = []
""" @description: 小米的主题爬虫 @author: baoqiang @time: 2018/12/6 下午8:26 """ import scrapy from scrapy.http import FormRequest import json from xiaoscript.config import get_root_path import time url_fmt = 'http://zhuti.xiaomi.com/compound?page={}&sort=New' comment_fmt = 'http://zhuti.xiaomi.com/comment/listall/{}?page=0&t={}&status=3' out_file = '{}/miui.json'.format(get_root_path()) root_url = 'http://zhuti.xiaomi.com' class MiuiSpider(scrapy.Spider): name = 'miui' def start_requests(self): for i in range(1, 1225): # for i in range(1, 3): url = url_fmt.format(i) yield FormRequest(url, callback=self.parse_cate) def parse_cate(self, response): datas = []
# encoding: utf-8 """ @description: 北京景点爬虫 @author: baoqiang @time: 2019-05-07 20:50 """ import scrapy from scrapy import Request from xiaoscript import config import json page_size = 30 url_fmt = 'https://www.tripadvisor.com.hk/Attractions-g294212-Activities-oa{}-Beijing.html' out_file = '{}/bj_tour.json'.format(config.get_root_path()) class TripAdvisorSpider(scrapy.Spider): name = 'trip_advisor' def start_requests(self): for i in range(0, 55): # for i in range(1, 2): url = url_fmt.format(page_size * i) yield Request(url, callback=self.parse_page, meta={'page': i}) def parse_page(self, response): meta = response.meta
"stand_ids": [287], "latitude": 0, "key_self": 0, "region_ids": [], "logicSort": "0", "plate_ids": [], "longitude": 0, "distance": "0", "update_time": 0, "ab_test": "A", "line_ids": [], "type_no": 0, "key": "" } out_file = '{}/hizhu.json'.format(config.get_root_path()) def run(): # for i in range(1, 30): for i in range(1, 20): req_body.update({'pageno': i}) resp = requests.post(url, json=req_body, verify=False, headers=headers) print(resp.status_code) resp_data = format_data(resp.json()) # print(resp_data) with open(out_file, 'a', encoding='utf-8') as fw: json.dump(resp_data, fw, ensure_ascii=False)
""" import random import sys from xiaoscript import config from threading import Lock import scrapy from scrapy.http import FormRequest import json import re start_url = 'https://www.douban.com/group/513717/discussion?start={}' id_pat = re.compile('https://www.douban.com/group/topic/([\\d]+)') out_file = '{}/douban_xiaozu.json'.format(config.get_root_path()) class DoubanXiaozuSpider(scrapy.Spider): name = 'douban_xiaozu_spider' lock = Lock() ids = set() def start_requests(self): # for i in range(0, 731): for i in range(0, 100): url = start_url.format(i * 25) headers.update({'X-Real-IP': get_random_ip()}) yield FormRequest(url, headers=headers, callback=self.parse_cate)
""" @description: 大众点评 北京密室 @author: baoqiang @time: 2019-06-30 12:52 """ import scrapy from scrapy import FormRequest from threading import Lock import json from xiaoscript import config url_fmt = 'http://www.dianping.com/beijing/ch30/g2754p{}' out_file = '{}/chamber.json'.format(config.get_root_path()) class DianpingChamberSpider(scrapy.Spider): name = 'dianping_chamber' lock = Lock() def start_requests(self): for i in range(1, 24): url = url_fmt.format(i) yield FormRequest(url, callback=self.parse_page, headers=headers) def parse_page(self, response): classes = response.selector.xpath('.//div[@class="content"]//ul/li')
""" import json import re import urllib.parse import logging import threading import scrapy from scrapy import Request from xiaoscript import config start_url = 'https://coolshell.cn/page/{}' out_file = '{}/coolshell.json'.format(config.get_root_path()) id_pat = re.compile('https://coolshell.cn/articles/(\d+).html') class CollShellSpider(scrapy.Spider): name = 'cool_shell' lock = threading.Lock() def start_requests(self): for i in range(1, 72): # for i in range(1, 3): url = start_url.format(i) yield Request(url, callback=self.parse_page)
@time: 2019-05-15 20:45 """ import json import scrapy from scrapy import FormRequest import sys import re import requests import logging from xconcurrent import threadpool from xiaoscript.config import get_root_path out_file = '{}/kaola.json'.format(get_root_path()) task_file = '{}/kaola_task.json'.format(get_root_path()) start_url = 'https://www.kaola.com' class KaolaSpider(scrapy.Spider): name = 'kaola' cnt = 0 processed_set = set() def start_requests(self): yield FormRequest(start_url, callback=self.parse_cate) def parse_cate(self, response): datas = []
@time: 2019-07-24 12:49 """ from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from xiaoscript import config from threading import Lock import json import re from scrapy import Request start_url = 'https://bj.lianjia.com/xiaoqu/' id_pat = re.compile('https://bj.lianjia.com/xiaoqu/([\\d]+)') out_file = '{}/lianjia_xiaoqu.json'.format(config.get_root_path()) class LjXiaoquSpider(CrawlSpider): name = 'lj_xiaoqu_spider' start_urls = [ start_url, ] links1 = LinkExtractor(allow='.*/xiaoqu/[a-z]+/(pg[\\d]+){0,1}$') rules = (Rule(links1, callback='parse_cate', follow=True), ) lock = Lock() ids = set()
from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from xiaoscript import config from threading import Lock import json import requests import re # 图书 小说 start_url = 'https://list.jd.com/list.html?cat=1713,3258&page=1&delivery=1&sort=sort_rank_asc' comment_url_fmt = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}' id_pat = re.compile('https://item.jd.com/([\\d]+).html') out_file = '{}/jd_book.json'.format(config.get_root_path()) # https://club.jd.com/comment/productCommentSummaries.action?referenceIds=12178407 # 先爬取id,后续批量协程更新评论 class JdBookSpider(CrawlSpider): name = 'jdbook_spider' start_urls = [start_url, ] links1 = LinkExtractor(allow='.*cat=\\d+,\\d+$') links2 = LinkExtractor(allow='.*cat=\\d+,\\d+,\\d+$') links3 = LinkExtractor(allow='.*cat=\\d+,\\d+,\\d+&page=[\\d]&sort=sort_rank_asc.*') rules = ( Rule(links1, follow=True), Rule(links2, follow=True),
#!/usr/bin/env python # encoding: utf-8 """ @description: 处理自如的爬虫数据,分析得到最应该租住的区域 @author: pacman @time: 2018/3/2 17:29 """ import json from xiaoscript import config import re root_path = config.get_root_path() floor_pat = re.compile('([\d]+)/([\d]+)层') distance_pat = re.compile('([\d]+)米') processed_ids = set() def process(): not_print_key = True with open('{}/ziru2.json'.format(root_path), 'r', encoding='utf-8') as f, \ open('{}/ziru2.txt'.format(root_path), 'w', encoding='utf-8') as fw: for idx, line in enumerate(f, start=1): line = line.strip() json_data = json.loads(line.strip())
@author: baoqiang @time: 2019/1/4 下午12:56 """ import scrapy from scrapy.http import FormRequest import json from xiaoscript.config import get_root_path import time from scrapy.selector import Selector import re import threading start_cate_url = 'https://www.wandoujia.com/category/app' item_cate_fmt = 'https://www.wandoujia.com/wdjweb/api/category/more?catId={}&subCatId={}&page={}&ctoken=ZnrB6v38kAfy6a1GyghJGGtM' out_file = '{}/wandou2.json'.format(get_root_path()) root_url = 'https://www.wandoujia.com' cate_url = 'https://www.wandoujia.com/category/' class Wandou2Spider(scrapy.Spider): name = 'wandou2' num = 1 lock = threading.Lock() def start_requests(self): yield FormRequest(start_cate_url, callback=self.parse_cate) def parse_cate(self, response): data = []
@author: baoqiang @time: 2018/11/28 下午10:05 """ import requests import json from xiaoscript import config import pandas as pd ZIROOM = 'ziroom' keywords = ['来广营', '东湖渠', '望京'] root_path = '/Users/baoqiang/Downloads/' out_file = '{}/ziru3.json'.format(config.get_root_path()) def run(): for keyword in keywords: print('process {}'.format(keyword)) run_item(keyword) def run_item(keyword): datas = [] for i in range(10, 10001, 10): # for i in range(10, 30, 10): payload = {'step': i, 'key_word': keyword} res = requests.post('http://m.ziroom.com/list/ajax-get-data',
#!/usr/bin/env python # encoding: utf-8 """ @description: 暖房 @author: pacman @time: 2017/11/1 15:01 """ import scrapy from scrapy.http import FormRequest import json from xiaoscript.config import get_root_path out_file = '{}/nuanfang.json'.format(get_root_path()) out_file2 = '{}/nuanfang.txt'.format(get_root_path()) headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13E234 MicroMessenger/6.5.20 NetType/WIFI Language/zh_CN' } class NuanfangSpider(scrapy.Spider): name = 'nuanfang' def start_requests(self): for i in range(1, 1001): # for i in range(1, 3): url = url_fmt.format(i) yield FormRequest(url, headers=headers, callback=self.parse_cate)
@time: 2018/10/19 下午12:10 """ import json import re import urllib.parse import logging import scrapy from scrapy import Request from xiaoscript import config start_url = 'https://www.dankegongyu.com/room/bj' out_file = '{}/danke.json'.format(config.get_root_path()) id_pat = re.compile('https://www.dankegongyu.com/room/([\d]+).html') id_pat2 = re.compile('https://www.dankegongyu.com/duanzu/([\d]+).html') class DankeSpider(scrapy.Spider): name = 'danke' def start_requests(self): yield Request(start_url, callback=self.parse_area) def parse_area(self, response): filter_div = './/div[@class="filter_options"]/dl[contains(@class,"area")]/dd/div[@class="option_list"]/div[@class="area-ls-wp"]' classes = response.selector.xpath(filter_div)
""" @description: 即刻爬虫 @author: baoqiang @time: 2018/12/18 下午12:45 """ import json import scrapy from scrapy import FormRequest import sys from xiaoscript.config import get_root_path out_file = '{}/jike.json'.format(get_root_path()) url_fmt = 'https://app.jike.ruguoapp.com/1.0/topics/listSimilarTopics?id={}' web_fmt = 'https://web.okjike.com/topic/{}/official' app_fmt = 'http://m.jike.ruguoapp.com/topics/{}' class JikeSpider(scrapy.Spider): name = 'jike' cnt = 0 processed_set = set() def start_requests(self): with open('../data/jike.txt') as f: for line in f: url = url_fmt.format(line.strip())