def _get_rank_by_item(item_name, url): print("crawling {} rank".format(item_name)) a = load_html_to_bs(urls.get('baseUrl') + url).select(".list_tab ") result = [ re.split('\n| ', i) for i in a[0].text.strip('\n\n').split('\n\n\n') ] return item_name, result
def get_trend_data_concurrent(): res = {} bs = load_html_to_bs(urls.get('trend')) provinces = bs.select("#pri_province")[0].text.strip('\n').replace( "更多\n", "").split('\n') items = {} for item in bs.select(".charts_type > div"): items[item.text] = item.attrs.get('value') dateType = {'month': 30, 'year': 365} with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for date in dateType: for province in provinces: for k, v in items.items(): futures.append( executor.submit(_get_data, date, dateType, province, k, v)) for future in as_completed(futures): date, province, k, result = future.result() if not res.get(date): res[date] = {} if not res[date].get(province): res[date][province] = {} res[date][province][k] = result return res, 'trend'
def _get_data(date, dateType, province, k, v): print("crawling {} {} {}".format(date, province, k)) res = bot.get( urls.get('trend_data').format(province=parse.quote( province.encode('gbk')), days=dateType.get(date), pid=v), ) return date, province, k, json.loads(res.text)
def get_pig_rank(): aaa = {} items = {} for i in load_html_to_bs(urls.get('rank_index')).select(".list_nav div a"): items[i.text] = i.attrs['href'] # item = get_input(items, 'item') with ThreadPoolExecutor(max_workers=10) as executor: futures = [ executor.submit(_get_rank_by_item, k, v) for k, v in items.items() ] for future in as_completed(futures): item, res = future.result() aaa[item] = res return aaa, 'rank'
def get_trend_data(): res = {} bs = load_html_to_bs(urls.get('trend')) provinces = bs.select("#pri_province")[0].text.strip('\n').replace( "更多\n", "").split('\n') items = {} for item in bs.select(".charts_type > div"): items[item.text] = item.attrs.get('value') dateType = {'month': 30, 'year': 365} for date in dateType: for province in provinces: for k, v in items.items(): date, province, k, result = _get_data(date, dateType, province, k, v) if not res.get(date): res[date] = {} if not res[date].get(province): res[date][province] = {} res[date][province][k] = result return res, 'trend'
import sys sys.path.append("./") from spider.request_factory import bot, urls, get_input from bs4 import BeautifulSoup from urllib import parse from pprint import pprint import json import pandas as pd import matplotlib.pyplot as plt category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕'] cities = [ '北京市', '上海市', '天津市', '重庆市', '广东省', '福建省', '浙江省', '江苏省', '山东省', '辽宁省', '江西省', '四川省', '陕西省', '湖北省', '河南省', '河北省', '山西省', '内蒙古' ] res = bot.get(url=urls.get('trend')) bs = BeautifulSoup(res.text, "html.parser") provinces = bs.select("#pri_province")[0].text.strip('\n').replace( "更多\n", "").split('\n') dateType = {'month': 30, 'year': 365} # days = get_input(dateType, 'dateType') days = 'year' items = {} for item in bs.select(".charts_type > div"): items[item.text] = item.attrs.get('value') # province = get_input(provinces, 'province') province = cities[0] # item = get_input(items, 'item')
import sys sys.path.append("./") import numpy as np import pandas as pd from matplotlib import pyplot as plt from spider.request_factory import load_html_to_bs, urls, get_input import re category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕'] items = {} for i in load_html_to_bs(urls.get('rank_index')).select(".list_nav div a"): items[i.text] = i.attrs['href'] plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False fig, axes = plt.subplots(5, 1, figsize=(12, 50)) num = 0 for cg in category: item = get_input(items, 'item', cg) a = load_html_to_bs(urls.get('baseUrl') + items.get(item)).select(".list_tab ") result = [ re.split('\n| ', i) for i in a[0].text.strip('\n\n').split('\n\n\n') ] df = pd.DataFrame(result[1:], columns=result[0]) df['11-13'] = df['11-13'].apply(float) df['11-12'] = df['11-12'].apply(float) df['排名'] = df['排名'].apply(float) df = df.sort_values(by='排名', ascending=False) df['排名'] = df['排名'].apply(str)
import sys sys.path.append("./") from spider.request_factory import bot, urls, get_input from bs4 import BeautifulSoup from urllib import parse from pprint import pprint import json import pandas as pd import matplotlib.pyplot as plt #category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕','白条肉'] category = ['19', '22', '20', '8', '9','10'] cities = ['北京市', '上海市', '天津市', '重庆市', '广东省', '福建省', '浙江省', '江苏省', '山东省', '辽宁省', '江西省', '四川省', '陕西省', '湖北省', '河南省', '河北省', '山西省', '内蒙古'] res = bot.get(url=urls.get('trend')) bs = BeautifulSoup(res.text, "html.parser") provinces = bs.select("#pri_province")[0].text.strip('\n').replace("更多\n", "").split('\n') dateType = { 'month': 30, 'year': 365 } days = 'year' province = cities[0] #item = category[0] res1 = bot.get(urls.get('trend_data').format(province=parse.quote(province.encode('gbk')), days=dateType.get(days), pid=category[0]), )