Beispiel #1
0
def _get_rank_by_item(item_name, url):
    print("crawling {} rank".format(item_name))
    a = load_html_to_bs(urls.get('baseUrl') + url).select(".list_tab ")
    result = [
        re.split('\n| ', i) for i in a[0].text.strip('\n\n').split('\n\n\n')
    ]
    return item_name, result
Beispiel #2
0
def get_trend_data_concurrent():
    res = {}
    bs = load_html_to_bs(urls.get('trend'))
    provinces = bs.select("#pri_province")[0].text.strip('\n').replace(
        "更多\n", "").split('\n')
    items = {}
    for item in bs.select(".charts_type > div"):
        items[item.text] = item.attrs.get('value')
    dateType = {'month': 30, 'year': 365}
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for date in dateType:
            for province in provinces:
                for k, v in items.items():
                    futures.append(
                        executor.submit(_get_data, date, dateType, province, k,
                                        v))
        for future in as_completed(futures):
            date, province, k, result = future.result()
            if not res.get(date):
                res[date] = {}
            if not res[date].get(province):
                res[date][province] = {}
            res[date][province][k] = result
    return res, 'trend'
Beispiel #3
0
def _get_data(date, dateType, province, k, v):
    print("crawling {} {} {}".format(date, province, k))
    res = bot.get(
        urls.get('trend_data').format(province=parse.quote(
            province.encode('gbk')),
                                      days=dateType.get(date),
                                      pid=v), )

    return date, province, k, json.loads(res.text)
Beispiel #4
0
def get_pig_rank():
    aaa = {}
    items = {}
    for i in load_html_to_bs(urls.get('rank_index')).select(".list_nav div a"):
        items[i.text] = i.attrs['href']
    # item = get_input(items, 'item')

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(_get_rank_by_item, k, v) for k, v in items.items()
        ]
    for future in as_completed(futures):
        item, res = future.result()
        aaa[item] = res
    return aaa, 'rank'
Beispiel #5
0
def get_trend_data():
    res = {}
    bs = load_html_to_bs(urls.get('trend'))
    provinces = bs.select("#pri_province")[0].text.strip('\n').replace(
        "更多\n", "").split('\n')
    items = {}
    for item in bs.select(".charts_type > div"):
        items[item.text] = item.attrs.get('value')
    dateType = {'month': 30, 'year': 365}

    for date in dateType:
        for province in provinces:
            for k, v in items.items():
                date, province, k, result = _get_data(date, dateType, province,
                                                      k, v)
                if not res.get(date):
                    res[date] = {}
                if not res[date].get(province):
                    res[date][province] = {}
                res[date][province][k] = result
    return res, 'trend'
Beispiel #6
0
import sys
sys.path.append("./")
from spider.request_factory import bot, urls, get_input
from bs4 import BeautifulSoup
from urllib import parse
from pprint import pprint
import json
import pandas as pd
import matplotlib.pyplot as plt
category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕']
cities = [
    '北京市', '上海市', '天津市', '重庆市', '广东省', '福建省', '浙江省', '江苏省', '山东省', '辽宁省',
    '江西省', '四川省', '陕西省', '湖北省', '河南省', '河北省', '山西省', '内蒙古'
]

res = bot.get(url=urls.get('trend'))
bs = BeautifulSoup(res.text, "html.parser")
provinces = bs.select("#pri_province")[0].text.strip('\n').replace(
    "更多\n", "").split('\n')
dateType = {'month': 30, 'year': 365}
# days = get_input(dateType, 'dateType')
days = 'year'

items = {}
for item in bs.select(".charts_type > div"):
    items[item.text] = item.attrs.get('value')

# province = get_input(provinces, 'province')
province = cities[0]

# item = get_input(items, 'item')
Beispiel #7
0
import sys
sys.path.append("./")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from spider.request_factory import load_html_to_bs, urls, get_input
import re

category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕']

items = {}
for i in load_html_to_bs(urls.get('rank_index')).select(".list_nav div a"):
    items[i.text] = i.attrs['href']

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, axes = plt.subplots(5, 1, figsize=(12, 50))
num = 0
for cg in category:
    item = get_input(items, 'item', cg)
    a = load_html_to_bs(urls.get('baseUrl') +
                        items.get(item)).select(".list_tab ")
    result = [
        re.split('\n| ', i) for i in a[0].text.strip('\n\n').split('\n\n\n')
    ]
    df = pd.DataFrame(result[1:], columns=result[0])
    df['11-13'] = df['11-13'].apply(float)
    df['11-12'] = df['11-12'].apply(float)
    df['排名'] = df['排名'].apply(float)
    df = df.sort_values(by='排名', ascending=False)
    df['排名'] = df['排名'].apply(str)
Beispiel #8
0
import sys
sys.path.append("./")
from spider.request_factory import bot, urls, get_input
from bs4 import BeautifulSoup
from urllib import parse
from pprint import pprint
import json
import pandas as pd
import matplotlib.pyplot as plt
#category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕','白条肉']
category = ['19', '22', '20', '8', '9','10']
cities = ['北京市', '上海市', '天津市', '重庆市', '广东省', '福建省', '浙江省', '江苏省', '山东省', '辽宁省', '江西省', '四川省', '陕西省', '湖北省', '河南省', '河北省', '山西省', '内蒙古']

res = bot.get(url=urls.get('trend'))
bs = BeautifulSoup(res.text, "html.parser")
provinces = bs.select("#pri_province")[0].text.strip('\n').replace("更多\n", "").split('\n')

dateType = {
    'month': 30,
    'year': 365
}

days = 'year'

province = cities[0]

#item = category[0]
res1 = bot.get(urls.get('trend_data').format(province=parse.quote(province.encode('gbk')), days=dateType.get(days),
                                            pid=category[0]),
              )