Esempio n. 1
0
def _get_rank_by_item(item_name, url):
    print("crawling {} rank".format(item_name))
    a = load_html_to_bs(urls.get('baseUrl') + url).select(".list_tab ")
    result = [
        re.split('\n| ', i) for i in a[0].text.strip('\n\n').split('\n\n\n')
    ]
    return item_name, result
Esempio n. 2
0
def get_trend_data_concurrent():
    res = {}
    bs = load_html_to_bs(urls.get('trend'))
    provinces = bs.select("#pri_province")[0].text.strip('\n').replace(
        "更多\n", "").split('\n')
    items = {}
    for item in bs.select(".charts_type > div"):
        items[item.text] = item.attrs.get('value')
    dateType = {'month': 30, 'year': 365}
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for date in dateType:
            for province in provinces:
                for k, v in items.items():
                    futures.append(
                        executor.submit(_get_data, date, dateType, province, k,
                                        v))
        for future in as_completed(futures):
            date, province, k, result = future.result()
            if not res.get(date):
                res[date] = {}
            if not res[date].get(province):
                res[date][province] = {}
            res[date][province][k] = result
    return res, 'trend'
Esempio n. 3
0
def get_pig_rank():
    aaa = {}
    items = {}
    for i in load_html_to_bs(urls.get('rank_index')).select(".list_nav div a"):
        items[i.text] = i.attrs['href']
    # item = get_input(items, 'item')

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(_get_rank_by_item, k, v) for k, v in items.items()
        ]
    for future in as_completed(futures):
        item, res = future.result()
        aaa[item] = res
    return aaa, 'rank'
Esempio n. 4
0
def get_trend_data():
    res = {}
    bs = load_html_to_bs(urls.get('trend'))
    provinces = bs.select("#pri_province")[0].text.strip('\n').replace(
        "更多\n", "").split('\n')
    items = {}
    for item in bs.select(".charts_type > div"):
        items[item.text] = item.attrs.get('value')
    dateType = {'month': 30, 'year': 365}

    for date in dateType:
        for province in provinces:
            for k, v in items.items():
                date, province, k, result = _get_data(date, dateType, province,
                                                      k, v)
                if not res.get(date):
                    res[date] = {}
                if not res[date].get(province):
                    res[date][province] = {}
                res[date][province][k] = result
    return res, 'trend'
Esempio n. 5
0
import sys
sys.path.append("./")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from spider.request_factory import load_html_to_bs, urls, get_input
import re

category = ['外三元', '内三元', '土杂猪', '玉米', '豆粕']

items = {}
for i in load_html_to_bs(urls.get('rank_index')).select(".list_nav div a"):
    items[i.text] = i.attrs['href']

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, axes = plt.subplots(5, 1, figsize=(12, 50))
num = 0
for cg in category:
    item = get_input(items, 'item', cg)
    a = load_html_to_bs(urls.get('baseUrl') +
                        items.get(item)).select(".list_tab ")
    result = [
        re.split('\n| ', i) for i in a[0].text.strip('\n\n').split('\n\n\n')
    ]
    df = pd.DataFrame(result[1:], columns=result[0])
    df['11-13'] = df['11-13'].apply(float)
    df['11-12'] = df['11-12'].apply(float)
    df['排名'] = df['排名'].apply(float)
    df = df.sort_values(by='排名', ascending=False)
    df['排名'] = df['排名'].apply(str)