def getIndex(keywords, endDate): baidu_index = BaiduIndex(keywords, '2015-01-01', endDate) df = pd.DataFrame() for index in baidu_index.get_index(): #print(index) baidu_index2 = dict_to_df(index) if baidu_index2['type'] == 'all': if df.empty: df = baidu_index2 else: df = df.append(baidu_index2) df.to_csv( 'D:/coin/spider-BaiduIndex-master/spider-BaiduIndex-master/SearchIndex.csv', encoding='utf_8_sig')
def get_index(): start_time = time.time() db = mysql_operation.getcon() select_sql = "select `tail_word` from `t_tail_word` where extr_flag = '0' and tail_word like '%php%' limit 5" update_sql = "update `t_tail_word` set extr_flag = '1' where extr_flag = '0' and tail_word like '%php%' limit 5" results = mysql_operation.baseselect(db, select_sql) mysql_operation.baseoperation(db, update_sql) search_list = [] if len(results) > 0: # 查询结果 for row in results: search_list.append(row[0]) baidu_index = BaiduIndex(search_list, '2018-01-01', '2019-04-01') for item in search_list: print(item, baidu_index.result[item]['all']) if len(baidu_index.result[item]['all']) > 0: # 搜索出来的index数量 all_avg = index_avg(baidu_index.result[item], 'all') pc_avg = index_avg(baidu_index.result[item], 'pc') wise_avg = index_avg(baidu_index.result[item], 'wise') print(all_avg, pc_avg, wise_avg) insert_sql = "INSERT INTO `baidu_index` " \ "(`word`, `start_time`, `end_time`, `status`, `all_avg`, `pc_avg`, `wise_avg`) " \ "VALUES ('%s', '%s', '%s', '0', '%s','%s','%s')" \ % (item, '2018-01-01', '2019-04-01', all_avg, pc_avg, wise_avg) else: # 未收录 insert_sql = "insert into `baidu_index` (`word`, `start_time`, `end_time`, `status`)" \ "values ('%s', '%s', '%s', '1')" % (item, '2018-01-01', '2019-04-01') mysql_operation.baseoperation(db, insert_sql) else: pass # 数据库查询结果没有 mysql_operation.closecon(db) print("times : ", time.time() - start_time) pass
def main(keywords_list, start_date, end_date): """ 爬虫调用主程序 :param keywords_list: 关键词列表 -->list :param start_date: 开始时间 -->str :param end_date: 结束时间 -->str :return: 存入数据库 """ for keyword in keywords_list: index = BaiduIndex(keyword, start_date, end_date) data = index.result[keyword]["all"] for i in range(len(data)): index = data[i]["index"] date = data[i]["date"] save_to_sql((keyword, index, date))
# -*- coding: utf-8 -*- """ Created on 2019/2/15 16:37 @Author: Johnson @Email:[email protected] @File: demo.py """ from get_index import BaiduIndex if __name__ == "__main__": """ 最多一次请求5个关键词 """ # 查看城市和省份的对应代码 print(BaiduIndex.city_code) print(BaiduIndex.province_code) baidu_index = BaiduIndex(['张艺兴', 'lol', '极限挑战', '吃鸡'], '2018-12-25', '2019-02-14',901) for data in baidu_index('lol', 'all'): print(data) # 获取全部5个关键词的全部数据 print(baidu_index.result) # 获取1个关键词的全部数据 print(baidu_index.result['极限挑战']) # 获取1个关键词的移动端数据 print(baidu_index.result['极限挑战']['wise']) # 获取1个关键词的pc端数据 print(baidu_index.result['极限挑战']['pc'])
from get_index import BaiduIndex if __name__ == "__main__": keywords = ['爬虫', 'lol', '张艺兴', '人工智能', '华为', '武林外传'] baidu_index = BaiduIndex(keywords, '2018-01-01', '2019-05-02') for index in baidu_index.get_index(): print(index)
import pandas as pd from key_word import * import datetime import time from get_index import BaiduIndex if __name__ == "__main__": start = time.time() today = datetime.datetime.now().strftime('%Y-%m-%d') keywords = getkeywords('stockname.csv') keywords = keywords[:500] index = BaiduIndex(keywords) index.run(10) #the parameter is how many thread we want res = index.get_index() res.to_csv(today + '-index.csv') print(time.time() - start)
from get_index import BaiduIndex if __name__ == "__main__": """ 最多一次请求5个关键词 抓取全国地级市全部数据 """ baidu_index = BaiduIndex([ '故宫','天坛','长城','天安门','颐和园'], '2017-09-30', '2018-10-01') """ 根据指定省份编码抓取省份相关地级市关键词 """ # province = ["931", "933", "934"] # baidu_index = BaiduIndex(['故宫', '天坛'], '2017-09-30', '2018-10-01', 'province', province) """ 根据指定市区编码抓取关键词 """ # city = ["678", "691"] # baidu_index = BaiduIndex(['故宫', '天坛'], '2017-09-30', '2018-10-01', 'city', city)
if __name__ == "__main__": 'if i want to write ith file, i = i-1, start = i*10' start = 1870 keywords = get_keywords(start) i = 187 while (keywords and i < 200): print(i) i = i + 1 start_time = '2011-01-01' end_time = '2020-04-01' 'create a res_dataframe' date_index = pd.date_range(start_time, end_time) resDf = pd.DataFrame(index=date_index, columns=keywords) baidu_index = BaiduIndex(keywords, start_time, end_time) for index in baidu_index.get_index(): 'print(index)' for item in keywords: if (index['keyword'] in item): resDf.loc[index['date'], item] = index['index'] break resDf.to_csv('./index/stkindex_' + str(i) + '.csv') start = start + 10 keywords = get_keywords(start)
from get_index import BaiduIndex if __name__ == "__main__": """ 最多一次请求5个关键词 """ # 查看城市和省份的对应代码 # print(BaiduIndex.city_code) # print(BaiduIndex.province_code) baidu_index = BaiduIndex(['张艺兴', 'lol', '极限挑战', 'python 免费空间'], '2019-04-01', '2019-04-02') # for data in baidu_index('lol', 'all'): # print(data) # 获取全部5个关键词的全部数据 # print(baidu_index.result) # 获取1个关键词的全部数据 print(baidu_index.result['python 免费空间']) # 获取1个关键词的移动端数据 print(baidu_index.result['极限挑战']['all']) # 获取1个关键词的pc端数据 print(baidu_index.result['极限挑战']['pc'])
from get_index import BaiduIndex import requests from config import COOKIES, PROVINCE_CODE, CITY_CODE if __name__ == "__main__": #测试 #url = "http://i.baidu.com" #wcookie = {"BDUSS":COOKIES} #HTML = requests.get(url,cookies=wcookie).content #print(HTML) #正式 keywords = ['猪肉'] baidu_index = BaiduIndex(keywords, '2019-08-01', '2020-01-21') for index in baidu_index.get_index(): print(index)
from get_index import BaiduIndex if __name__ == "__main__": """ 最多一次请求5个关键词 """ baidu_index = BaiduIndex(['张艺兴', 'lol', '极限挑战', '吃鸡'], '2016-10-01', '2018-10-02') for data in baidu_index('lol', 'all'): print(data) # 获取全部5个关键词的全部数据 print(baidu_index.result) # 获取1个关键词的全部数据 print(baidu_index.result['极限挑战']) # 获取1个关键词的移动端数据 print(baidu_index.result['极限挑战']['wise']) # 获取1个关键词的pc端数据 print(baidu_index.result['极限挑战']['pc'])
# baidu_index = BaiduIndex("无限极", '2018-12-13', '2019-02-15', 0) # # 无限极 = pd.DataFrame() # # for data in baidu_index('无限极', 'all'): # temp = pd.DataFrame(pd.Series(data)).T # 无限极 = pd.concat([无限极, temp]) # # # 无限极.index = range(len(无限极.shape)) # print(无限极) Df = pd.DataFrame() starttime = '2018-12-13' endtime = '2019-02-20' for i in ["无限极", "直销", "保健品", "权健", "华林酸碱平"]: baidu_index = BaiduIndex(i, starttime, endtime, 0) Temp = pd.DataFrame() for data in baidu_index(i, 'all'): temp = pd.DataFrame(pd.Series(data)).T Temp = pd.concat([Temp, temp]) Temp['keyword'] = [i] * (Temp.shape[0]) Df = pd.concat([Df, Temp]) print(Df) Df.to_csv("e:/Df.csv", encoding="gbk") # # 获取全部5个关键词的全部数据 # print(baidu_index.result) # # 获取1个关键词的全部数据 # print(baidu_index.result['无限极']) # # 获取1个关键词的移动端数据 # print(baidu_index.result['无限极']['wise'])
#%% from get_index import BaiduIndex import pandas as pd #if __name__ == "__main__": keywords = ['比特币'] baidu_index = BaiduIndex(keywords, '2013-04-01', '2014-04-30') baidu_index_dict = {'keyword': [], 'type': [], 'date': [], 'index': []} for index in baidu_index.get_index(): #print(index) if index['type'] == 'all': baidu_index_dict['keyword'].append(index['keyword']) baidu_index_dict['type'].append(index['type']) baidu_index_dict['date'].append(index['date']) baidu_index_dict['index'].append(index['index']) baidu_index_df = pd.DataFrame(baidu_index_dict)
# -*- coding: utf8 -*- import codecs from datetime import datetime from get_index import BaiduIndex if __name__ == "__main__": """ 最多一次请求5个关键词 """ # 查看城市和省份的对应代码 # print BaiduIndex.city_code # print BaiduIndex.province_code baidu_index = BaiduIndex(['人民币贬值'], '2009-01-01', '2019-03-31') for data in baidu_index('人民币贬值', 'all'): if datetime.strptime(data["date"], "%Y-%m-%d").strftime("%w") != "0" and datetime.strptime(data["date"], "%Y-%m-%d").strftime("%w") != "6": print data["date"], ",", data["index"] with open("test.csv", 'ab') as f: f.write(codecs.BOM_UTF8) f.write('"{0}","{1}"\r'.format(data["date"], data["index"])) print '获取1个关键词的全部数据' baidu_index.result['人民币贬值'] # print '获取1个关键词的移动端数据' # print(baidu_index.result['人民币贬值']['wise']) # print '获取1个关键词的pc端数据' # print(baidu_index.result['人民币贬值']['pc'])
@Author: Johnson @Email:[email protected] @File: demo.py """ from get_index import BaiduIndex import pandas as pd if __name__ == "__main__": """ 最多一次请求5个关键词 """ # 查看城市和省份的对应代码 # print(BaiduIndex.city_code) # print(BaiduIndex.province_code) baidu_index = BaiduIndex(["无限极", "直销", "保健品", "权健", "华林酸碱平"], '2018-12-13', '2019-02-20', 0) 无限极 = pd.DataFrame() for data in baidu_index('无限极', 'all'): temp = pd.DataFrame(pd.Series(data)).T 无限极 = pd.concat([无限极, temp]) # 无限极.index = range(len(无限极.shape)) print(无限极) # # 获取全部5个关键词的全部数据 # print(baidu_index.result) # # 获取1个关键词的全部数据 # print(baidu_index.result['无限极']) # # 获取1个关键词的移动端数据
from get_index import BaiduIndex if __name__ == "__main__": """ 最多一次请求5个关键词 """ # 查看城市和省份的对应代码 print(BaiduIndex.city_code) print(BaiduIndex.province_code) baidu_index = BaiduIndex(['找工作', '失业', '裁员'], '2011-01-01', '2016-10-01') for data in baidu_index('找工作', 'all'): print(data) # 获取全部5个关键词的全部数据 ## print(baidu_index.result) # 获取1个关键词的全部数据 print(baidu_index.result['找工作']) # 获取1个关键词的移动端数据 ## print(baidu_index.result['找工作']['wise']) # 获取1个关键词的pc端数据 ## print(baidu_index.result['找工作']['pc'])