def push2DB(): from ms_spider_fw.DBSerivce import DBService data=getKeyword() db=DBService('taobaodata','keyword') tableTitle=['categoryFi', 'categorySe', 'categoryTi'] db.createTable(tableTitle=tableTitle) db.data2DB(data=data)
def spiderMain(): """ # main主程序 :return: """ dler = Dler() dler.downLoad(100) DB = DBService(dbName='jddata', tableName='thirdPartShopInfo') DB.createTable( tableTitle=['productHref', 'companyName', 'shopName', 'shopHref', 'scoreSum', 'scoreProduct', 'scoreProductAvg', 'scoreService', 'scoreServiceAvg', 'scoreExpress', 'scoreExpressAvg', 'gradeHref']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() # proxy_test=temp[0] # if proxy_test=='-': # continue # else: # print(proxy_test) print(temp[0]) DB.data2DB(data=[url] + temp) else: time.sleep(1)
def spiderMain(): # 主程序 from ms_spider_fw.CSVService import CSV dler = Dler() dler.downLoad(100) DB = DBService(dbName="jddata", tableName="shop_grade_score") DB.createTable( tableTitle=[ "gradeHref", "totalScore", "totalScore_avg", "productScore", "productScore_avg", "productQualityScore", "productQualityScore_avg", "productDescribeScore", "productDescribeScore_avg", "returnExchangeRate", "returnExchangeRate_avg", "serviceScore", "serviceScore_avg", "sellerCSI", "sellerCSI_avg", "distributionCSI", "distributionCSI_avg", "onlineServiceCSI", "onlineServiceCSI_avg", "returnExchangeCSI", "returnExchangeCSI_avg", "temporalityScore", "temporalityScore_avg", "expScore", "expScore_avg", "sendPromptnessScore", "sendPromptnessScore_avg", "returnExchangeTime", "returnExchangeTime_avg", "onLineSeriveTime", "onLineSeriveTime_avg", "spider_time", ] ) # while True: que = DBN.queueForDownLoad while True: url, src = que.get() try: pPer = PPer(src) result = pPer.pageParser() total = [url] + result DB.data2DB(data=total) print(result) except: continue
def run(thread_count=1000): run_test(thread_count) db_server_c = DBService(dbName=db_name, tableName='proxy_ok', **connect_dict) db_server_c.createTable(tableTitle=['proxy_port', 'test_time'], x='Y') res = [] print '#'*100 print qu_proxy_ok.qsize() while qu_proxy_ok.qsize(): res.append(qu_proxy_ok.get()) db_server_c.data2DB(data=res)
def run(thread_count=20000): muti_thread_test(thread_count) db_server_c = DBService(dbName=db_name, tableName='proxy_ok', **connect_dict) db_server_c.createTable(tableTitle=['proxy_port', 'test_time'], x='Y') res = [] while qu_proxy_ok.qsize(): res.append([ qu_proxy_ok.get(), time.strftime('%Y-%m-%d %X', time.localtime()) ]) db_server_c.data2DB(data=res)
def dataGen(): comDict = companyInfo() proDict = productInfo() dict = {} for item in comDict.items(): if item[0] in proDict.keys(): dict[item[0]] = comDict[item[0]] + [proDict[item[0]]] else: continue data = [item[1] for item in dict.items()] db1 = DBService(dbName='jddata', tableName='thirdPartShopInfo') title = db1.getTableTitle() title = title + ['commnetCount'] print(title) db2 = DBService(dbName='jddata', tableName='thirdPartShopInfoAddtest') db2.createTable(tableTitle=title) db2.data2DB(data=data)
def spiderMain(): """ # main主程序 :return: """ dler = Dler() dler.downLoad(10) DB = DBService(#host='localhost', # user='******', # passwd='', # charset='utf8', # dbName='spider', dbName='alibaba', tableName='alibaba_cow_powder_3') DB.createTable(tableTitle= ['company_name', 'keyword', 'sale', 'href', 'member_id', 'offer_id', 'cxt_year', 'credit_detail_href', 'goods_from', 'product_title_sample', 'product_detail_sample', 'location', 'url_base']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() if temp: temp = map(lambda x: x + [url], temp) DB.data2DB(data=temp) print(u'++成功:%s'%url) else: print(u'--失败:%s'%url) else: time.sleep(1)
def spiderMain(): # 主程序 dler = Dler() dler.downLoad(100) DB = DBService(dbName="jddata", tableName="thirdPartShopSearchPage") DB.createTable(tableTitle=["tttt"]) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() print("=" * 30) print(url) for item in temp: print(item) # DB.data2DB(data=[url] + temp) else: time.sleep(1)
def spiderMain(): # 主程序 dler = Dler() dler.downLoad(100) DB = DBService(dbName='jddata', tableName='thirdPartShopAppID') DB.createTable(tableTitle=['shopHref','appID']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() print('='*30) print(url) print(temp) if temp: DB.data2DB(data=[url] + temp) else: time.sleep(1)
def getCategoryAndStartUrl(): import json global queue_for_url_targetBase queue_for_url_targetBase = Queue(0) src = myUrlOpen.requestByProxy('http://dc.3.cn/category/get?callback=getCategoryCallback') srcTemp = src.split('(', 1)[1][:-1] srcTemp = srcTemp.decode('gbk', 'ignore') srcJson = json.loads(srcTemp)['data'] category = [] for Fi in srcJson: targetFi = Fi['s'] for Se in targetFi: targetSeTitle = Se['n'] targetSe = Se['s'] for Ti in targetSe: targetTiTitle = Ti['n'] targetTi = Ti['s'] for Fo in targetTi: targetFoTitle = Fo['n'] categoryTemp = [targetSeTitle.split('|')[1], targetSeTitle.split('|')[0], targetTiTitle.split('|')[1], targetTiTitle.split('|')[0], targetFoTitle.split('|')[1], targetFoTitle.split('|')[0]] category.append(categoryTemp) queue_for_url_targetBase.put((targetFoTitle.split('|')[1], targetFoTitle.split('|')[0])) db = DBService(dbName='jddata', tableName='jdkeyword') db.createTable(tableTitle=['category_fi_name', 'category_fi', 'category_se_name', 'category_se', 'category_ti_name', 'category_ti']) db.data2DB(data=category) # for item in category: # print(item) # try: # db.data2DB(data=item) # except:continue # print('=' * 50) return category
import Queue json_file_queue = Queue.Queue(0) connect_jd = pymysql.connect( host='10.118.187.12', user='******', passwd='admin', database='platform_data' ) connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} dbs = DBService(dbName='platform_data', tableName='jd_data_temp_0326', **connect_dict) dbs.createTable( tableTitle= map(lambda x: x.strip(), 'shop_name, addr, com_name, shop_href, cate_0, score_summary, ' 'express_score, product_score, service_score,product_href, vender_id, ' 'sku_id, size_count'.split(',')) ) def get_min_max_id(): sql_min = 'SELECT MIN(id) FROM jd_product_detail' sql_max = 'SELECT MAX(id) FROM jd_product_detail' cur = connect_jd.cursor() cur.execute(sql_min) min_id = cur.fetchall() cur.execute(sql_max) max_id = cur.fetchall() cur.close() return min_id[0][0], max_id[0][0]
# config_text db_name = 'platform_data' table_name = 'suning' table_title = 'product_url,catalogue,sub_catalogue,product_title,promotion_desc,origin_price,price,' \ 'product_stars,comment_count,sending_service,other_service,product_params,shop_name,' \ 'shop_href,product_rating,product_rating_avg,serice_rating,service_rating_avg,express_rating,' \ 'express_rating_avg,com_name_tel,crawl_time' url_start = 'http://www.suning.com/emall/pgv_10052_10051_1_.html' # start url for crawl,string connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} # script db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict) if not db_server.isTableExist(): db_server.createTable(tableTitle=table_title.split(',')) class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl(url_start, callback=self.step_first) @config(age=2 * 24 * 60 * 60) def step_first(self, response): d = response.doc for t in d('.listLeft>dl>dd>span>a').items(): self.crawl(t.attr.href, callback=self.step_second)
db_name = 'address' table_name_1 = 'ershoufang_58city_baseinfo' table_name_2 = 'ershoufang_58city_detail' table_title_1 = 'detail,crawl_time' table_title_2 = 'url,detail,crawl_time' url_start = 'http://www.58.com/ershoufang/changecity/' # connect string , usually no need to modify connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'} db_server_1 = DBService(dbName=db_name, tableName=table_name_1, **connect_dict) db_server_2 = DBService(dbName=db_name, tableName=table_name_2, **connect_dict) # if create table for store result in mysql , no need to be changed if not db_server_1.isTableExist(): db_server_1.createTable(tableTitle=table_title_1.split(',')) if not db_server_2.isTableExist(): db_server_2.createTable(tableTitle=table_title_2.split(',')) pat_num = re.compile('\d+') pat_replace_space = re.compile('\s+?') pat_comment = re.compile('var arr=(.+?)\;') class Handler(BaseHandler): crawl_config = { # 'proxy': '10.10.10.10:80', 'headers': { 'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) ' 'Gecko/20100101 Firefox/4.0.1'
""" from pyspider.libs.base_handler import * from ms_spider_fw.DBSerivce import DBService import time db_server = DBService(dbName='b2c_base', tableName='b2c_website_list_meidebi', host='10.118.187.12', user='******', passwd='admin', charset='utf8') # create table for store result in mysql db_server.createTable(tableTitle=[ 'name', 'summary', 'url', 'evaluation_num', 'total_score', 'quality_score', 'express_service_score', 'customer_service_score', 'crawl_time' ]) class Handler(BaseHandler): crawl_config = { } @every(minutes=30 * 24 * 60) def on_start(self): self.crawl('http://www.meidebi.com/company/', callback=self.step_first)
"productsize", "integral", "anonymousflag", "userlevelname", "recommend", "userclientshow", "ismobile", "negwords", "negwordsnum", "goodwords", "goodwordsnum", "days", "industry" ] if not db_server.isTableExist(): db_server.createTable(tableTitle=table_title) # re_sub_p = re.compile('<.+?>') re_sub_p = re.compile(u'回复|#.+?#|@.+?[\s::]|\[.+?\]|@.+$|\s+?') res = list() def extract_info(x): try: d_t = json.loads(x) d = d_t['comments'] except Exception, e: # raise ValueError('No comments exists!') return None if isinstance(d, list):
# if not que.empty(): # url, src = que.get() # pPer = PPer(src) # temp = pPer.pageParser() # if temp: # temp = map(lambda x: x + [url], temp) # # DB.data2DB(data=temp) # print(u'++成功:%s' % url) # else: # print(u'--失败:%s' % url) # else: # time.sleep(1) if __name__ == "__main__": DB = DBService(dbName="alibaba", tableName="alibaba_cow_powder_phone") DB.createTable( tableTitle=[ "contacts_name", "contacts_sex", "contacts_job", "cell_phone", "tel_phone", "fax_phone", "shop_addr", "spider_time", "url", ] ) temp_main()