def spiderMain(): """ # main主程序 :return: """ dler = Dler() dler.downLoad(100) DB = DBService(dbName='jddata', tableName='thirdPartShopInfo') DB.createTable( tableTitle=['productHref', 'companyName', 'shopName', 'shopHref', 'scoreSum', 'scoreProduct', 'scoreProductAvg', 'scoreService', 'scoreServiceAvg', 'scoreExpress', 'scoreExpressAvg', 'gradeHref']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() # proxy_test=temp[0] # if proxy_test=='-': # continue # else: # print(proxy_test) print(temp[0]) DB.data2DB(data=[url] + temp) else: time.sleep(1)
def push2DB(): from ms_spider_fw.DBSerivce import DBService data=getKeyword() db=DBService('taobaodata','keyword') tableTitle=['categoryFi', 'categorySe', 'categoryTi'] db.createTable(tableTitle=tableTitle) db.data2DB(data=data)
def spiderMain(): # 主程序 from ms_spider_fw.CSVService import CSV dler = Dler() dler.downLoad(100) DB = DBService(dbName="jddata", tableName="shop_grade_score") DB.createTable( tableTitle=[ "gradeHref", "totalScore", "totalScore_avg", "productScore", "productScore_avg", "productQualityScore", "productQualityScore_avg", "productDescribeScore", "productDescribeScore_avg", "returnExchangeRate", "returnExchangeRate_avg", "serviceScore", "serviceScore_avg", "sellerCSI", "sellerCSI_avg", "distributionCSI", "distributionCSI_avg", "onlineServiceCSI", "onlineServiceCSI_avg", "returnExchangeCSI", "returnExchangeCSI_avg", "temporalityScore", "temporalityScore_avg", "expScore", "expScore_avg", "sendPromptnessScore", "sendPromptnessScore_avg", "returnExchangeTime", "returnExchangeTime_avg", "onLineSeriveTime", "onLineSeriveTime_avg", "spider_time", ] ) # while True: que = DBN.queueForDownLoad while True: url, src = que.get() try: pPer = PPer(src) result = pPer.pageParser() total = [url] + result DB.data2DB(data=total) print(result) except: continue
def run(thread_count=1000): run_test(thread_count) db_server_c = DBService(dbName=db_name, tableName='proxy_ok', **connect_dict) db_server_c.createTable(tableTitle=['proxy_port', 'test_time'], x='Y') res = [] print '#'*100 print qu_proxy_ok.qsize() while qu_proxy_ok.qsize(): res.append(qu_proxy_ok.get()) db_server_c.data2DB(data=res)
def run(thread_count=20000): muti_thread_test(thread_count) db_server_c = DBService(dbName=db_name, tableName='proxy_ok', **connect_dict) db_server_c.createTable(tableTitle=['proxy_port', 'test_time'], x='Y') res = [] while qu_proxy_ok.qsize(): res.append([ qu_proxy_ok.get(), time.strftime('%Y-%m-%d %X', time.localtime()) ]) db_server_c.data2DB(data=res)
def dataGen(): comDict = companyInfo() proDict = productInfo() dict = {} for item in comDict.items(): if item[0] in proDict.keys(): dict[item[0]] = comDict[item[0]] + [proDict[item[0]]] else: continue data = [item[1] for item in dict.items()] db1 = DBService(dbName='jddata', tableName='thirdPartShopInfo') title = db1.getTableTitle() title = title + ['commnetCount'] print(title) db2 = DBService(dbName='jddata', tableName='thirdPartShopInfoAddtest') db2.createTable(tableTitle=title) db2.data2DB(data=data)
def spiderMain(): """ # main主程序 :return: """ dler = Dler() dler.downLoad(10) DB = DBService(#host='localhost', # user='******', # passwd='', # charset='utf8', # dbName='spider', dbName='alibaba', tableName='alibaba_cow_powder_3') DB.createTable(tableTitle= ['company_name', 'keyword', 'sale', 'href', 'member_id', 'offer_id', 'cxt_year', 'credit_detail_href', 'goods_from', 'product_title_sample', 'product_detail_sample', 'location', 'url_base']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() if temp: temp = map(lambda x: x + [url], temp) DB.data2DB(data=temp) print(u'++成功:%s'%url) else: print(u'--失败:%s'%url) else: time.sleep(1)
def spiderMain(): # 主程序 dler = Dler() dler.downLoad(100) DB = DBService(dbName='jddata', tableName='thirdPartShopAppID') DB.createTable(tableTitle=['shopHref','appID']) while True: que = DBN.queueForDownLoad if not que.empty(): url, src = que.get() pPer = PPer(src) temp = pPer.pageParser() print('='*30) print(url) print(temp) if temp: DB.data2DB(data=[url] + temp) else: time.sleep(1)
def get_parser(url, driver): import random time.sleep(abs(random.gauss(5, 5))) driver.get(url) print(driver.title) contacts_name = "-" contacts_sex = "-" contacts_job = "-" try: contacts_name = driver.find_element_by_css_selector(".contact-info .membername").text contacts_sex = driver.find_element_by_css_selector(".contact-info>dl>dd").text.split(" ")[1] contacts_job = driver.find_element_by_css_selector(".contact-info>dl>dd").text.split("(")[1] contacts_job = contacts_job.split(")")[0] except: pass phone_frames = driver.find_elements_by_css_selector(".contcat-desc dl") cell_phone = "-" tel_phone = "-" fax_phone = "-" shop_addr = "-" for i in range(len(phone_frames)): text = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dt").text.strip() if text == u"移动电话:": cell_phone = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue elif text == u"电 话:": tel_phone = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue elif text == u"传 真:": fax_phone = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue elif text == u"地 址:": shop_addr = driver.find_element_by_css_selector(".contcat-desc dl:nth-child(" + str(i + 1) + ") dd").text continue spider_time = time.strftime("%Y-%m-%d %X", time.localtime()) result = [contacts_name, contacts_sex, contacts_job, cell_phone, tel_phone, fax_phone, shop_addr, spider_time, url] DB = DBService(dbName="alibaba", tableName="alibaba_cow_powder_phone") DB.data2DB(data=result)
def getCategoryAndStartUrl(): import json global queue_for_url_targetBase queue_for_url_targetBase = Queue(0) src = myUrlOpen.requestByProxy('http://dc.3.cn/category/get?callback=getCategoryCallback') srcTemp = src.split('(', 1)[1][:-1] srcTemp = srcTemp.decode('gbk', 'ignore') srcJson = json.loads(srcTemp)['data'] category = [] for Fi in srcJson: targetFi = Fi['s'] for Se in targetFi: targetSeTitle = Se['n'] targetSe = Se['s'] for Ti in targetSe: targetTiTitle = Ti['n'] targetTi = Ti['s'] for Fo in targetTi: targetFoTitle = Fo['n'] categoryTemp = [targetSeTitle.split('|')[1], targetSeTitle.split('|')[0], targetTiTitle.split('|')[1], targetTiTitle.split('|')[0], targetFoTitle.split('|')[1], targetFoTitle.split('|')[0]] category.append(categoryTemp) queue_for_url_targetBase.put((targetFoTitle.split('|')[1], targetFoTitle.split('|')[0])) db = DBService(dbName='jddata', tableName='jdkeyword') db.createTable(tableTitle=['category_fi_name', 'category_fi', 'category_se_name', 'category_se', 'category_ti_name', 'category_ti']) db.data2DB(data=category) # for item in category: # print(item) # try: # db.data2DB(data=item) # except:continue # print('=' * 50) return category
return judgePageHref def craweldhref(): db = DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge') href = db.getData(var='href') href = [item[0] for item in href] F = lambda x: x[:-1] if x[-1] == '/' else x href = map(F, href) print(len(href)) return href def href(): temp1 = commentHrefList() temp2 = craweldhref() temp2 = set(temp2) temp3 = [] for item in temp1: if not item[1] in temp2: temp3.append(list(item)) else: continue temp3=[[item[0],item[1]+'/','http://rate.taobao.com/user-rate-'+item[2]+'.htm']for item in temp3] return temp3 temp = href() db=DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge') db.data2DB(data=temp,tableTitle=['name','href','judgepage_href'])
def putDataIntoDB(path): data = getData(path=path) dbs = DBService(dbName='elec_platform', tableName='tmall_baseinfo_weekly_2016') dbs.data2DB(data=data)
map(lambda x: end_day - timedelta(days=x), range(1, days + 1))) _start_time.reverse() _end_time.reverse() time_step = zip(_start_time, _end_time) t_k = target_keyword(industry) for item in time_step: start, end = item print start for k_w in t_k: try: print k_w[1] api = weibo_api(start_time=start, end_time=end, key_word=k_w[0] + ' ' + industry) response = requests.get(url=api, headers=headers) db_server.data2DB(data=[add_info(response.content, industry, k_w[1]), time.strftime('%Y-%m-%d %X', time.localtime())]) page_total = json.loads(response.content).get('total_number') if not page_total: continue for i in range(2, 101 if page_total / 10 > 101 else page_total / 10): try: api_t = weibo_api(start_time=start, end_time=end, page=i, key_word=k_w[0] + ' ' + industry) response_t = requests.get(url=api_t, headers=headers) db_server.data2DB(data=[add_info(response_t.content, industry, k_w[1]), time.strftime('%Y-%m-%d %X', time.localtime())]) print 'is the ' + str(i) + ' request sucessful.' except Exception, e: print e.message continue
it.get("userProvince"), it.get("userRegisterTime"), it.get("viewCount"), it.get("orderId"), it.get("isReplyGrade"), it.get("nickname"), it.get("userClient"), it.get("productColor"), it.get("productSize"), it.get("integral"), it.get("anonymousFlag"), it.get("userLevelName"), it.get("recommend"), re.sub(re_sub_p, '', it.get("userClientShow")), it.get("isMobile"), st[2], st[3], st[0], st[1], it.get("days"), u'手机'] res.append(t) try: db_server.data2DB(data=t) except Exception,e: print e.message if __name__ == "__main__": for item in data: extract_info(item)