def getWaitCrawler(self): mysql = Mysql() sqlString = ConfigStart.SELECTCOUNTFROMLEAGUECRAWLER result = mysql.getOne(sqlString) mysql.dispose() return result[ConfigStart.RESULT] pass
def startLeagueMain(self, mutex): #获取数据库资源 mysql = Mysql() p_id, p_name, p_type, p_country, p_main_url = 0, '', 0, '', '' if mutex.acquire(): sqlString = ConfigStart.SELECTFROMLEAGUECRAWLER result = mysql.getOne(sqlString) if isinstance(result, bool): print "分析完毕" mutex.release() mysql.dispose() return pass else: mysql.update(ConfigStart.UPDATELEAGUESETCRAWLER, result[League.p_id]) print result[League.p_name] p_id,p_name,p_type,p_country,p_main_url=result[League.p_id],\ result[League.p_name],result[League.p_type],result[League.p_country],result[League.p_main_url] mutex.release() pass #获取每个联赛现存所有赛季并得到对应的url print "正在获取联赛___(%s)___的所有赛季信息获取的url为:%s " % (p_name, p_main_url) webfile = urllib.urlopen(p_main_url) webContent = webfile.read() webfile.close() webContent = unicode(webContent, ConfigStart.GBK) soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) leagueYears = soup.find_all(class_=ConfigStart.DROPLISTCLASS) for leagueYear in leagueYears[0].children: if (type(leagueYear) == bs4.element.Tag): print leagueYear.a[ConfigStart.STRINGTITLE] print leagueYear.a[ConfigStart.HREF] #print re.findall(r'\b\d+\b',leagueYear.a['title']) l = [[ p_id, ConfigStart.STARTURL + leagueYear.a[ConfigStart.HREF], leagueYear.a[ConfigStart.STRINGTITLE], leagueYear.a.string ]] sqlInsert = ConfigStart.INSERTINTOLEAGUEYEARINFO result = mysql.insertMany(sqlInsert, l) print result pass mysql.dispose()
def getMatchUrl(self, limit): mysql = Mysql() sqlAll = ConfigStart.SELECTFROMLEAGUEYEARINFOLIMIT resultSelect = mysql.getAll(sqlAll, limit) if resultSelect == False: return for resultChild in resultSelect: webfile = urllib.urlopen(resultChild[LeagueYearInfo.p_jifen_url]) webcontext = webfile.read() webfile.close() webContent = unicode(webcontext, ConfigStart.GBK) soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) getUrls = soup.find_all(class_="ltab_btn") sqlInsert = ConfigStart.INSERTINTOMATCHURL_TOP l = [] i = 0 print resultChild['p_jifen_url'] for getUrl in getUrls: if getUrl.string.find("赛制") >= 0: # resultOne=mysql.getOne("select * from institution where p_matchid=%s",resultChild['p_leagueid']) # if resultOne == False: # t=[] # t.append(resultChild['p_leagueid']) # mysql.insertOne("INSERT INTO institution(p_matchid,p_institution) VALUES (%s,%s) ",t) # pass break if i == 0: sqlInsert += "(%s,%s,%s)" i = 1 pass else: sqlInsert += ",(%s,%s,%s)" pass l.append(resultChild['p_id']) l.append(ConfigStart.STARTURL + getUrl['href']) l.append(getUrl.string) pass if l.__len__() == 0: continue resultInsert = mysql.update(sqlInsert, l) print resultInsert pass pass
def getData(self): # url ='http://www.kuaidaili.com/free/inha/%s/' mysql = Mysql() s = 1 while s <= 50: url = 'http://www.kuaidaili.com/free/inha/%s/' % (s) insertSql = "insert into proxyip(address,port) values" request = urllib2.Request(url) request.add_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s.%s Safari/537.36" % (random.uniform(1, 1000), random.uniform(0, 8000))) request.add_header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ) request.add_header("Accept-Encoding", "gzip, deflate, sdch") webfile = urllib2.urlopen(request) webcontext = webfile.read() webcontext = gzip.GzipFile(fileobj=StringIO.StringIO(webcontext), mode="r") webcontext = webcontext.read().decode('utf8') soup = BeautifulSoup(webcontext, "html.parser") resultTr = soup.find_all('tr') l = [] i = 0 for trChild in resultTr: print trChild res = trChild.find_all('td') if res.__len__() > 0: if i == 0: insertSql += '(%s,%s)' else: insertSql += ',(%s,%s)' i += 1 else: continue l.append(res[0].string) l.append(res[1].string) print webcontext print mysql.update(insertSql, l) s += 1 time.sleep(1) pass
def getJifen(self, limit): mysql = Mysql() sqlAll = ConfigStart.SELECTFROMLEAGUEYEARINFOLIMIT resultSelect = mysql.getAll(sqlAll, limit) l = [] i = 0 if resultSelect == False: return sqlInsert = ConfigStart.UPDATELEAGUEYEARINFO_TOP for resultChild in resultSelect: webfile = urllib.urlopen(resultChild[LeagueYearInfo.p_league_url]) webcontext = webfile.read() webfile.close() webContent = unicode(webcontext, ConfigStart.GBK) soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) jifenUrl = soup.find_all(href=re.compile(ConfigStart.COMPILEJIFEN)) for getJifen in jifenUrl: if (getJifen.string == ConfigStart.STRINGJIFEN): print getJifen[ConfigStart.HREF] l.append(resultChild[LeagueYearInfo.p_id]) l.append(ConfigStart.STARTURL + getJifen[ConfigStart.HREF]) if i == ConfigStart.FALSE: sqlInsert += "(%s,%s)" i = 1 else: sqlInsert += ",(%s,%s)" pass sqlInsert += ConfigStart.UPDATELEAGUEYEARINFO_BOTTOM print sqlInsert result = mysql.update(sqlInsert, l) print result pass mysql.dispose()
#!/usr/bin/env python # -*- coding:utf8 -*- import urllib2 import time from MySqlDB.MySqlConn import Mysql from Service.ShowCharType import * import sys reload(sys) sys.setdefaultencoding("utf-8") mysql = Mysql() req_header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'en-us', 'Connection': 'keep-alive', 'Referer': 'http://www.baidu.com/' } req_timeout = 3 testUrl = "http://www.baidu.com/" # url = "" # req = urllib2.Request(url,None,req_header) # jsondatas = urllib2.urlopen(req,None,req_timeout).read() cookies = urllib2.HTTPCookieProcessor() checked_num = 0 grasp_num = 0 req = urllib2.Request('http://api.xicidaili.com/free2016.txt', None,
def getMatchInfo(self, limit): mysql = Mysql() sqlAll = "select * from matchurl WHERE p_use=0 limit %s,10" resultSelect = mysql.getAll(sqlAll, limit) if resultSelect == False: return for resultChild in resultSelect: webfile = urllib.urlopen(resultChild['p_url']) webcontext = webfile.read() webfile.close() webContent = unicode(webcontext, 'gbk') soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) listInfo = soup.find_all(id='div_group_list') rounds = [] #当前第几回合或第几组 stid = resultChild['p_url'].split("jifen-")[1].split("/")[0] c = 'score' a = 'getmatch' if (listInfo.__len__() > 0): for listChild in listInfo[0].children: if (type(listChild) == bs4.element.Tag): if (listChild['data-group'] != 'all'): rounds.append(listChild['data-group']) pass pass pass pass listInfo = soup.find_all(id='match_group') if (listInfo.__len__() > 0): for listChild in listInfo[0].children: if (type(listChild) == bs4.element.Tag): if (listChild.a['data-group'] != 'all'): rounds.append(listChild.a['data-group']) pass pass pass pass #lmb3 listInfo = soup.find_all(class_='lmb3') asc = 0 for listC in listInfo: asc += 1 rounds.append(asc) pass pass urlInfo = "http://liansai.500.com/index.php?" if (rounds.__len__() == 0): insertContext = [] sqlInsert = "INSERT INTO `matchinfo` (`p_leagueid`, `fid`, `ghalfscore`, `gid`, `gname`, `gscore`, `gstanding`, `gsxname`, `handline`, `hhalfscore`, `hid`, `hname`, `hscore`, `hstanding`, `hsxname`, `round`, `status`, `stime`) VALUES " urlInfo += "c=" + c urlInfo += "&a=" + a urlInfo += "&stid=" + stid jsonContext = urllib.urlopen(urlInfo) jsonData = jsonContext.read() jsonContext.close() jsonData = unicode(jsonData, 'gbk') jsonData = json.loads(jsonData) index = 0 for jsonDataChild in jsonData: print jsonDataChild if index == 0: sqlInsert += "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" index = 1 else: sqlInsert += ",(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" pass insertContext.append(resultChild['pid']) insertContext.append(jsonDataChild['fid']) insertContext.append(jsonDataChild['ghalfscore']) insertContext.append(jsonDataChild['gid']) insertContext.append(jsonDataChild['gname']) insertContext.append(jsonDataChild['gscore']) insertContext.append(jsonDataChild['gstanding']) insertContext.append(jsonDataChild['gsxname']) insertContext.append(jsonDataChild['handline']) insertContext.append(jsonDataChild['hhalfscore']) insertContext.append(jsonDataChild['hid']) insertContext.append(jsonDataChild['hname']) insertContext.append(jsonDataChild['hscore']) insertContext.append(jsonDataChild['hstanding']) insertContext.append(jsonDataChild['hsxname']) insertContext.append(jsonDataChild['round']) insertContext.append(jsonDataChild['status']) insertContext.append(jsonDataChild['stime']) pass if index == 0: continue resInfo = mysql.update(sqlInsert, insertContext) print resInfo else: for roundChild in rounds: insertContext = [] sqlInsert = "INSERT INTO `matchinfo` (`p_leagueid`, `fid`, `ghalfscore`, `gid`, `gname`, `gscore`, `gstanding`, `gsxname`, `handline`, `hhalfscore`, `hid`, `hname`, `hscore`, `hstanding`, `hsxname`, `round`, `status`, `stime`) VALUES " urlInfo += "c=" + c urlInfo += "&a=" + a urlInfo += "&stid=" + stid urlInfo += "&round=" + str(roundChild) jsonContext = urllib.urlopen(urlInfo) jsonData = jsonContext.read() jsonContext.close() jsonData = unicode(jsonData, 'gbk') jsonData = json.loads(jsonData) index = 0 for jsonDataChild in jsonData: print jsonDataChild if index == 0: sqlInsert += "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" index = 1 else: sqlInsert += ",(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" pass insertContext.append(resultChild['pid']) insertContext.append(jsonDataChild['fid']) insertContext.append(jsonDataChild['ghalfscore']) insertContext.append(jsonDataChild['gid']) insertContext.append(jsonDataChild['gname']) insertContext.append(jsonDataChild['gscore']) insertContext.append(jsonDataChild['gstanding']) insertContext.append(jsonDataChild['gsxname']) insertContext.append(jsonDataChild['handline']) insertContext.append(jsonDataChild['hhalfscore']) insertContext.append(jsonDataChild['hid']) insertContext.append(jsonDataChild['hname']) insertContext.append(jsonDataChild['hscore']) insertContext.append(jsonDataChild['hstanding']) insertContext.append(jsonDataChild['hsxname']) insertContext.append(jsonDataChild['round']) insertContext.append(jsonDataChild['status']) insertContext.append(jsonDataChild['stime']) pass if index == 0: continue resInfo = mysql.update(sqlInsert, insertContext) print resInfo pass pass #INSERT INTO `matchinfo` (`p_leagueid`, `fid`, `ghalfscore`, `gid`, `gname`, `gscore`, `gstanding`, `gsxname`, `handline`, `hhalfscore`, `hid`, `hname`, `hscore`, `hstanding`, `hsxname`, `round`, `status`, `stime`) VALUES ('1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2017-04-21 17:24:58') #设置p_use已经抓取标志 useSql = "update matchurl set p_use=1 where pid = %s" mysql.update(useSql, resultChild['pid']) print "matchurl更新成功" pass mysql.dispose()
主程序启动开始 1.先加载配置文件 ''' from Service.MainToSecond import * from Service.LeagueMain import * from Service.SecondToJifen import * from Service.JifenToMatchUrl import * from Service.AnalysisDataAll import * from MySqlDB.MySqlConn import Mysql import threading import threadPoolBase import ConfigStart mutex = threading.Lock() if __name__ == ConfigStart.MAIN: mysql = Mysql() pool = threadPoolBase.ThreadPool(ConfigStart.THREADCOUNT) if (MainToSecond.getSecondUrl() == ConfigStart.TRUE): # leagueMain=LeagueMain() # while leagueMain.getWaitCrawler()>ConfigStart.NULL: # func_var = [] # for i in range(ConfigStart.THREADCOUNT): # #只使用一个数据库连接时会出现mysql崩溃 # func_var.append(([mutex],None)) # pass # requests = threadPoolBase.makeRequests(leagueMain.startLeagueMain, func_var) # [pool.putRequest(req) for req in requests] # pool.wait() # pass # # sqlAll = ConfigStart.LEAGUEYEARINFO_COUNT
def getDataFromMatchInfo(self, limit): mysql = Mysql() sqlAll = ConfigStart.SELECTFROMMATCHINFOLIMIT resultSelect = mysql.getAll(sqlAll, limit) if resultSelect == False: #print "没有要查找的数据" return #写日志 for resultChild in resultSelect: fid = resultChild['fid'] selectLogSql = "select count(*) as result from log where fid =%s " selRes = mysql.getOne(selectLogSql, fid) if selRes['result'] == 0: logSql = "insert into log(fid) values(%s)" mysql.update(logSql, fid) mysql.end() #print fid deleteSqls = [ "DELETE FROM yazhi WHERE matchinfoid=%s ", " DELETE FROM oupei WHERE matchinfoid=%s ", " DELETE FROM rangqiu WHERE matchinfoid=%s ", " DELETE FROM daxiao WHERE matchinfoid=%s ", " DELETE FROM befen WHERE matchinfoid=%s ", " DELETE FROM jinqiu WHERE matchinfoid=%s ", " DELETE FROM dsjinqiu WHERE matchinfoid=%s ", " DELETE FROM bqc WHERE matchinfoid=%s ", " DELETE FROM teamstatistics WHERE matchinfoid=%s ", " DELETE FROM playerstatistics WHERE matchinfoid=%s" ] for deleteSqlsChild in deleteSqls: mysql.delete(deleteSqlsChild, fid) mysql.end() #print "清理数据成功" i = 0 ''' =====================================欧赔开始================================================ ''' count_cursor = 0 while True: if count_cursor != i * 30: break url = ConfigStart.ANALYSISOUZHIURL % (fid, i * 30) #print "=============================================%s==================================="%url openUrls = OpenUrls() webcontext = openUrls.getWebContent(url, mysql, i, 1) # if webcontext.find('500.com')==-1 and webcontext!='': # #print "查看webcontext:%s"%webcontext # continue # pass # else: # if webcontext =='': # break soup = BeautifulSoup(webcontext, "html.parser") ouzhiData1 = soup.find_all(ttl='zy') if ouzhiData1.__len__() == 0: #print '获取完毕' break j = 0 for ouzhiDataChild in ouzhiData1: #print "------------------------%s------------------------" % (i * 30 + j+1) count_cursor = i * 30 + j + 1 #print ouzhiDataChild['id'] insertSql = "INSERT INTO `oupei` (`matchinfoid`, `companyid`, `op_s`, `op_p`, `op_f`, `ret`, `kl_s`, `kl_p`, `kl_f`, `update_time`) VALUES " insertContext = [] companyName = ouzhiDataChild.find_all('td', class_='tb_plgs') #print companyName[0]['title'] companyId = self.selectRetCompanyId( companyName[0]['title'], mysql, fid) webjson = 0 #每当进一次except就去减少一次可访问次数 reduceCount = 0 while True: try: webjson = openUrls.useProxy( ConfigStart.ANALYSISOUZHIDATAURL % (fid, ouzhiDataChild['id']), mysql, 0) webjson = json.loads(webjson) break pass except Exception, e: reduceCount = 1 continue pass pass pass #print webjson if webjson == None: continue if webjson.__len__() == 0: continue kellyjson = 0 while True: try: kellyjson = openUrls.useProxy( ConfigStart.ANALYSISOUZHIKELLYURL % (fid, ouzhiDataChild['id']), mysql, 0) kellyjson = json.loads(kellyjson) break pass except Exception, e: continue pass pass pass index = 0 for webjsonChild in webjson: indexT = 0 for kellyjsonChild in kellyjson: if index == indexT: #TODO:添加数据到数据库中 if index == 0: insertSql += "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" pass else: insertSql += ",(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" pass insertContext.append(fid) insertContext.append(companyId) insertContext.append(webjsonChild[0]) insertContext.append(webjsonChild[1]) insertContext.append(webjsonChild[2]) insertContext.append(webjsonChild[3]) insertContext.append(kellyjsonChild[0]) insertContext.append(kellyjsonChild[1]) insertContext.append(kellyjsonChild[2]) insertContext.append(kellyjsonChild[3]) pass break pass pass indexT += 1 pass index += 1 pass mysql.update(insertSql, insertContext) mysql.end() j += 1 pass i += 1 pass
def getSecondUrl(): # allraceMainWrap try: webfile = urllib.urlopen(ConfigStart.STARTURL) webContent = webfile.read() webfile.close() soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) allraceMainWrap = soup.find_all(id=ConfigStart.LEAGUESDIV) #获取到网页后开始分配数据库资源 mysql = Mysql() #i为分区赛事 i = -1 # 数据模型为p_name p_type p_country p_sport_type p_main_type p_name = ConfigStart.NULLSTRING p_type = 1 p_country = ConfigStart.NULLSTRING p_sport_type = 1 p_main_type = ConfigStart.NULLSTRING sqlAll = ConfigStart.INSERTINTOLEAGUETABLE for child in allraceMainWrap[ConfigStart.DIVTOPINDEX].children: if (type(child) == bs4.element.Tag): i = i + ConfigStart.INC #print child singleUrl = child.find_all(class_=[ ConfigStart.ALLLEAGUECLASS_1, ConfigStart.ALLLEAGUECLASS_2 ]) #print singleUrl for psingleUrl in singleUrl: #print psingleUrl for getUrlTag in psingleUrl: if (type(getUrlTag) == bs4.element.Tag): if (type(getUrlTag.div) == type(None)): print getUrlTag.a[ConfigStart.HREF] print getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_name = getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_type = i + 1 p_country = ConfigStart.NULLSTRING p_main_type = ConfigStart.STARTURL + getUrlTag.a[ ConfigStart.HREF] sqlString = ConfigStart.SELECTCOUNTFROMLEAGUETABLE lSelect = [p_name, p_type, p_country] resultSelect = mysql.getOne( sqlString, lSelect) if resultSelect[ ConfigStart. RESULT] == ConfigStart.NULL: l = [[ p_name, p_type, p_country, p_sport_type, p_main_type ]] result = mysql.insertMany(sqlAll, l) print result pass else: #获取到国家 print getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) #获取联赛及各个Url for leagueInfo in getUrlTag.div.children: if (type(leagueInfo) == bs4.element.Tag ): print leagueInfo.string print leagueInfo[ConfigStart.HREF] p_name = leagueInfo.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_main_type = ConfigStart.STARTURL + leagueInfo[ ConfigStart.HREF] p_type = i + 1 p_country = getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) sqlString = ConfigStart.SELECTCOUNTFROMLEAGUETABLE lSelect = [ p_name, p_type, p_country ] resultSelect = mysql.getOne( sqlString, lSelect) if resultSelect[ ConfigStart. RESULT] == ConfigStart.NULL: l = [[ p_name, p_type, p_country, p_sport_type, p_main_type ]] result = mysql.insertMany( sqlAll, l) print result pass pass print ConfigStart.MATCHPARTION[i] pass pass pass pass #各洲的杯赛 lrace_bei allraceCup = soup.find_all(class_=ConfigStart.CPUMATCHTAG) for cup in allraceCup: i = i + 1 print cup print cup.a.string print cup.a[ConfigStart.HREF] for cupChild in cup.find_all(ConfigStart.A): print cupChild p_name = cupChild.string.encode(ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_type = i + ConfigStart.INC p_country = ConfigStart.NULLSTRING p_main_type = ConfigStart.STARTURL + cupChild[ ConfigStart.HREF] sqlString = ConfigStart.SELECTCOUNTFROMLEAGUETABLE lSelect = [p_name, p_type, p_country] resultSelect = mysql.getOne(sqlString, lSelect) if resultSelect[ConfigStart.RESULT] == ConfigStart.NULL: l = [[ p_name, p_type, p_country, p_sport_type, p_main_type ]] result = mysql.insertMany(sqlAll, l) mysql.end() print result pass pass mysql.dispose() except Exception, e: print Exception, ":", e pass