def getWaitCrawler(self): mysql = Mysql() sqlString = ConfigStart.SELECTCOUNTFROMLEAGUECRAWLER result = mysql.getOne(sqlString) mysql.dispose() return result[ConfigStart.RESULT] pass
def getJifen(self, limit): mysql = Mysql() sqlAll = ConfigStart.SELECTFROMLEAGUEYEARINFOLIMIT resultSelect = mysql.getAll(sqlAll, limit) l = [] i = 0 if resultSelect == False: return sqlInsert = ConfigStart.UPDATELEAGUEYEARINFO_TOP for resultChild in resultSelect: webfile = urllib.urlopen(resultChild[LeagueYearInfo.p_league_url]) webcontext = webfile.read() webfile.close() webContent = unicode(webcontext, ConfigStart.GBK) soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) jifenUrl = soup.find_all(href=re.compile(ConfigStart.COMPILEJIFEN)) for getJifen in jifenUrl: if (getJifen.string == ConfigStart.STRINGJIFEN): print getJifen[ConfigStart.HREF] l.append(resultChild[LeagueYearInfo.p_id]) l.append(ConfigStart.STARTURL + getJifen[ConfigStart.HREF]) if i == ConfigStart.FALSE: sqlInsert += "(%s,%s)" i = 1 else: sqlInsert += ",(%s,%s)" pass sqlInsert += ConfigStart.UPDATELEAGUEYEARINFO_BOTTOM print sqlInsert result = mysql.update(sqlInsert, l) print result pass mysql.dispose()
def startLeagueMain(self, mutex): #获取数据库资源 mysql = Mysql() p_id, p_name, p_type, p_country, p_main_url = 0, '', 0, '', '' if mutex.acquire(): sqlString = ConfigStart.SELECTFROMLEAGUECRAWLER result = mysql.getOne(sqlString) if isinstance(result, bool): print "分析完毕" mutex.release() mysql.dispose() return pass else: mysql.update(ConfigStart.UPDATELEAGUESETCRAWLER, result[League.p_id]) print result[League.p_name] p_id,p_name,p_type,p_country,p_main_url=result[League.p_id],\ result[League.p_name],result[League.p_type],result[League.p_country],result[League.p_main_url] mutex.release() pass #获取每个联赛现存所有赛季并得到对应的url print "正在获取联赛___(%s)___的所有赛季信息获取的url为:%s " % (p_name, p_main_url) webfile = urllib.urlopen(p_main_url) webContent = webfile.read() webfile.close() webContent = unicode(webContent, ConfigStart.GBK) soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) leagueYears = soup.find_all(class_=ConfigStart.DROPLISTCLASS) for leagueYear in leagueYears[0].children: if (type(leagueYear) == bs4.element.Tag): print leagueYear.a[ConfigStart.STRINGTITLE] print leagueYear.a[ConfigStart.HREF] #print re.findall(r'\b\d+\b',leagueYear.a['title']) l = [[ p_id, ConfigStart.STARTURL + leagueYear.a[ConfigStart.HREF], leagueYear.a[ConfigStart.STRINGTITLE], leagueYear.a.string ]] sqlInsert = ConfigStart.INSERTINTOLEAGUEYEARINFO result = mysql.insertMany(sqlInsert, l) print result pass mysql.dispose()
def getMatchInfo(self, limit): mysql = Mysql() sqlAll = "select * from matchurl WHERE p_use=0 limit %s,10" resultSelect = mysql.getAll(sqlAll, limit) if resultSelect == False: return for resultChild in resultSelect: webfile = urllib.urlopen(resultChild['p_url']) webcontext = webfile.read() webfile.close() webContent = unicode(webcontext, 'gbk') soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) listInfo = soup.find_all(id='div_group_list') rounds = [] #当前第几回合或第几组 stid = resultChild['p_url'].split("jifen-")[1].split("/")[0] c = 'score' a = 'getmatch' if (listInfo.__len__() > 0): for listChild in listInfo[0].children: if (type(listChild) == bs4.element.Tag): if (listChild['data-group'] != 'all'): rounds.append(listChild['data-group']) pass pass pass pass listInfo = soup.find_all(id='match_group') if (listInfo.__len__() > 0): for listChild in listInfo[0].children: if (type(listChild) == bs4.element.Tag): if (listChild.a['data-group'] != 'all'): rounds.append(listChild.a['data-group']) pass pass pass pass #lmb3 listInfo = soup.find_all(class_='lmb3') asc = 0 for listC in listInfo: asc += 1 rounds.append(asc) pass pass urlInfo = "http://liansai.500.com/index.php?" if (rounds.__len__() == 0): insertContext = [] sqlInsert = "INSERT INTO `matchinfo` (`p_leagueid`, `fid`, `ghalfscore`, `gid`, `gname`, `gscore`, `gstanding`, `gsxname`, `handline`, `hhalfscore`, `hid`, `hname`, `hscore`, `hstanding`, `hsxname`, `round`, `status`, `stime`) VALUES " urlInfo += "c=" + c urlInfo += "&a=" + a urlInfo += "&stid=" + stid jsonContext = urllib.urlopen(urlInfo) jsonData = jsonContext.read() jsonContext.close() jsonData = unicode(jsonData, 'gbk') jsonData = json.loads(jsonData) index = 0 for jsonDataChild in jsonData: print jsonDataChild if index == 0: sqlInsert += "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" index = 1 else: sqlInsert += ",(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" pass insertContext.append(resultChild['pid']) insertContext.append(jsonDataChild['fid']) insertContext.append(jsonDataChild['ghalfscore']) insertContext.append(jsonDataChild['gid']) insertContext.append(jsonDataChild['gname']) insertContext.append(jsonDataChild['gscore']) insertContext.append(jsonDataChild['gstanding']) insertContext.append(jsonDataChild['gsxname']) insertContext.append(jsonDataChild['handline']) insertContext.append(jsonDataChild['hhalfscore']) insertContext.append(jsonDataChild['hid']) insertContext.append(jsonDataChild['hname']) insertContext.append(jsonDataChild['hscore']) insertContext.append(jsonDataChild['hstanding']) insertContext.append(jsonDataChild['hsxname']) insertContext.append(jsonDataChild['round']) insertContext.append(jsonDataChild['status']) insertContext.append(jsonDataChild['stime']) pass if index == 0: continue resInfo = mysql.update(sqlInsert, insertContext) print resInfo else: for roundChild in rounds: insertContext = [] sqlInsert = "INSERT INTO `matchinfo` (`p_leagueid`, `fid`, `ghalfscore`, `gid`, `gname`, `gscore`, `gstanding`, `gsxname`, `handline`, `hhalfscore`, `hid`, `hname`, `hscore`, `hstanding`, `hsxname`, `round`, `status`, `stime`) VALUES " urlInfo += "c=" + c urlInfo += "&a=" + a urlInfo += "&stid=" + stid urlInfo += "&round=" + str(roundChild) jsonContext = urllib.urlopen(urlInfo) jsonData = jsonContext.read() jsonContext.close() jsonData = unicode(jsonData, 'gbk') jsonData = json.loads(jsonData) index = 0 for jsonDataChild in jsonData: print jsonDataChild if index == 0: sqlInsert += "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" index = 1 else: sqlInsert += ",(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" pass insertContext.append(resultChild['pid']) insertContext.append(jsonDataChild['fid']) insertContext.append(jsonDataChild['ghalfscore']) insertContext.append(jsonDataChild['gid']) insertContext.append(jsonDataChild['gname']) insertContext.append(jsonDataChild['gscore']) insertContext.append(jsonDataChild['gstanding']) insertContext.append(jsonDataChild['gsxname']) insertContext.append(jsonDataChild['handline']) insertContext.append(jsonDataChild['hhalfscore']) insertContext.append(jsonDataChild['hid']) insertContext.append(jsonDataChild['hname']) insertContext.append(jsonDataChild['hscore']) insertContext.append(jsonDataChild['hstanding']) insertContext.append(jsonDataChild['hsxname']) insertContext.append(jsonDataChild['round']) insertContext.append(jsonDataChild['status']) insertContext.append(jsonDataChild['stime']) pass if index == 0: continue resInfo = mysql.update(sqlInsert, insertContext) print resInfo pass pass #INSERT INTO `matchinfo` (`p_leagueid`, `fid`, `ghalfscore`, `gid`, `gname`, `gscore`, `gstanding`, `gsxname`, `handline`, `hhalfscore`, `hid`, `hname`, `hscore`, `hstanding`, `hsxname`, `round`, `status`, `stime`) VALUES ('1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2017-04-21 17:24:58') #设置p_use已经抓取标志 useSql = "update matchurl set p_use=1 where pid = %s" mysql.update(useSql, resultChild['pid']) print "matchurl更新成功" pass mysql.dispose()
# func_var = [] # for i in range(threadCount): # func_var.append(([i * ConfigStart.THREADCOUNT], None)) # pass # requests = threadPoolBase.makeRequests(jifenToMatchUrl.getMatchInfo, func_var) # [pool.putRequest(req) for req in requests] # pool.wait() # mysql.dispose() #判断已经获取到哪里 try: analysisData = AnalysisData() sqlAll = ConfigStart.SELECTFROMMATCHINFO resultSelect = mysql.getOne(sqlAll) threadCount = int( resultSelect[ConfigStart.RESULT]) / ConfigStart.THREADCOUNT + 1 secondToJifen = SecondToJifen() func_var = [] for i in range(threadCount): func_var.append(([i * ConfigStart.THREADCOUNT], None)) pass requests = threadPoolBase.makeRequests( analysisData.getDataFromMatchInfo, func_var) [pool.putRequest(req) for req in requests] pool.wait() mysql.dispose() pass except Exception, e: file = open('main.log', 'wb+') file.write("%s\r\n" % e) file.close()
def getSecondUrl(): # allraceMainWrap try: webfile = urllib.urlopen(ConfigStart.STARTURL) webContent = webfile.read() webfile.close() soup = BeautifulSoup(webContent, ConfigStart.PARSEMETHOD) allraceMainWrap = soup.find_all(id=ConfigStart.LEAGUESDIV) #获取到网页后开始分配数据库资源 mysql = Mysql() #i为分区赛事 i = -1 # 数据模型为p_name p_type p_country p_sport_type p_main_type p_name = ConfigStart.NULLSTRING p_type = 1 p_country = ConfigStart.NULLSTRING p_sport_type = 1 p_main_type = ConfigStart.NULLSTRING sqlAll = ConfigStart.INSERTINTOLEAGUETABLE for child in allraceMainWrap[ConfigStart.DIVTOPINDEX].children: if (type(child) == bs4.element.Tag): i = i + ConfigStart.INC #print child singleUrl = child.find_all(class_=[ ConfigStart.ALLLEAGUECLASS_1, ConfigStart.ALLLEAGUECLASS_2 ]) #print singleUrl for psingleUrl in singleUrl: #print psingleUrl for getUrlTag in psingleUrl: if (type(getUrlTag) == bs4.element.Tag): if (type(getUrlTag.div) == type(None)): print getUrlTag.a[ConfigStart.HREF] print getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_name = getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_type = i + 1 p_country = ConfigStart.NULLSTRING p_main_type = ConfigStart.STARTURL + getUrlTag.a[ ConfigStart.HREF] sqlString = ConfigStart.SELECTCOUNTFROMLEAGUETABLE lSelect = [p_name, p_type, p_country] resultSelect = mysql.getOne( sqlString, lSelect) if resultSelect[ ConfigStart. RESULT] == ConfigStart.NULL: l = [[ p_name, p_type, p_country, p_sport_type, p_main_type ]] result = mysql.insertMany(sqlAll, l) print result pass else: #获取到国家 print getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) #获取联赛及各个Url for leagueInfo in getUrlTag.div.children: if (type(leagueInfo) == bs4.element.Tag ): print leagueInfo.string print leagueInfo[ConfigStart.HREF] p_name = leagueInfo.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_main_type = ConfigStart.STARTURL + leagueInfo[ ConfigStart.HREF] p_type = i + 1 p_country = getUrlTag.span.string.encode( ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) sqlString = ConfigStart.SELECTCOUNTFROMLEAGUETABLE lSelect = [ p_name, p_type, p_country ] resultSelect = mysql.getOne( sqlString, lSelect) if resultSelect[ ConfigStart. RESULT] == ConfigStart.NULL: l = [[ p_name, p_type, p_country, p_sport_type, p_main_type ]] result = mysql.insertMany( sqlAll, l) print result pass pass print ConfigStart.MATCHPARTION[i] pass pass pass pass #各洲的杯赛 lrace_bei allraceCup = soup.find_all(class_=ConfigStart.CPUMATCHTAG) for cup in allraceCup: i = i + 1 print cup print cup.a.string print cup.a[ConfigStart.HREF] for cupChild in cup.find_all(ConfigStart.A): print cupChild p_name = cupChild.string.encode(ConfigStart.UTF8).replace( ConfigStart.SPACESTRING, ConfigStart.NULLSTRING) p_type = i + ConfigStart.INC p_country = ConfigStart.NULLSTRING p_main_type = ConfigStart.STARTURL + cupChild[ ConfigStart.HREF] sqlString = ConfigStart.SELECTCOUNTFROMLEAGUETABLE lSelect = [p_name, p_type, p_country] resultSelect = mysql.getOne(sqlString, lSelect) if resultSelect[ConfigStart.RESULT] == ConfigStart.NULL: l = [[ p_name, p_type, p_country, p_sport_type, p_main_type ]] result = mysql.insertMany(sqlAll, l) mysql.end() print result pass pass mysql.dispose() except Exception, e: print Exception, ":", e pass