def parse_infos_course(urlsource): req = urllib2.Request(urlsource) html = urllib2.urlopen(req) pronos_page = BeautifulSoup(html,"lxml") boite_entete = pronos_page.find_all("div","boite4rond")[0] num_course = urlsource[urlsource.rfind("/")+1:] num_course = num_course[:num_course.find("_")] nom_course = boite_entete.find_all("h1")[0].contents[0] nom_course = sanitize.s(nom_course[:nom_course.find(' - ')]) reunion_course = boite_entete.find_all("h3")[0].contents[0] reunion_course = sanitize.s(reunion_course[reunion_course.rfind('union')+6:]) lieu_course = boite_entete.find_all("h3")[0].contents[0] lieu_course = lieu_course[:lieu_course.find(' | ')] lieu_course = sanitize.s(lieu_course[lieu_course.find(' - ')+3:]) meteo_course = boite_entete.find_all("div","boite4rond")[0].find_all("img") if not meteo_course: meteo_course = '' else: meteo_course = meteo_course[0].get("src") meteo_course = meteo_course[meteo_course.rfind('/')+1:] meteo_course = meteo_dico[meteo_course] temp_course = str(boite_entete.find("h4")) if temp_course == 'None': temp_course = '' else : temp_course = temp_course[temp_course.find(">")+1:temp_course.find("\xc2")] + ' - ' + temp_course[temp_course.rfind("/>")+2:temp_course.rfind("\xc2")] type_course = boite_entete.find_all("p")[0].contents[0] type_course = type_course[:type_course.find(' - ')] distance_course = boite_entete.find_all("p")[0].contents[0] distance_course = distance_course[:distance_course.find('m - ')] distance_course = distance_course[distance_course.find(' - ')+3:] prix_course = boite_entete.find_all("p")[0].contents[0] prix_course = prix_course[prix_course.rfind(' - ')+3:] prix_course = prix_course[:prix_course.find('&')] date_course = urlsource[urlsource.find("prono")+15:urlsource.find("prono")+25] heure_course = boite_entete.find_all("h3")[0].contents[0] heure_course = heure_course[:heure_course.find(' - ')] return [num_course,nom_course, reunion_course,lieu_course, meteo_course, temp_course,type_course, distance_course,prix_course, date_course,heure_course]
def find_nom_cheval(urlsource): req = urllib2.Request(urlsource) html = urllib2.urlopen(req) page = BeautifulSoup(html,"lxml") try: nom_cheval=page.find("div","ficheinfo").find("h1").contents[0].encode("ascii","ignore") except: nom_cheval = '' return sanitize.s(nom_cheval)
def find_nom_cheval(urlsource): req = urllib2.Request(urlsource) html = urllib2.urlopen(req) page = BeautifulSoup(html, "lxml") try: nom_cheval = page.find("div", "ficheinfo").find("h1").contents[0].encode( "ascii", "ignore") except: nom_cheval = '' return sanitize.s(nom_cheval)
def parse_infos_course(urlsource): req = urllib2.Request(urlsource) html = urllib2.urlopen(req) pronos_page = BeautifulSoup(html, "lxml") boite_entete = pronos_page.find_all("div", "boite4rond")[0] num_course = urlsource[urlsource.rfind("/") + 1:] num_course = num_course[:num_course.find("_")] nom_course = boite_entete.find_all("h1")[0].contents[0] nom_course = sanitize.s(nom_course[:nom_course.find(' - ')]) reunion_course = boite_entete.find_all("h3")[0].contents[0] reunion_course = sanitize.s(reunion_course[reunion_course.rfind('union') + 6:]) lieu_course = boite_entete.find_all("h3")[0].contents[0] lieu_course = lieu_course[:lieu_course.find(' | ')] lieu_course = sanitize.s(lieu_course[lieu_course.find(' - ') + 3:]) meteo_course = boite_entete.find_all("div", "boite4rond")[0].find_all("img") if not meteo_course: meteo_course = '' else: meteo_course = meteo_course[0].get("src") meteo_course = meteo_course[meteo_course.rfind('/') + 1:] meteo_course = meteo_dico[meteo_course] temp_course = str(boite_entete.find("h4")) if temp_course == 'None': temp_course = '' else: temp_course = temp_course[temp_course.find(">") + 1:temp_course.find( "\xc2")] + ' - ' + temp_course[temp_course.rfind("/>") + 2:temp_course.rfind("\xc2")] type_course = boite_entete.find_all("p")[0].contents[0] type_course = type_course[:type_course.find(' - ')] distance_course = boite_entete.find_all("p")[0].contents[0] distance_course = distance_course[:distance_course.find('m - ')] distance_course = distance_course[distance_course.find(' - ') + 3:] prix_course = boite_entete.find_all("p")[0].contents[0] prix_course = prix_course[prix_course.rfind(' - ') + 3:] prix_course = prix_course[:prix_course.find('&')] date_course = urlsource[urlsource.find("prono") + 15:urlsource.find("prono") + 25] heure_course = boite_entete.find_all("h3")[0].contents[0] heure_course = heure_course[:heure_course.find(' - ')] return [ num_course, nom_course, reunion_course, lieu_course, meteo_course, temp_course, type_course, distance_course, prix_course, date_course, heure_course ]
def get_id_chevaux(mini,maxi): nb_errors = 0 url_root = "http://www.canalturf.com/courses_fiche_cheval.php?idcheval=" cheval_id_table = [] for i in range(mini,maxi+1): urlsource = url_root+str(i) req = urllib2.Request(urlsource) html = urllib2.urlopen(req) page = BeautifulSoup(html,"lxml").find("div","ficheinfo") try: cartouche = page.find_all("div")[0] cart_infos = cartouche.find_all("div")[1] palmares = page.find("div","fiche_bloc") palm_infos = palmares.find("p") except: nb_errors = nb_errors +1 continue num = str(i) try: nom = cartouche.find("h1").contents[0].encode("ascii","ignore") nom = sanitize.s(nom[nom.find("chev")+7:]) except: nom = '' nb_errors = nb_errors+1 try: sex = cart_infos.contents[0].encode("ascii","ignore") sexe = sex[11:12] age = sex[12:] except: sexe = '' age = '' nb_errors = nb_errors+1 try: robe = cart_infos.contents[2].encode("ascii","ignore")[7:] except: robe = '' nb_errors = nb_errors+1 try: pere = sanitize.s(cart_infos.contents[4].encode("ascii","ignore")[6:]) except: pere = '' nb_errors = nb_errors+1 try: mere = sanitize.s(cart_infos.contents[6].encode("ascii","ignore")[6:]) except: mere = '' nb_errors = nb_errors+1 try: pere_mere = sanitize.s(cart_infos.contents[8].encode("ascii","ignore")[13:]) except: pere_mere = '' nb_errors = nb_errors+1 try: proprio = sanitize.s(cart_infos.contents[11].encode("ascii","ignore")[14:]) except: proprio = '' nb_errors = nb_errors+1 try: entraineur = sanitize.s(cart_infos.contents[13].encode("ascii","ignore")[13:]) except: entraineur = '' nb_errors = nb_errors+1 try: eleveur = sanitize.s(cart_infos.contents[15].encode("ascii","ignore")[10:]) except: eleveur = '' nb_errors = nb_errors+1 try: gain = palm_infos.contents[0].encode("ascii","ignore")[7:] except: gain = '' nb_errors = nb_errors+1 try: perfs = palm_infos.contents[2].encode("ascii","ignore")[8:] except: perfs = '' nb_errors = nb_errors+1 try: courus = palm_infos.contents[4].encode("ascii","ignore")[12:] except: courus = '' nb_errors = nb_errors+1 try: victoires = palm_infos.contents[6].encode("ascii","ignore")[14:] except: victoires = '' nb_errors = nb_errors+1 try: places = palm_infos.contents[8].encode("ascii","ignore")[10:] except: places = '' nb_errors = nb_errors+1 cheval_id = [num,nom,sexe,age,robe,pere,mere,pere_mere,proprio,entraineur,eleveur,gain,perfs,\ courus,victoires,places] cheval_id_table.append(cheval_id) return (cheval_id_table,nb_errors)
def get_id_chevaux(mini, maxi): nb_errors = 0 url_root = "http://www.canalturf.com/courses_fiche_cheval.php?idcheval=" cheval_id_table = [] for i in range(mini, maxi + 1): urlsource = url_root + str(i) req = urllib2.Request(urlsource) html = urllib2.urlopen(req) page = BeautifulSoup(html, "lxml").find("div", "ficheinfo") try: cartouche = page.find_all("div")[0] cart_infos = cartouche.find_all("div")[1] palmares = page.find("div", "fiche_bloc") palm_infos = palmares.find("p") except: nb_errors = nb_errors + 1 continue num = str(i) try: nom = cartouche.find("h1").contents[0].encode("ascii", "ignore") nom = sanitize.s(nom[nom.find("chev") + 7:]) except: nom = '' nb_errors = nb_errors + 1 try: sex = cart_infos.contents[0].encode("ascii", "ignore") sexe = sex[11:12] age = sex[12:] except: sexe = '' age = '' nb_errors = nb_errors + 1 try: robe = cart_infos.contents[2].encode("ascii", "ignore")[7:] except: robe = '' nb_errors = nb_errors + 1 try: pere = sanitize.s(cart_infos.contents[4].encode("ascii", "ignore")[6:]) except: pere = '' nb_errors = nb_errors + 1 try: mere = sanitize.s(cart_infos.contents[6].encode("ascii", "ignore")[6:]) except: mere = '' nb_errors = nb_errors + 1 try: pere_mere = sanitize.s(cart_infos.contents[8].encode( "ascii", "ignore")[13:]) except: pere_mere = '' nb_errors = nb_errors + 1 try: proprio = sanitize.s(cart_infos.contents[11].encode( "ascii", "ignore")[14:]) except: proprio = '' nb_errors = nb_errors + 1 try: entraineur = sanitize.s(cart_infos.contents[13].encode( "ascii", "ignore")[13:]) except: entraineur = '' nb_errors = nb_errors + 1 try: eleveur = sanitize.s(cart_infos.contents[15].encode( "ascii", "ignore")[10:]) except: eleveur = '' nb_errors = nb_errors + 1 try: gain = palm_infos.contents[0].encode("ascii", "ignore")[7:] except: gain = '' nb_errors = nb_errors + 1 try: perfs = palm_infos.contents[2].encode("ascii", "ignore")[8:] except: perfs = '' nb_errors = nb_errors + 1 try: courus = palm_infos.contents[4].encode("ascii", "ignore")[12:] except: courus = '' nb_errors = nb_errors + 1 try: victoires = palm_infos.contents[6].encode("ascii", "ignore")[14:] except: victoires = '' nb_errors = nb_errors + 1 try: places = palm_infos.contents[8].encode("ascii", "ignore")[10:] except: places = '' nb_errors = nb_errors + 1 cheval_id = [num,nom,sexe,age,robe,pere,mere,pere_mere,proprio,entraineur,eleveur,gain,perfs,\ courus,victoires,places] cheval_id_table.append(cheval_id) return (cheval_id_table, nb_errors)
def parse_infos_chevaux(urlsource): nb_errors = 0 req = urllib2.Request(urlsource) html = urllib2.urlopen(req) pronos_page = BeautifulSoup(html, "lxml") table_chevaux = pronos_page.find( "table", "course").find_all("tbody")[1].find_all("tr") label = pronos_page.find("table", "course").find_all("tbody")[0].find_all("td") for i in range(0, len(label)): label[i] = str(label[i].contents) result_cheval = [] for cheval in table_chevaux: sstable = cheval.find_all("td") num_course = urlsource[urlsource.rfind("/") + 1:] num_course = num_course[:num_course.find("_")] if sstable[0].string == "NP": result_cheval.append([num_course,sstable[1].string,\ '',"NP",'','','','','','','','','','','','']) continue nom_cheval = sanitize.s( cheval.find("strong").contents[0].encode("ascii", "ignore")) id_cheval = cheval.find("a", "fiche").get("href") id_cheval = id_cheval[id_cheval.find("idcheval") + 9:] id_cheval = id_cheval[:id_cheval.find("&")] num_cheval = sstable[0].string.encode("ascii", "ignore") ind = find_index(label, "Def") try: def_cheval = sstable[ind].contents[0].encode("ascii", "ignore") except: nb_errors = nb_errors + 1 def_cheval = '' ind = find_index(label, "Ec") try: ecurie_cheval = sstable[ind].contents[0].encode("ascii", "ignore") except: nb_errors = nb_errors + 1 ecurie_cheval = '' ind = find_index(label, "Corde") try: corde_cheval = sstable[ind].contents[0].encode("ascii", "ignore") except: nb_errors = nb_errors + 1 corde_cheval = '' ind = find_index(label, "Oeil") try: oeil_cheval = sstable[ind].contents[0].encode("ascii", "ignore") except: nb_errors = nb_errors + 1 oeil_cheval = '' ind = find_index(label, "Entrai") try: jockey_cheval = sanitize.s( sstable[ind].find_all("a")[0].contents[1].encode( "ascii", "ignore")) except: nb_errors = nb_errors + 1 jockey_cheval = '' ind = find_index(label, "Poids") try: poids_cheval = sstable[ind].contents[0].encode("ascii", "ignore") except: nb_errors = nb_errors + 1 poids_cheval = '' ind = find_index(label, "Dist") try: dist_cheval = sstable[ind].contents[0].encode("ascii", "ignore") except: nb_errors = nb_errors + 1 dist_cheval = '' ind = find_index(label, "10h") try: cote10h_cheval = sstable[ind].contents[0].string.encode( "ascii", "ignore") except: nb_errors = nb_errors + 1 cote10h_cheval = '' try: cotepmu_cheval = sstable[ind + 1].find( "strong").contents[0].string.encode("ascii", "ignore") except: nb_errors = nb_errors + 1 cotepmu_cheval = '' try: variation = sstable[ind + 2].contents[0].string.encode( "ascii", "ignore") except: nb_errors = nb_errors + 1 variation = '' try: cotezeturf_cheval = sstable[ind + 3].find( "strong").contents[0].string.encode("ascii", "ignore") except: nb_errors = nb_errors + 1 cotezeturf_cheval = '' try: cotebetclic_cheval = sstable[ind + 4].find( "strong").contents[0].string.encode("ascii", "ignore") except: nb_errors = nb_errors + 1 cotebetclic_cheval = '' cheval_infos = [num_course, nom_cheval, id_cheval, num_cheval,\ def_cheval,ecurie_cheval, corde_cheval,oeil_cheval,jockey_cheval,\ poids_cheval,dist_cheval,cote10h_cheval,cotepmu_cheval,variation,\ cotezeturf_cheval,cotebetclic_cheval] result_cheval.append(cheval_infos) return (result_cheval, nb_errors)
def parse_infos_chevaux(urlsource): nb_errors = 0 req = urllib2.Request(urlsource) html = urllib2.urlopen(req) pronos_page = BeautifulSoup(html,"lxml") table_chevaux = pronos_page.find("table", "course").find_all("tbody")[1].find_all("tr") label = pronos_page.find("table", "course").find_all("tbody")[0].find_all("td") for i in range(0,len(label)): label[i] = str(label[i].contents) result_cheval = [] for cheval in table_chevaux: sstable = cheval.find_all("td") num_course = urlsource[urlsource.rfind("/")+1:] num_course = num_course[:num_course.find("_")] if sstable[0].string == "NP": result_cheval.append([num_course,sstable[1].string,\ '',"NP",'','','','','','','','','','','','']) continue nom_cheval = sanitize.s(cheval.find("strong").contents[0].encode("ascii","ignore")) id_cheval = cheval.find("a","fiche").get("href") id_cheval = id_cheval[id_cheval.find("idcheval")+9:] id_cheval = id_cheval[:id_cheval.find("&")] num_cheval = sstable[0].string.encode("ascii","ignore") ind = find_index(label,"Def") try: def_cheval = sstable[ind].contents[0].encode("ascii","ignore") except: nb_errors = nb_errors+1 def_cheval = '' ind = find_index(label,"Ec") try: ecurie_cheval = sstable[ind].contents[0].encode("ascii","ignore") except: nb_errors = nb_errors+1 ecurie_cheval = '' ind = find_index(label,"Corde") try: corde_cheval = sstable[ind].contents[0].encode("ascii","ignore") except: nb_errors = nb_errors+1 corde_cheval = '' ind = find_index(label,"Oeil") try: oeil_cheval = sstable[ind].contents[0].encode("ascii","ignore") except: nb_errors = nb_errors+1 oeil_cheval = '' ind = find_index(label,"Entrai") try: jockey_cheval = sanitize.s(sstable[ind].find_all("a")[0].contents[1].encode("ascii","ignore")) except: nb_errors = nb_errors+1 jockey_cheval = '' ind = find_index(label,"Poids") try: poids_cheval = sstable[ind].contents[0].encode("ascii","ignore") except: nb_errors = nb_errors+1 poids_cheval = '' ind = find_index(label,"Dist") try: dist_cheval = sstable[ind].contents[0].encode("ascii","ignore") except: nb_errors = nb_errors+1 dist_cheval = '' ind = find_index(label,"10h") try: cote10h_cheval = sstable[ind].contents[0].string.encode("ascii","ignore") except: nb_errors = nb_errors+1 cote10h_cheval = '' try: cotepmu_cheval = sstable[ind+1].find("strong").contents[0].string.encode("ascii","ignore") except: nb_errors = nb_errors+1 cotepmu_cheval = '' try: variation = sstable[ind+2].contents[0].string.encode("ascii","ignore") except: nb_errors = nb_errors+1 variation = '' try: cotezeturf_cheval = sstable[ind+3].find("strong").contents[0].string.encode("ascii","ignore") except: nb_errors = nb_errors+1 cotezeturf_cheval = '' try: cotebetclic_cheval = sstable[ind+4].find("strong").contents[0].string.encode("ascii","ignore") except: nb_errors = nb_errors+1 cotebetclic_cheval = '' cheval_infos = [num_course, nom_cheval, id_cheval, num_cheval,\ def_cheval,ecurie_cheval, corde_cheval,oeil_cheval,jockey_cheval,\ poids_cheval,dist_cheval,cote10h_cheval,cotepmu_cheval,variation,\ cotezeturf_cheval,cotebetclic_cheval] result_cheval.append(cheval_infos) return (result_cheval,nb_errors)