def seller_info(seller_link): page = proxy_module.main(seller_link) soup=BeautifulSoup(page) page.close() view_item = soup.find_all("td", attrs={"id":"viewItemId"}) try: view_item = str(view_item[0].a.get("href")) view_link = view_item print view_link page2 = proxy_module.main(view_item) soup2=BeautifulSoup(page2) page2.close() last_item_content = soup2.find_all("h1", attrs={"class":"vi-is1-titleH1"}) last_item_content = str(last_item_content[0].get_text()).encode('ascii','ignore') item_condition = soup2.find_all("span", attrs={"class":"vi-is1-condText"}) item_condition = str(item_condition[0].string).encode('ascii','ignore') ended = soup2.find_all("span", attrs={"class":"vi-is1-dt"}) ended = str(ended[0].get_text()).encode('ascii','ignore') selling_price = soup2.find_all("span", attrs={"id":"v4-27"}) selling_price=str(selling_price[0].get_text()).encode('ascii','ignore') shipping = soup2.find_all("span", attrs={"id":"fshippingCost"}) shipping = str(shipping[0].get_text()).encode('ascii','ignore') except: view_item ="private" if view_item.lower()=="private": last_item_content = "None as it is private " view_link = "No link as it is private " item_condition = "Private" ended = "Not Known " selling_price = "Not Known " shipping = "Not Known" #print last_item_content,view_link ,item_condition, ended, selling_price, shipping return last_item_content, view_link,item_condition, ended, selling_price, shipping
def seller_info(link): page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("td", attrs={"id":"viewItemId"}) try: view_link = str(data[0].a.get("href")) page = proxy_module.main(view_link) print view_link soup = BeautifulSoup(page) data = soup.find_all("h1", attrs={"class":"vi-is1-titleH1"}) last_item_content = str(data[0].get_text()) data = soup.find_all("span", attrs={"class":"vi-is1-condText"}) item_condition = str(data[0].string) try: data = soup.find_all("span", attrs={"class":"vi-is1-dt"}) ended = str(data[0].get_text()) except: ended ="Not yet ended" data = soup.find_all("span", attrs={"class":"vi-is1-prcp"}) selling_price = str(data[0].get_text()) data = soup.find_all("span", attrs={"id":"fshippingCost"}) shipping = str(data[0].get_text()) except: view_link ="private" if view_link.lower()=="private": last_item_content = "None as it is private " view_link = "No link as it is private " item_condition = "Private" ended = "Not Known " selling_price = "Not Known " shipping = "Not Known" return last_item_content, view_link,item_condition, ended, selling_price, shipping
def a2zthird(db,cursor,link,movie_name,movie_link): page = proxy_module.main(movie_link) soup = BeautifulSoup(page) data = soup.find_all("strong") for l in data: s = l.get_text().encode("ascii","ignore") if re.search(r"Dailymotion",s): print "*"*10 print movie_name,movie_link print "Dailymotion" para = l.find_next("p") soup2 = BeautifulSoup(str(para)) data2 = soup2.find_all("a") for m in data2: #print m.get_text(),m.get("href") ok here watch = m.get_text().encode("ascii","ignore") watch_link = m.get("href").encode("ascii","ignore") a2zdaily(db,cursor,link,movie_name,movie_link,watch,watch_link) #sys.exit() elif re.search(r"Youtube",s): print "*"*10 print movie_name,movie_link print "Youtube" para = l.find_next("p") soup2 = BeautifulSoup(str(para)) data2 = soup2.find_all("a") for m in data2: #print m.get_text(),m.get("href") ok here watch = m.get_text().encode("ascii","ignore") watch_link = m.get("href").encode("ascii","ignore") a2zyou(db,cursor,link,movie_name,movie_link,watch,watch_link) #sys.exit() else: pass
def a2z(): link = "http://www.hindilinks4u.net/hindi-movies-a-to-z" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"id":"wp_page_numbers"}) soup = BeautifulSoup(str(data)) data = soup.find_all("a") page_list = [] threads = [] for l in data: if l.get("href") not in page_list: page_list.append(l.get("href")) for link in page_list: link = link.encode("ascii","ignore") #a2zsecond(link) t = threading.Thread(target=a2zsecond,args=(link,)) threads.append(t) t.start() logging.debug("a2z") if len(threads)>10: t.join() del threads[:]
def year(): link = "http://www.hindilinks4u.net/" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"class":"tagcloud"}) year2(str(data))
def archives4(db,cursor,links,archives_name, archives_link,movie_name,movie_link): page = proxy_module.main(movie_link) soup = BeautifulSoup(page) page.close() #print movie_name,movie_link image_link = image_finder(soup,movie_name,movie_link) data = soup.find_all("strong") for l in data: s = l.get_text().encode("ascii","ignore") if re.search(r"Dailymotion",s): print print "Dailymotion" para = l.find_next("p") soup2 = BeautifulSoup(str(para)) data2 = soup2.find_all("a") for m in data2: #print m.get_text(),m.get("href") ok here watch = m.get_text().encode("ascii","ignore") watch_link = m.get("href").encode("ascii","ignore") archivesdaily(db,cursor,links,archives_name, archives_link,movie_name,movie_link,watch,watch_link,image_link) #sys.exit() elif re.search(r"Youtube",s): print print "You tube" para = l.find_next("p") soup2 = BeautifulSoup(str(para)) data2 = soup2.find_all("a") for m in data2: #print m.get_text(),m.get("href") ok here watch = m.get_text().encode("ascii","ignore") watch_link = m.get("href").encode("ascii","ignore") archivesyou(db,cursor,links,archives_name, archives_link,movie_name,movie_link,watch,watch_link,image_link) #sys.exit() else: pass
def a2zsecond(db,cursor,link): page = proxy_module.main(link) soup = BeautifulSoup(page) data = soup.find_all("div",attrs={"class":"results_content"}) for l in data: movie_name = l.get_text().encode("ascii","ignore") movie_link = l.a.get("href").encode("ascii","ignore") #print movie_name, movie_link a2zthird(db,cursor,link,movie_name,str(movie_link))
def archives3(db,cursor,links,archives_name, archives_link): page = proxy_module.main(archives_link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"class":"results_content"}) for l in data: movie_link = l.a.get("href").encode("ascii","ignore") movie_name = l.a.get_text().encode("ascii","ignore") archives4(db,cursor,links,archives_name, archives_link,movie_name,movie_link)
def cato2(db, cursor,cat_name,cat_link): page = proxy_module.main(cat_link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"class":"results_content"}) for l in data: movie_name = l.a.get_text().encode("ascii","ignore") movie_link = l.a.get("href").encode("ascii","ignore") #print film_name, film_link #ok till here cato3(db, cursor,cat_name,cat_link,movie_name,movie_link)
def archives(db,cursor): link = "http://www.hindilinks4u.net/" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("h2",attrs={"class":"widgettitle"}) for l in data: if str(l.get_text()).strip() =="Archives": links = l.find_next("ul") archives2(db,cursor,str(links))
def year(db,cursor): link = "http://www.hindilinks4u.net/" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"class":"tagcloud"}) #for l in data: # if str(l.get_text()).strip() =="Movies By years": # links = l.find_next("ul") year2(db,cursor,str(data))
def latest_movie(): link = "http://www.hindilinks4u.net/" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("h2",attrs={"class":"widgettitle"}) for l in data: if str(l.get_text()).strip() =="Latest Movies": links = l.find_next("ul") latest_movie2(str(links))
def tvs(): link = "http://www.hindilinks4u.net/" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("h2",attrs={"class":"widgettitle"}) for l in data: if str(l.get_text()).strip() =="TV Shows and Awards": links = l.find_next("ul") tvs2(str(links))
def main(): collection =["main_link", "title","sub_link", "actual_price","final_price","colours"] in_file(collection) link ="http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&facetOrder[]=ideal_for&otracker=nmenu_sub_women_0_Handbags" page = proxy_module.main(link) soup = BeautifulSoup(page) data = soup.find_all("span",attrs={"class":"items"}) number = data[0].get_text() number = number.strip() number = int(number) for num in range(1,number,15): main2(num)
def a2zyou(db,cursor,link,movie_name,movie_link,watch,watch_link): page = proxy_module.main(watch_link) soup = BeautifulSoup(page) if soup.find_all("iframe",attrs={"src",re.compile("youtube")}): em = soup.find_all("iframe",attrs={"src",re.compile("youtube")}) em_link = em[0]["src"].encode("ascii","ignore") print movie_name,watch,em_link elif soup.find_all("embed",attrs={"src":re.compile(r"")}): em = soup.find_all("embed",attrs={"src":re.compile(r"youtube")}) em_link = em[0]["src"].encode("ascii","ignore") print movie_name,watch,em_link else: em_link = " on this link: " +watch_link.encode("ascii","ignore") print movie_name,watch,em_link
def tvsdaily(inks,show_name, show_link,watch,watch_link,image_link): page = proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}): em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}): em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: " +watch_link.encode("ascii","ignore") collection = [show_name,show_link,"Dailymotion",watch,em_link,image_link] logging.debug(tuple(collection)) in_file(collection)
def yearyou(links,year_name, year_link,movie_name,movie_link,watch,watch_link,image_link): page = proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() if soup.find_all("iframe",attrs={"src":re.compile("youtube")}): em = soup.find_all("iframe",attrs={"src":re.compile("youtube")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"youtube")}): em = soup.find_all("embed",attrs={"src":re.compile(r"youtube")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: " +watch_link.encode("ascii","ignore") collection = [year_name,year_link,movie_name,movie_link,"Youtube",watch,em_link,image_link] logging.debug(tuple(collection)) in_file(collection)
def cato(db, cursor): link = "http://www.hindilinks4u.net/" page = proxy_module.main(link) soup = BeautifulSoup(page) page.close() data = soup.find_all("h2",attrs={"class":"widgettitle"}) for l in data: if str(l.get_text()).strip() =="Categories": cat_list = l.find_next("ul") soup = BeautifulSoup(str(cat_list)) data = soup.find_all("a") for l in data: cat_name= l.get_text().encode("ascii","ignore") cat_link= l.get("href").encode("ascii","ignore") cato2(db, cursor,cat_name,cat_link)
def jwellery(): main_link = "http://www.flipkart.com/jewellery" page = proxy_module.main(main_link) soup = BeautifulSoup(page) cat_available = soup.find_all("div", attrs={"id":"list-categories"}) all_links = cat_available[0].find_all("a") jobs = [] for l in all_links: cat_link = "http://www.flipkart.com/"+ str(l.get("href")) cat_title = str(l.get_text()).strip() p = multiprocessing.Process(target=open_page, args = (cat_link, cat_title)) p2 = multiprocessing.Process(target=open_page, args = (cat_link, cat_title)) jobs.append(p) p.start()
def main2(num): num = str(num) link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&start="+num+"&ajax=true" page = proxy_module.main(link) st = page.read() st = st.replace("<","<") st = st.replace(">",">") st = st.replace(""",'"') soup = BeautifulSoup(st) details = soup.find_all("div",attrs={"class":"pu-details lastUnit"}) if details: threads = [] for l in details: t = threading.Thread(target=collection,args=(l,)) threads.append(t) t.start() t.join()
def actordaily(db,cursor,links,actor_name, actor_link,movie_name,movie_link,watch,watch_link): page = proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}): em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}): em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: "+watch_link.encode("ascii","ignore") print #print actor_name, actor_link,movie_name,movie_link,watch,watch_link #print actor_name,actor_link,movie_name,watch,em_link '''sql = """insert ignore into actor(actor_actoress,actor_actoress_link,movie,movie_link,watch,watch_link) values("%s","%s","%s","%s","%s","%s")"""%(actor_name,actor_link,movie_name,movie_link,watch,em_link)
def director4(links,director_name, director_link,movie_name,movie_link): page = proxy_module.main(movie_link) soup = BeautifulSoup(page) page.close() image_link=image_finder(soup,movie_name,movie_link) data = soup.find_all("strong") threads = [] for l in data: s = l.get_text().encode("ascii","ignore") if re.search(r"Dailymotion",s): para = l.find_next("p") soup2 = BeautifulSoup(str(para)) data2 = soup2.find_all("a") for m in data2: watch = m.get_text().encode("ascii","ignore") watch_link = m.get("href").encode("ascii","ignore") #directordaily(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link) arg =(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link) t = threading.Thread(target=directordaily,args=arg) threads.append(t) t.start() logging.debug("Dailymotion") if len(threads)>5: t.join() del threads[:] elif re.search(r"Youtube",s): para = l.find_next("p") soup2 = BeautifulSoup(str(para)) data2 = soup2.find_all("a") for m in data2: watch = m.get_text().encode("ascii","ignore") watch_link = m.get("href").encode("ascii","ignore") #directoryou(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link) arg =(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link) t = threading.Thread(target=directoryou,args=arg) threads.append(t) t.start() logging.debug("Youtube") if len(threads)>5: t.join() del threads[:] else: pass
def cato2(cat_name,cat_link): page = proxy_module.main(cat_link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"class":"results_content"}) threads = [] for l in data: movie_name = l.a.get_text().encode("ascii","ignore") movie_link = l.a.get("href").encode("ascii","ignore") #cato3(cat_name,cat_link,movie_name,movie_link) t = threading.Thread(target=cato3,args=(cat_name,cat_link,movie_name,movie_link)) threads.append(t) t.start() logging.debug("cato2") if len(threads)>10: t.join() del threads[:]
def catodaily(cat_name,cat_link,movie_name,movie_link,watch,watch_link,image_link): #page,driver= firebug_proxy.main(watch_link) page= proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() #driver.close() if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}): em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}): em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: " +watch_link.encode("ascii","ignore") collection = [cat_name,cat_link,movie_name,movie_link,"Daliymotion",watch,em_link,image_link] logging.debug(tuple(collection)) in_file(collection)
def archives3(links,archives_name, archives_link): page = proxy_module.main(archives_link) soup = BeautifulSoup(page) page.close() data = soup.find_all("div",attrs={"class":"results_content"}) threads = [] for l in data: movie_link = l.a.get("href").encode("ascii","ignore") movie_name = l.a.get_text().encode("ascii","ignore") #archives4(links,archives_name, archives_link,movie_name,movie_link) arg =(links,archives_name, archives_link,movie_name,movie_link) t = threading.Thread(target=archives4,args=arg) threads.append(t) t.start() logging.debug("youtube") if len(threads)>5: t.join() del threads[:]
def a2z(db, cursor): link = "http://www.hindilinks4u.net/hindi-movies-a-to-z" page = proxy_module.main(link) soup = BeautifulSoup(page) data = soup.find_all("div",attrs={"id":"wp_page_numbers"}) soup = BeautifulSoup(str(data)) data = soup.find_all("a") # print data ok page_list = [] for l in data: if l.get("href") not in page_list: page_list.append(l.get("href")) #print page_list ok for link in page_list: link = link.encode("ascii","ignore") #print type(link), #print link a2zsecond(db,cursor,link)
def archivesdaily(db,cursor,links,archives_name, archives_link,movie_name,movie_link,watch,watch_link,image_link): page = proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}): em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}): em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: "+watch_link.encode("ascii","ignore") print #print archives_name, archives_link,movie_name,movie_link,watch,watch_link #print archives_name,archives_link,movie_name,watch,em_link sql = """insert ignore into archives(archives,archives_link,movie,movie_link,watch,watch_link,image_link) values("%s","%s","%s","%s","%s","%s","%s")"""%(archives_name,archives_link,movie_name,movie_link,watch,em_link,image_link) print sql cursor.execute(sql) db.commit()
def directoryou(db,cursor,links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link): page = proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() if soup.find_all("iframe",attrs={"src":re.compile("youtube")}): em = soup.find_all("iframe",attrs={"src":re.compile("youtube")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"youtube")}): em = soup.find_all("embed",attrs={"src":re.compile(r"youtube")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: " +watch_link.encode("ascii","ignore") print print #print director_name, director_link,movie_name,movie_link,watch,watch_link #print director_name,movie_name,watch,em_link sql = """insert ignore into director(diector,director_link,movie,movie_link,watch,watch_link,image_link) values("%s","%s","%s","%s","%s","%s","%s")"""%(director_name,director_link,movie_name,movie_link,watch,em_link,image_link) print sql cursor.execute(sql) db.commit()
def tvsdaily(db,cursor,links,show_name, show_link,watch,watch_link,image_link): #page = firebug_proxy.main(watch_link) #page = page.encode("ascii","ignore") #soup = BeautifulSoup(page) page = proxy_module.main(watch_link) soup = BeautifulSoup(page) page.close() if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}): em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}): em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}) em_link = em[0]["src"].encode("ascii","ignore") else: em_link = " on this link: " +watch_link.encode("ascii","ignore") print #print cat_link,show_link,watch_link #print show_name,show_link #print watch,em_link sql = """insert ignore into tvs2(show_name,link_link,watch,watch_link,image_link) values("%s","%s","%s","%s","%s")"""%(show_name,show_link,watch,em_link,image_link) print sql cursor.execute(sql) db.commit()
def main2(num): num = str(num) main_link ="http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&facetOrder[]=ideal_for&otracker=nmenu_sub_women_0_Handbags" link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&start="+num+"&ajax=true" page = proxy_module.main(link) st = page.read() st = st.replace("<","<") st = st.replace(">",">") st = st.replace(""",'"') soup = BeautifulSoup(st) details = soup.find_all("div",attrs={"class":"pu-details lastUnit"}) if details: for l in details: link = str(l.div.a.get("href")) link = "http://www.flipkart.com"+link title = str(l.div.a.get("title")) discount = l.find("div", attrs={"class":"pu-discount fk-font-11"}) if discount: dis = discount.span.get_text() else: dis = "None" final_price = l.find("div",attrs={"class":"pu-final"}) final_price = final_price.span.get_text() colour = l.find_all("div",attrs={"class":"fk-hidden cp-sizes"}) clrs = [] if colour: for l in colour: clr = l.div.get_text() clrs.append(clr) else: clr = "None" clr = ','.join(clrs) collection = [main_link, title,link, dis,final_price,clr] in_file(collection)