コード例 #1
0
def seller_info(seller_link):
    page = proxy_module.main(seller_link)
    soup=BeautifulSoup(page)
    page.close()
    view_item = soup.find_all("td", attrs={"id":"viewItemId"})
    try:
        view_item = str(view_item[0].a.get("href"))
	view_link = view_item
        print view_link
	page2 = proxy_module.main(view_item)
	soup2=BeautifulSoup(page2)
        page2.close()
	last_item_content = soup2.find_all("h1", attrs={"class":"vi-is1-titleH1"})
	last_item_content = str(last_item_content[0].get_text()).encode('ascii','ignore') 
	item_condition = soup2.find_all("span", attrs={"class":"vi-is1-condText"})
        item_condition = str(item_condition[0].string).encode('ascii','ignore') 
	ended = soup2.find_all("span", attrs={"class":"vi-is1-dt"})
	ended = str(ended[0].get_text()).encode('ascii','ignore') 
	selling_price = soup2.find_all("span", attrs={"id":"v4-27"})
	selling_price=str(selling_price[0].get_text()).encode('ascii','ignore') 
	shipping = soup2.find_all("span", attrs={"id":"fshippingCost"})
        shipping = str(shipping[0].get_text()).encode('ascii','ignore') 
    except:
        view_item ="private"

    if  view_item.lower()=="private":
        last_item_content = "None as it is private "
        view_link = "No link as it is private "
        item_condition = "Private"
        ended = "Not Known  "
	selling_price = "Not Known  "
	shipping = "Not Known"
    #print last_item_content,view_link ,item_condition, ended, selling_price, shipping
    return last_item_content, view_link,item_condition, ended, selling_price, shipping
コード例 #2
0
def seller_info(link):
    page  = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("td", attrs={"id":"viewItemId"})
    try:
        view_link = str(data[0].a.get("href"))
        page  = proxy_module.main(view_link)
        print view_link
        soup = BeautifulSoup(page)
        data = soup.find_all("h1", attrs={"class":"vi-is1-titleH1"})
        last_item_content = str(data[0].get_text())
        data = soup.find_all("span", attrs={"class":"vi-is1-condText"})
        item_condition = str(data[0].string)
        try:
            data = soup.find_all("span", attrs={"class":"vi-is1-dt"})
            ended = str(data[0].get_text())
        except:
            ended ="Not yet ended"
        data = soup.find_all("span", attrs={"class":"vi-is1-prcp"})
        selling_price = str(data[0].get_text())
        data = soup.find_all("span", attrs={"id":"fshippingCost"})
        shipping  = str(data[0].get_text())
    except:
        view_link ="private"
    if  view_link.lower()=="private":
        last_item_content = "None as it is private "
	view_link = "No link as it is private "
	item_condition = "Private"
	ended = "Not Known  "
	selling_price = "Not Known  "
	shipping = "Not Known"
    return last_item_content, view_link,item_condition, ended, selling_price, shipping
コード例 #3
0
ファイル: h_a2z.py プロジェクト: Jai-Prakash-Singh/h_link
def a2zthird(db,cursor,link,movie_name,movie_link): 

    page = proxy_module.main(movie_link)
    soup = BeautifulSoup(page)
    data = soup.find_all("strong")
    for l in data:
        s = l.get_text().encode("ascii","ignore")
        if re.search(r"Dailymotion",s):
            print "*"*10
            print movie_name,movie_link 
            print "Dailymotion"
            para = l.find_next("p")
            soup2 = BeautifulSoup(str(para))
            data2 = soup2.find_all("a")
            for m in data2:
                #print m.get_text(),m.get("href") ok here
                watch = m.get_text().encode("ascii","ignore")
                watch_link = m.get("href").encode("ascii","ignore")
                a2zdaily(db,cursor,link,movie_name,movie_link,watch,watch_link)
            #sys.exit()
        elif re.search(r"Youtube",s):
            print "*"*10
            print movie_name,movie_link 
            print "Youtube"
            para = l.find_next("p")
            soup2 = BeautifulSoup(str(para))
            data2 = soup2.find_all("a")
            for m in data2:
                #print m.get_text(),m.get("href") ok here
                watch = m.get_text().encode("ascii","ignore")
                watch_link = m.get("href").encode("ascii","ignore")
                a2zyou(db,cursor,link,movie_name,movie_link,watch,watch_link)
            #sys.exit() 
        else:
            pass
コード例 #4
0
def a2z():
    
    link = "http://www.hindilinks4u.net/hindi-movies-a-to-z"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"id":"wp_page_numbers"})
    soup = BeautifulSoup(str(data))
    data = soup.find_all("a")
    page_list = []
    threads = []
    for l in data:
        if l.get("href") not in page_list:
            page_list.append(l.get("href"))

    for link in page_list:
        link = link.encode("ascii","ignore")
        #a2zsecond(link)
        t = threading.Thread(target=a2zsecond,args=(link,))
        threads.append(t)   
        t.start()
        logging.debug("a2z")
        if len(threads)>10:
            t.join()
            del threads[:]
コード例 #5
0
def year():
    link = "http://www.hindilinks4u.net/"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"class":"tagcloud"})
    year2(str(data))  
コード例 #6
0
ファイル: archieves.py プロジェクト: Jai-Prakash-Singh/h_link
def archives4(db,cursor,links,archives_name, archives_link,movie_name,movie_link):
    page = proxy_module.main(movie_link)
    soup = BeautifulSoup(page)
    page.close()
    #print movie_name,movie_link
    image_link = image_finder(soup,movie_name,movie_link)
    data = soup.find_all("strong")
    for l in data:
        s = l.get_text().encode("ascii","ignore")
        if re.search(r"Dailymotion",s):
            print 
            print "Dailymotion"
            para = l.find_next("p")
            soup2 = BeautifulSoup(str(para))
            data2 = soup2.find_all("a")
            for m in data2:
                #print m.get_text(),m.get("href") ok here
                watch = m.get_text().encode("ascii","ignore")
                watch_link = m.get("href").encode("ascii","ignore")
                archivesdaily(db,cursor,links,archives_name, archives_link,movie_name,movie_link,watch,watch_link,image_link)
            #sys.exit()
        elif re.search(r"Youtube",s):
            print 
            print "You tube"
            para = l.find_next("p")
            soup2 = BeautifulSoup(str(para))
            data2 = soup2.find_all("a")
            for m in data2:
                #print m.get_text(),m.get("href") ok here
                watch = m.get_text().encode("ascii","ignore")
                watch_link = m.get("href").encode("ascii","ignore")
                archivesyou(db,cursor,links,archives_name, archives_link,movie_name,movie_link,watch,watch_link,image_link)
            #sys.exit() 
        else:
            pass
コード例 #7
0
ファイル: h_a2z.py プロジェクト: Jai-Prakash-Singh/h_link
def a2zsecond(db,cursor,link):
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    data = soup.find_all("div",attrs={"class":"results_content"})
    for l in data:
        movie_name = l.get_text().encode("ascii","ignore")
        movie_link = l.a.get("href").encode("ascii","ignore")
        #print  movie_name, movie_link
        a2zthird(db,cursor,link,movie_name,str(movie_link))
コード例 #8
0
ファイル: archieves.py プロジェクト: Jai-Prakash-Singh/h_link
def archives3(db,cursor,links,archives_name, archives_link):
    page = proxy_module.main(archives_link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"class":"results_content"})
    for l in data:
        movie_link = l.a.get("href").encode("ascii","ignore")
        movie_name = l.a.get_text().encode("ascii","ignore")
        archives4(db,cursor,links,archives_name, archives_link,movie_name,movie_link)
コード例 #9
0
def cato2(db, cursor,cat_name,cat_link):
    page = proxy_module.main(cat_link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"class":"results_content"})
    for l in data:
        movie_name = l.a.get_text().encode("ascii","ignore")
        movie_link = l.a.get("href").encode("ascii","ignore")
        #print film_name, film_link  #ok till here
        cato3(db, cursor,cat_name,cat_link,movie_name,movie_link)
コード例 #10
0
ファイル: archieves.py プロジェクト: Jai-Prakash-Singh/h_link
def archives(db,cursor):
    link = "http://www.hindilinks4u.net/"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("h2",attrs={"class":"widgettitle"})
    for l in data:
        if str(l.get_text()).strip() =="Archives":
            links = l.find_next("ul")
            
    archives2(db,cursor,str(links))  
コード例 #11
0
ファイル: byyear.py プロジェクト: Jai-Prakash-Singh/h_link
def year(db,cursor):
    link = "http://www.hindilinks4u.net/"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"class":"tagcloud"})
    #for l in data:
    #    if str(l.get_text()).strip() =="Movies By years":
    #        links = l.find_next("ul")
            
    year2(db,cursor,str(data))  
コード例 #12
0
def latest_movie():
    link = "http://www.hindilinks4u.net/"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("h2",attrs={"class":"widgettitle"})
    for l in data:
        if str(l.get_text()).strip() =="Latest Movies":
            links = l.find_next("ul")
            
    latest_movie2(str(links))  
コード例 #13
0
def tvs():
    link = "http://www.hindilinks4u.net/"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("h2",attrs={"class":"widgettitle"})
    for l in data:
        if str(l.get_text()).strip() =="TV Shows and Awards":
            links = l.find_next("ul")
            
    tvs2(str(links))  
コード例 #14
0
ファイル: try_4.py プロジェクト: Jai-Prakash-Singh/parse_flip
def  main():
   
    collection =["main_link", "title","sub_link", "actual_price","final_price","colours"]
    in_file(collection)
    link ="http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&facetOrder[]=ideal_for&otracker=nmenu_sub_women_0_Handbags"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    data = soup.find_all("span",attrs={"class":"items"})
    number  = data[0].get_text()
    number = number.strip()
    number = int(number)
    for  num in range(1,number,15):
        main2(num)
コード例 #15
0
ファイル: h_a2z.py プロジェクト: Jai-Prakash-Singh/h_link
def a2zyou(db,cursor,link,movie_name,movie_link,watch,watch_link):
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    if soup.find_all("iframe",attrs={"src",re.compile("youtube")}):
        em = soup.find_all("iframe",attrs={"src",re.compile("youtube")})
	em_link = em[0]["src"].encode("ascii","ignore")
	print movie_name,watch,em_link
    elif soup.find_all("embed",attrs={"src":re.compile(r"")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"youtube")})
	em_link = em[0]["src"].encode("ascii","ignore")
	print movie_name,watch,em_link
    else:
	em_link = " on this link: " +watch_link.encode("ascii","ignore")
        print movie_name,watch,em_link
コード例 #16
0
def tvsdaily(inks,show_name, show_link,watch,watch_link,image_link):
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close()
    if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")
    else:
	em_link = " on this link: " +watch_link.encode("ascii","ignore")
    collection = [show_name,show_link,"Dailymotion",watch,em_link,image_link]
    logging.debug(tuple(collection))
    in_file(collection)
コード例 #17
0
def yearyou(links,year_name, year_link,movie_name,movie_link,watch,watch_link,image_link):
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close() 
    if soup.find_all("iframe",attrs={"src":re.compile("youtube")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("youtube")})
	em_link = em[0]["src"].encode("ascii","ignore")
    elif soup.find_all("embed",attrs={"src":re.compile(r"youtube")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"youtube")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    else:
	em_link = " on this link: " +watch_link.encode("ascii","ignore")
    collection = [year_name,year_link,movie_name,movie_link,"Youtube",watch,em_link,image_link]
    logging.debug(tuple(collection))
    in_file(collection)
コード例 #18
0
def cato(db, cursor):
    link = "http://www.hindilinks4u.net/"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("h2",attrs={"class":"widgettitle"})
    for l in data:
        if str(l.get_text()).strip() =="Categories":
            cat_list = l.find_next("ul")

    soup = BeautifulSoup(str(cat_list))
    data = soup.find_all("a")
    for l in data:
        cat_name= l.get_text().encode("ascii","ignore")
        cat_link= l.get("href").encode("ascii","ignore")
        cato2(db, cursor,cat_name,cat_link)
コード例 #19
0
def jwellery():

    main_link = "http://www.flipkart.com/jewellery"

    page = proxy_module.main(main_link)
    soup = BeautifulSoup(page)
    cat_available = soup.find_all("div", attrs={"id":"list-categories"})
    all_links = cat_available[0].find_all("a")

    jobs = []
    for l in all_links:
        cat_link = "http://www.flipkart.com/"+ str(l.get("href"))
        cat_title = str(l.get_text()).strip()
        p = multiprocessing.Process(target=open_page, args = (cat_link, cat_title))
        p2 = multiprocessing.Process(target=open_page, args = (cat_link, cat_title))
        jobs.append(p)
        p.start()
コード例 #20
0
def main2(num):
    num = str(num)
    link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&start="+num+"&ajax=true"
    page = proxy_module.main(link)
    st = page.read()
    st = st.replace("&lt;","<")
    st = st.replace("&gt;",">")
    st = st.replace("&quot;",'"')
    soup = BeautifulSoup(st)
    details = soup.find_all("div",attrs={"class":"pu-details lastUnit"})  
    if details:
        threads = []
        for l in details:  
            t = threading.Thread(target=collection,args=(l,))
            threads.append(t)
            t.start() 
            t.join()
コード例 #21
0
ファイル: actor.py プロジェクト: Jai-Prakash-Singh/h_link
def actordaily(db,cursor,links,actor_name, actor_link,movie_name,movie_link,watch,watch_link):
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close()
    
    if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")
    else:
	em_link = " on this link: "+watch_link.encode("ascii","ignore")
    print 
    #print actor_name, actor_link,movie_name,movie_link,watch,watch_link
    #print actor_name,actor_link,movie_name,watch,em_link
    '''sql = """insert ignore into actor(actor_actoress,actor_actoress_link,movie,movie_link,watch,watch_link) values("%s","%s","%s","%s","%s","%s")"""%(actor_name,actor_link,movie_name,movie_link,watch,em_link)
コード例 #22
0
def director4(links,director_name, director_link,movie_name,movie_link):
    page = proxy_module.main(movie_link)
    soup = BeautifulSoup(page)
    page.close()
    image_link=image_finder(soup,movie_name,movie_link)
    data = soup.find_all("strong")
    threads = []
    for l in data:
        s = l.get_text().encode("ascii","ignore")
        if re.search(r"Dailymotion",s):
            para = l.find_next("p")
            soup2 = BeautifulSoup(str(para))
            data2 = soup2.find_all("a")
            for m in data2:
                watch = m.get_text().encode("ascii","ignore")
                watch_link = m.get("href").encode("ascii","ignore")
                #directordaily(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link)
                arg =(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link)
                t = threading.Thread(target=directordaily,args=arg)
                threads.append(t)
                t.start()
                logging.debug("Dailymotion")
                if len(threads)>5:
                    t.join()
                    del threads[:]


        elif re.search(r"Youtube",s):
            para = l.find_next("p")
            soup2 = BeautifulSoup(str(para))
            data2 = soup2.find_all("a")
            for m in data2:
                watch = m.get_text().encode("ascii","ignore")
                watch_link = m.get("href").encode("ascii","ignore")
                #directoryou(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link)
                arg =(links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link)
                t = threading.Thread(target=directoryou,args=arg)
                threads.append(t)
                t.start()
                logging.debug("Youtube")
                if len(threads)>5:
                    t.join()
                    del threads[:]

        else:
            pass
コード例 #23
0
def cato2(cat_name,cat_link):
    page = proxy_module.main(cat_link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"class":"results_content"})
    threads = []
    for l in data:
        movie_name = l.a.get_text().encode("ascii","ignore")
        movie_link = l.a.get("href").encode("ascii","ignore")
        #cato3(cat_name,cat_link,movie_name,movie_link)
        t = threading.Thread(target=cato3,args=(cat_name,cat_link,movie_name,movie_link))
        threads.append(t)
        t.start()
        logging.debug("cato2")
        if len(threads)>10:
            t.join()
            del threads[:]
コード例 #24
0
def catodaily(cat_name,cat_link,movie_name,movie_link,watch,watch_link,image_link):
    #page,driver= firebug_proxy.main(watch_link)
    page= proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close()
    #driver.close()
    if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")
    else:
	em_link = " on this link: " +watch_link.encode("ascii","ignore")
    
    collection = [cat_name,cat_link,movie_name,movie_link,"Daliymotion",watch,em_link,image_link]
    logging.debug(tuple(collection))
    in_file(collection)
コード例 #25
0
def archives3(links,archives_name, archives_link):
    page = proxy_module.main(archives_link)
    soup = BeautifulSoup(page)
    page.close()
    data = soup.find_all("div",attrs={"class":"results_content"})
    threads = []
    for l in data:
        movie_link = l.a.get("href").encode("ascii","ignore")
        movie_name = l.a.get_text().encode("ascii","ignore")
        #archives4(links,archives_name, archives_link,movie_name,movie_link)
        arg =(links,archives_name, archives_link,movie_name,movie_link)
        t = threading.Thread(target=archives4,args=arg)
        threads.append(t)
        t.start()
        logging.debug("youtube")
        if len(threads)>5:
                t.join()
                del threads[:]
コード例 #26
0
ファイル: h_a2z.py プロジェクト: Jai-Prakash-Singh/h_link
def a2z(db, cursor):
    
    link = "http://www.hindilinks4u.net/hindi-movies-a-to-z"
    page = proxy_module.main(link)
    soup = BeautifulSoup(page)
    data = soup.find_all("div",attrs={"id":"wp_page_numbers"})
    soup = BeautifulSoup(str(data))
    data = soup.find_all("a")
    # print data ok 
    page_list = []
    for l in data:
        if l.get("href") not in page_list:
            page_list.append(l.get("href"))
    #print page_list ok 
    for link in page_list:
        link = link.encode("ascii","ignore")
        #print type(link),
        #print link
        a2zsecond(db,cursor,link)
コード例 #27
0
ファイル: archieves.py プロジェクト: Jai-Prakash-Singh/h_link
def archivesdaily(db,cursor,links,archives_name, archives_link,movie_name,movie_link,watch,watch_link,image_link):
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close()
    if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")
    else:
	em_link = " on this link: "+watch_link.encode("ascii","ignore")
    print 
    #print archives_name, archives_link,movie_name,movie_link,watch,watch_link
    #print archives_name,archives_link,movie_name,watch,em_link
    sql = """insert ignore into archives(archives,archives_link,movie,movie_link,watch,watch_link,image_link) values("%s","%s","%s","%s","%s","%s","%s")"""%(archives_name,archives_link,movie_name,movie_link,watch,em_link,image_link)
    print sql
    cursor.execute(sql)
    db.commit()
コード例 #28
0
ファイル: director.py プロジェクト: Jai-Prakash-Singh/h_link
def directoryou(db,cursor,links,director_name, director_link,movie_name,movie_link,watch,watch_link,image_link):
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close() 
    if soup.find_all("iframe",attrs={"src":re.compile("youtube")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("youtube")})
	em_link = em[0]["src"].encode("ascii","ignore")
    elif soup.find_all("embed",attrs={"src":re.compile(r"youtube")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"youtube")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    else:
	em_link = " on this link: " +watch_link.encode("ascii","ignore")
    print 
    print 
    #print director_name, director_link,movie_name,movie_link,watch,watch_link
    #print director_name,movie_name,watch,em_link
    sql = """insert ignore into director(diector,director_link,movie,movie_link,watch,watch_link,image_link) values("%s","%s","%s","%s","%s","%s","%s")"""%(director_name,director_link,movie_name,movie_link,watch,em_link,image_link)
    print sql
    cursor.execute(sql)
    db.commit()
コード例 #29
0
ファイル: h_tv_show.py プロジェクト: Jai-Prakash-Singh/h_link
def tvsdaily(db,cursor,links,show_name, show_link,watch,watch_link,image_link):
    #page = firebug_proxy.main(watch_link)
    #page = page.encode("ascii","ignore")
    #soup = BeautifulSoup(page)
    page = proxy_module.main(watch_link)
    soup = BeautifulSoup(page)
    page.close()
    if soup.find_all("iframe",attrs={"src":re.compile("dailymotion")}):
        em = soup.find_all("iframe",attrs={"src":re.compile("dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")	
    elif soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")}):
	em = soup.find_all("embed",attrs={"src":re.compile(r"dailymotion")})
	em_link = em[0]["src"].encode("ascii","ignore")
    else:
	em_link = " on this link: " +watch_link.encode("ascii","ignore")
    print 
    #print cat_link,show_link,watch_link
    #print show_name,show_link
    #print watch,em_link
    sql = """insert ignore into tvs2(show_name,link_link,watch,watch_link,image_link) values("%s","%s","%s","%s","%s")"""%(show_name,show_link,watch,em_link,image_link)
    print sql
    cursor.execute(sql)
    db.commit()
コード例 #30
0
ファイル: try_4.py プロジェクト: Jai-Prakash-Singh/parse_flip
def main2(num):
    num = str(num)
 
    main_link ="http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&facetOrder[]=ideal_for&otracker=nmenu_sub_women_0_Handbags"
   
    link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?p[]=facets.ideal_for%255B%255D%3DWomen&p[]=sort%3Dpopularity&sid=reh%2Cihu%2Cm08&start="+num+"&ajax=true"
    page = proxy_module.main(link)
    st = page.read()
    st = st.replace("&lt;","<")
    st = st.replace("&gt;",">")
    st = st.replace("&quot;",'"')
    soup = BeautifulSoup(st)
    details = soup.find_all("div",attrs={"class":"pu-details lastUnit"})  
    if details:
        for l in details:  
            link = str(l.div.a.get("href"))
            link = "http://www.flipkart.com"+link
            title = str(l.div.a.get("title"))          
            discount = l.find("div", attrs={"class":"pu-discount fk-font-11"})
            if discount:
                dis = discount.span.get_text()
            else:
                dis = "None"              
            final_price = l.find("div",attrs={"class":"pu-final"})
            final_price = final_price.span.get_text()
            colour = l.find_all("div",attrs={"class":"fk-hidden cp-sizes"})
            clrs = []
            if colour:
                for  l in colour:
                    clr = l.div.get_text()
                    clrs.append(clr)
            else:
                clr = "None"
            clr = ','.join(clrs)
                   
        collection = [main_link, title,link, dis,final_price,clr]
        in_file(collection)