def main(line):
    line = ast.literal_eval(line)

    page = req_proxy.main(line[8])

    f = open("myfile.txt", "w+")
    print >>f, page
    f.close()

    #driver = phan_proxy.main(line[8])
    #page = driver.page_source

    #driver.delete_all_cookies()
    #driver.quit()
    
    #tree = html.fromstring(page)
    #meta_disc = tree.xpath("/html/head/meta[3]/text()")

    soup = BeautifulSoup(page, "html.parser")
    meta_disc = soup.find("meta", attrs={"name":"description"}).get("content")

    title = soup.find("title").get_text()
    desc = soup.find("section", attrs={"id":"product-detail"})
    dte = time.strftime("%d:%m:%Y")
    status = " "
    spec = " "
    vender = "zovi.com"
    brand = "zovi"

    print map(my_strip,  [line[7], line[11], line[0], line[5], line[2], 
                         line[3], brand, line[9], line[5], line[4], 
                         line[1], line[8], vender, title, meta_disc, line[10], 
                         desc, spec, dte, status])
def main2(line, all_brand_link, f3):
    menu = line[0]
    ctlink = line[1]
    cttitle = line[2]
    sctlink = line[3]
    scttitle = line[4]

    page = req_proxy.main(all_brand_link)
    soup = BeautifulSoup(page)

    tag_brand_uls = soup.find_all("ul", attrs={"class": "column"})

    for ul_brand_link in tag_brand_uls:
        all_brand_link = ul_brand_link.find_all("a")
        for bl_bt in all_brand_link:
            bl = "%s%s" % ("http://www.amazon.in", bl_bt.get("href"))
            bt = str(bl_bt.span.get_text()).strip()

            filedata = [menu, ctlink, cttitle, sctlink, scttitle, bl, bt]
            filedata2 = map(mystrip, filedata)
            logging.debug(filedata2)

            f4 = open(f3, "a+")
            print >> f4, filedata2
            f4.close()
    def get_product(self, line):

        """ given a  ct sct scl  subsubcate subcatelink br and brl find product"""
        line = ast.literal_eval(line)
        link = "http://www.junglee.com%s" %(line[-1])

        line[-1] = link
        
        page = req_proxy.main(link)
        soup = BeautifulSoup(page, "html.parser")

        pro_div_list = soup.find_all("div", attrs={"id":re.compile(r"^result_[\d]{1,}$")})

        filobj_write = self.pr_pl_filobj.write
        my_strip = self.my_strip

        info = [ filobj_write(str(  map(  my_strip, 
                                          line + [ pro_div.find("img", attrs={"alt":"Product Details"}).find_parent("a").get("href") ]
                                       )
                                 ) 
                                 + "\n"
                             )
                 for pro_div  in  pro_div_list
               ]

        del info[:]
        del info
def main():
    dte = "dir%s" %(time.strftime("%d%m%Y"))
   
    try:
        os.makedirs(dte)
    except:
        pass


    link = "http://www.myntra.com/"
    page  = req_proxy.main(link) 
    #page = phan_proxy.main(link)
   
    
    if  not page:
        main()
    
    soup = BeautifulSoup(page)

    tag_mklevel2 = soup.find_all("a", attrs={"class":"mk-level2 "})
    
    #print len(filter(None,  map(menucollection, tag_mklevel2)))    
    f = open("to_extract.txt", "w+")
    f2 = open("extracted.txt", "a+")

    print >>f, dte
    print >>f2, dte

    f.close()
    f2.close()

    return   filter(None,  map(menucollection, tag_mklevel2))
def main2(line, catlink, filename2):
    menu = line[0].strip()
    submnlink = line[1].strip()
    submntitle = line[2].strip()
    catlink = line[-2].strip()
    cattitle = line[-1].strip()

    page = req_proxy.main(catlink)

    soup = BeautifulSoup(page)

    tag_brand = soup.find("ul", attrs={"id": "brand"})

    tag_brand_a = []

    if tag_brand is not None:
        tag_brand_a = tag_brand.find_all("a")

    f = open(filename2, "a+")

    for al in tag_brand_a:
        brandlink = "%s%s" % ("http://www.flipkart.com", str(al.get("href")).strip())
        brandtitle = str(al.get_text()).replace("\n", " ").replace("\t", " ").replace("\r", " ").strip()
        print >> f, [menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle]
        logging.debug([menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle])

    f.close()
Example #6
0
def main():

    directory = "diramazon%s" %(time.strftime("%d%m%y"))
    
    try:
        os.makedirs(directory)

    except:
        pass

    f = open("to_extract_amazon.txt", "w+")
    f.write(directory)
    f.close()

    f = open("extracted_amazon.txt", "a+")
    f.write(directory)
    f.close()

    filename = "%s/%s" %(directory, "f_mn_ctl_cat.txt")

    f = open(filename, "a+")

    link = "http://www.amazon.in/gp/site-directory/ref=sa_menu_top_fullstore"
    page = req_proxy.main(link)

    soup2 = BeautifulSoup(page)

    catlist = ["Beauty & Health", "Watches & Fashion Jewellery", "Handbags & Luggage"]

    map(functools.partial(main2, soup=soup2, f = f), catlist)

    f.close()
def main():
    link = "http://www.homeshop18.com/all-stores.html"
    page = req_proxy.main(link)

    soup = BeautifulSoup(page)

    tag_border = soup.find_all("div", attrs={"class":"border-box2 mar-top"})

    patter_firstpage = []

    for brend in tag_border:
        brend_title = brend.find("div", attrs={"class":"brend-title clearfix"}).get_text()
        cat = str(brend_title).strip()

        if  (cat == "Books") or (cat == "Jewellery"):
	    pass

	else:
	    tag_img_holder = brend.find_all("div", attrs={"class":"img-holder"})

            for sub_cat in tag_img_holder:
	        sub_cat_link = str(sub_cat.find("a").get("href")).strip()
                
                parsed = urlparse(sub_cat_link) 
		sub_cat_title = filter(None, str(parsed.path).split("/"))

	        patter_firstpage.append([cat, sub_cat_link, sub_cat_title[0].strip()])

    return patter_firstpage
def main2(mn_sbml_sbml, filename):
    menu = mn_sbml_sbml[0]
    submenulink = mn_sbml_sbml[1]
    submenutitle =  mn_sbml_sbml[2]
 
    page = req_proxy.main(submenulink)
    
    soup = BeautifulSoup(page)
    
    tag_cat = soup.find("div", attrs={"class":"nav-section first-section"})

    tag_cat_link = []

    if tag_cat is not None:
        tag_cat_link = tag_cat.find_all("a")

    f = open(filename, "a+")

    for l in tag_cat_link:
        cattitle = str(l.get("title")).strip()
	catlink = "%s%s" %("http://www.flipkart.com", str(l.get("href")).strip())
        print >>f, [menu, submenulink, submenutitle, catlink, cattitle]
        logging.debug([menu, submenulink, submenutitle, catlink, cattitle])

    f.close()
def part_second(line, filename2):

    f = open(filename2, "a+")

    line2 = line.strip().split(",")
    menulink  = line2[0].strip()
    menutitle = line2[1].strip()
    catlink = line2[2].strip()
    cattitle = line2[3].replace("\n","").strip()
    subcatlink = line2[-2].strip()
    subcattitle = line2[-1].strip()

    page = req_proxy.main(subcatlink)
    soup = BeautifulSoup(page, "html.parser")

    tag_brand = soup.find("div", text=re.compile("Brand"))
    blbtbc_list = tag_brand.parent.parent.find_all("li", attrs={"class":"srch_ctgry_item"})

    for blbtbc in blbtbc_list:
        brandlink = "%s%s" %("http://www.homeshop18.com", str(blbtbc.a.get("href")).strip())
        brandtitle = str(blbtbc.a.get( "title")).replace("\n", "").strip()

        print >>f, ','.join([menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle])
        print menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle
	print "*"*145

    f.close()
def main2(line, all_brand_link, f3):
    menu = line[0]
    ctlink = line[1]
    cttitle = line[2]
    sctlink = line[3]
    scttitle = line[4]

    page = req_proxy.main(all_brand_link)
    soup = BeautifulSoup(page)

    tag_brand_uls = soup.find_all("ul", attrs={"class":"column"})

    for ul_brand_link in tag_brand_uls:
        all_brand_link = ul_brand_link.find_all("a")
        for bl_bt in all_brand_link:
	    bl = "%s%s" %("http://www.amazon.in", bl_bt.get("href"))
	    bt = str(bl_bt.span.get_text()).strip()

            filedata = [menu, ctlink, cttitle, sctlink, scttitle, bl, bt]
            filedata2 = map(mystrip, filedata)
	    logging.debug(filedata2)
            #print >>f3, filedata2
            f4 = open(f3, "a+")
            print >>f4, filedata2
            f4.close()
    def lnk_tl_vl_vl_prl_fun2(self, line, counter, cl = None):
        if cl is None:
            cl = line[-2]

        else:
            pass

        page = req_proxy.main(cl)
        soup =  BeautifulSoup(page)

	pro_div = soup.find("ul", attrs={"id":"listing-product-list"})
	pro_link = pro_div.find_all("li", attrs={"class":"listing-unit"})
       
        for pl in pro_link:
            pl = pl.find("a", attrs={"class":"OneLinkNoTx"})
            pl = pl.get("href")
            print line + [pl]
            self.f2.write(str(line + [pl]) + "\n")

        counter += len(pro_link)
        
        if counter > 1000:
            return  0

        next_url = pro_div.find("li", attrs={"class":"next"})

        try:
            next_url = "http://download.cnet.com%s" %(next_url.a.get("href"))
            logging.debug(next_url)
            self.lnk_tl_vl_vl_prl_fun2(line, counter,  next_url)
        except:
            pass
def main2(ml_mt_sub, filename):
    f = open(filename, "a+")

    menulink = ml_mt_sub[0]
    menutitle = ml_mt_sub[1]
    
    page = req_proxy.main(menulink)

    soup = BeautifulSoup(page, "html.parser")

    tag_box = soup.find_all("div", attrs={"class":"head clearfix"})
    
    for al in tag_box:
        cato = al.find("div")

        catolink = "%s%s" %("http://www.homeshop18.com", str(cato.a.get("href")).strip())
        catotitle = cato.a.get_text()

        sub_cato = al.find_next_sibling("div")
        
	if sub_cato:
            sub_cato2 = sub_cato.find_all("a")

	    for al in sub_cato2:
	        sub_catolink = "%s%s" %("http://www.homeshop18.com", str(al.get("href")).strip())
                sub_catotext = al.get("title")

                print >>f, ','.join([menulink, menutitle, catolink, catotitle, sub_catolink, sub_catotext])
                
	else:
	   print >>f, ','.join([menulink, menutitle, catolink, catotitle, catolink, catotitle])

    f.close()
def main5(i, q):
    for line, f in iter(q.get, None):
    
        link = line[-2].strip()

        page = req_proxy.main(link)
        soup = BeautifulSoup(page)

        tag_brand  =  soup.find("div", attrs={"id":"facet_brand"})
        
        try:
            tag_a =  tag_brand.find_all("a")

        except:
            tag_a = []
        
        for l in  tag_a:
            try:
                brandlink = str(l.get("href")).strip()
                bramdname = str(l.get_text()).strip()
                print >>f,  "%s,%s,%s" %(','.join(line), brandlink, bramdname)

            except:
                pass

        time.sleep(i + 2)
        q.task_done()

    q.task_done()
def main3(i, q):
    for link in iter(q.get, None):
        page = req_proxy.main(link)
        soup = BeautifulSoup(page)

        tag_cat = soup.find_all("div", attrs={"class":"search-by-cat mt10 mb10 pl14 "})
    
        if tag_cat:
            cat_tag_a = tag_cat[0].find_all("a")

        else:
            cat_tag_a = []

        for cl in cat_tag_a:
            try:
                link_cl_ctxt_ccount.append([link, str(cl.get("href")).strip(), str(cl.get_text()).strip()])
                logging.debug((link, str(cl.get("href")).strip(), str(cl.get_text()).strip()))

            except:
                pass

        time.sleep(i + 2)
        q.task_done()

    q.task_done()
def part_second(line, filename2):
    f = open(filename2, "a+")

    line2 = line.strip().split(",")
    menulink = line2[0].strip()
    menutitle = line2[1].strip()
    catlink = line2[2].strip()
    cattitle = line2[3].replace("\n", "").strip()
    subcatlink = line2[-2].strip()
    subcattitle = line2[-1].strip()

    page = req_proxy.main(subcatlink)
    soup = BeautifulSoup(page)

    tag_brand = soup.find("div", text=re.compile("Brand"))
    blbtbc_list = tag_brand.parent.parent.find_all(
        "li", attrs={"class": "srch_ctgry_item"})

    for blbtbc in blbtbc_list:
        brandlink = "%s%s" % ("http://www.homeshop18.com",
                              str(blbtbc.a.get("href")).strip())
        brandtitle = str(blbtbc.a.get("title")).replace("\n", "").strip()

        print >> f, ','.join([
            menulink, menutitle, catlink, cattitle, subcatlink, subcattitle,
            brandlink, brandtitle
        ])
        print menulink, menutitle, catlink, cattitle, subcatlink, subcattitle, brandlink, brandtitle
        print "*" * 145

    f.close()
def main(link):
    page = req_proxy.main(link)
    soup = BeautifulSoup(page, "html.parser")
    
    all_page_list = soup.find_all("li", attrs={"class":"MyAccountPag"})

    threads = []

    t = threading.Thread(target=main2, args=(link,))
    threads.append(t)
    t.start()

    for page_link_tag in all_page_list:
        page_link = "http://www.yebhi.com%s" %(str(page_link_tag.a.get("href")))
        t = threading.Thread(target=main2, args=(page_link,))
	threads.append(t)
	t.start()

    main_thread = threading.currentThread()

    for t in threading.enumerate():
        if t is main_thread:
            continue
        logging.debug('joining %s', t.getName())
        t.join()
def main(directory, mainlink):
    filename = "%s/complete_link_collection.txt" %(directory)

    f = open(filename, "a+")

    page = req_proxy.main(mainlink)

    soup = BeautifulSoup(page, "html.parser")
    cat_collection_box = soup.find("div", attrs={"class":"brw_bdr"})
    
    link_list = cat_collection_box.find_all("a")

    for link in link_list:
        link  = link.get("href")

        parsed = urlparse(link)

        if len(parsed.netloc) == 0:
            link = "http://www.snapdeal.com%s" %(link)
            
        f.write(str(link) + "\n")
        #print link

    f.close()
    
    return filename
def main(line, directory):
    filename = "%s/f_cl_tr_ct_st_scl_bt_bl.txt" %(directory)
    
    f = open(filename, "a+")
   
    page = req_proxy.main(line[-1])

    soup = BeautifulSoup(page, "html.parser")
    brand_tag_list = soup.find_all("span", attrs={"class":"forspan"})

    
    for brand_tag in brand_tag_list:
        if str(brand_tag.get_text()).strip() == "Brands":
            brand_box = brand_tag.find_parent("div", attrs={"class":"divli"})

    brand_list = brand_box.find_all("a", attrs={"class":"RefineByLink"})

    for brand_tag  in brand_list:
        brand = str(brand_tag.get("relmselect"))
	brand_link = "http://www.yebhi.com%s" %(str(brand_tag.get("href")))
	#f.write(str([catelink, target, cate, sub_cat, sub_cat_link]) + "\n")
        f.write(str([line[0], line[1], line[2], line[3], line[4], brand, brand_link]) + "\n")
        
        logging.debug([line[0], line[1], line[2], line[3], line[4], brand, brand_link])

    f.close()
def main(line, directory):
    direc = "%s/%s/%s/%s" %(directory, line[2], line[3], line[5])

    try:
        os.makedirs(direc)
    except:
        pass

    filename = "%s/%s.csv" %(direc, line[5])
    f = open(filename, "a+")
    
    page = req_proxy.main(line[-6])
    soup = BeautifulSoup(page, "html.parser")

    title = soup.find("title").get_text()
    meta_disc = soup.find("meta", attrs={"name":"description"}).get("content")
    seller = "yebhi.com"
    item_desc = soup.find("div", attrs={"itemprop":"description"})
    dte = time.strftime("%d:%m:%Y")
    status  = " "

    f.write(",".join(map(my_strip, [line[9], line[7], line[0], line[12], line[2],
                                    line[3], line[5], line[10],  line[11], '',  
                                    line[1], line[8], seller, title, 
                                    meta_disc, line[13], item_desc, '', dte, status] )) + "\n")
    f.close()

    logging.debug("inserted  ............")
def main(directory, link):
    page = req_proxy.main(link)
    #driver = phan_proxy.main(link)

    #try:
    #     driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div/a/img").click()
    #     logging.debug("clicked..................................................................")

    #except:
    #    pass

    #driver = phan_scroller.main(driver)
    #page = driver.page_source

    soup = BeautifulSoup(page, "html.parser")

    target_cat_list = soup.find("div", attrs={"id":"breadCrumbWrapper"}).find_all("span", attrs={"itemprop":"title"})
    filename = "%s/%s.doc" %(directory, "_".join(str(target_cat_list[-1].get_text()).split()))

    f = open(filename, "a+")
    
    item_big_box_list = soup.find("div", attrs={"id":re.compile("products-main")})


    item_box_list = item_big_box_list.find_all("div", attrs={"class":"product_grid_row"})

    for item_box in item_box_list:
        item_sub_box_list  = item_box.find_all("div", attrs={"class":"product_grid_cont gridLayout3"})

        if len(item_sub_box_list) == 0:
            item_sub_box_list  = item_box.find_all("div", attrs={"class":"product_grid_cont gridLayout4"})
        
        for item_sub_box in item_sub_box_list:
            item_link = item_sub_box.find("a", attrs={"class":"hit-ss-logger somn-track prodLink"}).get("href")
            parsed = urlparse(item_link)

            if len(parsed.netloc) == 0:
                item_link = "http://www.snapdeal.com%s" %(item_link)

            size = []
            size_box_list = item_sub_box.find("div", attrs={"class":"productAttr"})
            
            if size_box_list is not None:
                size_option_box = size_box_list.find_all("option")

                for size_option in size_option_box:
                    size.append(size_option.get("value"))

            if len(size) !=0:
                size = filter(None, map(my_strip, size))
    
            info = [link, item_link, size]
            info2 = map(my_strip, info)

            f.write(str(info2) + "\n")
            logging.debug("inserted........................................................................")


    f.close()
Example #21
0
def main2(line, filename):
    catlink = line[0]
    cattitle = line[1]

    f = open(filename, "a+")

    page = req_proxy.main(catlink)

    soup = BeautifulSoup(page)

    tag_page = soup.find("div", attrs={"id": "wp_page_numbers"})

    tag_page_a_list = []

    if tag_page:
        tag_page_a = tag_page.find_all("a")[:-1]

        for cat_page in tag_page_a:
            sub_page_link = str(cat_page.get("href")).strip()

            sub_page = req_proxy.main(sub_page_link)

            sub_soup = BeautifulSoup(sub_page)

            tag_content_a = soup.find_all("h2", attrs={"class": "title"})

            for subcatlinktitle in tag_content_a:
                subcatlink = str(subcatlinktitle.a.get("href")).strip()
                subcattitle = subcatlinktitle.get_text().encode(
                    "ascii", "ignore")
                subcattitle = str(subcattitle).strip().replace(
                    "\n", "").replace("\t", "").replace(",", "")
                print >> f, ",".join(
                    [catlink, cattitle, subcatlink, subcattitle])

    else:
        tag_content_a = soup.find_all("h2", attrs={"class": "title"})

        for subcatlinktitle in tag_content_a:
            subcatlink = str(subcatlinktitle.a.get("href")).strip()
            subcattitle = subcatlinktitle.get_text().encode("ascii", "ignore")
            subcattitle = str(subcattitle).strip().replace("\n", "").replace(
                "\t", "").replace(",", "")
            print >> f, ",".join([catlink, cattitle, subcatlink, subcattitle])

    f.close()
    def pl_to_info_collection(self):
        tl = self.line_list[0]
        tt = self.line_list[1]
        cl = self.line_list[2]
        ct = self.line_list[3]
        pl = self.line_list[4]

        page = req_proxy.main(pl)
        soup = BeautifulSoup(page)

        pt = soup.find("h1", attrs={"itemprop": "name"})
        pt = pt.get_text().encode("ascii", "ignore")

        version = soup.find("li", attrs={"class": "qsVersion"})
        version = version.get_text().encode("ascii", "ignore")

        filesize = soup.find("li", attrs={"class": "fileSize"})
        filesize = filesize.get_text().encode("ascii", "ignore")

        dtadded = soup.find("li", attrs={"class": "qsDateAdded"})
        dtadded = dtadded.get_text().encode("ascii", "ignore")

        price = soup.find("li", attrs={"class": "qsPrice"})
        price = price.get_text().encode("ascii", "ignore")

        oosys = soup.find("li", attrs={"class": "qsOs"})
        oosys = oosys.get_text().encode("ascii", "ignore")

        tdown = soup.find("li", attrs={"class": "qsTotalDownloads"})
        tdown = tdown.get_text().encode("ascii", "ignore")

        wkdown = soup.find("li", attrs={"class": "qsWeeklyDownloads"})
        wkdown = wkdown.get_text().encode("ascii", "ignore")

        direc2 = "%s/%s/%s" % (self.direc, tt, ct)

        try:
            os.makedirs(direc2)
        except:
            pass

        filename = "%s/%s.csv" % (direc2, ct)

        f = open(filename, "a+")

        info = map(self.mystrip, [
            tl, tt, cl, ct, pl, pt, version, filesize, dtadded, price, oosys,
            tdown, wkdown
        ])
        logging.debug(info)

        infostr = ','.join(info)
        f.write(infostr + "\n")

        f.close()
def main2(line, filename):
    catlink = line[0]
    cattitle = line[1]

    f = open(filename, "a+")

    page = req_proxy.main(catlink)

    soup = BeautifulSoup(page)

    tag_page = soup.find("div", attrs={"id":"wp_page_numbers"})

    tag_page_a_list = []

    if tag_page:
        tag_page_a = tag_page.find_all("a")[:-1]

	for cat_page in tag_page_a:
	    sub_page_link = str(cat_page.get("href")).strip()

	    sub_page =  req_proxy.main(sub_page_link)

	    sub_soup = BeautifulSoup(sub_page)

            tag_content_a  = soup.find_all("h2", attrs={"class":"title"})

	    for subcatlinktitle in tag_content_a:
	        subcatlink = str(subcatlinktitle.a.get("href")).strip()
		subcattitle = subcatlinktitle.get_text().encode("ascii", "ignore")
                subcattitle = str(subcattitle).strip().replace("\n", "").replace("\t", "").replace(",", "")
		print >>f, ",".join([catlink, cattitle, subcatlink, subcattitle])

    else:
	tag_content_a  = soup.find_all("h2", attrs={"class":"title"})

	for subcatlinktitle in tag_content_a:
	    subcatlink = str(subcatlinktitle.a.get("href")).strip()
	    subcattitle = subcatlinktitle.get_text().encode("ascii", "ignore")
            subcattitle = str(subcattitle).strip().replace("\n", "").replace("\t", "").replace(",", "")
            print >>f,  ",".join([catlink, cattitle, subcatlink, subcattitle])
    
    f.close()
    def page1_cat_link_collect(self):
        page = req_proxy.main(self.link)

        tree = html.fromstring(page)
        cat_box = tree.xpath("/html/body/div/div/div[5]/div[2]/div[2]/div/ul/li[2]/ul/li")

        all_cat_a = []
        for cat_link in cat_box:
            all_cat_a.append(cat_link.xpath("a/@href")[0])
    
        self.all_cat_a = all_cat_a
def cat_to_subcat(fs, link):
    page = req_proxy.main(link)

    soup = BeautifulSoup(page, "html.parser")
    cat_big_box = soup.find("div", attrs={"id":"matchingCatbox"})

    cat_box_list = cat_big_box.find_all("a", attrs={"class":re.compile("somn-track")})

    for  cat_box in cat_box_list:
        cat_link = my_strip(cat_box.get("href"))
        
        fs.write(cat_link + "\n")
    def page_link_to_movie_link(self, link):
        f = self.page_link_to_mov_link

        page = req_proxy.main(link)
	soup = BeautifulSoup(page)

	movi_link_box = soup.find("div", attrs={"id":"content"})
        movi_link_list = movi_link_box.find_all("a", title=re.compile("Permanent Link"))

	for mov_link in movi_link_list:
	    f.write(",".join([link, str(mov_link.get("href"))]) + "\n")
	    logging.debug([link, str(mov_link.get("href"))])
    def pl_to_info_collection(self):
        tl = self.line_list[0]
        tt = self.line_list[1]
        cl = self.line_list[2]
        ct = self.line_list[3]
        pl = self.line_list[4]

        page = req_proxy.main(pl)
        soup = BeautifulSoup(page, "html.parser")

        pt = soup.find("h1", attrs={"itemprop": "name"})
        pt = pt.get_text().encode("ascii", "ignore")

        version = soup.find("li", attrs={"class": "qsVersion"})
        version = version.get_text().encode("ascii", "ignore")

        filesize = soup.find("li", attrs={"class": "fileSize"})
        filesize = filesize.get_text().encode("ascii", "ignore")

        dtadded = soup.find("li", attrs={"class": "qsDateAdded"})
        dtadded = dtadded.get_text().encode("ascii", "ignore")

        price = soup.find("li", attrs={"class": "qsPrice"})
        price = price.get_text().encode("ascii", "ignore")

        oosys = soup.find("li", attrs={"class": "qsOs"})
        oosys = oosys.get_text().encode("ascii", "ignore")

        tdown = soup.find("li", attrs={"class": "qsTotalDownloads"})
        tdown = tdown.get_text().encode("ascii", "ignore")

        wkdown = soup.find("li", attrs={"class": "qsWeeklyDownloads"})
        wkdown = wkdown.get_text().encode("ascii", "ignore")

        direc2 = "%s/%s/%s" % (self.direc, tt, ct)

        try:
            os.makedirs(direc2)
        except:
            pass

        filename = "%s/%s.csv" % (direc2, ct)

        f = open(filename, "a+")

        info = map(self.mystrip, [tl, tt, cl, ct, pl, pt, version, filesize, dtadded, price, oosys, tdown, wkdown])
        logging.debug(info)

        infostr = ",".join(info)
        f.write(infostr + "\n")

        f.close()
    def movie_link_tu_page(self, link):
        page = req_proxy.main(link)      
        soup = BeautifulSoup(page)

        try:
	    page_links_div = soup.find("div", attrs={"id":"wp_page_numbers"})
            page_links_li = page_links_div.find_all("a")

	    for a  in page_links_li[:-1]:
	        self.movie_page_link.append(a.get("href"))

        except:
            self.movie_page_link.append(link)
Example #29
0
def main(line, f3):
#def main(line):
    line = line.strip()
    line = ast.literal_eval(line)

    menu = line[0]
    ctlink = line[1]
    cttitle = line[2]

    if cttitle == "Home & Decor":
        cttitle = unicode(r("Home & Décor"), 'utf8')
    

    #ctlink = "http://www.amazon.in/Sling-Bags/b/ref=sd_allcat_hbc_sling?ie=UTF8&node=1983351031"
    #cttitle =  "Sling & Cross-Body Bags"

    page = req_proxy.main(ctlink)
   
    soup = BeautifulSoup(page)

    tag_depatrmen = soup.find("h2", text=re.compile("Department"))
    tag_ul = tag_depatrmen.find_next("ul")

    tag_strong = tag_ul.find("strong", text = re.compile(cttitle))

    #tag_li = tag_strong.find_next("li")
    parent_li = tag_strong.find_parent("li")

    tag_narrow = None

    try:
        parent_li = parent_li.find_next_sibling()
        tag_narrow = parent_li.find("span", attrs={"class":"narrowValue"})

    except:
        print >>f3,  [menu, ctlink, cttitle, ctlink, cttitle]

         
    loop = True

    while loop is True:
	if tag_narrow is not None:
	    tag_narrow = tag_narrow.find_next("a")
            sctlink = "%s%s" %("http://www.amazon.in", tag_narrow.get("href"))
            scttitle = tag_narrow.get_text().encode("ascii", "ignore").strip()
            if scttitle != "What's this?":
                print >>f3, [menu, ctlink, cttitle, sctlink, scttitle]
            tag_narrow = tag_narrow.find("span", attrs={"class":"narrowValue"})

        else:
	    loop = False
def cat_to_subcat_brand(fb, link):
    page = req_proxy.main(link)

    soup = BeautifulSoup(page, "html.parser")
    brand_big_box = soup.find("div", attrs={"name":"Brand"})

    brand_box_list = brand_big_box.find_all("input", attrs={"filtername":"Brand"})

    for brandbox in brand_box_list:
        brand = my_strip(brandbox.get("value"))
        #print "http://www.snapdeal.com/products/men-apparel-jeans/?q=Brand%3A"+ brand + "&FID=checkbox_searchable_Brand%20%3A" + brand
        brand_link =  "%s/?q=Brand:%s" %(link, brand)
        fb.write(brand_link + "\n")
        logging.debug("inserted...................................................................................")
def main(ntl_pth_cat):
    link = ntl_pth_cat[0]
    page = req_proxy.main(link)

    if not page:
        main(ntl_pth_cat)
    
    filename = "dir%s/%s" %(time.strftime("%d%m%Y"), "cl_cpth_sc_bl_bn_bc_links_extracted.txt")
     
    f = open(filename, "a+")
    print >>f, link
    f.close()

    main2(ntl_pth_cat, page)
def main():
    f = open("to_extract_jnglee.txt")
    directory = f.read().strip()
    f.close()

    filename ="%s/ml_mt_ct_cl.txt" %(directory)

    f = open(filename)
    line = f.read().strip()
    f.close()

    line_list  = ast.literal_eval(line)
 
    ml_mt_cl_ct_sl_st = []

    for line in line_list:
        ml = line[0]
	mt = line[1]
        cl = line[2]
	ct = line[3]

        parsed = urlparse(cl)
        link_pth_lsit = filter(None, parsed.path.split("/"))
	
        if link_pth_lsit[1] == 'b':
	    page = req_proxy.main(cl)
	    soup = BeautifulSoup(page, "html.parser")
            tag_department = soup.find("h3", text=re.compile("Department"))
            tag_ul = tag_department.find_next_sibling("ul")
            
            if tag_ul is None:
                 tag_ul = tag_department.find_next_sibling("div", attrs={"class":"left_nav_section"})
                 tag_ul = tag_ul.find("ul")

            tag_a  = tag_ul.find_all("a")
             
	    for sl_sl in tag_a:
                sl = "%s%s" %("http://www.junglee.com", str(sl_sl.get("href")))
                st =  str(sl_sl.get_text()).strip()
                ml_mt_cl_ct_sl_st.append([ml, mt, cl, ct, sl, st])
	
    f.close()

    filename ="%s/ml_mt_cl_ct_sl_st.txt" %(directory)
    f = open(filename, "w+")
    print >>f, ml_mt_cl_ct_sl_st
    f.close()

    return  ml_mt_cl_ct_sl_st
def main():
    directory = "dirjnglee%s" %(time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)
    except:
        pass

    f = open("extracted_jnglee.txt", "a+")
    f.write(directory)
    f.close()

    f = open("to_extract_jnglee.txt", "w+")
    f.write(directory)
    f.close()

    Health_beauti = {}

    Health_beauti["Health-Personal-Care"] = "http://www.junglee.com/Health-Personal-Care/b/683850031/ref=nav_menu_6_1_1_0"
    Health_beauti["Beauty"] = "http://www.junglee.com/Beauty/b/837260031/ref=nav_menu_6_2_1_0"
    Health_beauti["Clothing"] ="http://www.junglee.com/Clothing/b/683843031/ref=nav_menu_2_1_1_0"
    Health_beauti["Shoes"] = "http://www.junglee.com/Shoes/b/805169031/ref=nav_menu_2_2_1_0"
    Health_beauti["Watches"] = "http://www.junglee.com/Watches/b/683890031/ref=nav_menu_2_3_1_0"
    Health_beauti["Accessories-online"] = "http://www.junglee.com/buy/Accessories-online/1000702243/ref=nav_menu_2_4_1_0"
    Health_beauti["Jewellery"] = "http://www.junglee.com/Jewellery/b/683862031/ref=nav_menu_2_5_1_0"


    ml_mt_ct_cl = []
     
    for mt, ml in Health_beauti.items():
        page = req_proxy.main(ml)
	soup = BeautifulSoup(page, "html.parser")

	cat_div = soup.find("div", attrs={"id":"left-1"})
	catt_catl = cat_div.find_all("a")

        for ct_cl in catt_catl:
            cl = "%s%s" %("http://www.junglee.com", str(ct_cl.get("href")))
            ct =  str(ct_cl.get_text()).strip()
            ml_mt_ct_cl.append([ml, mt, cl, ct])

    filename = "%s/ml_mt_ct_cl.txt"  %(directory)

    f = open(filename, "w+")
    print >>f, ml_mt_ct_cl
    f.close()

    return ml_mt_ct_cl
def main3(link):

    page = req_proxy.main(link)
    
    soup = BeautifulSoup(page)
      
    tag_first_container  = soup.find("div", attrs= {"class":"filter-container first"})

    
    tag_div_cat = soup.find("div", text = re.compile("categories"))

    if tag_div_cat:
        return tag_first_container

    else:
        pass
    def lnk_to_cat_collecion(self, link):
        page = req_proxy.main(link)
        soup = BeautifulSoup(page)
        parsed = urlparse(link)
        link_path = filter(None, parsed.path.split("/"))
    
        tittl = link_path[1]
        cat_div = soup.find("dl", attrs={"class":"catNav"})
        cl_ct_list = cat_div.find_all("dd")
    
        for cl_ct in cl_ct_list:
            cl = cl_ct.a.get("href")
            cl = "%s%s" %("http://download.cnet.com", str(cl))
	    ct = str(cl_ct.a.get_text())
            print >>self.f, [link, tittl] + map(self.mystrip, [cl, ct])
            self.lnk_ttl_cl_ct.append([link, tittl] + map(self.mystrip, [cl, ct]))
def main(line, f3):
    line = line.strip()
    line = ast.literal_eval(line)

    menu = line[0]
    ctlink = line[1]
    cttitle = line[2]
    sctlink = line[3]
    scttitle = line[4]

    page = req_proxy.main(sctlink)

    soup = BeautifulSoup(page)

    tag_brands = soup.find("h2", text=re.compile("Brands"))

    tag_ul = tag_brands.find_next("ul")

    tag_span_see_more = tag_ul.find("span",
                                    attrs={"class": "refinementLink seeMore"})

    if tag_span_see_more is not None:
        all_brand_link = "%s%s" % (
            "http://www.amazon.in",
            tag_span_see_more.find_parent("a").get("href"))
        main2(line, all_brand_link, f3)

    else:
        tag_al = tag_ul.find_all("a")
        for al in tag_al:
            bl = "%s%s" % ("http://www.amazon.in", al.get("href"))
            bt = al.span.get_text().encode("ascii", "ignore")

            filedata = [menu, ctlink, cttitle, sctlink, scttitle, bl, bt]
            filedata2 = map(mystrip, filedata)

            logging.debug(filedata2)
            f4 = open(f3, "a+")
            print >> f4, filedata2
            f4.close()
def main2(ml_mt_sub, filename):
    f = open(filename, "a+")

    menulink = ml_mt_sub[0]
    menutitle = ml_mt_sub[1]

    page = req_proxy.main(menulink)

    soup = BeautifulSoup(page)

    tag_box = soup.find_all("div", attrs={"class": "head clearfix"})

    for al in tag_box:
        cato = al.find("div")

        catolink = "%s%s" % ("http://www.homeshop18.com",
                             str(cato.a.get("href")).strip())
        catotitle = cato.a.get_text()

        sub_cato = al.find_next_sibling("div")

        if sub_cato:
            sub_cato2 = sub_cato.find_all("a")

            for al in sub_cato2:
                sub_catolink = "%s%s" % ("http://www.homeshop18.com",
                                         str(al.get("href")).strip())
                sub_catotext = al.get("title")

                print >> f, ','.join([
                    menulink, menutitle, catolink, catotitle, sub_catolink,
                    sub_catotext
                ])

        else:
            print >> f, ','.join([
                menulink, menutitle, catolink, catotitle, catolink, catotitle
            ])

    f.close()
Example #38
0
def main3(line, f2):
    line = line.strip()
    line2 = line.split(",")
    catlink = line2[-2]

    page = req_proxy.main(catlink)

    soup = BeautifulSoup(page)

    subcat = soup.find("select", attrs={"id": "categorySelect"})

    try:
        subcatoption = subcat.find_all("option")

        for subcatop in subcatoption:
            print >> f2, "%s,%s" % (line, str(subcatop.get_text()).strip())

            logging.debug("%s,%s" % (line, str(subcatop.get_text()).strip()))

    except:
        print >> f2, "%s,%s" % (line, "None")

        logging.debug("%s,%s" % (line, "None"))
Example #39
0
def main():
    directory = "dir%s/categories" % (time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)

    except:
        pass

    f = open("to_extract_cat.txt", "w+")
    print >> f, directory
    f.close()

    f = open("extracted_cat.txt", "a+")
    print >> f, directory
    f.close()

    link = "http://www.filmlinks4u.net/"

    page = req_proxy.main(link)

    soup = BeautifulSoup(page)

    tag_cat = soup.find("li", attrs={"id": "categories-3"})

    tag_cat_a_list = tag_cat.find_all("a")

    catlink_cattitle = []

    for al in tag_cat_a_list:
        catlink = str(al.get("href")).strip()
        cattitle = str(catlink.split("/")[-1]).strip().replace(
            ",", "").replace("\n", "").replace("\t", "")

        catlink_cattitle.append([catlink, cattitle])

    mainthreading(catlink_cattitle)
def main(line):
    line = ast.literal_eval(line)
    line = map(str.strip, line)
    target = line[0]
    catlink = line[1]
    cattitle = line[2]
    subcatlink = line[3]
    subcattitle = line[4]
    brand = line[5]
    productlink = line[6]
    imagelink = line[7]
    producttitle = line[8]

    f = open("to_extractbagittoday")
    directory = f.read().strip()
    f.close()

    dir2 = "%s/%s/%s/%s/%s" % (directory, target, cattitle, subcattitle, brand)

    try:
        os.makedirs(dir2)
    except:
        pass

    filename = "%s/%s.csv" % (dir2, brand)

    f = open(filename, "a+")

    page = req_proxy.main(productlink)

    soup = BeautifulSoup(page)

    product_path = productlink.split("/")

    product_path = filter(None, product_path)
    sku = str(product_path[-1])

    start = sku.find("-")
    if start != -1:
        sku = sku[start + 1:]

    sp = soup.find("span", attrs={"class": "offer-price"})
    sp = str(sp.get_text())

    mrp = sp
    mrp2 = soup.find("span", attrs={"class": "mrp-price"})

    if mrp2 is not None:
        mrp = str(mrp2.get_text())

    colour = None

    colour2 = soup.find("span", attrs={"class": "colorClass"})

    if colour2 is not None:
        colour = str(colour2.get("title"))

    size = []

    size2 = soup.find("ul", attrs={"class": "attributes-ul-cont"})
    if size2 is not None:
        size2 = size2.find_all("input")
        for sz in size2:
            size.append(str(sz.get("value")))

    size = str(size)

    tree = html.fromstring(page)

    desc2 = tree.xpath(
        "/html/body/div/section/div[4]/div/section/div/div/div/div")

    desc2 = desc2[0]
    desc = str(html.tostring(desc2))

    spec = None

    date = str(time.strftime("%d%mi%Y"))

    status = "None"

    vender = "bagittoday.com"

    metadesc2 = soup.find("meta", attrs={"name": "description"})
    metadesc = str(metadesc2.get("content"))

    metatitle2 = soup.find("title")
    metatitle = str(metatitle2.get_text())

    output = [
        sku, producttitle, catlink, sp, cattitle, subcattitle, brand,
        imagelink, mrp, colour, target, productlink, vender, metatitle,
        metadesc, size, desc, spec, date, status
    ]

    output = map(string_strip, output)

    print >> f, ','.join(output)

    print output
Example #41
0
def main(filename, brandname, catname, l):

    try:

        item_link = l

        page = req_proxy.main(item_link)

        soup = BeautifulSoup(page)

        tag_dis = soup.find("div", attrs={"id":"description"})

        if tag_dis:
            tag_dis = str(tag_dis).replace("\n","")

        tag_spec = soup.find("div", attrs={"id":"specifications"})

        if tag_spec:
            tag_spec = str(tag_spec).replace("\n","")


        tag_h1 = soup.find("h1", attrs={"itemprop":"name"})
        item_title = str(tag_h1.get_text()).strip()

        try:
           tag_colour = soup.find("div", attrs={"class":"line extra_text bmargin10"})
           item_clour = str(tag_colour.get_text()).strip()
        except:
           item_clour = " No more colour"

        tag_img = soup.find("img", attrs={"id":"visible-image-small"})
        item_image = tag_img.get("src")

        
      
        try:
            tag_price = soup.find("span", attrs={"class":"fk-font-verybig pprice fk-bold"})
            item_price = str(tag_price.get_text()).strip()
        except:
            tag_price = soup.find("div", attrs={"class":"prices"})
            item_price = str(tag_price.get_text()).strip().replace("\n", " ")


        try:
           tag_mrp = soup.find("span", attrs={"id":"fk-mprod-list-id"})
           item_mrp = str(tag_discount.get_text()).strip()
        except:
           item_mrp = item_price


        tag_seller = soup.find("a", attrs={"class":"pp-seller-badge-name fk-bold"})
        item_seller = str(tag_seller.get_text()).strip()
        
        try:
            tag_sku = soup.find("a", attrs={"class":"btn btn-orange btn-buy-big fk-buy-now fkg-pp-buy-btn"})
            sku = str(tag_sku.get("href")).split("=")[-1].strip()
        except:
            sku = "no sku defined"

        size = []
        try:
            tag_multiselect = soup.find_all("div", attrs={"class":"multiselect-item"})

            for l in tag_multiselect:
                try:
                    size.append(str(l.get_text()))
                except:
                    pass

        except:
            pass

        if not size:
            size.append("No size defined")

        size2 = ' '.join(size).replace("\n", " ")

        del size[:]
        del size

        date = str(time.strftime("%d:%m:%Y")).strip()

        f = open(filename,"a+")
        print >>f, ','.join([date, catname, brandname,  item_title, item_price,
                             item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)])
        f.close()

        logging.debug([date, catname, brandname,  item_title, item_price,
                      item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)])


    except:
        f = open("newerrorfile.txt", "a+")
        print >>f, l
        f.close()
Example #42
0
enclosure_queue = Queue()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(levelname)s] (%(threadName)-10s) %(message)s',
                    )


def main2(i, q):
    while True:
        filename, brandname, catname, l = q.get()
        
        item_link = l
<<<<<<< HEAD
        #page = urll_proxy.main(l)
        #assert page
        page = req_proxy.main(l)
        soup = BeautifulSoup(page)
        #page.close()
        try:

            tag_dis = soup.find("div", attrs={"id":"description"})
            if tag_dis:
                tag_dis = str(tag_dis).replace("\n","")
            
            tag_spec = soup.find("div", attrs={"id":"specifications"})
            if tag_spec:
                tag_spec = str(tag_spec).replace("\n","")

            tag_h1 = soup.find("h1", attrs={"itemprop":"name"})
            item_title = str(tag_h1.get_text()).strip()
Example #43
0
def main2(line):
    f = open("to_extract_cat.txt")
    directory = f.read().strip()
    f.close()

    filename2 = "%s/%s" % (directory, "f_ct_cl_st_sl_vt_wp_wl_img.txt")

    f2 = open(filename2, "a+")

    line2 = line.split(",")
    catlink = str(line2[0]).strip()
    cattitle = str(line2[1]).replace("\n",
                                     " ").replace("\t",
                                                  "").replace("\r",
                                                              "").strip()

    sub_catlink = line2[2]
    sub_cattitle = line2[3].replace("\n", " ").replace("\t",
                                                       "").replace("\r",
                                                                   "").strip()

    end = sub_cattitle.find(")").strip()

    if end != -1:
        sub_cattitle = sub_cattitle[:end + 1]

    page = req_proxy.main(sub_catlink)

    soup = BeautifulSoup(page)
    tree = html.fromstring(page)

    image = "None"

    if tree is not None:
        image = str(
            tree.xpath(
                '/html/body/div/div/div[5]/div/div[2]/div/div/div[3]/a/img/@src'
            )[0]).strip()

    tag_hotserver = soup.find_all("span", text=re.compile("Host Server"))

    for l in tag_hotserver:
        loop = True
        video_type = str(l.next_sibling).replace("\n", " ").replace(
            "\t", "").replace("\r", "").strip()
        next_a = l.find_next("a")

        while loop is True:
            try:
                tag_nm = str(next_a.name).strip()

                if tag_nm == "br":
                    pass

                if tag_nm == "a":
                    watchlink = str(next_a.get("href")).strip()
                    watchpart = str(next_a.get_text()).replace(
                        "\n", " ").replace("\t", "").replace("\r", "").strip()
                    print >> f2, ','.join([
                        cattitle, catlink, sub_cattitle, sub_catlink,
                        video_type, watchpart, watchlink, image
                    ])
                    print ','.join([
                        cattitle, catlink, sub_cattitle, sub_catlink,
                        video_type, watchpart, watchlink, image
                    ])

                if tag_nm != "a" and tag_nm != "br":
                    loop = False

                next_a = next_a.find_next_sibling()

            except:
                loop = False

    f.close()