def main(directory, mainlink):
    filename = "%s/complete_link_collection.txt" %(directory)

    f = open(filename, "a+")

    driver = phan_proxy.main(mainlink)
   
    try:
         driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div/a/img").click()
         logging.debug("clicked..................................................................")

    except:
        pass

    page = driver.page_source
    driver.delete_all_cookies()
    driver.quit()
 
    soup = BeautifulSoup(page, "html.parser")
    link_list = soup.find_all("a", attrs={"class":"somn-track"})

    for link in link_list:
        link  = link.get("href")

        parsed = urlparse(link)

        if len(parsed.netloc) == 0:
            link = "http://www.snapdeal.com%s" %(link)
            
        f.write(str(link) + "\n")
        print link

    f.close()
def main2(bl):
    driver = phan_proxy.main(bl)

    page = driver.page_source
    #print page
    #print driver.current_url

    try:
        time.sleep(2)
        driver.find_element_by_id("jab-news").click()
        logging.debug("jab-news....")

    except:
        pass

    try:
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        logging.debug("scrolling... first")
        return driver

    except:
        logging.debug("repeating...")
        driver.delete_all_cookies()
        driver.quit()
        main2(bl)
Ejemplo n.º 3
0
def main(line):
    print line
    line2 = line.strip().split(",")

    menulink = line2[0].strip()
    menutitle = line2[1].strip()

    catlink = line2[2].strip()
    cattitle = line2[3].strip()

    subcatlink = line2[4].strip()
    subcatitle = line2[5].strip()

    brandlink = line2[6].strip()
    brandtite = line2[7].strip()

    driver = phan_proxy.main(brandlink)

    driver = driver_scroller(driver)

    driver = sub_scroller(driver)

    page = driver.page_source

    soup = BeautifulSoup(page)

    tag_srchresult = soup.find("div", attrs={"id": "searchResultsDiv"})

    tag_product = tag_srchresult.find_all("p",
                                          attrs={"class": "product_title"})

    for al in tag_product:
        print "%s%s" % ("http://www.homeshop18.com", str(
            al.a.get("href")).strip())
def main(line):
    print line
    line2 = line.strip().split(",")
    
    menulink = line2[0].strip()
    menutitle = line2[1].strip()
  
    catlink = line2[2].strip()
    cattitle = line2[3].strip()

    subcatlink = line2[4].strip()
    subcatitle = line2[5].strip()

    brandlink = line2[6].strip()
    brandtite = line2[7].strip()

    driver = phan_proxy.main(brandlink)
   
    driver = driver_scroller(driver)    

    driver = sub_scroller(driver)

    page = driver.page_source
    
    soup = BeautifulSoup(page)

    tag_srchresult = soup.find("div", attrs={"id":"searchResultsDiv"})

    tag_product = tag_srchresult.find_all("p", attrs={"class":"product_title"})

    for al in tag_product:
        print "%s%s" %("http://www.homeshop18.com", str(al.a.get("href")).strip())
def supermain():
    link = "http://www.snapdeal.com/products/mens-footwear-sports-shoes/?q=Wearability_s%3AFootball&sort=plrty&#plrty|"
    link ="http://www.snapdeal.com/products/mens-footwear-casual-shoes?q=Price:399,7899&sort=rec"
    driver = phan_proxy.main(link)
        
    try:
         driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div/a/img").click()
         logging.debug("clicked..................................................................")

    except:
        pass
    
    driver = main(driver) 
    page = driver.page_source 

    soup = BeautifulSoup(page, "html.parser")

    item_big_box_list = soup.find("div", attrs={"id":"products-main4"})
    item_box_list = item_big_box_list.find_all("div", attrs={"class":"product_grid_row"})

    for item_box in item_box_list:
        item_sub_box_list  = item_box.find_all("div", attrs={"class":"product_grid_cont gridLayout3"})


        for item_sub_box in item_sub_box_list:
            item_link = item_sub_box.find("a", attrs={"class":"hit-ss-logger somn-track prodLink"}).get("href")
            print item_link

    print len(item_box_list)

    driver.delete_all_cookies()
    driver.quit()
def main():
    directory = "dir%s" % (time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)

    except:
        pass

    f = open("to_extract.txt", "w+")
    print >> f, directory
    f.close()

    f = open("extracted.txt", "a+")
    print >> f, directory
    f.close()

    link = "http://www.homeshop18.com/all-stores.html"

    driver = phan_proxy.main(link)

    try:
        WebDriverWait(driver, 1000).until(ajax_complete,
                                          "Timeout waiting for page to load")
    except WebDriverException:
        pass

    try:
        driver.find_element_by_xpath("/html/body/div[7]/div/a").click()
    except:
        pass

    try:
        WebDriverWait(driver, 1000).until(ajax_complete,
                                          "Timeout waiting for page to load")
    except WebDriverException:
        pass

    driver = driver_scroller(driver)

    page = driver.page_source

    soup = BeautifulSoup(page)

    tag_menuflyer = soup.find("div", attrs={"class": "bcMenuFlyer"})

    tag_menu_lt = tag_menuflyer.find_all("a")

    ml_mt = []

    for lt in tag_menu_lt:
        lt.get_text()
        menulink = "%s%s" % ("http://www.homeshop18.com", str(
            lt.get("href")).strip())
        menutitle = str(lt.get_text()).strip()
        ml_mt.append([menulink, menutitle])

    mainthreading(ml_mt)
def main(line):
    f = open("to_extract_cat.txt")
    directory = f.read().strip()
    f.close()

    filename2 = "%s/%s" %(directory, "f_ct_cl_st_sl_tp_prt_em_im_wl.csv")

    f2 = open(filename2, "a+")

    line2 = line.strip().split(",")

    line2 = map(str.strip,  line2)
    cattitle = line2[0]
    catlink = line2[1]
    subcattitle = line2[2]
    subcatlink = line2[3]
    tpe = line2[4]
    prt = line2[5]
    wl = line2[6]
    img = line2[7]

    driver = phan_proxy.main(wl)
     
    try:
        WebDriverWait(driver, 1000).until( ajax_complete,  "Timeout waiting for page to load")

    except WebDriverException:
        pass

    page = driver.page_source
    tree = html.fromstring(page)

    embedlink =  tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src")
    embedlink2 =  tree.xpath("/html/body/object/embed/@src")
    embedlink3 =  tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src")

    if embedlink  is not None:
        print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img,  wl])
        logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img,  wl])

    elif embedlink2 is not None:
        print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img,  wl])
        logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img,  wl])

    elif embedlink3 is  not None:
        print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img,  wl])
        logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img,  wl])

    else:
        f3 = open("page3_filmlink_embedlink_error2.txt", "a+")
        print >>f3, line
        f3.close()

    driver.delete_all_cookies()
    driver.quit()
    f2.close()
def main():
    directory = "dir%s" % (time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)

    except:
        pass

    f = open("to_extract.txt", "w+")
    print >> f, directory
    f.close()

    f = open("extracted.txt", "a+")
    print >> f, directory
    f.close()

    link = "http://www.homeshop18.com/all-stores.html"

    driver = phan_proxy.main(link)

    try:
        WebDriverWait(driver, 1000).until(ajax_complete, "Timeout waiting for page to load")
    except WebDriverException:
        pass

    try:
        driver.find_element_by_xpath("/html/body/div[7]/div/a").click()
    except:
        pass

    try:
        WebDriverWait(driver, 1000).until(ajax_complete, "Timeout waiting for page to load")
    except WebDriverException:
        pass

    driver = driver_scroller(driver)

    page = driver.page_source

    soup = BeautifulSoup(page)

    tag_menuflyer = soup.find("div", attrs={"class": "bcMenuFlyer"})

    tag_menu_lt = tag_menuflyer.find_all("a")

    ml_mt = []

    for lt in tag_menu_lt:
        lt.get_text()
        menulink = "%s%s" % ("http://www.homeshop18.com", str(lt.get("href")).strip())
        menutitle = str(lt.get_text()).strip()
        ml_mt.append([menulink, menutitle])

    mainthreading(ml_mt)
def main(directory, link, target, cate):

    directory2 = "%s/%s/%s" %(directory, target, cate)
    try:
        os.makedirs(directory2)
    except:
        pass

    filename = "%s/%s.doc" %(directory2, cate)
   
    f = open(filename, "a+")
   
    driver = phan_proxy.main(link)
    driver = driver_scroller(driver)
    
    page = driver.page_source

    driver.delete_all_cookies()
    driver.quit()

    soup = BeautifulSoup(page, "html.parser")

    item_box = soup.find("section", attrs={"id":"catalog"})
    items_list = item_box.find_all("div", attrs={"class":"item"})

    for item_tag in items_list:
        sub_cate = item_tag.get("data-tag")
	colour = item_tag.get("data-color")
	price = item_tag.get("data-price")
	gender = item_tag.get("data-gender")
	sku = item_tag.get("data-option")
	item_link = "http://zovi.com%s" %(item_tag.a.get("href"))
	item_image = "http:%s" %(item_tag.a.img.get("data-original"))

        item_size2 = item_tag.find("div", attrs={"class":"available-sizes"})
        try:
            item_size = item_size2.find_all("li", attrs={"class":""})
            item_size = str(map(tag_text, item_size)).replace(",", " ")
        except:
            item_size  = str(item_size2)
      
        item_title = str(item_tag.find("div", attrs={"class":"title"}).get_text()).replace(",", " ").strip()

        try:
            item_sale = item_tag.find("span", attrs={"class":"tags visible sale"}).get_text().strip()
        except:
            item_sale = " "
        
        item_info = [link, target, cate, sub_cate, colour, price, gender, 
                     sku, item_link, item_image, item_size, item_title, item_sale]

        f.write(str(map(str, item_info)) + "\n")
        logging.debug(item_info)

    f.close()
def main(line, directory):

    line = ast.literal_eval(line)
    
    line = map(str.strip, line)
    menu = line[0]
    submnlink = line[1]
    submntitle = line[2]
    catlink = line[3]
    cattitle = line[4]
    brandlink = line[-2]
    brandtitle = line[-1]
    start = brandtitle.find("(")
    brandtitle = brandtitle[:start].strip()

    dirtwo = "%s/%s/%s/%s/%s" %(directory, menu, submntitle, cattitle, brandtitle)

    try:
        os.makedirs(dirtwo)

    except:
        pass

    filedoc = "%s/%s.doc" %(dirtwo, brandtitle)
    filedocx = "%s/%s.docx" %(dirtwo, brandtitle)

    f2 = open(filedoc, "a+")
    f3 = open(filedocx, "a+")

    driver = phan_proxy.main(brandlink)
    
    driver = driver_scroller(driver)

    driver = sub_scroller(driver)

    page = driver.page_source

    soup = BeautifulSoup(page, "html.parser")
    
    tag_product = soup.find("div", attrs={"id":"products"})
    
    tag_product_a = tag_product.find_all("a", attrs={"class":"pu-image fk-product-thumb "})


    for al in tag_product_a:
        itemlink = "%s%s" %("http://www.flipkart.com", str(al.get("href").strip()))
        print >>f2, [menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle, itemlink]
        print >>f3, itemlink
	print [menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle, itemlink]

    driver.delete_all_cookies()
    driver.quit()
    
    f2.close()
    f3.close()
def main():

    directory = "dir%s" %(time.strftime("%d%m%Y"))
    
    f = open("to_extract.txt", "w+")
    print >>f, directory
    f.close()

    f = open("extracted.txt", "a+")
    print >>f, directory
    f.close()

    try:
        os.makedirs(directory)

    except:
        pass


    link = "http://www.jabong.com"
    driver = phan_proxy.main(link)

    try:
        driver.find_element_by_id("jab-news").click()

    except:
        pass

    page = driver.page_source
    driver.close()
    soup = BeautifulSoup(page)

    tag_li = soup.find("li", attrs={"id":"qa-navigation0"})
    h = HTMLParser.HTMLParser()
    tag_li = h.unescape(str(tag_li))

    tag_li = tag_li.replace("\n", " ").replace("<!--",  " ").replace("-->", " ")

    soup = BeautifulSoup(tag_li)
    tag_a = soup.find_all("a")

    menu_links = []   
    
    for l in tag_a:
        try:
	    menulink = l.get("href")
            menu_links.append(menulink)
    
        except:
	    pass

    main2(menu_links)
    def wl_to_el(self, line):
        driver = phan_proxy.main(line[-2])
        page = driver.page_source
        driver.quit()

        tree = html.fromstring(page)

        if len(tree.xpath("/html/body/center/table/tbody/tr[2]/td/iframe/@src")) != 0:
            embedlink = tree.xpath("/html/body/center/table/tbody/tr[2]/td/iframe/@src")[0]

        elif len(tree.xpath("/html/body/center/table/tbody/tr[2]/td/embed/@src")) != 0:
            embedlink = tree.xpath("/html/body/center/table/tbody/tr[2]/td/embed/@src")[0]

        elif len(tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src")) != 0:
            embedlink = tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src")[0]

        elif len(tree.xpath("/html/body/object/embed/@src")) != 0:
            embedlink = tree.xpath("/html/body/object/embed/@src")[0]

        elif len(tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src")) != 0:
            embedlink = tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src")[0]

        elif len(tree.xpath("/html/body/div/div[2]/div[3]/div[3]/div/input/@value")) != 0:
            embedlink = tree.xpath("/html/body/div/div[2]/div[3]/div[3]/div/input/@value")
            start = embedlink[0].find("src=")
            end = embedlink[0].find('"', start + 5)
            embedlink = embedlink[0][start + 5 : end].strip()

        else:
            embedlink = ""

        try:
            data = [
                filter(None, line[0].split("/"))[3],
                line[0],
                line[1].split("/")[-1][:-5],
                line[1],
                line[2],
                line[3],
                embedlink,
                line[-1],
                line[-2],
            ]

            self.f.write(",".join(data) + "\n")
            logging.debug(("inserted....", embedlink))

        except:
            pass
def main(line):
    filename2 = "dt_movie2.csv"

    f2 = open(filename2, "a+")

    line2 = line.strip().split(",")

    line2 = map(str.strip,  line2)

    cattitle = line2[1]
    catlink = line2[2]
    subcattitle = line2[3]
    subcatlink = line2[4]
    tpe = line2[5]
    prt = line2[6]
    wl = line2[-3]
    img = line2[-4]

    wl = wl.replace('"', " ").strip()
    driver = phan_proxy.main(wl)
     
    page = driver.page_source
    tree = html.fromstring(page)

    embedlink =  tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src")
    embedlink2 =  tree.xpath("/html/body/object/embed/@src")
    embedlink3 =  tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src")

    if embedlink3  is not None:
        print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img,  wl])
        logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img,  wl])

    elif embedlink2 is not None:
        print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img,  wl])
        logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img,  wl])

    elif embedlink is  not None:
        print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img,  wl])
        logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img,  wl])

    else:
        f3 = open("page3_filmlink_embedlink_error2.txt", "a+")
        print >>f3, line
        f3.close()

    driver.delete_all_cookies()
    driver.quit()
    f2.close()
Ejemplo n.º 14
0
def main4(line, filename):
    menulink = line[0]
    menutitle = line[1]

    driver = phan_proxy.main(menulink)

    height = 0
    loop = True

    while loop is True:
        logging.debug("scrolling...")
        time.sleep(1)
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        heightnow = driver.execute_script("return $(document ).height();")

        if heightnow == height:
            loop = False

        else:
            height = heightnow
            loop = True

    page = driver.page_source

    soup = BeautifulSoup(page)

    tag_main_comp_holder = soup.find(
        "div", attrs={"class": "campaignHolder mainCampaignHolder"})

    tag_a = tag_main_comp_holder.find_all("a", attrs={"class": "indulgeLink"})

    line2 = str(line).strip("[]").strip()

    f = open(filename, "a+")

    for al in tag_a:
        clink = str(al.get("href")).strip()
        catlink = "%s%s" % ("http://www.fashionandyou.com", clink)
        cattitle = clink.replace("/", "").strip()

        print >> f, ','.join([menulink, menutitle, catlink, cattitle])
        logging.debug((line2, catlink, cattitle))

    f.close()
def main():
    directory = "dirflipkart%s" %(time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)
    except:
        pass

    f = open("to_extractfilpkart", "w+")
    f.write(directory)
    f.close()

    f2 = open("extractedflipkart", "a+")
    f2.write(directory)
    f2.close()
    
    link = "http://www.flipkart.com/"
    driver = phan_proxy.main(link)
    
    page = driver.page_source
 
    #soup = BeautifulSoup(page, "html.parser")
    soup = BeautifulSoup(page)
    tag_menu = soup.find_all("li", attrs={"class":"menu-l0  "})


    menu_subl_subt = []

    for menu in tag_menu:
        menutitle = menu.get("data-key")
        submenu = menu.find_all("li", attrs={"class":"heading"})
	submenu_new = menu.find_all("li", attrs={"class":"new-heading"})

	submenu.extend(submenu_new)

	for l in submenu:
	    submenutitle = str(l.get_text()).strip()
	    submenulink = "%s%s" %("http://www.flipkart.com", str(l.a.get("href")).strip())

	    menu_subl_subt.append([menutitle, submenulink, submenutitle])
    
    driver.delete_all_cookies()
    driver.quit()
    
    mainthread(menu_subl_subt)
def main4(line, filename):
    menulink = line[0]
    menutitle  = line[1]

    driver = phan_proxy.main(menulink)

    height = 0
    loop = True

    while loop is True:
        logging.debug("scrolling...")
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        heightnow = driver.execute_script("return $(document ).height();")

        if heightnow == height:
            loop = False

        else:
            height = heightnow
            loop = True

    page = driver.page_source
    
    soup = BeautifulSoup(page)

    tag_main_comp_holder = soup.find("div", attrs={"class":"campaignHolder mainCampaignHolder"})

    tag_a = tag_main_comp_holder.find_all("a", attrs={"class":"indulgeLink"})
      
    line2 = str(line).strip("[]").strip()
    
    f = open(filename, "a+")
    
    for al in tag_a:
        clink = str(al.get("href")).strip()
        catlink = "%s%s" %("http://www.fashionandyou.com", clink)
        cattitle = clink.replace("/", "").strip()
        
        print >>f, ','.join([menulink, menutitle, catlink, cattitle])
	logging.debug((line2, catlink, cattitle))
        
    f.close()
def main(line):
    line2 = line.strip().split(",")
    sub_cat = line2[-1].strip()

    cattitle = line2[-2].strip()
    catlink = line2[-3].strip()

    menutitle = line2[1].strip()
    menulink =  line2[0].strip()
   
    driver = phan_proxy.main(catlink)

    try:
        WebDriverWait(driver, 1000).until( ajax_complete,  "Timeout waiting for page to load")
    except WebDriverException:
        pass

    if (sub_cat is not  "None") and (sub_cat is not "All Categories") :
        driver = sub_cat_select(driver, sub_cat)
    
    driver = driver_scroller(driver)
        
    page = driver.page_source

    soup = BeautifulSoup(page)

    tag_ul_product = soup.find("ul", attrs={"class":"products"})

    tag_li_dis_block = tag_ul_product.find_all("li", attrs={"style" : re.compile(": block;")})

    for al in tag_li_dis_block:
        tag_al_a = al.find("a", attrs={"class":"productLink"})

	if tag_al_a:
            productlink = "%s%s" %("http://www.fashionandyou.com", str(tag_al_a.get("href")).strip())
            print ','.join([menulink, menutitle, catlink, cattitle, sub_cat,  productlink])

    print len(tag_li_dis_block)
    driver.delete_all_cookies()
    driver.quit()
Ejemplo n.º 18
0
def main():
    link = "http://www.snapdeal.com/"

    page = phan_proxy.main(link)
    #page = req_proxy.main(link)

    f = open("code1_sourcebody.txt", "a+")
    print >>f, page.encode("ascii", "ignore")
    f.close()
    

    soup = BeautifulSoup(page)

    tag_nav = soup.find_all("li",  attrs={"class":"navlink"})
    
    menu_list = {}

    for l in tag_nav:
        menu =  str(l.a.get_text()).strip()

        menu_list[menu] = {}

        tag_a = l.find_all("a", attrs={"class":"somn-track"})

        for l2 in tag_a:
            sub_menu = str(l2.get_text()).strip()
            sub_menulink = str(l2.get("href")).strip()
            menu_list[menu][sub_menu] = sub_menulink
            
    f = open("code1_menu_sub_menu_link.txt", "w+")
    print >>f,  menu_list
    f.close()

    menu_list.clear()
    print menu_list
    del menu_list
Ejemplo n.º 19
0
def main(line):
    filename2 = "dt_movie2.csv"

    f2 = open(filename2, "a+")

    line2 = line.strip().split(",")

    line2 = map(str.strip, line2)

    cattitle = line2[1]
    catlink = line2[2]
    subcattitle = line2[3]
    subcatlink = line2[4]
    tpe = line2[5]
    prt = line2[6]
    wl = line2[-3]
    img = line2[-4]

    wl = wl.replace('"', " ").strip()
    driver = phan_proxy.main(wl)

    page = driver.page_source
    tree = html.fromstring(page)

    embedlink = tree.xpath(
        "/html/body/div[2]/center/div/div/div[2]/iframe/@src")
    embedlink2 = tree.xpath("/html/body/object/embed/@src")
    embedlink3 = tree.xpath(
        "/html/body/div[2]/center/div/div/div[2]/object/embed/@src")

    if embedlink3 is not None:
        print >> f2, ','.join([
            cattitle, catlink, subcattitle, subcatlink, tpe, prt,
            str(embedlink3[0]).strip(), img, wl
        ])
        logging.debug([
            cattitle, catlink, subcattitle, subcatlink, tpe, prt,
            str(embedlink3[0]).strip(), img, wl
        ])

    elif embedlink2 is not None:
        print >> f2, ','.join([
            cattitle, catlink, subcattitle, subcatlink, tpe, prt,
            str(embedlink2[0]).strip(), img, wl
        ])
        logging.debug([
            cattitle, catlink, subcattitle, subcatlink, tpe, prt,
            str(embedlink2[0]).strip(), img, wl
        ])

    elif embedlink is not None:
        print >> f2, ','.join([
            cattitle, catlink, subcattitle, subcatlink, tpe, prt,
            str(embedlink[0]).strip(), img, wl
        ])
        logging.debug([
            cattitle, catlink, subcattitle, subcatlink, tpe, prt,
            str(embedlink[0]).strip(), img, wl
        ])

    else:
        f3 = open("page3_filmlink_embedlink_error2.txt", "a+")
        print >> f3, line
        f3.close()

    driver.delete_all_cookies()
    driver.quit()
    f2.close()
def main(line):
    line = ast.literal_eval(line)
    menu = line[0]
    catlink = line[1]
    cattitle = line[2]
    scatlink = line[3]
    scattitle = line[4]

    driver = phan_proxy.main(scatlink)
    driver = driver_scroller(driver)

    page = driver.page_source

    driver.delete_all_cookies()
    driver.quit()

    soup = BeautifulSoup(page)
    tag_brand = soup.find("ul", id="filter_Brands")

    tag_brand_li = tag_brand.find_all("li")

    tag_brand_li = tag_brand_li[1:]

    f = open("to_extractbagittoday", "a+")
    directory = f.read().strip()
    f.close()
    
    filename = "%s/%s" %(directory, "f_mn_ct_ct_scl_sct_bt.txt")
    
    f = open(filename, "a+")
    
    bt_list2 = []
    
    for bt  in tag_brand_li:
        bt_list2.append(str(bt.span.get_text()))

    bt_list = map(str_lower_strip, bt_list2)

    print bt_list

    tag_pro_image = soup.find_all("div", attrs={"class":"product-image"})

    for l in  tag_pro_image:
        product_link  = "%s%s" %("http://www.bagittoday.com", str(l.find("a").get("href")).strip())
        product_image = str(l.find("img").get("data-original")).strip()
        product_title = str(l.find("img").get("title")).lower().strip()
        #product_title_split = map(str_lower_strip, product_title.lower().split(" "))
   
        #brand = list((Counter(bt_list) & Counter(product_title_split)).elements())
        brand2 = map(functools.partial(re_line_match, line = product_title), bt_list)
  
        brand = filter(None, brand2)

        if len(brand) == 0 :
            brand = "private"

        else:
            brand = brand[0].strip()

        print >>f, [menu, catlink, cattitle, scatlink, scattitle, brand, product_link, product_image, product_title]
        print  [menu, catlink, cattitle, scatlink, scattitle, brand, product_link, product_image, product_title]
        
    f.close()
def main(line):
    print line
    line2 = line.strip().split(",")
    
    menulink = line2[0].strip()
    menutitle = line2[1].strip()
  
    catlink = line2[2].strip()
    cattitle = line2[3].strip()

    subcatlink = line2[4].strip()
    subcatitle = line2[5].strip()

    brandlink = line2[6].strip()
    brandtite = line2[7].strip()

    driver = phan_proxy.main(brandlink)
   
    driver = driver_scroller(driver)    

    driver = sub_scroller(driver)

    page = driver.page_source
    
    soup = BeautifulSoup(page, "html.parser")

    tag_srchresult = soup.find("div", attrs={"id":"searchResultsDiv"})

    #tag_product = tag_srchresult.find_all("p", attrs={"class":"product_title"})
    tag_product = tag_srchresult.find_all("p", attrs={"class":"product_image"})



    f = open("to_extract.txt")
    directory = f.read().strip()
    f.close()

    sub_dir = "%s/%s/%s/%s/%s" %(directory, menutitle, cattitle, subcatitle, brandtite)

    try:
        os.makedirs(sub_dir)

    except:
        pass

    filename1 = "%s/%s.doc" %(sub_dir, brandtite)
    filename2 = "%s/%s.docx" %(sub_dir, brandtite)
    
    f = open(filename1, "a+")
    f2 = open(filename2, "a+")
    
    for al in tag_product:
        #prolink =  "%s%s" %("http://www.homeshop18.com", str(al.a.get("href")).strip())
        prolink =  "%s%s" %("http://www.homeshop18.com", str(al.a.get("href")).strip())
        prolimg = str(al.a.img.get("data-original")).strip()

        parsed = urlparse(prolimg)
        prolimg = "%s%s" %("http://www.homeshop18.com", parsed.path)

	print >>f, ",".join([menulink, menutitle, catlink, cattitle, subcatlink, subcatitle, brandlink,brandtite, prolink, prolimg])
        print >>f2, prolink
def main():
    directory = "dirbagittoday%s" % (time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)
    except:
        pass

    f = open("to_extractbagittoday", "w+")
    print >> f, directory
    f.close()

    f2 = open("extractedbagittoday", "a+")
    print >> f2, directory
    f2.close()

    filename = "%s/%s" % (directory, "f_mn_sl_st_scl_sct.txt")
    f = open(filename, "a+")

    link = "http://www.bagittoday.com/"
    driver = phan_proxy.main(link)
    driver.refresh()
    page = driver.page_source
    driver.delete_all_cookies()
    driver.quit()

    soup = BeautifulSoup(page)

    tag_menu = soup.find("ul", attrs={"id": "menu"})
    tag_menu_li = tag_menu.find_all("li", attrs={"class": "first"})

    req_menu = [
        "Men", "Women", "Kids", "Electronics & Mobiles", "Home & Kitchen",
        "Jewellery"
    ]

    req_manu_obj = []

    for al in tag_menu_li:
        if str(al.a.get_text()).strip() in req_menu:
            req_manu_obj.append(al)

    req_menu_subcat = {}

    for menu in req_manu_obj:
        menu_sub_cat = menu.find_all("li", id=re.compile("subCat"))
        req_menu_subcat[str(menu.a.get_text()).strip()] = menu_sub_cat

    mn_scl_sct_ccatl_ccatt = {}

    for k, v in req_menu_subcat.items():
        sub_cat_likn_title = map(sub_cat_extraction, v)

        mn_scl_sct_ccatl_ccatt[k] = sub_cat_likn_title

    #print mn_scl_sct_ccatl_ccatt

    for k, v in mn_scl_sct_ccatl_ccatt.items():
        for scl_sct_cl_ct in v:
            sl_st = scl_sct_cl_ct[0]
            sl_st = sl_st.split(",")
            sl = sl_st[0]
            st = sl_st[1]

            if scl_sct_cl_ct is None:
                print >> f, [k, sl, st, sl, st]
                print[k, sl, st, sl, st]

            for cl_ct in scl_sct_cl_ct[1]:
                cl = cl_ct.keys()[0]
                ct = cl_ct.values()[0]
                print >> f, [k, sl, st, cl, ct]
                print[k, sl, st, cl, ct]

    f.close()
Ejemplo n.º 23
0
def main(line):
    line2 = line.strip().split(",")
    sub_cat = line2[-1].strip()

    cattitle = line2[-2].strip()
    catlink = line2[-3].strip()

    menutitle = line2[1].strip()
    menulink =  line2[0].strip()

    f = open("to_extract.txt")
    directory = f.read().strip()
    f.close()

    if sub_cat  ==  "None":
        sub_cat2 = cattitle
        sub_dir =  "%s/%s/%s/%s" %(directory, menutitle, cattitle, sub_cat2)
        filename1 =  "%s/%s.doc" %(sub_dir,sub_cat2)
	filename2 = "%s/%s.docx" %(sub_dir,sub_cat2)

    else:
        sub_dir =  "%s/%s/%s/%s" %(directory, menutitle, cattitle, sub_cat)
	filename1 =  "%s/%s.doc" %(sub_dir,sub_cat)
	filename2 = "%s/%s.docx" %(sub_dir,sub_cat)

    try:
        os.makedirs(sub_dir)

    except:
        pass

    f = open(filename1, "a+")
    f2 = open(filename2, "a+")
   
    driver = phan_proxy.main(catlink)

    try:
        WebDriverWait(driver, 1000).until( ajax_complete,  "Timeout waiting for page to load")
    except WebDriverException:
        pass

    if (sub_cat != "None") and (sub_cat != "All Categories") :
        driver = sub_cat_select(driver, sub_cat)
    
    driver = driver_scroller(driver)
        
    page = driver.page_source

    soup = BeautifulSoup(page)

    tag_ul_product = soup.find("ul", attrs={"class":"products"})

    tag_li_dis_block = tag_ul_product.find_all("li", attrs={"style" : re.compile(": block;")})

    for al in tag_li_dis_block:
        tag_al_a = al.find("a", attrs={"class":"productLink"})

	if tag_al_a:
            productlink = "%s%s" %("http://www.fashionandyou.com", str(tag_al_a.get("href")).strip())
            print >>f,  ','.join([menulink, menutitle, catlink, cattitle, sub_cat,  productlink])
	    print >>f2 , productlink

	    logging.debug([menulink, menutitle, catlink, cattitle, sub_cat,  productlink])

    print len(tag_li_dis_block)

    driver.delete_all_cookies()
    driver.quit()
    f.close()
    f2.close()
Ejemplo n.º 24
0
def main(line):
    line = ast.literal_eval(line)
    menu = line[0]
    catlink = line[1]
    cattitle = line[2]
    scatlink = line[3]
    scattitle = line[4]

    driver = phan_proxy.main(scatlink)
    driver = driver_scroller(driver)

    page = driver.page_source

    driver.delete_all_cookies()
    driver.quit()

    soup = BeautifulSoup(page)
    tag_brand = soup.find("ul", id="filter_Brands")

    tag_brand_li = tag_brand.find_all("li")

    tag_brand_li = tag_brand_li[1:]

    f = open("to_extractbagittoday", "a+")
    directory = f.read().strip()
    f.close()

    filename = "%s/%s" % (directory, "f_mn_ct_ct_scl_sct_bt.txt")

    f = open(filename, "a+")

    bt_list2 = []

    for bt in tag_brand_li:
        bt_list2.append(str(bt.span.get_text()))

    bt_list = map(str_lower_strip, bt_list2)

    print bt_list

    tag_pro_image = soup.find_all("div", attrs={"class": "product-image"})

    for l in tag_pro_image:
        product_link = "%s%s" % ("http://www.bagittoday.com",
                                 str(l.find("a").get("href")).strip())
        product_image = str(l.find("img").get("data-original")).strip()
        product_title = str(l.find("img").get("title")).lower().strip()
        #product_title_split = map(str_lower_strip, product_title.lower().split(" "))

        #brand = list((Counter(bt_list) & Counter(product_title_split)).elements())
        brand2 = map(functools.partial(re_line_match, line=product_title),
                     bt_list)

        brand = filter(None, brand2)

        if len(brand) == 0:
            brand = "private"

        else:
            brand = brand[0].strip()

        print >> f, [
            menu, catlink, cattitle, scatlink, scattitle, brand, product_link,
            product_image, product_title
        ]
        print[
            menu, catlink, cattitle, scatlink, scattitle, brand, product_link,
            product_image, product_title
        ]

    f.close()
def main():
    directory = "dirbagittoday%s" %(time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)
    except:
        pass

    f = open("to_extractbagittoday", "w+")
    print >>f, directory
    f.close()

    f2 = open("extractedbagittoday", "a+")
    print >>f2, directory
    f2.close()

    filename = "%s/%s" %(directory, "f_mn_sl_st_scl_sct.txt")
    f = open(filename, "a+")
    
    link = "http://www.bagittoday.com/"
    driver = phan_proxy.main(link)
    driver.refresh()
    page = driver.page_source
    driver.delete_all_cookies()
    driver.quit()
 
    soup = BeautifulSoup(page)

    tag_menu = soup.find("ul", attrs={"id":"menu"})
    tag_menu_li = tag_menu.find_all("li", attrs={"class":"first"})

    req_menu = ["Men", "Women", "Kids", "Electronics & Mobiles", "Home & Kitchen", "Jewellery"]
    
    req_manu_obj = []
    
    for al in tag_menu_li:
        if  str(al.a.get_text()).strip()  in req_menu:
	    req_manu_obj.append(al)
 
    req_menu_subcat = {}
    
    for menu in  req_manu_obj:
        menu_sub_cat = menu.find_all("li", id = re.compile("subCat"))
        req_menu_subcat[str(menu.a.get_text()).strip()] = menu_sub_cat


    mn_scl_sct_ccatl_ccatt = {}

    for  k, v in req_menu_subcat.items():
         sub_cat_likn_title = map(sub_cat_extraction, v)

	 mn_scl_sct_ccatl_ccatt[k] =  sub_cat_likn_title

    #print mn_scl_sct_ccatl_ccatt

    for k, v in mn_scl_sct_ccatl_ccatt.items():
        for scl_sct_cl_ct in v:
	    sl_st = scl_sct_cl_ct[0]
            sl_st  = sl_st.split(",")
            sl = sl_st[0]
            st = sl_st[1]
            
            if scl_sct_cl_ct is None:
                print >>f, [k, sl, st, sl, st]
                print [k, sl, st, sl, st]

	    for cl_ct in scl_sct_cl_ct[1]:
	        cl = cl_ct.keys()[0]
		ct = cl_ct.values()[0]
                print >>f, [k, sl, st, cl, ct]
                print [k, sl, st, cl, ct]

    f.close()