def main(directory, mainlink): filename = "%s/complete_link_collection.txt" %(directory) f = open(filename, "a+") driver = phan_proxy.main(mainlink) try: driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div/a/img").click() logging.debug("clicked..................................................................") except: pass page = driver.page_source driver.delete_all_cookies() driver.quit() soup = BeautifulSoup(page, "html.parser") link_list = soup.find_all("a", attrs={"class":"somn-track"}) for link in link_list: link = link.get("href") parsed = urlparse(link) if len(parsed.netloc) == 0: link = "http://www.snapdeal.com%s" %(link) f.write(str(link) + "\n") print link f.close()
def main2(bl): driver = phan_proxy.main(bl) page = driver.page_source #print page #print driver.current_url try: time.sleep(2) driver.find_element_by_id("jab-news").click() logging.debug("jab-news....") except: pass try: time.sleep(1) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") logging.debug("scrolling... first") return driver except: logging.debug("repeating...") driver.delete_all_cookies() driver.quit() main2(bl)
def main(line): print line line2 = line.strip().split(",") menulink = line2[0].strip() menutitle = line2[1].strip() catlink = line2[2].strip() cattitle = line2[3].strip() subcatlink = line2[4].strip() subcatitle = line2[5].strip() brandlink = line2[6].strip() brandtite = line2[7].strip() driver = phan_proxy.main(brandlink) driver = driver_scroller(driver) driver = sub_scroller(driver) page = driver.page_source soup = BeautifulSoup(page) tag_srchresult = soup.find("div", attrs={"id": "searchResultsDiv"}) tag_product = tag_srchresult.find_all("p", attrs={"class": "product_title"}) for al in tag_product: print "%s%s" % ("http://www.homeshop18.com", str( al.a.get("href")).strip())
def main(line): print line line2 = line.strip().split(",") menulink = line2[0].strip() menutitle = line2[1].strip() catlink = line2[2].strip() cattitle = line2[3].strip() subcatlink = line2[4].strip() subcatitle = line2[5].strip() brandlink = line2[6].strip() brandtite = line2[7].strip() driver = phan_proxy.main(brandlink) driver = driver_scroller(driver) driver = sub_scroller(driver) page = driver.page_source soup = BeautifulSoup(page) tag_srchresult = soup.find("div", attrs={"id":"searchResultsDiv"}) tag_product = tag_srchresult.find_all("p", attrs={"class":"product_title"}) for al in tag_product: print "%s%s" %("http://www.homeshop18.com", str(al.a.get("href")).strip())
def supermain(): link = "http://www.snapdeal.com/products/mens-footwear-sports-shoes/?q=Wearability_s%3AFootball&sort=plrty&#plrty|" link ="http://www.snapdeal.com/products/mens-footwear-casual-shoes?q=Price:399,7899&sort=rec" driver = phan_proxy.main(link) try: driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div/a/img").click() logging.debug("clicked..................................................................") except: pass driver = main(driver) page = driver.page_source soup = BeautifulSoup(page, "html.parser") item_big_box_list = soup.find("div", attrs={"id":"products-main4"}) item_box_list = item_big_box_list.find_all("div", attrs={"class":"product_grid_row"}) for item_box in item_box_list: item_sub_box_list = item_box.find_all("div", attrs={"class":"product_grid_cont gridLayout3"}) for item_sub_box in item_sub_box_list: item_link = item_sub_box.find("a", attrs={"class":"hit-ss-logger somn-track prodLink"}).get("href") print item_link print len(item_box_list) driver.delete_all_cookies() driver.quit()
def main(): directory = "dir%s" % (time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extract.txt", "w+") print >> f, directory f.close() f = open("extracted.txt", "a+") print >> f, directory f.close() link = "http://www.homeshop18.com/all-stores.html" driver = phan_proxy.main(link) try: WebDriverWait(driver, 1000).until(ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass try: driver.find_element_by_xpath("/html/body/div[7]/div/a").click() except: pass try: WebDriverWait(driver, 1000).until(ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass driver = driver_scroller(driver) page = driver.page_source soup = BeautifulSoup(page) tag_menuflyer = soup.find("div", attrs={"class": "bcMenuFlyer"}) tag_menu_lt = tag_menuflyer.find_all("a") ml_mt = [] for lt in tag_menu_lt: lt.get_text() menulink = "%s%s" % ("http://www.homeshop18.com", str( lt.get("href")).strip()) menutitle = str(lt.get_text()).strip() ml_mt.append([menulink, menutitle]) mainthreading(ml_mt)
def main(line): f = open("to_extract_cat.txt") directory = f.read().strip() f.close() filename2 = "%s/%s" %(directory, "f_ct_cl_st_sl_tp_prt_em_im_wl.csv") f2 = open(filename2, "a+") line2 = line.strip().split(",") line2 = map(str.strip, line2) cattitle = line2[0] catlink = line2[1] subcattitle = line2[2] subcatlink = line2[3] tpe = line2[4] prt = line2[5] wl = line2[6] img = line2[7] driver = phan_proxy.main(wl) try: WebDriverWait(driver, 1000).until( ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass page = driver.page_source tree = html.fromstring(page) embedlink = tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src") embedlink2 = tree.xpath("/html/body/object/embed/@src") embedlink3 = tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src") if embedlink is not None: print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img, wl]) logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img, wl]) elif embedlink2 is not None: print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img, wl]) logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img, wl]) elif embedlink3 is not None: print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img, wl]) logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img, wl]) else: f3 = open("page3_filmlink_embedlink_error2.txt", "a+") print >>f3, line f3.close() driver.delete_all_cookies() driver.quit() f2.close()
def main(): directory = "dir%s" % (time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extract.txt", "w+") print >> f, directory f.close() f = open("extracted.txt", "a+") print >> f, directory f.close() link = "http://www.homeshop18.com/all-stores.html" driver = phan_proxy.main(link) try: WebDriverWait(driver, 1000).until(ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass try: driver.find_element_by_xpath("/html/body/div[7]/div/a").click() except: pass try: WebDriverWait(driver, 1000).until(ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass driver = driver_scroller(driver) page = driver.page_source soup = BeautifulSoup(page) tag_menuflyer = soup.find("div", attrs={"class": "bcMenuFlyer"}) tag_menu_lt = tag_menuflyer.find_all("a") ml_mt = [] for lt in tag_menu_lt: lt.get_text() menulink = "%s%s" % ("http://www.homeshop18.com", str(lt.get("href")).strip()) menutitle = str(lt.get_text()).strip() ml_mt.append([menulink, menutitle]) mainthreading(ml_mt)
def main(directory, link, target, cate): directory2 = "%s/%s/%s" %(directory, target, cate) try: os.makedirs(directory2) except: pass filename = "%s/%s.doc" %(directory2, cate) f = open(filename, "a+") driver = phan_proxy.main(link) driver = driver_scroller(driver) page = driver.page_source driver.delete_all_cookies() driver.quit() soup = BeautifulSoup(page, "html.parser") item_box = soup.find("section", attrs={"id":"catalog"}) items_list = item_box.find_all("div", attrs={"class":"item"}) for item_tag in items_list: sub_cate = item_tag.get("data-tag") colour = item_tag.get("data-color") price = item_tag.get("data-price") gender = item_tag.get("data-gender") sku = item_tag.get("data-option") item_link = "http://zovi.com%s" %(item_tag.a.get("href")) item_image = "http:%s" %(item_tag.a.img.get("data-original")) item_size2 = item_tag.find("div", attrs={"class":"available-sizes"}) try: item_size = item_size2.find_all("li", attrs={"class":""}) item_size = str(map(tag_text, item_size)).replace(",", " ") except: item_size = str(item_size2) item_title = str(item_tag.find("div", attrs={"class":"title"}).get_text()).replace(",", " ").strip() try: item_sale = item_tag.find("span", attrs={"class":"tags visible sale"}).get_text().strip() except: item_sale = " " item_info = [link, target, cate, sub_cate, colour, price, gender, sku, item_link, item_image, item_size, item_title, item_sale] f.write(str(map(str, item_info)) + "\n") logging.debug(item_info) f.close()
def main(line, directory): line = ast.literal_eval(line) line = map(str.strip, line) menu = line[0] submnlink = line[1] submntitle = line[2] catlink = line[3] cattitle = line[4] brandlink = line[-2] brandtitle = line[-1] start = brandtitle.find("(") brandtitle = brandtitle[:start].strip() dirtwo = "%s/%s/%s/%s/%s" %(directory, menu, submntitle, cattitle, brandtitle) try: os.makedirs(dirtwo) except: pass filedoc = "%s/%s.doc" %(dirtwo, brandtitle) filedocx = "%s/%s.docx" %(dirtwo, brandtitle) f2 = open(filedoc, "a+") f3 = open(filedocx, "a+") driver = phan_proxy.main(brandlink) driver = driver_scroller(driver) driver = sub_scroller(driver) page = driver.page_source soup = BeautifulSoup(page, "html.parser") tag_product = soup.find("div", attrs={"id":"products"}) tag_product_a = tag_product.find_all("a", attrs={"class":"pu-image fk-product-thumb "}) for al in tag_product_a: itemlink = "%s%s" %("http://www.flipkart.com", str(al.get("href").strip())) print >>f2, [menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle, itemlink] print >>f3, itemlink print [menu, submnlink, submntitle, catlink, cattitle, brandlink, brandtitle, itemlink] driver.delete_all_cookies() driver.quit() f2.close() f3.close()
def main(): directory = "dir%s" %(time.strftime("%d%m%Y")) f = open("to_extract.txt", "w+") print >>f, directory f.close() f = open("extracted.txt", "a+") print >>f, directory f.close() try: os.makedirs(directory) except: pass link = "http://www.jabong.com" driver = phan_proxy.main(link) try: driver.find_element_by_id("jab-news").click() except: pass page = driver.page_source driver.close() soup = BeautifulSoup(page) tag_li = soup.find("li", attrs={"id":"qa-navigation0"}) h = HTMLParser.HTMLParser() tag_li = h.unescape(str(tag_li)) tag_li = tag_li.replace("\n", " ").replace("<!--", " ").replace("-->", " ") soup = BeautifulSoup(tag_li) tag_a = soup.find_all("a") menu_links = [] for l in tag_a: try: menulink = l.get("href") menu_links.append(menulink) except: pass main2(menu_links)
def wl_to_el(self, line): driver = phan_proxy.main(line[-2]) page = driver.page_source driver.quit() tree = html.fromstring(page) if len(tree.xpath("/html/body/center/table/tbody/tr[2]/td/iframe/@src")) != 0: embedlink = tree.xpath("/html/body/center/table/tbody/tr[2]/td/iframe/@src")[0] elif len(tree.xpath("/html/body/center/table/tbody/tr[2]/td/embed/@src")) != 0: embedlink = tree.xpath("/html/body/center/table/tbody/tr[2]/td/embed/@src")[0] elif len(tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src")) != 0: embedlink = tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src")[0] elif len(tree.xpath("/html/body/object/embed/@src")) != 0: embedlink = tree.xpath("/html/body/object/embed/@src")[0] elif len(tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src")) != 0: embedlink = tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src")[0] elif len(tree.xpath("/html/body/div/div[2]/div[3]/div[3]/div/input/@value")) != 0: embedlink = tree.xpath("/html/body/div/div[2]/div[3]/div[3]/div/input/@value") start = embedlink[0].find("src=") end = embedlink[0].find('"', start + 5) embedlink = embedlink[0][start + 5 : end].strip() else: embedlink = "" try: data = [ filter(None, line[0].split("/"))[3], line[0], line[1].split("/")[-1][:-5], line[1], line[2], line[3], embedlink, line[-1], line[-2], ] self.f.write(",".join(data) + "\n") logging.debug(("inserted....", embedlink)) except: pass
def main(line): filename2 = "dt_movie2.csv" f2 = open(filename2, "a+") line2 = line.strip().split(",") line2 = map(str.strip, line2) cattitle = line2[1] catlink = line2[2] subcattitle = line2[3] subcatlink = line2[4] tpe = line2[5] prt = line2[6] wl = line2[-3] img = line2[-4] wl = wl.replace('"', " ").strip() driver = phan_proxy.main(wl) page = driver.page_source tree = html.fromstring(page) embedlink = tree.xpath("/html/body/div[2]/center/div/div/div[2]/iframe/@src") embedlink2 = tree.xpath("/html/body/object/embed/@src") embedlink3 = tree.xpath("/html/body/div[2]/center/div/div/div[2]/object/embed/@src") if embedlink3 is not None: print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img, wl]) logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img, wl]) elif embedlink2 is not None: print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img, wl]) logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img, wl]) elif embedlink is not None: print >>f2, ','.join([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img, wl]) logging.debug([cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img, wl]) else: f3 = open("page3_filmlink_embedlink_error2.txt", "a+") print >>f3, line f3.close() driver.delete_all_cookies() driver.quit() f2.close()
def main4(line, filename): menulink = line[0] menutitle = line[1] driver = phan_proxy.main(menulink) height = 0 loop = True while loop is True: logging.debug("scrolling...") time.sleep(1) driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) heightnow = driver.execute_script("return $(document ).height();") if heightnow == height: loop = False else: height = heightnow loop = True page = driver.page_source soup = BeautifulSoup(page) tag_main_comp_holder = soup.find( "div", attrs={"class": "campaignHolder mainCampaignHolder"}) tag_a = tag_main_comp_holder.find_all("a", attrs={"class": "indulgeLink"}) line2 = str(line).strip("[]").strip() f = open(filename, "a+") for al in tag_a: clink = str(al.get("href")).strip() catlink = "%s%s" % ("http://www.fashionandyou.com", clink) cattitle = clink.replace("/", "").strip() print >> f, ','.join([menulink, menutitle, catlink, cattitle]) logging.debug((line2, catlink, cattitle)) f.close()
def main(): directory = "dirflipkart%s" %(time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extractfilpkart", "w+") f.write(directory) f.close() f2 = open("extractedflipkart", "a+") f2.write(directory) f2.close() link = "http://www.flipkart.com/" driver = phan_proxy.main(link) page = driver.page_source #soup = BeautifulSoup(page, "html.parser") soup = BeautifulSoup(page) tag_menu = soup.find_all("li", attrs={"class":"menu-l0 "}) menu_subl_subt = [] for menu in tag_menu: menutitle = menu.get("data-key") submenu = menu.find_all("li", attrs={"class":"heading"}) submenu_new = menu.find_all("li", attrs={"class":"new-heading"}) submenu.extend(submenu_new) for l in submenu: submenutitle = str(l.get_text()).strip() submenulink = "%s%s" %("http://www.flipkart.com", str(l.a.get("href")).strip()) menu_subl_subt.append([menutitle, submenulink, submenutitle]) driver.delete_all_cookies() driver.quit() mainthread(menu_subl_subt)
def main4(line, filename): menulink = line[0] menutitle = line[1] driver = phan_proxy.main(menulink) height = 0 loop = True while loop is True: logging.debug("scrolling...") time.sleep(1) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) heightnow = driver.execute_script("return $(document ).height();") if heightnow == height: loop = False else: height = heightnow loop = True page = driver.page_source soup = BeautifulSoup(page) tag_main_comp_holder = soup.find("div", attrs={"class":"campaignHolder mainCampaignHolder"}) tag_a = tag_main_comp_holder.find_all("a", attrs={"class":"indulgeLink"}) line2 = str(line).strip("[]").strip() f = open(filename, "a+") for al in tag_a: clink = str(al.get("href")).strip() catlink = "%s%s" %("http://www.fashionandyou.com", clink) cattitle = clink.replace("/", "").strip() print >>f, ','.join([menulink, menutitle, catlink, cattitle]) logging.debug((line2, catlink, cattitle)) f.close()
def main(line): line2 = line.strip().split(",") sub_cat = line2[-1].strip() cattitle = line2[-2].strip() catlink = line2[-3].strip() menutitle = line2[1].strip() menulink = line2[0].strip() driver = phan_proxy.main(catlink) try: WebDriverWait(driver, 1000).until( ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass if (sub_cat is not "None") and (sub_cat is not "All Categories") : driver = sub_cat_select(driver, sub_cat) driver = driver_scroller(driver) page = driver.page_source soup = BeautifulSoup(page) tag_ul_product = soup.find("ul", attrs={"class":"products"}) tag_li_dis_block = tag_ul_product.find_all("li", attrs={"style" : re.compile(": block;")}) for al in tag_li_dis_block: tag_al_a = al.find("a", attrs={"class":"productLink"}) if tag_al_a: productlink = "%s%s" %("http://www.fashionandyou.com", str(tag_al_a.get("href")).strip()) print ','.join([menulink, menutitle, catlink, cattitle, sub_cat, productlink]) print len(tag_li_dis_block) driver.delete_all_cookies() driver.quit()
def main(): link = "http://www.snapdeal.com/" page = phan_proxy.main(link) #page = req_proxy.main(link) f = open("code1_sourcebody.txt", "a+") print >>f, page.encode("ascii", "ignore") f.close() soup = BeautifulSoup(page) tag_nav = soup.find_all("li", attrs={"class":"navlink"}) menu_list = {} for l in tag_nav: menu = str(l.a.get_text()).strip() menu_list[menu] = {} tag_a = l.find_all("a", attrs={"class":"somn-track"}) for l2 in tag_a: sub_menu = str(l2.get_text()).strip() sub_menulink = str(l2.get("href")).strip() menu_list[menu][sub_menu] = sub_menulink f = open("code1_menu_sub_menu_link.txt", "w+") print >>f, menu_list f.close() menu_list.clear() print menu_list del menu_list
def main(line): filename2 = "dt_movie2.csv" f2 = open(filename2, "a+") line2 = line.strip().split(",") line2 = map(str.strip, line2) cattitle = line2[1] catlink = line2[2] subcattitle = line2[3] subcatlink = line2[4] tpe = line2[5] prt = line2[6] wl = line2[-3] img = line2[-4] wl = wl.replace('"', " ").strip() driver = phan_proxy.main(wl) page = driver.page_source tree = html.fromstring(page) embedlink = tree.xpath( "/html/body/div[2]/center/div/div/div[2]/iframe/@src") embedlink2 = tree.xpath("/html/body/object/embed/@src") embedlink3 = tree.xpath( "/html/body/div[2]/center/div/div/div[2]/object/embed/@src") if embedlink3 is not None: print >> f2, ','.join([ cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img, wl ]) logging.debug([ cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink3[0]).strip(), img, wl ]) elif embedlink2 is not None: print >> f2, ','.join([ cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img, wl ]) logging.debug([ cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink2[0]).strip(), img, wl ]) elif embedlink is not None: print >> f2, ','.join([ cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img, wl ]) logging.debug([ cattitle, catlink, subcattitle, subcatlink, tpe, prt, str(embedlink[0]).strip(), img, wl ]) else: f3 = open("page3_filmlink_embedlink_error2.txt", "a+") print >> f3, line f3.close() driver.delete_all_cookies() driver.quit() f2.close()
def main(line): line = ast.literal_eval(line) menu = line[0] catlink = line[1] cattitle = line[2] scatlink = line[3] scattitle = line[4] driver = phan_proxy.main(scatlink) driver = driver_scroller(driver) page = driver.page_source driver.delete_all_cookies() driver.quit() soup = BeautifulSoup(page) tag_brand = soup.find("ul", id="filter_Brands") tag_brand_li = tag_brand.find_all("li") tag_brand_li = tag_brand_li[1:] f = open("to_extractbagittoday", "a+") directory = f.read().strip() f.close() filename = "%s/%s" %(directory, "f_mn_ct_ct_scl_sct_bt.txt") f = open(filename, "a+") bt_list2 = [] for bt in tag_brand_li: bt_list2.append(str(bt.span.get_text())) bt_list = map(str_lower_strip, bt_list2) print bt_list tag_pro_image = soup.find_all("div", attrs={"class":"product-image"}) for l in tag_pro_image: product_link = "%s%s" %("http://www.bagittoday.com", str(l.find("a").get("href")).strip()) product_image = str(l.find("img").get("data-original")).strip() product_title = str(l.find("img").get("title")).lower().strip() #product_title_split = map(str_lower_strip, product_title.lower().split(" ")) #brand = list((Counter(bt_list) & Counter(product_title_split)).elements()) brand2 = map(functools.partial(re_line_match, line = product_title), bt_list) brand = filter(None, brand2) if len(brand) == 0 : brand = "private" else: brand = brand[0].strip() print >>f, [menu, catlink, cattitle, scatlink, scattitle, brand, product_link, product_image, product_title] print [menu, catlink, cattitle, scatlink, scattitle, brand, product_link, product_image, product_title] f.close()
def main(line): print line line2 = line.strip().split(",") menulink = line2[0].strip() menutitle = line2[1].strip() catlink = line2[2].strip() cattitle = line2[3].strip() subcatlink = line2[4].strip() subcatitle = line2[5].strip() brandlink = line2[6].strip() brandtite = line2[7].strip() driver = phan_proxy.main(brandlink) driver = driver_scroller(driver) driver = sub_scroller(driver) page = driver.page_source soup = BeautifulSoup(page, "html.parser") tag_srchresult = soup.find("div", attrs={"id":"searchResultsDiv"}) #tag_product = tag_srchresult.find_all("p", attrs={"class":"product_title"}) tag_product = tag_srchresult.find_all("p", attrs={"class":"product_image"}) f = open("to_extract.txt") directory = f.read().strip() f.close() sub_dir = "%s/%s/%s/%s/%s" %(directory, menutitle, cattitle, subcatitle, brandtite) try: os.makedirs(sub_dir) except: pass filename1 = "%s/%s.doc" %(sub_dir, brandtite) filename2 = "%s/%s.docx" %(sub_dir, brandtite) f = open(filename1, "a+") f2 = open(filename2, "a+") for al in tag_product: #prolink = "%s%s" %("http://www.homeshop18.com", str(al.a.get("href")).strip()) prolink = "%s%s" %("http://www.homeshop18.com", str(al.a.get("href")).strip()) prolimg = str(al.a.img.get("data-original")).strip() parsed = urlparse(prolimg) prolimg = "%s%s" %("http://www.homeshop18.com", parsed.path) print >>f, ",".join([menulink, menutitle, catlink, cattitle, subcatlink, subcatitle, brandlink,brandtite, prolink, prolimg]) print >>f2, prolink
def main(): directory = "dirbagittoday%s" % (time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extractbagittoday", "w+") print >> f, directory f.close() f2 = open("extractedbagittoday", "a+") print >> f2, directory f2.close() filename = "%s/%s" % (directory, "f_mn_sl_st_scl_sct.txt") f = open(filename, "a+") link = "http://www.bagittoday.com/" driver = phan_proxy.main(link) driver.refresh() page = driver.page_source driver.delete_all_cookies() driver.quit() soup = BeautifulSoup(page) tag_menu = soup.find("ul", attrs={"id": "menu"}) tag_menu_li = tag_menu.find_all("li", attrs={"class": "first"}) req_menu = [ "Men", "Women", "Kids", "Electronics & Mobiles", "Home & Kitchen", "Jewellery" ] req_manu_obj = [] for al in tag_menu_li: if str(al.a.get_text()).strip() in req_menu: req_manu_obj.append(al) req_menu_subcat = {} for menu in req_manu_obj: menu_sub_cat = menu.find_all("li", id=re.compile("subCat")) req_menu_subcat[str(menu.a.get_text()).strip()] = menu_sub_cat mn_scl_sct_ccatl_ccatt = {} for k, v in req_menu_subcat.items(): sub_cat_likn_title = map(sub_cat_extraction, v) mn_scl_sct_ccatl_ccatt[k] = sub_cat_likn_title #print mn_scl_sct_ccatl_ccatt for k, v in mn_scl_sct_ccatl_ccatt.items(): for scl_sct_cl_ct in v: sl_st = scl_sct_cl_ct[0] sl_st = sl_st.split(",") sl = sl_st[0] st = sl_st[1] if scl_sct_cl_ct is None: print >> f, [k, sl, st, sl, st] print[k, sl, st, sl, st] for cl_ct in scl_sct_cl_ct[1]: cl = cl_ct.keys()[0] ct = cl_ct.values()[0] print >> f, [k, sl, st, cl, ct] print[k, sl, st, cl, ct] f.close()
def main(line): line2 = line.strip().split(",") sub_cat = line2[-1].strip() cattitle = line2[-2].strip() catlink = line2[-3].strip() menutitle = line2[1].strip() menulink = line2[0].strip() f = open("to_extract.txt") directory = f.read().strip() f.close() if sub_cat == "None": sub_cat2 = cattitle sub_dir = "%s/%s/%s/%s" %(directory, menutitle, cattitle, sub_cat2) filename1 = "%s/%s.doc" %(sub_dir,sub_cat2) filename2 = "%s/%s.docx" %(sub_dir,sub_cat2) else: sub_dir = "%s/%s/%s/%s" %(directory, menutitle, cattitle, sub_cat) filename1 = "%s/%s.doc" %(sub_dir,sub_cat) filename2 = "%s/%s.docx" %(sub_dir,sub_cat) try: os.makedirs(sub_dir) except: pass f = open(filename1, "a+") f2 = open(filename2, "a+") driver = phan_proxy.main(catlink) try: WebDriverWait(driver, 1000).until( ajax_complete, "Timeout waiting for page to load") except WebDriverException: pass if (sub_cat != "None") and (sub_cat != "All Categories") : driver = sub_cat_select(driver, sub_cat) driver = driver_scroller(driver) page = driver.page_source soup = BeautifulSoup(page) tag_ul_product = soup.find("ul", attrs={"class":"products"}) tag_li_dis_block = tag_ul_product.find_all("li", attrs={"style" : re.compile(": block;")}) for al in tag_li_dis_block: tag_al_a = al.find("a", attrs={"class":"productLink"}) if tag_al_a: productlink = "%s%s" %("http://www.fashionandyou.com", str(tag_al_a.get("href")).strip()) print >>f, ','.join([menulink, menutitle, catlink, cattitle, sub_cat, productlink]) print >>f2 , productlink logging.debug([menulink, menutitle, catlink, cattitle, sub_cat, productlink]) print len(tag_li_dis_block) driver.delete_all_cookies() driver.quit() f.close() f2.close()
def main(line): line = ast.literal_eval(line) menu = line[0] catlink = line[1] cattitle = line[2] scatlink = line[3] scattitle = line[4] driver = phan_proxy.main(scatlink) driver = driver_scroller(driver) page = driver.page_source driver.delete_all_cookies() driver.quit() soup = BeautifulSoup(page) tag_brand = soup.find("ul", id="filter_Brands") tag_brand_li = tag_brand.find_all("li") tag_brand_li = tag_brand_li[1:] f = open("to_extractbagittoday", "a+") directory = f.read().strip() f.close() filename = "%s/%s" % (directory, "f_mn_ct_ct_scl_sct_bt.txt") f = open(filename, "a+") bt_list2 = [] for bt in tag_brand_li: bt_list2.append(str(bt.span.get_text())) bt_list = map(str_lower_strip, bt_list2) print bt_list tag_pro_image = soup.find_all("div", attrs={"class": "product-image"}) for l in tag_pro_image: product_link = "%s%s" % ("http://www.bagittoday.com", str(l.find("a").get("href")).strip()) product_image = str(l.find("img").get("data-original")).strip() product_title = str(l.find("img").get("title")).lower().strip() #product_title_split = map(str_lower_strip, product_title.lower().split(" ")) #brand = list((Counter(bt_list) & Counter(product_title_split)).elements()) brand2 = map(functools.partial(re_line_match, line=product_title), bt_list) brand = filter(None, brand2) if len(brand) == 0: brand = "private" else: brand = brand[0].strip() print >> f, [ menu, catlink, cattitle, scatlink, scattitle, brand, product_link, product_image, product_title ] print[ menu, catlink, cattitle, scatlink, scattitle, brand, product_link, product_image, product_title ] f.close()
def main(): directory = "dirbagittoday%s" %(time.strftime("%d%m%Y")) try: os.makedirs(directory) except: pass f = open("to_extractbagittoday", "w+") print >>f, directory f.close() f2 = open("extractedbagittoday", "a+") print >>f2, directory f2.close() filename = "%s/%s" %(directory, "f_mn_sl_st_scl_sct.txt") f = open(filename, "a+") link = "http://www.bagittoday.com/" driver = phan_proxy.main(link) driver.refresh() page = driver.page_source driver.delete_all_cookies() driver.quit() soup = BeautifulSoup(page) tag_menu = soup.find("ul", attrs={"id":"menu"}) tag_menu_li = tag_menu.find_all("li", attrs={"class":"first"}) req_menu = ["Men", "Women", "Kids", "Electronics & Mobiles", "Home & Kitchen", "Jewellery"] req_manu_obj = [] for al in tag_menu_li: if str(al.a.get_text()).strip() in req_menu: req_manu_obj.append(al) req_menu_subcat = {} for menu in req_manu_obj: menu_sub_cat = menu.find_all("li", id = re.compile("subCat")) req_menu_subcat[str(menu.a.get_text()).strip()] = menu_sub_cat mn_scl_sct_ccatl_ccatt = {} for k, v in req_menu_subcat.items(): sub_cat_likn_title = map(sub_cat_extraction, v) mn_scl_sct_ccatl_ccatt[k] = sub_cat_likn_title #print mn_scl_sct_ccatl_ccatt for k, v in mn_scl_sct_ccatl_ccatt.items(): for scl_sct_cl_ct in v: sl_st = scl_sct_cl_ct[0] sl_st = sl_st.split(",") sl = sl_st[0] st = sl_st[1] if scl_sct_cl_ct is None: print >>f, [k, sl, st, sl, st] print [k, sl, st, sl, st] for cl_ct in scl_sct_cl_ct[1]: cl = cl_ct.keys()[0] ct = cl_ct.values()[0] print >>f, [k, sl, st, cl, ct] print [k, sl, st, cl, ct] f.close()