コード例 #1
0
def main():
    date = time.strftime("%d%m%Y")

    link = "http://www.flipkart.com/"

    page = urll_proxy.main(link)
    html = page.read()
    soup = BeautifulSoup(html)
    page.close()

    tag_menu = soup.find_all("li", attrs={"class":"menu-l0"})
    
    dict_menu_links = {}
    
    for l in tag_menu:
        menu =  l.get("data-key")

        tag_menu_item = l.find_all("li", attrs={"class":"menu-item"})
        
	dict_menu_links[menu] = []

        for l2 in tag_menu_item:
            try:
               l2 = "http://www.flipkart.com" + l2.a.get("href")
               dict_menu_links[menu].append(l2)
            except:
                pass

    return  dict_menu_links
コード例 #2
0
def main():
    link = "http://www.flipkart.com/"
    
    page = urll_proxy.main(link)
    soup = BeautifulSoup(page.read())
    page.close()

    tag_menu = soup.find_all("li", attrs={"class":"menu-l0"})

    dict_menu_link = {}

    for l in tag_menu:
        menu = str(l.get("data-key")).strip()
        tag_menu_item = l.find_all("li", attrs={"class":"menu-item"})

	dict_menu_link[menu] = []
        
	for l in tag_menu_item:
	    try:
	        full_link = "http://www.flipkart.com" + l.a.get("href")
	        dict_menu_link[menu].append(full_link)
	    
	    except:
	        pass
     
    f = open("code1_dict_menu_link.txt", "w+")
    print >>f, dict_menu_link
    f.close()
コード例 #3
0
ファイル: code2.py プロジェクト: sulaaardit/flipkaart
def main4(todatdir_menu, sub_link):

    page = urll_proxy.main(str(sub_link).strip())
    html = page.read()
    soup = BeautifulSoup(str(html))
    print page.geturl()
    page.close()

    link_split = sub_link.strip().split("/")
    #print menu + "__" + "__".join(link_split[3:-1])

    #shutil.rmtree(todatdir_menu)

    if re.search(r'[\w]*~brand', link_split[-2]):

        filename = todatdir_menu + "/sublink_alreadybrand.txt"
        f = open(filename, "a+")
        print >> f, sub_link
        f.close()

    elif soup.find_all("ul", attrs={"id": "brand"}):

        filename = todatdir_menu + "/sublink_brand_to_extract.txt"
        f = open(filename, "a+")
        print >> f, sub_link
        f.close()

    else:
        print "pass from main4"
コード例 #4
0
def main4(todatdir_menu, sub_link):
    
    page = urll_proxy.main(str(sub_link).strip())
    html = page.read()
    soup = BeautifulSoup(str(html))
    print page.geturl()
    page.close()

    link_split = sub_link.strip().split("/")
    #print menu + "__" + "__".join(link_split[3:-1])

    #shutil.rmtree(todatdir_menu)

    if re.search(r'[\w]*~brand', link_split[-2]):

        filename = todatdir_menu + "/sublink_alreadybrand.txt"
        f = open(filename, "a+")
        print >>f , sub_link
        f.close()

    elif soup.find_all("ul", attrs={"id":"brand"}):

        filename = todatdir_menu + "/sublink_brand_to_extract.txt"
        f = open(filename, "a+")
        print >>f , sub_link
        f.close()

    else:
        print "pass from main4"
コード例 #5
0
def main():

    link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?sid=reh%2Cihu%2Cm08"
    page = urll_proxy.main(link)
    soup = BeautifulSoup(page)
    tag_ul = soup.find_all("ul", attrs={"id": "brand"})

    f = open("page1_bn_bl_bc.csv", "a+")
    f2 = open("page1_brandname", "a+")

    tag_a = tag_ul[0].find_all("a")

    pos = 1
    for l in tag_a:
        brand_link = "http://www.flipkart.com" + str(l.get("href")).strip()
        brand_name = str(l.span.get_text()).strip()
        brand_count = str(l.find("span", attrs={
            "class": "count"
        }).get_text()).strip("()")
        date = str(time.strftime("%d/%m/%Y"))
        print >> f, ','.join(
            [date, str(pos), brand_name, brand_count, brand_link])
        print >> f2, brand_name
        pos = pos + 1

    f.close()
    f2.close()

    shutil.copy2("page1_bn_bl_bc.csv", "page1_bn_bl_bc_new.csv")
    os.remove("page1_bn_bl_bc.csv")
コード例 #6
0
def main3(i, q):
    while True:
        dirtwo, l = q.get()
	dirtwo = dirtwo + "/" + l.split("/")[3]

       
        if not os.path.exists(dirtwo):
            os.makedirs(dirtwo)

	filename = dirtwo + "/" + "-xx-".join(l.split("/")[3:-1]) + "-xx-bnbcbl.csv"

	f = open(filename, "a+")
        
	page = urll_proxy.main(l)
        soup = BeautifulSoup(page)
	page.close()

	tag_ul = soup.find("ul", attrs={"id":"brand"})
        tag_a = tag_ul.find_all("a")

	pos = 1
	for l in tag_a:
	    if l.get("href"):
	        brand_link = "http://www.flipkart.com"+str(l.get("href")).strip()
	        brand_name = str(l.span.get_text()).strip()
	        brand_count = str(l.find("span", attrs={"class":"count"}).get_text()).strip("()")
	        #print brand_link, brand_name, brand_count
		date = str(time.strftime("%d:%m:%Y"))
		print >>f, ','.join([date, str(pos), brand_name, brand_count, brand_link])
		pos = pos+1
コード例 #7
0
def main3(i, q):
    while True:
        directory, link = q.get()

        page = urll_proxy.main(link)
        html = page.read()
        soup = BeautifulSoup(html)
        page.close()

        if soup.find("ul", attrs={"id": "brand"}):
            filename = directory + "/extract_brand_from_it.txt"
            f = open(filename, "a+")
            print >> f, link
            f.close()

        link_split = link.split("/")[-2]

        if re.search(".*~brand", link.split("/")[-2]):
            filename = directory + "/its_already_brand_.txt"
            f = open(filename, "a+")
            print >> f, link
            f.close()

        tag_nav = soup.find("div", attrs={"class": "nav-section-cat-list"})

        if tag_nav:
            for l in tag_nav:
                try:
                    sub_link = "http://www.flipkart.com" + l.get("href")
                    t = threading.Thread(target=main4,
                                         args=(directory, sub_link))
                    t.start()
                except:
                    pass
コード例 #8
0
ファイル: code1.py プロジェクト: sulaaardit/flipkaart
def main():
    link = "http://www.flipkart.com/"

    page = urll_proxy.main(link)
    soup = BeautifulSoup(page.read())
    page.close()

    tag_menu = soup.find_all("li", attrs={"class": "menu-l0"})

    dict_menu_link = {}

    for l in tag_menu:
        menu = str(l.get("data-key")).strip()
        tag_menu_item = l.find_all("li", attrs={"class": "menu-item"})

        dict_menu_link[menu] = []

        for l in tag_menu_item:
            try:
                full_link = "http://www.flipkart.com" + l.a.get("href")
                dict_menu_link[menu].append(full_link)

            except:
                pass

    f = open("code1_dict_menu_link.txt", "w+")
    print >> f, dict_menu_link
    f.close()
コード例 #9
0
def main3(i, q):
    while True:
        directory, link = q.get()

        page = urll_proxy.main(link)
        html = page.read()
        soup = BeautifulSoup(html)
        page.close()

	if soup.find("ul", attrs={"id":"brand"}):
	    filename = directory + "/extract_brand_from_it.txt"
	    f = open(filename, "a+")
	    print >>f, link
	    f.close()

	link_split = link.split("/")[-2]
        
	if re.search(".*~brand", link.split("/")[-2]):
	    filename = directory + "/its_already_brand_.txt"
	    f = open(filename, "a+")
	    print >>f, link
	    f.close()

        tag_nav = soup.find("div", attrs={"class":"nav-section-cat-list"})

	if tag_nav:
	    for  l in tag_nav:
	        try:
		    sub_link = "http://www.flipkart.com" + l.get("href")
	            t = threading.Thread(target=main4, args=(directory, sub_link))
		    t.start()
	        except:
	            pass
コード例 #10
0
def main():

    link = "http://www.flipkart.com/bags-wallets-belts/bags/hand-bags/pr?sid=reh%2Cihu%2Cm08"
    page = urll_proxy.main(link)
    soup = BeautifulSoup(page)
    tag_ul = soup.find_all("ul", attrs={"id":"brand"})

    f  = open("page1_bn_bl_bc.csv","a+")
    f2 = open("page1_brandname", "a+")

    tag_a = tag_ul[0].find_all("a")
     
    pos = 1
    for l in tag_a:
        brand_link = "http://www.flipkart.com"+str(l.get("href")).strip()
	brand_name = str(l.span.get_text()).strip()
	brand_count = str(l.find("span", attrs={"class":"count"}).get_text()).strip("()")
	date = str(time.strftime("%d/%m/%Y"))
        print >>f, ','.join([date, str(pos), brand_name, brand_count, brand_link])
	print >>f2, brand_name
        pos = pos+1
       

    f.close()
    f2.close()

    shutil.copy2("page1_bn_bl_bc.csv", "page1_bn_bl_bc_new.csv")
    os.remove("page1_bn_bl_bc.csv")
コード例 #11
0
def main3(i, q):
    while True:
        fpth, l = q.get()

        fpath_list = fpth.split("/")
        l_list = l.split("/")

        page = urll_proxy.main(l)
        html = page.read()
        soup = BeautifulSoup(html)

        tag_ul = soup.find("ul", attrs={"id": "brand"})

        try:
            tag_a = tag_ul.find_all("a")

            for l2 in tag_a:
                try:
                    if l2.get("href"):
                        brdl = "http://www.flipkart.com" + l2.get("href")
                        brdn = l2.span.get_text()
                        brdc = l2.find("span", attrs={
                            "class": "count"
                        }).get_text()

                        fdir = '/'.join(fpath_list[:-1]) + "/" + l_list[3]

                        if not os.path.exists(fdir):
                            os.makedirs(fdir)

                        fhomepage = fdir + "/" + "8".join(
                            l_list[3:-1]) + "8.csv"

                        f = open(fhomepage, "a+")
                        date = time.strftime("%H:%M:%S")
                        print >> f, ','.join([date, brdn, brdc, brdl])
                        f.close()

                        f2 = open(fpath_list[0] + "/availtoscroll", "a+")
                        print >> f2, fdir
                        f2.close()

                    else:
                        pass
                except:
                    pass
        except:
            pass
コード例 #12
0
ファイル: code2.py プロジェクト: sulaaardit/flipkaart
def main2(i, q):
    while True:
        todaydir, menu, l = q.get()

        page = urll_proxy.main(str(l).strip())
        html = page.read()
        soup = BeautifulSoup(str(html))
        print page.geturl()
        page.close()

        link_split = l.strip().split("/")
        #print menu + "__" + "__".join(link_split[3:-1])

        todatdir_menu = todaydir + "/" + menu

        #shutil.rmtree(todatdir_menu)

        if not os.path.exists(todatdir_menu):
            os.makedirs(todatdir_menu)

        if re.search(r'[\w]*~brand', link_split[-2]):

            filename = todatdir_menu + "/alreadybrand.txt"

            f = open(filename, "a+")
            print >> f, l
            f.close()

        elif soup.find_all("ul", attrs={"id": "brand"}):

            filename = todatdir_menu + "/brand_to_extract.txt"
            f = open(filename, "a+")
            print >> f, l
            f.close()

        elif soup.find_all("div", attrs={"class": "nav-section-cat-list"}):

            tag_nav = soup.find_all("div",
                                    attrs={"class": "nav-section-cat-list"})

            main3(todatdir_menu, tag_nav[0])

        else:
            print "pass"

        time.sleep(2)
        q.task_done()
コード例 #13
0
def main2(i, q):
    while True:
        todaydir, menu, l = q.get()
 
        page = urll_proxy.main(str(l).strip())
        html = page.read()
        soup = BeautifulSoup(str(html))
        print page.geturl()
        page.close()

        link_split = l.strip().split("/")
        #print menu + "__" + "__".join(link_split[3:-1])

        todatdir_menu = todaydir + "/" + menu

        #shutil.rmtree(todatdir_menu)

        if not os.path.exists(todatdir_menu):
            os.makedirs(todatdir_menu)

        if re.search(r'[\w]*~brand', link_split[-2]):

            filename = todatdir_menu + "/alreadybrand.txt"

            f = open(filename, "a+")
            print >>f , l
            f.close()

        elif soup.find_all("ul", attrs={"id":"brand"}):

            filename = todatdir_menu + "/brand_to_extract.txt"
            f = open(filename, "a+")
            print >>f , l
            f.close()

        elif soup.find_all("div", attrs={"class":"nav-section-cat-list"}):
   
            tag_nav = soup.find_all("div", attrs={"class":"nav-section-cat-list"})

            main3(todatdir_menu, tag_nav[0])

        else:
            print "pass"

        time.sleep(2)
        q.task_done()
コード例 #14
0
def main3(i, q):
    while True:
        fpth, l = q.get()

        fpath_list = fpth.split("/")
        l_list = l.split("/")

        page = urll_proxy.main(l)
	html = page.read()
        soup = BeautifulSoup(html)

	tag_ul = soup.find("ul", attrs={"id":"brand"})
        
        try:
            tag_a = tag_ul.find_all("a")
	
	    for l2 in tag_a:
	        try:
	            if l2.get("href"):
	                brdl =  "http://www.flipkart.com" + l2.get("href")
	                brdn =  l2.span.get_text()
	                brdc =  l2.find("span", attrs={"class":"count"}).get_text()
                    
		        fdir = '/'.join(fpath_list[:-1]) + "/" + l_list[3]
                    
		        if not os.path.exists(fdir):
		            os.makedirs(fdir)

                        fhomepage = fdir + "/" + "8".join(l_list[3:-1]) + "8.csv"

                        f = open(fhomepage, "a+")
		        date = time.strftime("%H:%M:%S")
		        print  >>f, ','.join([date, brdn, brdc, brdl])
		        f.close()
                            
                        f2 = open(fpath_list[0] + "/availtoscroll", "a+")
                        print >>f2, fdir
                        f2.close()

		    else:
		        pass
	        except:
	            pass
        except:
            pass
コード例 #15
0
def main4(directory, sub_link):

    page = urll_proxy.main(sub_link)
    html = page.read()
    soup = BeautifulSoup(html)
    page.close()

    link_split = sub_link.split("/")[-2]

    if re.search(".*~brand", sub_link.split("/")[-2]):
        filename = directory + "/sub_link_its_already_brand_.txt"
        f = open(filename, "a+")
        print >> f, sub_link
        f.close()

    if soup.find("ul", attrs={"id": "brand"}):
        filename = directory + "/sublink_extract_brand_from_it.txt"
        f = open(filename, "a+")
        print >> f, sub_link
        f.close()
コード例 #16
0
def main4(directory, sub_link):
    
    page = urll_proxy.main(sub_link)
    html = page.read()
    soup = BeautifulSoup(html)
    page.close()
 
    link_split = sub_link.split("/")[-2]

    if re.search(".*~brand", sub_link.split("/")[-2]):
        filename = directory + "/sub_link_its_already_brand_.txt"
	f = open(filename, "a+")
	print >>f, sub_link
	f.close()

    if soup.find("ul", attrs={"id":"brand"}):
        filename = directory + "/sublink_extract_brand_from_it.txt"
	f = open(filename, "a+")
	print >>f, sub_link
	f.close()
コード例 #17
0
def main(link):

    page = urll_proxy.main(link)
    soup = BeautifulSoup(page)
    tag_ul = soup.find_all("ul", attrs={"id": "brand"})

    cat_name = link.split("/")[-2].strip()

    currentdir = os.getcwd()

    currentdate = time.strftime("%d%m%Y")

    branddir = currentdir + "/brand_info_by_date/" + cat_name + currentdate

    if not os.path.exists(branddir):
        subprocess.check_output(['mkdir', '-p', branddir])

    fname = branddir + "/" + cat_name

    f = open(fname + "_bn_bc_bl.csv", "a+")
    f2 = open(fname + "_brandname_brandlink.csv", "a+")

    tag_a = tag_ul[0].find_all("a")

    pos = 1
    for l in tag_a:
        brand_link = "http://www.flipkart.com" + str(l.get("href")).strip()
        brand_name = str(l.span.get_text()).strip()
        brand_count = str(l.find("span", attrs={
            "class": "count"
        }).get_text()).strip("()")
        date = str(time.strftime("%d/%m/%Y"))
        print >> f, ','.join(
            [date, str(pos), brand_name, brand_count, brand_link])
        print >> f2, ','.join([brand_name, brand_link])
        pos = pos + 1

    f.close()
    f2.close()
    print fname + "_brandname_brandlink.csv"
コード例 #18
0
def main(link):

    page = urll_proxy.main(link)
    soup = BeautifulSoup(page)
    tag_ul = soup.find_all("ul", attrs={"id":"brand"})

    cat_name = link.split("/")[-2].strip()
    
    currentdir = os.getcwd()

    currentdate = time.strftime("%d%m%Y")

    branddir = currentdir+"/brand_info_by_date/"+cat_name+currentdate

    if not os.path.exists(branddir):
        subprocess.check_output(['mkdir', '-p', branddir])
    
    fname = branddir+"/"+cat_name

    f = open(fname+"_bn_bc_bl.csv","a+")
    f2 = open(fname+"_brandname_brandlink.csv", "a+")

    tag_a = tag_ul[0].find_all("a")
     
    pos = 1
    for l in tag_a:
        brand_link = "http://www.flipkart.com"+str(l.get("href")).strip()
	brand_name = str(l.span.get_text()).strip()
	brand_count = str(l.find("span", attrs={"class":"count"}).get_text()).strip("()")
	date = str(time.strftime("%d/%m/%Y"))
        print >>f, ','.join([date, str(pos), brand_name, brand_count, brand_link])
	print >>f2, ','.join([brand_name, brand_link])
        pos = pos+1
       

    f.close()
    f2.close()
    print fname+"_brandname_brandlink.csv"
コード例 #19
0
def main():

    directory = "dir%s" % (time.strftime("%d%m%Y"))

    try:
        os.makedirs(directory)
    except:
        pass

    f = open("to_extract.txt", "w+")
    print >> f, directory
    f.close()

    f = open("extracted.txt", "a+")
    print >> f, directory
    f.close()

    link = "http://www.fashionandyou.com/"
    #page = req_proxy.main(link)
    page = urll_proxy.main(link)

    soup = BeautifulSoup(page)

    tag_menu = soup.find("ul", attrs={"id": "verticalsMenu"})

    tag_menu_li = tag_menu.find_all("a", attrs={"class": "vertical-tab"})

    menucontainer = []

    for al in tag_menu_li:
        menulink = "%s%s" % ("http://www.fashionandyou.com", str(
            al.get("href")).strip())
        menutitle = str(al.get_text())

        menucontainer.append([menulink, menutitle])

    main2(menucontainer)
def main():

    directory = "dir%s" %(time.strftime("%d%m%Y"))
    
    try:
        os.makedirs(directory)
    except:
        pass

    f = open("to_extract.txt", "w+")
    print >>f, directory
    f.close()

    f = open("extracted.txt", "a+")
    print >>f, directory
    f.close()

    link = "http://www.fashionandyou.com/"
    #page = req_proxy.main(link)
    page = urll_proxy.main(link)
    
    soup = BeautifulSoup(page)
     
    tag_menu = soup.find("ul", attrs={"id":"verticalsMenu"})

    tag_menu_li = tag_menu.find_all("a", attrs={"class":"vertical-tab"})

    menucontainer  = []

    for al in tag_menu_li:
        menulink =  "%s%s" %("http://www.fashionandyou.com", str(al.get("href")).strip())
        menutitle = str(al.get_text())
 
        menucontainer.append([menulink, menutitle])

    main2(menucontainer)
コード例 #21
0
            date = str(time.strftime("%d:%m:%Y")).strip()

            f = open(filename,"a+")
            print >>f, ','.join([date, catname, brandname,  item_title, item_price, 
                                 item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)])
            f.close()
         
            logging.debug([date, catname, brandname,  item_title, item_price,
                          item_image, item_clour, item_mrp, item_seller, item_link, sku, size2, str(tag_dis), str(tag_spec)])

        except:
            f = open("newerrorfile", "a+")
            print >>f, l
    
=======
        page = urll_proxy.main(l)
        assert page
        soup = BeautifulSoup(page)
        page.close()

        tag_h1 = soup.find("h1", attrs={"itemprop":"name"})
        item_title = str(tag_h1.get_text()).strip()

        try:
           tag_colour = soup.find("div", attrs={"class":"line extra_text bmargin10"})
           item_clour = str(tag_colour.get_text()).strip()
        except:
           item_clour = " No more colour"
        
      
        tag_img = soup.find("img", attrs={"id":"visible-image-small"})