コード例 #1
0
    def filter_links(raw_links, blacklist=None):

        #format the blacklist (because it might be passed link objects but we only want strings)
        if blacklist:
            tmp = []
            for list_item in blacklist:
                try:
                    tmp.append(list_item.text)
                except:
                    tmp.append(list_item)
            blacklist = tmp

        #A container for the filtered results
        filtered_links = []
        
        for link in raw_links:
            if (not "[IMG]" in link.text and not link.text == "About Us" and not link.text == "Privacy" and not link.text == "Customer Agreement" and not link.text == "No Worries Guarantee" and not link.text == "Store Locator" and not link.text == "Product Recall") and "CategoryDisplay?" in link.url:

                #Now test to see if the call includes the categories variable.. if so anything in the blacklist will be filtered from the links
                if blacklist==None:
                    filtered_links.append(link)

                else:
                    if not link.text in blacklist:
                        filtered_links.append(link)

        #Now we need to determine if this list has an ALL option (Which includes all the other options) in which case we will remove the others and just keel all
        found_all = False

        for link in filtered_links:
            if "All" in link.text:
                #There should never be > 1 "All" category
                #assert not found_all
                found_all = True
                all_link = link

        if found_all:
            progress(cfg, "Found embracing category ("+all_link.text+") Ignoring other categories.")
            filtered_links = [all_link]

        return filtered_links
コード例 #2
0
    #total_status = cfg.show_running_total()

    #Let us first create a web-browser which we can use to access the coles website and crawl for data
    browser = mechanize.Browser()

    #And create a container for our final product list
    master_products = []
    
    #And a counter
    num_products = 0

    #(The function "progress" represents progress report messages, this one simply describes the browser's current activity)
    if not cfg.VERBOSE:
        print "Attempting to scrape colesonline.com..."

    progress(cfg, "Connecting to colesonline.com...")
    #Now lets hit the coles website (colesonline.com)
    browser.open(cfg.base() + "CMLForwardViewCmd?storeId=10052&catalogId=10001&viewName=HomePageDisplay&WT.tsrc=Supermarket&WT.mc_id=Supermarket_COL_Landing")

    #There is a form on this page which asks you to enter your suburb - at the moment this is filled using a default from the config object.
    #Hopefully in the future the scraper can be expanded to gather data from ALL stores... for now we will only handle one :)

    #The form:
    browser.select_form(name="RetrieveSuburbsForm")
    browser["postcode"] = "6110"
    progress(cfg, "Entering store...")
    browser.submit()

    #The next page has another form which asks you to select a store from a list..
    
    #This page has frames so first lets isolate the right frame