Ejemplo n.º 1
0
 def download(self, url):
     ''' Download the content of a page for late usage. The content is saved to a file
     with the url as filename
     '''
     resp, content = fetch(url)
     write_to_file(urllib.parse.quote_plus(url), 'w',
                   content.decode('utf-8'))
Ejemplo n.º 2
0
def get_content_as_json(url):
    resp, content = fetch(url)
    assert resp.status == 200
    # for some reason the content is in binary
    content = content.decode("utf-8")
    content = json.loads(content)
    return content
Ejemplo n.º 3
0
def get_content_as_json(url):
    resp, content = fetch(url)
    assert resp.status == 200
    # for some reason the content is in binary
    content = content.decode("utf-8")
    content = json.loads(content)
    return content
Ejemplo n.º 4
0
def download_image(img_url, dir_name):
    '''Downloads an image. Fails if request is redirected.
    '''

    response, img = fetch(img_url, 'GET')
    if (response['content-location'] != img_url):
        write_to_file(os.path.join(dir_name, 'image_not_found.txt'), 'w', "image at {0} was not found".format(img_url))
        return False

    img_name = os.path.basename(img_url)
    write_to_file(os.path.join(dir_name, img_name), 'wb', img)
    return True
Ejemplo n.º 5
0
def download_image(img_url, dir_name):
    """Downloads an image. Fails if request is redirected.
    """

    response, img = fetch(img_url, "GET")
    if response["content-location"] != img_url:
        write_to_file(os.path.join(dir_name, "image_not_found.txt"), "w", "image at {0} was not found".format(img_url))
        return False

    img_name = os.path.basename(img_url)
    write_to_file(os.path.join(dir_name, img_name), "wb", img)
    return True
Ejemplo n.º 6
0
    def grab_real_url(self, search_key):
        ''' Given the homepage and a key that identifies an item, try to search the 
        website for the item's "real" url.
        
        Keyword arguments:
        search_key -- key that uniquely identifies the item (e.g. image url)

        '''

        resp, content = fetch(self._search_url.format(search_key))
        soup = bs(content)
        result_list = soup.find('a', 'overhand')
        url = result_list['href']
        return url
Ejemplo n.º 7
0
    def grab_real_url(self, search_key):
        ''' Given the homepage and a key that identifies an item, try to search the 
        website for the item's "real" url.
        
        Keyword arguments:
        search_key -- key that uniquely identifies the item (e.g. image url)

        '''

        resp, content = fetch(self._search_url.format(search_key))
        soup = bs(content)
        result_list = soup.find('a', 'overhand')
        url = result_list['href']
        return url
Ejemplo n.º 8
0
 def get_item_info(self, url, image_url=None):
     ''' Given a item url and image url, returns a json representation of the item's
     information. image url is usefl in case actual url is not - it can be used for finding
     the item's "real" url
     '''
     path = re.split(self._URL_REGEX, url)[2]
     if (path.strip() == ''):
         if (image_url == None):
             return None
         # the url posted is just the homepage
         url = self.grab_real_url(image_url)
     # get content for item and parse it
     resp, content = fetch(url)
     item = self.scrape(content)
     item.url = url
     return item
Ejemplo n.º 9
0
 def get_item_info(self, url, image_url=None):
     ''' Given a item url and image url, returns a json representation of the item's
     information. image url is usefl in case actual url is not - it can be used for finding
     the item's "real" url
     '''
     path = re.split(self._URL_REGEX, url)[2]
     if (path.strip() == ''):
         if (image_url==None):
             return None
         # the url posted is just the homepage
         url = self.grab_real_url(image_url)
     # get content for item and parse it
     resp, content = fetch(url)
     item = self.scrape(content)
     item.url = url
     return item
Ejemplo n.º 10
0
 def download(self, url):
     ''' Download the content of a page for late usage. The content is saved to a file
     with the url as filename
     '''
     resp, content = fetch(url)
     write_to_file(urllib.parse.quote_plus(url), 'w', content.decode('utf-8'))