def get_items(string_to_search):
    data = get_data_from_server(string_to_search)
    soup = BeautifulSoup(''.join(data))
    list = []

    table = soup.find('table',{'class':'mpitems'})
    rows = deque(table.findAll('tr'))
    rows.popleft()
    
    for theItem in rows:
        cols = theItem.findAll('td',recursive=False)
        newItem = Item()
        image = cols[0].find('img')
        if image:
            newItem.image=(image['src'])
            titleSpan = cols[1].find('span',{'class':'br_item_title'})
            if titleSpan:
                newItem.title = unicode(titleSpan.a.string)
                newItem.link='http://www.discogs.com'+titleSpan.a['href']
                newItem.fromPage='Discogs'
                priceSpan = cols[4].find('span',{'class':'price'})
                if priceSpan:
                    newItem.price=unicode(priceSpan.string)
                list.append(newItem)
    return list
Example #2
0
 def get(self):
     term_to_index=self.request.get('t')
     link_to_index=self.request.get('l')
     item=Item()
     item.title=term_to_index
     item.link=link_to_index
     index_item_and_store_item(item)
     template_values = {}
     path = os.path.join(os.path.dirname(__file__),'templates/main.html')
     self.response.out.write(template.render(path,template_values))
def parse_data_from_server(html_data):
    soup = BeautifulSoup(html_data)
    list = []
    items = soup.findAll('div', {'class':'item'})
    for theItem in items:
        newItem = Item()
        name = theItem.find('a',{'class':'nombre sin_subrayar'})
        if name:
            newItem.title=unicode(name.string)
            newItem.price=''#get_price(theItem)
            newItem.link="http://www.todocoleccion.net"+name['href']
            newItem.image=theItem.find('div',{'class':'foto'}).img['src']
            newItem.fromPage='TodoColeccion'
            list.append(newItem)
    return list
Example #4
0
    def parse_item(self, response, url):
        """
        parse the response of a `python requests.get()`

        @param response - an object of `requests.get()`
        @param url - current url object that has been crawled
        @return item
        """
        soup = bs4.BeautifulSoup(response.text, 'html.parser')

        # print("title: %s" % soup.title)

        it = Item()
        for link in soup.find_all('meta'):
            if link.get('name') == 'description':
                it.name_jp = link.get('content')
                it.name_zh = trans.translate(it.name_jp)
                it.link = response.url

        # print("item name_zh: %s" % it.name_zh)
        self.feed_new_urls(soup, url)
        return it