Esempio n. 1
0
    def fetch(self):
        self.logger.info("Start fetching data from www.360buy.com...")
        parser = etree.HTMLParser(encoding='gbk')
        text = urllib2.urlopen(LIST_URL).read(-1)
        tree = etree.HTML(text, parser=parser)

        time = datetime.datetime.now().date().strftime('%Y-%m-%d')
        nodes = tree.xpath(LIST_XPATH)
        for node in nodes:
            node1 = node.find(TITLE_PATH)
            #print  etree.tostring(node, method='html', encoding='utf-8')
            title = node1.attrib['title']
            url = node1.attrib['href']
            node2 = node.find(PREVIEW_PATH)
            preview = node2.attrib['src']
            node3 = node.find(PRICE_PATH)
            price = node3.text
            self.logger.info("%s: %s - %s" % (time, title, url))
            self.logger.info("%s - %s" % (price, preview))
            object_found.send(self, time=time, title=title, url=url, preview=preview ,price=price)
Esempio n. 2
0
    def _get_objects_from_url(self, url):
        objects = []
        parser = etree.HTMLParser(encoding='gbk')
        text = urllib2.urlopen(url).read(-1)
        tree = etree.HTML(text, parser=parser)
        nodes = tree.xpath(LIST_XPATH)

        for node in nodes:
            try:
                title_node = node.find('li[1]/a')
                time_node = node.find('li[2]')
                url_node = node.find('li[3]/a')
                if url_node is None: continue

                new_url = urlparse.urljoin(url, url_node.attrib['href']).replace('../', '')
                text = urllib2.urlopen(new_url).read(-1).decode('gbk')

                time = datetime.datetime.strptime(time_node.text, "%Y-%m-%d")
                title = title_node.text
                title_parts = title.split(' ')
                contest = title_parts[0]
                contest_info = title_parts[1]

                object_got = {
                    'time': time,
                    'title': title,
                    'url': new_url,
                    }

                print "        Object retrieved: %s" % object_got['title']
                self.logger.info("%s: %s - %s" % (object_got, title, new_url))

                objects.append(object_got)
                object_found.send(self, time=time, title=title, url=new_url, check=True, contest=contest, info=contest_info, qipu=text)

            except:
                pass

        return objects