Ejemplo n.º 1
0
def Querst_request():
    """通过GET网页获取网页的URL,并且更新库内的url值

    """
    session = returns_session('Sqlextend')()
    to = ot_baidu_search_info
    data = session.query(to).filter('id > 694').all()
    if not data:
        return
    for item in data:
        new = ot_baidu_search_info()
        new.id = item.id

        while True:
            try:
                r_url = get(item.url)
                break
            except:
                time.sleep(1)
                print 'retry sleep 1'

        if r_url.ok:
            new.url = r_url.request.url

            print 'Update old_url = %s, New_url = %s' % (item.url, new.url)
            session.merge(new)
            try:
                session.commit()
            except:
                continue
        else:
            print item.id
    session.close()
Ejemplo n.º 2
0
    def Querst_request(self):
        """搜索实例

        """

        point = insert_database('Sqlextend', tablename=ot_baidu_search_info)
        for key in self.keys:
            # self.firefox.get(self.url)
            for i in range(0, 20):
                self.set_pn(key, i)
                while True:
                    try:
                        self.firefox.get(self.url)
                        break
                    except:
                        # self.firefox.quit()
                        self.reset_firefox()
                        continue

                data = self.firefox.page_source

                if data:
                    xhtml = html.document_fromstring(data)
                    content = zip(xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]//div[@class="c-tools"]'),
                                  xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]//span[@class="g"]'))
                    for title, url in content:
                        db = ot_baidu_search_info()
                        try:
                            db.title = json.loads(
                                title.get('data-tools'))['title'].encode('utf8')
                        except:
                            try:
                                db.title = title.get(
                                    'data-tools').split(':')[1].split(',')[0].replace('"', '').encode('utf8')
                            except IndexError:
                                pass

                        db.url = url.text_content().encode('utf8')
                        db.key = key
                        insert_database(
                            'Sqlextend', tablename=ot_baidu_search_info, editor=db)
                        point.set_value(db)
                        point.insert()
                    """
                    for item in xhtml.xpath('//div[@id="content_left"]//div[@class="f13"]'):
                        #print item.get('href'), item.text_content().encode('utf8')
                        db = ot_baidu_search_info()
                        import pdb
                        pdb.set_trace()
                        db.title = item.xpath('//h3//a')[0].title
                        db.url = item.xpath('//span[@class="g"]')[0].text_content
                        
                        #db.url = item.get('href')
                        db.key = key
                        insert_database('Sqlextend', tablename = ot_baidu_search_info, editor = db)
                        point.set_value(db)
                        point.insert()
                    """
                time.sleep(2)

        self.firefox.close()