Beispiel #1
0
    def _getdetail(self):
        rdelete = re.compile(r'\r')
        ndelete = re.compile(r'\n')
        for url in self.links.keys():
            ob = gethtml_and_soup.Gethtml_and_soup(url, random.randint(10, 30))
            try:
                html = ob.gethtml()
                soup = ob.getsoup()
                title = ob.soup.find('h1').text
                title = rdelete.sub('', title)
                title = ndelete.sub('', title)
                content = ob.soup.find('div', {
                    'class': 'userCarPhotoMemo'
                }).text
                content = rdelete.sub('', content)
                content = ndelete.sub('', content)
                owner = ob.soup.find('h2', {
                    'class': 'car_title car_header'
                }).text
            except Exception as e:
                print(e)
                print('url:{url}'.format(url=url))
                title = '=NODATA='
                content = '=NODATA='
                owner = '=NODATA='

            self.links[url]['title'] = title
            self.links[url]['content'] = content
            self.links[url]['ownerdetail'] = owner
Beispiel #2
0
 def getlinks(self):
     for i in range(1, 50):
         print('page:', i)
         self.url = self.baseurl.format(page=i)
         self.ob = gethtml_and_soup.Gethtml_and_soup(
             self.url, random.randint(10, 30))
         try:
             html = self.ob.gethtml()
         except Exception as e:
             print(e)
             break
         self.ob.getsoup()
         self._spoitlink()
         self._getdetail()
         self.save_contents()
Beispiel #3
0
    def run(self):
        rdelete = re.compile(r'\r')
        ndelete = re.compile(r'\n')
        while True:
            url = self.queue.get()
            logging.warning('NAME:{name}---{url}'.format(name=self.getName(),
                                                         url=url))
            #print(url)
            if url == None:
                break
            time.sleep(10)
            ob = gethtml_and_soup.Gethtml_and_soup(url, random.randint(10, 30))
            try:
                html = ob.gethtml()
                soup = ob.getsoup()
            except Exception as e:
                print(e, ':row33')
            try:
                ob.title = ob.soup.find('h1').text
                ob.title = rdelete.sub('', ob.title)
                ob.title = ndelete.sub('', ob.title)
            except Exception as e:
                print(e, ':row39')
                ob.title = '=NO DATA='
            try:
                ob.content = ob.soup.find('div', {
                    'class': 'userCarPhotoMemo'
                }).text
                ob.content = rdelete.sub('', ob.content)
                ob.content = ndelete.sub('', ob.content)
            except Exception as e:
                print(e, 'NAME:{name}--:row46'.format(name=self.getName()))
                ob.content = '=NO DATA='
            try:
                ob.owner = ob.soup.find('h2', {
                    'class': 'car_title car_header'
                }).text
            except Exception as e:
                print(e, 'NAME:{name}--:row51'.format(name=self.getName()))
                ob.owner = '=NO DATA='

            #print(ob.title,ob.owner,ob.url,ob.content)
            logging.debug('{title}({owner})--{url}--{content}'.format(
                title=ob.title, owner=ob.owner, url=ob.url,
                content=ob.content))
            self.queue2.put((ob.url, ob.title, ob.owner, ob.content))
Beispiel #4
0
 def getlinks(self):
     for i in range(1, 50):
         print('page:', i)
         self.url = self.baseurl.format(page=i)
         self.ob = gethtml_and_soup.Gethtml_and_soup(
             self.url, random.randint(10, 30))
         try:
             html = self.ob.gethtml()
             soup = self.ob.getsoup()
             self._spoitlink()
         except Exception as e:
             print(e)
             break
     for k in self.links.keys():
         insertsql = 'insert into links(url) values(?)'
         with db.connect('minkara.db') as con:
             try:
                 con.execute(
                     'create table if not exists links(url text unique)')
                 con.commit()
                 con.execute(insertsql, (k, ))
                 con.commit()
             except Exception as e:
                 print(e, k)