예제 #1
0
 def __init__(self,list):
     self.parser = etree.HTMLParser(encoding = 'utf-8')
     self.db = BaseDb()
     self.db.connectdb()
     self.category_links = list['category_links']
     self.url_list = list['url']
     self.url_set = list['url_set']
예제 #2
0
class Parse_json(BaseParse):
    def __init__(self,list):
        self.parser = etree.HTMLParser(encoding = 'utf-8')
        self.db = BaseDb()
        self.db.connectdb()
        self.url_list = list['url']
        self.url_set = list['url_set']

    def parse(self, url, page,threadName):
        search = urlparse.parse_qs(url,True).get('q')[0]
        try:
            # The non-standard json string, the regular process standardization
            re_item = re.compile(r'(?<=[{,])\w+')
            page = re_item.sub("\"\g<0>\"", page)
            result = json.loads(page)
        except:
            print 'json parse error,search:%s,page:%s' % (search,page)
            return True
        for item in result.get('result'):
            if (item is None) or (item[0] is None) or item[0] == '':
                continue
            word=format.strip_tags(item[0])
            #print 'search:%s;word:%s;count:%s' % (search.decode('utf-8'),item[0],item[1])
            try:
                self.save2db(search.decode('utf-8'),word,item[1])
            except:
                print 'save to database faild,search word:%s' % search
            self.add2redis(word)
            return True
    
    def save2db(self,search,word,count):
        sql = self.db.sql_insert(search,word,count)
        if self.db.execsql(sql):
            print 'save word:【%s】 to database,success!' % word
        else:
            print 'save word:【%s】 to database,faild' % word
    
    def add2redis(self,word):
        #r = getRedis()
        try:
            s=format.extract(word)
            for w in s:
                tmp = suggest_url % w
                #r.rpush('url_list', tmp)
                self.url_list.push(tmp)
        except:
            print 'put to redis error,word:%s' % word       
예제 #3
0
    def Rules(self):
        #linkbase
        linkbase = getRedis(2)
        #linkbase.flushdb()
        db = BaseDb()
        db.connectdb()
        db.getAllCategorys()
        
        category_links = Categoryids(linkbase)
        url_list = DQueue(linkbase,'url_news')
#         category_links.set('aaaasw222','zhz')
#         print category_links.get('zhz')
#         sys.exit(0)
        for store in drugstoreurl:
            url_set = Record(linkbase, store)
            #print url_list.len()
            #sys.exit()
            #for i in xrange(30):
                #url = url_list.pop()
                #url_set.delete(url,store)
                #url_set.delete(url,'crawled_set')
                #print url_list.len()
#              print url_list.len()
            if(url_list.len() == 0):
                for item in base.category_ids:
                    if(store == 'http://search.jianke.com/prod'):
                        url = store+'?wd='+item['name']+'&catagoryid='+str(item['id'])
                    elif(store == 'http://www.jxdyf.com/search'):
                        url = store+'/'+item['name']+'.html?catagoryid='+str(item['id'])
                    elif(store == 'http://search.360kad.com'):
                        url = store+'?pageText='+item['name']+'&catagoryid='+str(item['id'])
                    elif(store == 'http://www.ehaoyao.com/search'):
                        url = store+'/search/'+item['name']+'?catagoryid='+str(item['id'])
                    elif(store == 'http://www.yaofang.cn/n/public/search'):
                        url = store+'?s_words='+item['name']+'&sort=interrelated&catagoryid='+str(item['id'])
                    
                    url_list.push(url)
            #url_list.pop()
            #print url_list.len()
            #sys.exit(0)
            base.url_maps = get_Maps()
            signal.signal(60, self.reload_handler)
            list = {
                    'url':url_list,
                    'url_set':url_set,
                    'category_links':category_links
                    }
            self.AddRules(list, 'Parse_url', 'url', 10)
예제 #4
0
 def __init__(self,list):
     self.parser = etree.HTMLParser(encoding = 'utf-8')
     self.db = BaseDb()
     self.db.connectdb()
     self.url_list = list['url']
     self.url_set = list['url_set']
예제 #5
0
class Parse_url(BaseParse):
    def __init__(self,list):
        self.parser = etree.HTMLParser(encoding = 'utf-8')
        self.db = BaseDb()
        self.db.connectdb()
        self.category_links = list['category_links']
        self.url_list = list['url']
        self.url_set = list['url_set']
    def parse(self, url, page,threadName):
        #Match extraction rules
        #print url
        #print page
        #sys.exit()
        rule = self.findrules(url)
        if not rule:
            print 'Not found xpath:' + url 
            self.db.exception(url, 1)#No matched
            return False

        #extraction information with regular 
        if 'page_re' in rule:
            self.extract_re(rule, url, page)
            return True
        #Make DOM of html page
        try:
            self.html = etree.HTML(page, self.parser)
        except:
            print sys.exc_info()[0],url
            return False

        self.crawled = 'crawled_set'
        if 'crawled' in rule:
            self.crawled = rule['crawled']

        #extraction information with xpath
        if 'link_xpath' in rule:
            self.extract_link(rule,url)

        if 'page_xpath' in rule:
            if not self.extract_page_xpath(rule, url):
                return True

        if 'extra' in rule:
            self.doextra(rule, url)

        self.url_set.crawled(url, self.crawled)
        return True

    def doextra(self, rule, url):
        try:
            tmp = eval(rule['extra'])
            tmp += '&table=%s&url=%s' % (rule['table'], url)
            self.url_list.lpush(tmp)
        except:
            return None

    def geturlparam(self, url, param):
        result = urlparse.urlparse(url)
        return urlparse.parse_qs(result.query, True)[param][0] 
    
    def extract_link(self, rule,url):
    	categoryid = self.geturlparam(url,'catagoryid')
    	#url_verufy = base.url_verify
        pre_url = ''
        if 'pre_url' in rule: pre_url = rule['pre_url']
        for lx in rule['link_xpath']:
            urls = self.html.xpath(lx)
            for i in urls:
                #save the url to the queue of linkbase
                if i[0] == '/': 
                    tmp = pre_url + i
                elif i.find('http://') == 0:
                    tmp = i
                else:
                    tmp = pre_url + '/' + i
                if not self.url_set.exist(tmp, self.crawled):
                    print 'push into url_list:' + tmp
                    self.url_list.push(tmp)
                    self.category_links.set(categoryid,tmp)
                    #print base.url_verify
                    self.url_set.insert(tmp, self.crawled)

    def extract_page_xpath(self, rule, url):
        self.item.clear()
        self.item['url'] = url
        try:
            for k,v in rule['page_xpath'].items():
                self.add_xpath2(k, v)
             
            if self.item['name'] == '0':
                if 'page_xpath2' in rule:
                    self.item.clear()
                    self.item['url'] = url
                    for k,v in rule['page_xpath2'].items():
                        self.add_xpath2(k, v)
                    if self.item['name'] == '0':
                        self.db.exception(url, 2)
                        return False
                else:
                    self.db.exception(url, 2) 
                    return False
        except:
            info=sys.exc_info()
            print info[0],":---",info[1]
            self.db.exception(url, 2) 
            return False

         
        #save the information into mysql
        #url_verify = base.url_verify
        print self.category_links.get(url)
        print url
        if 'type' in rule:
        	if(rule['type'] == 'product'):
        		self.item['drugstore'] = rule['drugstore']
        	 	self.item['category_id'] = self.category_links.get(url)
        	 	self.item['store'] = rule['store']
        	 	self.item['price'] = self.search('\d+(\.\d*)?',str(self.item['price']))
        	 	self.item['original_price'] = self.search('\d+(\.\d*)?',str(self.item['original_price']))
        print self.item
        #sys.exit()
        if self.url_set.isnewpage(url, self.crawled):#insert
            sql = self.db.sql_insert(rule['table'], self.item)
            if self.db.execsql(sql):
                return True
            else:
                self.db.exception(url, 3)
                return False

        else:#update
            sql = self.db.sql_update(rule['table'], self.item)
            if self.db.execsql(sql):
                return True
            else:
                self.db.exception(url, 4)
                return False
        

    def extract_re(self, rule, url, page):
        self.item.clear()
        try:
            self.item['url'] = self.geturlparam(url,'url')
            table = self.geturlparam(url, 'table')
        except:
            self.db.exception(url, 5)
            return False

        for k,v in rule['page_re'].items():
            self.item[k] = self.search(v, page)

        sql = self.db.sql_update(table, self.item)
        if not self.db.execsql(sql):
            self.db.exception(url, 5)

    def search(self, pe, str):
        result = re.search(pe, str)
        if result:
            return result.group()
        else:
            return None
예제 #6
0
class Parse_url(BaseParse):
    def __init__(self,list):
        self.parser = etree.HTMLParser(encoding = 'utf-8')
        self.db = BaseDb()
        self.db.connectdb()
        self.url_list = list['url']
        self.url_set = list['url_set']

    def parse(self, url, page,threadName):
        #Match extraction rules
        rule = self.findrules(url)
        if not rule:
            print 'Not found xpath:' + url 
            self.db.exception(url, 1)#No matched
            return False

        #extraction information with regular 
        if 'page_re' in rule:
            self.extract_re(rule, url, page)
            return True

        #Make DOM of html page
        try:
            self.html = etree.HTML(page, self.parser)
        except:
            print sys.exc_info()[0],url
            return False

        self.crawled = 'crawled_set'
        if 'crawled' in rule:
            self.crawled = rule['crawled']

        #extraction information with xpath
        if 'link_xpath' in rule:
            self.extract_link(rule)

        if 'page_xpath' in rule:
            if not self.extract_page_xpath(rule, url):
                return True

        if 'extra' in rule:
            self.doextra(rule, url)

        self.url_set.crawled(url, self.crawled)
        return True

    def doextra(self, rule, url):
        try:
            tmp = eval(rule['extra'])
            tmp += '&table=%s&url=%s' % (rule['table'], url)
            self.url_list.lpush(tmp)
        except:
            return None

    def geturlparam(self, url, param):
        result = urlparse.urlparse(url)
        return urlparse.parse_qs(result.query, True)[param][0] 
    
    def extract_link(self, rule):
        pre_url = ''
        if 'pre_url' in rule: pre_url = rule['pre_url']
        for lx in rule['link_xpath']:
            urls = self.html.xpath(lx)
            for i in urls:
                #save the url to the queue of linkbase
                if i[0] == '/': 
                    tmp = pre_url + i
                elif i.find('http://') == 0:
                    tmp = i
                else:
                    tmp = pre_url + '/' + i
                if not self.url_set.exist(tmp, self.crawled):
                    print 'push into url_list:' + tmp
                    self.url_list.push(tmp)
                    self.url_set.insert(tmp, self.crawled)

    def extract_page_xpath(self, rule, url):
        self.item.clear()
        self.item['url'] = url
        try:
            for k,v in rule['page_xpath'].items():
                self.add_xpath2(k, v)

            if self.item['name'] == '0':
                if 'page_xpath2' in rule:
                    self.item.clear()
                    self.item['url'] = url
                    for k,v in rule['page_xpath2'].items():
                        self.add_xpath2(k, v)
                    if self.item['name'] == '0':
                        self.db.exception(url, 2)
                        return False
                else:
                    self.db.exception(url, 2) 
                    return False
        except:
            info=sys.exc_info()
            print info[0],":---",info[1]
            self.db.exception(url, 2) 
            return False

            
        #save the information into mysql
        if self.url_set.isnewpage(url, self.crawled):#insert
            sql = self.db.sql_insert(rule['table'], self.item)
            if self.db.execsql(sql):
                return True
            else:
                self.db.exception(url, 3)
                return False

        else:#update
            sql = self.db.sql_update(rule['table'], self.item)
            if self.db.execsql(sql):
                return True
            else:
                self.db.exception(url, 4)
                return False
        

    def extract_re(self, rule, url, page):
        self.item.clear()
        try:
            self.item['url'] = self.geturlparam(url,'url')
            table = self.geturlparam(url, 'table')
        except:
            self.db.exception(url, 5)
            return False

        for k,v in rule['page_re'].items():
            self.item[k] = self.search(v, page)

        sql = self.db.sql_update(table, self.item)
        if not self.db.execsql(sql):
            self.db.exception(url, 5)

    def search(self, pe, str):
        result = re.search(pe, str)
        if result:
            return result.group(1)
        else:
            return None