def getinfo(url,html): document = BSXPathEvaluator(html) setting={} setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页')]" setting['title_xpath']="//title" next_link = document.getFirstItem(setting['next_xpath'])['href']#获取下一页URL next_url=urlparse.urljoin(url,next_link)#修正为绝对URL title= document.getFirstItem(setting['title_xpath']).string #site=root=urlparse.urlparse(url).netloc return title,next_url
def get(self): from google.appengine.api import urlfetch from BeautifulSoup import BeautifulSoup from BSXPath import BSXPathEvaluator,XPathResult result = urlfetch.fetch(url="http://www.u17.com/comic_list/le!_th99_gr99_ca99_ss99_ob0_m0_p1.html", headers={'dd': 'dd'}) if(result.status_code == 200): doc = BSXPathEvaluator(result.content)#/OL[20]/DIV[1]/A[1]/IMG[1] r = doc.getFirstItem('/html[1]/BODY[1]/DIV[8]/DIV[3]/DIV[2]/DIV[12]') self.response.out.write(r)
def parse_document(document): BSXdocument = BSXPathEvaluator(document) XPath_table = './/*[@id="main"]/p[2]/table' XPath_table_body = '%s/tbody' % (XPath_table) XPath_table_header = '%s/tr[1]' % (XPath_table_body) XPath_table_lines = '%s/tr' % (XPath_table_body) rows = BSXdocument.getItemList(XPath_table_lines)[1:] for row_counter in xrange(len(rows)): row = rows[row_counter] # print row # print "======" rowDoc = BSXPathEvaluator('%s'%row) XPath_table_row = '/' XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1) cell_category = rowDoc.getFirstItem(XPath_table_row_cell_category) XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2) cell_type = rowDoc.getFirstItem(XPath_table_row_cell_type) XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3) cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time) XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4) cell_level = rowDoc.getFirstItem(XPath_table_row_cell_level) XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5) cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message) print "======", row_counter, "======" print "Category:",cell_category print "Type:",cell_type print "Time:",cell_time print "Level:",cell_level print "Message:",cell_message return rows
def getinfo(url,html): document = BSXPathEvaluator(html) setting={} setting['next_xpath']=u"//a[contains(text(),'下章') or contains(text(),'下一章') or contains(text(),'下一页') or contains(text(),'下页') or contains(text(),'下一节')]" setting['title_xpath']="//title" title= ''+document.getFirstItem(setting['title_xpath']).string next_link = document.getItemList(setting['next_xpath']) if len(next_link)==0: return title,None pass next_url=urlparse.urljoin(url,next_link[0]['href'])#修正为绝对URL #site=root=urlparse.urlparse(url).netloc return title,next_url
def parse_document(document): BSXdocument = BSXPathEvaluator(document) XPath_table = './/*[@id="main"]/p[2]/table' XPath_table_body = '%s/tbody' % (XPath_table) XPath_table_header = '%s/tr[1]' % (XPath_table_body) XPath_table_lines = '%s/tr' % (XPath_table_body) rows = BSXdocument.getItemList(XPath_table_lines)[1:] # get cloud name fjson = open('panda_queues.json','r') data = fjson.read() dic = json.loads(data) fjson.close() records = [] ex_record = [] exist_records = [] in_buf_records = [] maxId = db.get_max_id() last_time = db.get_last_updated_time() if last_time is None: db.first_last_updated_time() last_time = db.get_last_updated_time() this_time = None skip_time = None set_last = None this_year = datetime.date.today().year if maxId is None: maxId = 0 processed_rows = 0 for row_counter in xrange(len(rows)): record = () ex_rec = () SHIFT=0 row = rows[row_counter] rowDoc = BSXPathEvaluator('%s'%row) #XPath_table_row = '%s/tr[%d]' % (XPath_table_body, row_counter+1) XPath_table_row = '/' """ XPath_table_row_cell_category = '%s/td[%d]/text()' % (XPath_table_row, 1) cell_category = BSXdocument.getItemList(XPath_table_row_cell_category) if len(cell_category)>0: cell_category = cell_category[0] XPath_table_row_cell_type = '%s/td[%d]/text()' % (XPath_table_row, 2) cell_type = BSXdocument.getItemList(XPath_table_row_cell_type) if len(cell_type)>0: cell_type = cell_type[0] """ XPath_table_row_cell_time = '%s/td[%d]/text()' % (XPath_table_row, 3) cell_time = rowDoc.getFirstItem(XPath_table_row_cell_time) #if len(cell_time)>0: #cell_time = cell_time[0] """ XPath_table_row_cell_level = '%s/td[%d]/text()' % (XPath_table_row, 4) cell_level = BSXdocument.getItemList(XPath_table_row_cell_level) if len(cell_level)>0: cell_level = cell_level[0] """ XPath_table_row_cell_message = '%s/td[%d]/text()' % (XPath_table_row, 5) cell_message = rowDoc.getFirstItem(XPath_table_row_cell_message) #if len(cell_message)>0: #cell_message = cell_message[0] message_category="no.category" message_date = "" message_time = "" message_dn = "" message_jobset ="no.jobset" message_jobdef = "no.jobdef" message_action = "" message_site="no.site" message_reason="no.reason" message_weight="no.weight" message_datetime = str(cell_time).split(' ') message_date = message_datetime[0].strip() message_time = message_datetime[1].strip() # Skip the leading uncompleted minute log_year = get_log_year(this_year, message_date, message_time) this_time = "%s-%s %s"%(log_year, message_date, message_time) if skip_time is None or skip_time == this_time: skip_time = this_time continue # set the last updated time when skip done.( Records in time DESC ) if set_last is None: # save it when every thing done set_last = this_time # Break when reach the last_time if (last_time is not None) and (this_time <= last_time): break # print 'Debug:',message_date,message_time,row_counter,cell_message processed_rows += 1 tmp_message = str(cell_message.replace(' ', ' ')).split(' : ') message_dn = tmp_message[0].split('=')[1].replace("\\\'","").strip().replace(' ','_') tmp_job = tmp_message[1].split(' ') if len(tmp_job) > 1: message_jobset = tmp_job[0].split('=')[1].strip() message_jobdef = tmp_job[1].split('=')[1].strip() else: if is_this_category(tmp_job[0],'jobset'): message_jobset = tmp_job[0].split('=')[1].strip() if is_this_category(tmp_job[0],'jobdef'): message_jobdef = tmp_job[0].split('=')[1].strip() ###print;print;print #print u'DEBUG: date time=', message_date, message_time #print u'DEBUG: dn=', message_dn #print u'DEBUG: jobset=', message_jobset #print u'DEBUG: jobdef=', message_jobdef #print u'DEBUG: ln113: tmp_message[1]=', tmp_message[1] #print u'DEBUG: ln113: tmp_message[2]=', tmp_message[2] ## skip if is_this_category(cell_message, ' action=skip '): # continue # try to speed up message_category = "D" message_skip = tmp_message[2].split(' ') message_action = message_skip[0].split('=')[1].strip() message_site = message_skip[1].split('=')[1].strip() message_reason = message_skip[2].split('=')[1].strip() if re.search('=',message_skip[4]): message_weight = message_skip[4].split('=')[1].strip() else: message_reason = '_'.join(message_skip[3:]).strip('_') # exclude : add at 2011-10-26 elif is_this_category(cell_message, ' action=exclude '): message_category = "E" message_skip = tmp_message[2].split(' ') message_action = message_skip[0].split('=')[1].strip() message_site = message_skip[1].split('=')[1].strip() message_reason = message_skip[2].split('=')[1].strip() if re.search('=',message_skip[4]): message_weight = message_skip[4].split('=')[1].strip() else: message_reason = '_'.join(message_skip[3:]).strip('_') site_name,cloud = get_sitecloud_name(dic,message_site) if is_excluded(ex_record,message_dn,message_jobset,site_name): message_category = "D" # skip if excluded by other jobdef of same jobset else: ex_rec = (message_dn, message_jobset, site_name) ex_record.insert(0, ex_rec) ## choose elif is_this_category(cell_message, ' action=choose '): message_category = "C" message_choose = tmp_message[2].split(' ') message_action = message_choose[0].split('=')[1].strip() message_site = message_choose[1].split('=')[1].strip() message_reason = message_choose[2].split('=')[1].strip() if re.search('=',message_choose[5]): message_weight = message_choose[5].split('=')[1].strip() else: message_reason = '_'.join(message_choose[3:]).strip('_') ## action=use: add at 2011-10-26 elif is_this_category(cell_message, ' action=use '): #message_category = "C" message_choose = tmp_message[2].split(' ') message_action = message_choose[0].split('=')[1].strip() message_site = message_choose[1].split('=')[1].strip() # message_reason = message_choose[2].split('=')[1].strip() message_reason = '_'.join(message_choose[3:]).strip('_') if is_this_category(message_reason, 'site'): message_category = "A" if is_this_category(message_reason, 'cloud'): message_category = "B" ## use site or cloud elif is_this_category(cell_message, ' use '): message_use = tmp_message[2].split(' ') message_action = message_use[0].strip() message_site = message_use[1].strip() message_reason = '_'.join(message_use[3:]).strip('_') if is_this_category(message_reason, 'site'): message_category = "A" if is_this_category(message_reason, 'cloud'): message_category = "B" ## other actions elif is_this_category(cell_message, ' action='): message_buf = tmp_message[2].split(' ') message_action = message_buf[0].split('=')[1].strip() print "WARNING: action=%s is not processed!"%message_action ## append to records it belong to if message_category in ['A','B','C','E']: logDate = str("%s-%s"%(log_year, message_date)) rec_idx = None site_name,cloud = get_sitecloud_name(dic,message_site) dailyLogId = db.is_exist_item(logDate, message_category, site_name, message_dn) if dailyLogId is None: rec_idx = is_in_buf(records, logDate, message_category, site_name, message_dn) if dailyLogId is not None: exist_records.append([dailyLogId]) elif rec_idx is not None: record = (logDate, message_category, site_name, message_dn) in_buf_records.append(record) else: maxId += 1 count = 1 record = (maxId, logDate, message_category, site_name, \ cloud, message_dn, count) records.append(record) if DEBUG==1: print "=========" print "DEBUG:",message_category,": ",row print "=========" db.set_last_updated_time(set_last) # set when all done. if (this_time is not None) and not (this_time <= last_time): print "Error: === NOT Reach the last updated time (%s -> %s) ==="%(this_time,last_time) return (processed_rows,records, exist_records, in_buf_records)