def get_provider_info(content, element_num): total_provider_num = 0 total_golden_product = 0 jq_pa = pQuery(content) num_arr_html = jq_pa('#ContentPlaceHolder1_ProductSupplier').nextAll() num_arr = num_arr_html('font') page_num = num_arr.length place_holder_1_pa = jq_pa('table.ProdGN_4') font_pa = place_holder_1_pa('font') font_text_pa = pQuery(font_pa).text() total_golden_product = font_text_pa.count(u'黄金产品') + total_golden_product total_provider_num = place_holder_1_pa.length + total_provider_num if page_num > 1: for i_page_num in range(1, page_num): url = get_provider_page_uri(element_num, i_page_num) # print url content = get_content(url, running_state) jq = pQuery(content) place_holder_1 = jq('table.ProdGN_4') font = place_holder_1('font') font_text = pQuery(font).text() total_golden_product = font_text.count(u'黄金产品') + total_golden_product total_provider_num = place_holder_1.length + total_provider_num return [total_provider_num, total_golden_product] else: return [total_provider_num, total_golden_product]
def itemGetResource(self, itemQuey): imageListQuey = self.sphinx.getQuey('http://www.pixiv.net/' + itemQuey.find('._work').attr('href')) downList = imageListQuey.find('.item-container .image') for imageRow in downList: imageData = pQuery(imageRow) imageSrc = imageData.attr('data-src') self.itemData['resource'].append(imageSrc)
def get_resource_id(course): data = [] for name, url in course: logging.info('process {0}, {1}'.format(name, url)) html = requests.get(base_url + url).content for rid in (pQuery(html)('#vlink_1 ul li input')): data.append((name, rid.attrib['value'])) return data
def find_page_resource(pattern, page): data = [] for link in pQuery(page)('li.gxbox ul li.l1 a'): # print(link.attrib['href'] + " " + link.attrib['alt']) name = link.attrib['alt'] if pattern.search(name): logging.info('find course {0}'.format(name)) data.append((name, link.attrib['href'])) return data
def pieces(self): pageQuery = self.sphinx.getQuey(self.url) def piecesGenerator(piecesSet): for pieceUrl in piecesSet: pieceObj = Piece() pieceObj.info = self.info pieceObj.url = pieceUrl yield pieceObj piecesSet = set([self.info['host'] + pQuery(piece).attr('href') for piece in pageQuery.find('._image-items .image-item .work')]) return piecesGenerator(piecesSet)
def lookup(*words): uri = "http://dict.baidu.com/s?wd=%(word)s" _word = "" logger.info(words) for word in words: _word += word + ' ' _word = _word[:-1] word = {"word" : _word} url = uri % word logger.info(url) #doc = urllib.urlopen(url).read() doc = pQuery(url=url) explain = doc('#en-simple-means>div').eq(0).find('p').text() print(explain)
def auto_checknovelupdate(): url = "http://www.biquge.la/book/14/" # url = "http://www.baidu.com" # 抓取网页 page = urllib2.urlopen(url) # 解码 text = unicode(page.read(), "gbk") # print text # 转成jQuery对象 jQuery = pQuery(text) # 取出页面元素 title = jQuery("#info>h1").html() updatetime = jQuery("#info>p:eq(2)").text() chapter = jQuery("#info>p:eq(3)>a").text() # 提取更新时间 strTime = u"最后更新:" pos_start = updatetime.find(strTime) time_len = len(strTime) pos_start = pos_start + time_len updatetime = updatetime[pos_start:] # 字符串转成时间 t = datetime.datetime.strptime(updatetime, "%Y-%m-%d %H:%M") updatetime = t.strftime('%Y-%m-%d %H:%M:%S') # 更新数据库 bUpdate = update_db_row(title, updatetime, chapter) smstext = "" if bUpdate: # 拼接短信内容 smstext = u"【赢创天下科技】[%s]更新了,最后更新:%s,最新章节:%s" % (title, updatetime, chapter) write_log(LOG_FILE_NAME, smstext) send_smd(smstext) else: logtext = "--------不发送短信--------" write_log(LOG_FILE_NAME, logtext) return smstext
def analyzer(self): pageNext = True pageUrl = self._buildUrl(self.url, { 'p': 1 }) def pageListGenerator(pageSet): for pageUrl in pageSet: pageObj = self.__class__(pageUrl) pageObj.info = self.info pageObj.url = pageUrl yield pageObj while (pageNext): pageSet = set([]) listQuery = self.sphinx.getQuey(pageUrl) pageUrlData = self._queryUrl(pageUrl) currentPage = listQuery.find('.column-order-menu:eq(0) .page-list .current').html() if ((pageUrlData['query']['p'] == currentPage) or currentPage is None): pageSet.add(pageUrl) pageList = listQuery.find('.column-order-menu:eq(0) .page-list') pageListPages = pageList.find('a') for pageItem in pageListPages: pageSet.add(self.info['baseUrl'] + pQuery(pageItem).attr('href')) def pageSort(pageLink): return int(self._queryUrl(pageUrl)['query']['p']) pageSet = sorted(pageSet, key=pageSort) yield pageListGenerator(pageSet) if (len(pageListPages) >= 8): lastPage = self.info['baseUrl'] + listQuery.find( '.column-order-menu:eq(0) .page-list li:last a').attr( 'href') pageUrl = self._buildUrl(lastPage, { 'p': int(self._queryUrl(lastPage)['query']['p']) + 5 }) else: pageNext = False
def get_main_info(content): # 使用py query解析文本 jq = pQuery(content) # 各元素信息提取 row_num = 0 tr = jq('tr') for i_tr in tr: td = pQuery(i_tr) arr = td('td') if len(arr) >= 3: row_num += 1 chemical_name = arr('a').eq(0) chinese_name = arr('a').eq(1) cas = arr('a').eq(2) href = str(pQuery(chemical_name).attr['href']) element_num = re.findall(r'\d+', href) # Element Info # print 'Element Num: ' + element_num[0] # print 'Chinese Name: ' + pQuery(chinese_name).text() # print 'Chemical Name: ' + pQuery(chemical_name).text() # print 'CAS: ' + pQuery(cas).text() # print 'MF: ' + pQuery(arr('span')).text() # Provider Info provider_url = get_provider_page_uri(element_num[0], 0) provider_content = get_content(provider_url, running_state) provider_info = get_provider_info(provider_content, element_num[0]) # print 'total_provider_num: ' + str(provider_info[0]) # print 'total_golden_product: ' + str(provider_info[1]) line_count = row_num print 'Line Count: ' + str(line_count) ws.write(line_count, 0, pQuery(cas).text()) ws.write(line_count, 1, pQuery(chinese_name).text()) ws.write(line_count, 2, pQuery(chemical_name).text()) ws.write(line_count, 3, pQuery(arr('span')).text()) ws.write(line_count, 4, int(provider_info[0])) ws.write(line_count, 5, int(provider_info[1])) line_count += 1
def getQuey(self, url): html = self.get(url) return pQuery(html)