def _render_list_item(self, concept): concept.render_object.li = Pq('<li></li>') concept.render_object.li.attr('id', '%s-%s' % (concept.id, 'li')) concept.render_object.li.append( concept.render_object.phr_span.children().remove()) concept.render_object.phr_span.append(concept.render_object.li) if AutoAttributeEngine.is_ordered_list(concept.get_parent(), None, self.document): concept.render_object.render_as_ordered_list = True elif AutoAttributeEngine.is_unordered_list(concept.get_parent(), None, self.document): concept.render_object.render_as_unordered_list = True
def findEachBuilding(self, html): doc = Pq(html) a_list = doc("a.e_huangse") for a in a_list: self._apartment_detail["BUILDING_NUM"] = doc(a).text() href = doc(a).attr("onclick") href = href[href.index("'") + 1:] href = href[:href.index("'")] url = self._base_url + href # doc_str = self.get_page_content_str(url) # elf._extract_data(doc_str) # time.sleep(1) self.save_building(url)
def _extract_data(self, doc_str): doc = Pq(doc_str) self._comcode_detail["province"] = doc('.content>ul>li>h1').text() doc = Pq(doc_str) tr_list = doc('.content>table>tr') for tr in tr_list: try: # time.sleep(1) td_list = doc(tr).find("td") self._comcode_detail["city"] = doc(td_list[0]).find("a").text() a_list = doc(td_list[1]).find("a") for a in a_list: self._comcode_detail["area"] = doc(a).text() url = self._base_url + doc(a).attr("href") # html = self.get_page_content_str(url) # self._extract_data2(html) insert_sql = " INSERT INTO fetch_list2 (source_id, url,times,page,STATUS) VALUE(98,'{}',0,0,0)".format( url) print("insert sql is [" + insert_sql) Dao.execute_dmls(insert_sql) except IndexError as er: print("error in " + doc(tr).text())
def extractKeywords(subpagecfg: dict, keydic: dict, websitedomain: str, name: str, keywordmap: dict, othermap: dict, tagkeyword: dict, filterwords=None): url = str(keydic.get('website')) if url is not None: if url.startswith(websitedomain): html = SpiderApi.getPageSourceCode(url) try: contentselector = subpagecfg.get('contentselector') keyselector = subpagecfg.get('keyselector1') while True: soup = BeautifulSoup(html, 'lxml') SpiderApi.deleteNoise(soup) # 删除style、script等标签 allcontent = soup.text allcontent = allcontent.replace('\n', '').replace('\r', '') # 提取startdate、enddate字段 extractDateByRegular(allcontent, othermap, keydic) # 精确提取location字段 preciseExtractLocation(allcontent, othermap, keydic) tablesoup = BeautifulSoup(html, 'lxml') content = soup.select(contentselector) if len(content) > 0: tmpsoup = BeautifulSoup(str(content[0]), 'lxml') content = content[0].get_text() writeToFile(name + '.txt', content) lines = formatReadlines(name + '.txt') removeFile(name + '.txt') # print(content) # print('* ' * 50) table = tablesoup.select(keyselector) if len(table) > 0: tablehtml = table[0] elements = [] for row in tablehtml.children: if not isinstance(row, NavigableString): rowcontent = str(row.get_text()).replace('\t', '') \ .replace('\r\n', '').replace('\n', '') elements.append(rowcontent) # for ele in elements: # print(ele) matchKeywords(elements, websitedomain, keywordmap, othermap, keydic) extractWebsiteField(lines, tmpsoup, websitedomain, keydic) extractTagFiles(tagkeyword, keydic, filterwords) break else: html = getHtmlCore(html) doc = Pq(html) html = str(doc(contentselector)) except Exception as e: print("method extractKeywords exec exception:\n {}".format(traceback.format_exc()))
def get_products(): html = browser.page_source doc = Pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image': item.find('.pic .img').attr('data-src'), 'price': item.find('.price').text().replace('\n', ''), 'deal': item.find('.deal-cnt').text(), 'title': item.find('.title').text().replace('\n', ''), 'shop': item.find('.shopname').text(), 'location': item.find('.location').text() } print(product) save_to_mongo(product)
def _extract_data(self, url): community_id = self._save_community() doc_str = self.get_page_content_str(url) doc = Pq(doc_str) tr_list = doc("table>tr") try: for tr in tr_list: Floor_num = Pq(tr)("td:eq(0)").text() a_list = doc(tr).find("td.preview>a") for a in a_list: apartment_detail = { 'COMMUNITY_ID': community_id, 'FLOOR_NUM': Floor_num, 'APARTMENT_NUM': doc(a).text(), 'STATUS': '2', 'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) } self._save_apartment(apartment_detail) sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql) except Exception as e: print(e) sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url) Dao.execute_dmls(sql)
def book_from_div(div): """ 从一个 div 里面获取到一本书的信息 """ e = Pq(div) # 小作用域变量用单字符 b = Book() b.name = e('.title').text() b.score = e('.rating_nums').text() or '0' b.evaluate = e('.pl').text() b.meta = e('.abstract').text() b.url = e('.title-text').attr('href') # xmlns这个属性可能导致pyquery无法如下解析 b.cover_url = e('.cover').attr('src') return b
def find_url_from_ul(self, ul): """ 对每一个ul 进行解析 """ doc = Pq(ul) li_list = doc("li") for li in li_list: url = self._base_url + doc(li).find("div>p>a").attr("href") if url in self.detail_info_urls: continue else: self._merchant_detail["url"] = url self.detail_info_urls.append(url) html = self.get_page_content_str(url) self._extract_data2(html)
def __init__(self, pro, doc, group, organization=None): self.project = pro self.document = doc self.groups = [group.key, Group.get_worldshare().key] self.organization = organization self.user = User() self.user.groups = self.groups if organization: self.user.organization = organization.key self.html = '' self.body = Pq('<span></span>')
def _extract_data(self, doc_str, apartment_detail): try: doc = Pq(doc_str) a_list = doc("table>tr>td>table>tr>td") # total_item =int( doc("").text().strip()) # count_num = int(total_item) / 12 for a in a_list: apartment_detail["APARTMENT_NUM"] = doc(a).text() if apartment_detail["APARTMENT_NUM"].strip() != '': apartment_detail["create_time"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) self._save_community(apartment_detail) except Exception as err: print(err) time.sleep(1) self._extract_data(doc_str)
def query(query_url): sleep(0.2) page_text = Pq(get_html(query_url)) if 'This name is the accepted name of a species in the genus' in page_text('p:eq(0)').text(): return latin_name elif 'This name is a synonym of' in page_text('p:eq(0)').text(): sci_name = re.search('This name is a synonym of (.*)', page_text('p:eq(0)').text()).group(1) return sci_name elif 'The results are below' in page_text('p:eq(0)').text(): lv2_url = 'http://www.theplantlist.org' + page_text('table>tbody>tr:eq(0)>td:eq(0)>a').attr('href') sci_name = page_text('table>tbody>tr:eq(0)>td:eq(0)').text() if latin_name in sci_name: return query(lv2_url) else: return 'check tbl manually' else: return 'check NCBI'
def render(self): cur_wc = 0 concept_count = 0 processed_concepts = {} for concept in self._get_next_concept(): if concept: if not concept.has_permission_read(self.user): continue render = True if not concept.is_summary_crawlable(document=self.document, project=self.project): render = False attr = concept.get_attr_by_doc(self.document) if attr and attr.is_header(): render = False if attr and attr.is_image(): render = False if render: phrase = concept.get_phrasing(doc=self.document, return_text=False) wc = phrase.get_word_count() if wc + cur_wc > self.word_count: break concept_count += 1 cur_wc += wc parent = concept.get_parent() if not processed_concepts.get(parent.id): processed_concepts[parent.id] = [] processed_concepts[parent.id].append(concept) paragraph_divider = 300 paragraph_count = cur_wc / paragraph_divider if cur_wc % paragraph_divider > 0: paragraph_count += 1 con_pre_par = (concept_count / paragraph_count) + 1 self.paragraph = Pq('<p></p>') self.body.append(self.paragraph) self.con_count = 0 self._render(self.project, con_pre_par, processed_concepts) self.html = self.body.html(method='html')
def dl(mid: str, save_path: str): """ 下载 :param mid: str 资源的编号 :param save_path: str 保存文件的目录 :return: """ url = "http://www.177pic.info/html/" + str(mid) + ".html" s = time.time() # 计算耗时的时间戳 e = query_html(url) if e[0] == 1: e = Pq(e[1]) title = e('.entry-title').eq(0) if title: title = title.text() else: return "资源不存在,请检查是否输入有误" print("标题:{}".format(title)) match_obj = re.search(r'\[(\d+)P]', title) a = "未知" if match_obj is not None: a = match_obj.group(1) if a != "未知": counter.reset(int(a)) page_link_list = e('.page-links').find('a') page_num = page_link_list.length print("开始下载……") save_path = os.path.join(save_path, title) if not os.path.exists(os.path.join(save_path)): os.makedirs(os.path.join(save_path)) init_threads(page_num, url, save_path) d = time.time() print("耗时{}S".format(round(d - s, 4))) if setting.zip_mode > 1: print(zip_file(save_path)) if setting.zip_mode > 2: import shutil shutil.rmtree(save_path) print("{} 已删除".format(save_path)) return "Done" else: print(e[1])
def crawl_daili66(self, page_count=4): """ 获取代理66 :param page_count: 页码 :return: 代理 """ start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawling', url) html = get_page(url) if html: doc = Pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def __init__(self, pro, doc, wc, group, organization=None): self.project = pro self.document = doc self.word_count = wc self.groups = [group.key, Group.get_worldshare().key] self.organization = organization self.user = User() self.user.groups = self.groups self.walker = ConceptPublishWalker(pro) if organization: self.user.organization = organization.key self.html = '' self.body = Pq('<span></span>') self.con_count = 0 self.paragraph = None
def handle_td(k, td): # 0. # employed # 1. Code # 2. Occupation + link !IMPORTANT # 3. Project growth - as image # 4. Projected openings td = Pq(td) if k == 0: data['num_employed'] = td.text() if k == 1: data['code'] = td.text() if k == 2: subdata = {'job': td.text(), 'url': td.find('a').attr('href')} data['occupation'] = subdata if k == 3: data['projected_growth'] = td.find('img').attr('alt') if k == 4: data['projected_openings'] = td.text()
def save_md(page, name): e = Pq(page) md = e(".blog-content-box .htmledit_views") pic = md("img") for p in pic.items(): pic_path = p.attr("src") print(pic_path) # TODO: 图片下载和地址替换 ### folder = config.md_folder create_folder(folder) filename = "{}.md".format(name) path = os.path.join(folder, filename) if not os.path.exists(path): with open(path, 'w', encoding="UTF8") as f: f.write(str(md))
def _extract_data(self, doc_str): doc = Pq(doc_str) li_list = doc('.aside.aside-left>.category-nav.J-category-nav>li') for li in li_list: self._category_detail["shopType"] = doc(li).attr("data-key") self._category_detail["categoryId"] = self._category_detail[ "shopType"] self._category_detail["name"] = doc(li).find(".name>span").text() self._category_list.append(copy.copy(self._category_detail)) # doc2 = Pq(doc_str) # div_list = doc2(".aside.aside-left>.category-nav.J-category-nav>li>.secondary-category.J-secondary-category>div>div") a_list = doc(li).find("div>a") for a in a_list: self._category_detail["categoryId"] = doc(a).attr("data-key") self._category_detail["name"] = doc(a).text() self._category_list.append(copy.copy(self._category_detail)) self.save_category()
def parse_page(json): """页面解析函数""" if json: items = json.get('data').get('cards') # 分析json格式发现,偶数元素才包含mblog,所以判断mblog是否存在 # 再执行下面的操作 for item in items: item = item.get('mblog') if item == None: pass else: weibo = {} weibo['id'] = item.get('id') weibo['text'] = Pq(item.get('text')).text() weibo['attitudes'] = item.get('attitudes_count') weibo['comments'] = item.get('comments_count') weibo['reports'] = item.get('reposts_count') yield weibo
def _extract_data(self, doc_str): doc = Pq(doc_str) #name self.__ne_detail["name"] = doc('.mainTitle >h1').text() #area for li in doc('.newinfo >ul> li'): if (doc(li).find(".z").text() == "详细地址:"): str = doc(li).text() str = str.replace("详细地址: ", "").replace(" ", "").replace( "-", "").replace(" ", " ").replace(" ", ",") self.__ne_detail["location"] = str if (doc(li).find(".z").text() == "服务区域:"): str = doc(li).text() str = str.replace("服务区域: ", "").replace(" ", "").replace( "-", "").replace(" ", " ").replace(" ", ",") self.__ne_detail["area_name"] = str self.__ne_detail["description"] = doc('.description_con >span').text() print(self.__ne_detail) self._video_dao()
def get_latin_name(chinese_name, retry_num=0): retry_num += 1 if retry_num > 3: return query_url = 'http://www.iplant.cn/info/' + chinese_name try: a = Pq(get_html(query_url)) latin_name = a('#sptitlel.infolatin').text() if latin_name == '': try: return re.search('[a-zA-Z\s]+', a('.infomore>a').text()).group(0).strip() + \ '\t' + \ re.search('[\u4e00-\u9fa5]+', a('.infomore>a').text()).group(0) except AttributeError: return return latin_name except TimeoutError: sleep(1) get_chinese_name(chinese_name, retry_num=retry_num)
def fanfou_from_div(div): """ 从 div 中获取消息信息 """ e = Pq(div) # 小作用域变量用单字符 m = Fanfou() # m.name = e('.title').text() m.content = e('.content').text() m.time = e('.time').attr('stime') m.device = e('.method').text() m.link = "fanfou.com" + e('.stamp').html().split('"', 2)[1] m.pic_link = e('.content a').attr('name') if m.pic_link is not None: m.pic_link = 'fanfou.com' + m.pic_link m.pic = e('.photo').attr('href') m.pic = str(m.pic).split('@', 1)[0] # log('piclink', m.pic_link) return m
def url2wordcloud(url, requests_kwargs={}, exclude_punct=True, normalized=True, limit=None, size=1, min_len=None): """Convert the text content of a urls' html to a wordcloud config. Args: url (str): The url to load. requests_kwargs (dict, optional): The kwargs to pass to the requests library. (e.g. auth, headers, mimetypes) exclude_punc (bool, optional): exclude punctuation min_length (int, optional): the minimum required length, if any limit (int, optional): the number of items to limit (by most common), if any normalized (bool, optional): normalize data by lowercasing and strippping whitespace Returns: same value as :func:`~format_4_wordcloud` """ resp = requests.get(url, **requests_kwargs) if not resp.status_code == 200: return [] resp = Pq(resp.content).find('body').text().split(' ') if exclude_punct: resp = [ re.sub(r'[^a-zA-Z0-9]+', '', w) for w in resp if w not in punctuation ] if min_len is not None: resp = [w for w in resp if len(w) >= min_len] if normalized: resp = [w.lower() for w in resp] words = get_word_freq_distribution(resp) if limit is not None: words = words.most_common(limit) else: words = [(k, v) for k, v in words.items()] return format_4_wordcloud(words, size_multiplier=size)
def IMWarring(imwarringurl): get_ck = Login.redlogin("http://devops.lab.everhomes.com/login") idlist = [184] for id in idlist: warurl = imwarringurl.format(id) get_page = Login.get_req(warurl) doc = Pq(get_page) tr_list = doc("#content > form:nth-child(4) > div > table > tbody>tr") for tr in tr_list: topic = "#" + doc(tr)(" td.subject > a").text() + "#" if topic == "##": continue else: tourl = "http://devops.lab.everhomes.com/" + doc(tr)( " td.subject > a").attr("href") name = doc(tr)(" td.assigned_to > a").text() if name == "": continue date = doc(tr)(" td.due_date").text() maintopic = "【" + name + "】" + "业务例会重点任务预警" + "—" + topic if date == "": content = "warning!!!\n" + '<html><body><p>' + name + ',您主题为 <a style="font-family:verdana;color:3366CC ;font-size:18px;" href=' + tourl + '><u>' + topic + '</u></a>的任务目前处于预警状态且未填预期截止时间请及时处理</p></body></html>' else: date = datetime.datetime.strptime(date, "%Y-%m-%d") delay = (datetime.datetime.now() - date).days if delay > 0: content = "warning!!!\n" + '<html><body><p>' + name + ',您主题为 <a style="font-family:verdana;color:3366CC ;font-size:18px;" href=' + tourl + '><u>' + topic + '</u></a>的任务已延期' + str( delay) + '天请及时处理<p></body></html>' else: content = "warning!!!\n" + '<html><body><p>' + name + ',您主题为 <a style="font-family:verdana;color:3366CC ;font-size:18px;" href=' + tourl + '><u>' + topic + '</u></a>的任务距离截止日期还剩' + str( abs(delay)) + '天请及时处理</p></body></html>' addr = [mailto_dict[name], mailto_dict["st"]] if send_mail(addr, maintopic, content): #邮件主题和邮件内容 print(addr) print("done!") print(content) else: print("failed!")
def fanfou_from_url(url): """ 从 url 下载并解析消息 """ page = cached_page(url) e = Pq(page) items = e('.message li') log('消息来自', items) # items[0]('title') # 调用 movie_from_div # __iter__ 迭代器 f = [fanfou_from_div(i) for i in items] for i in f: f = i.__dict__ save.SQLsave(f) if f['pic'] != 'None': pic = f['pic'] log('保存照片', pic) save_pic(pic) return Fanfou
def get_products(): try: wait_.until( ec.presence_of_element_located( (By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) html = browser.page_source doc = Pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'title': item.find('.title').text(), 'price': item.find('.price').text(), 'pay': item.find('.deal-cnt').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text(), 'image': item.find('.pic .img').attr('src'), } save_to_mongodb(product) print(product) except exceptions.TimeoutException: return get_products()
def search_youtube_video(title, pages): print("Entramos en la busqueda") cont = 0 lista_url = [] lista_views = [] for page in range(pages): params = urllib.parse.urlencode({ 'search_query': 'intitle:"%s", video' % title, 'page': page }) jq = Pq( url="http://www.youtube.com/results?%s" % params, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0" }) jq.make_links_absolute("http://www.youtube.com") for video in jq("ol.item-section").children().items(): url = video.find("a.yt-uix-tile-link").attr("href") lista_url.append(url) views = video.find("ul.yt-lockup-meta-info li").eq(1).html() if views is not None: res = int( views.split('visualizaciones')[0].strip().replace('.', '')) else: res = 0 lista_views.append(res) cont = cont + 1 if cont == 8: indice = lista_views.index(max(lista_views)) print("views: {} ".format(max(lista_views))) print("indice: {}".format(indice)) print("url: " + lista_url[indice]) return lista_url[indice] indice = lista_views.index(max(lista_views)) return lista_url[indice]
def redlogin(loginurl): cj = http.cookiejar.LWPCookieJar() cookie_support = urllib.request.HTTPCookieProcessor(cj) opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler) urllib.request.install_opener(opener) h = urllib.request.urlopen(loginurl) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } #伪装浏览器 req = urllib.request.Request(url=loginurl, headers=headers) #构造请求 urllib.request.install_opener(opener) m_fp = urllib.request.urlopen(req, timeout=500) #访问网站获取源码 html_str = m_fp.read().decode('utf-8') #读取源码,该网站使用的编码方式是utf-8 doc = Pq(html_str) authenticity_token = doc("head > meta:nth-child(8)").attr("content") print("authenticity_token=: " + authenticity_token) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Referer': "http://devops.lab.everhomes.com/login", } values = { "authenticity_token": authenticity_token, "username": "******", "password": "******" } data = urllib.parse.urlencode(values).encode('utf-8') req = urllib.request.Request(url=loginurl, headers=headers, data=data) urllib.request.install_opener(opener) m_fp = urllib.request.urlopen(req)
def _extract_data2(self, doc_str): doc = Pq(doc_str) li_list = doc(".mainListing.clearfix>.pL>.list>li") for li in li_list: self._community_detail["url"] = doc(li).find( ".details>div>a").attr("href") self._community_detail["name"] = doc(li).find( ".details>div>a").text() p = doc(li).find(".details>p") self._community_detail["location"] = doc(p[0]).text() self._community_detail["area_name"] = self._community_detail[ "location"][self._community_detail["location"].index("[") + 1:self._community_detail["location"].index("]")] self._community_detail["location"] = self._community_detail[ "location"][self._community_detail["location"].index("]") + 1:] url = doc(li).find(".details>.p_links>a").attr("href") self._community_detail['latitude'] = url[url.index("l1=") + 3:url.index("&l2")] self._community_detail['longitude'] = url[url.index("l2=") + 3:url.index("&l3")] self._save_community()
def cached_page(url): """ 保存缓存页面 """ page_dic = {} filename = '{}.html'.format(url.split('/')[-1]) page = get_page(url, filename) e = Pq(page) # tmp = e('.article-list .article-item-box.csdn-tracking-statistics .h4') # print(tmp) items = e('.article-list .article-item-box.csdn-tracking-statistics').items() for i in items: k = i.attr("data-articleid") v = str(i("h4")("a").text()) page_dic[k] = v for p in page_dic.items(): url = "https://blog.csdn.net/{}/article/details/{}".format(config.author, p[0]) print(p[0], p[1]) page = get_page(url, "{}.html".format(p[0])) save_md(page, p[0]) # items = items.children(".article-item-box.csdn-tracking-statistics") # print(items) return page