def get_detail(df, cik): result = df.date1.to_frame().copy() info = list() for i, (date, url, type_) in enumerate(df.values): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36' } response = requests.get(url, headers=headers, timeout=timeout) if response.status_code == 200: print('{}的{}的报表获取成功!'.format(cik, date)) if '<?xml' in response.text[:400]: xml = etree.fromstring(response.content) text = etree.tostring(xml, encoding='unicode') else: text = str(pq.PyQuery(response.text).text()) temp = word_count(text) temp['date1'] = date temp['link'] = url temp['type'] = type_ one = pd.DataFrame(temp, index=[i]) info.append(one) if len(info) > 0: agg = pd.concat(info, axis=0) answer = pd.merge(result, agg, how='left', on='date1') return answer return cik
def _parse_catalog(self): """ 请求self.url,获取小说目录页面内容 :return: 所有详细页面的链接 """ result = CommonTool.fetch_page(self.catalog_url) doc = pq.PyQuery(result) # 内存去重 detail_urls = set() # 模式1 https://www.kanunu8.com/book3/8257/ for a in doc('table:nth-child(2) > tbody > tr > td > a').items(): detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href) if detail_url in detail_urls: # 去重 continue if self.HOST not in detail_url: # 不是该站点链接 continue detail_urls.add(detail_url) # 模式2 https://www.kanunu8.com/book2/10946/index.html for a in doc('div.col-left > div > dl > dd > a').items(): detail_url = urllib.request.urljoin(self.catalog_url, a.attr.href) if detail_url in detail_urls: # 去重 continue if self.HOST not in detail_url: # 不是该站点链接 continue detail_urls.add(detail_url) return detail_urls
def getSpinDetails(url, source): d = pyquery.PyQuery(urlread(url)) spin = { 'name': '', 'summary': '', 'description': '', 'releaseDate': '', 'logo': 'qrc:/logo-fedora.svg', 'screenshots': [], 'source': '', 'variants': { '': dict(url='', sha256='', size=0) } } spin['source'] = source spin['name'] = d('title').html().strip() screenshot = d('img').filter('.img-responsive').attr('src') if screenshot: spin['screenshots'].append(url + "/.." + screenshot) for i in d('div').filter('.col-sm-8').html().split('\n'): #line = i.strip().replace('<p>', '').replace('</p>', '') line = i.strip() if len(line): spin['description'] += line download = getDownload(url + "/.." + d('a.btn').attr('href')) spin['variants'] = download #spin['release'] = getRelease(download) if 'KDE Plasma' in spin['name']: spin['logo'] = 'qrc:/kde_icon.png' if 'Xfce' in spin['name']: spin['logo'] = 'qrc:/xfce_icon.png' if 'LXDE' in spin['name']: spin['logo'] = 'qrc:/lxde_icon.png' if 'MATE' in spin['name']: spin['logo'] = 'qrc:/mate_icon.png' if 'SoaS' in spin['name']: spin['logo'] = 'qrc:/soas_icon.png' if 'Astronomy' in spin['name']: spin['logo'] = 'qrc:/astronomy_icon_green.png' if 'Design' in spin['name']: spin['logo'] = 'qrc:/design-suite_icon_green.png' if 'Games' in spin['name']: spin['logo'] = 'qrc:/games_icon_green.png' if 'Jam' in spin['name']: spin['logo'] = 'qrc:/jam_icon_green.png' if 'Robotics' in spin['name']: spin['logo'] = 'qrc:/robotics-suite_icon_green.png' if 'Scientific' in spin['name']: spin['logo'] = 'qrc:/scientific_icon_green.png' if 'Security' in spin['name']: spin['logo'] = 'qrc:/security-lab_icon_green.png' return spin
def getSpins(url, source): d = pyquery.PyQuery(urlread(url)) spins = [] for i in d('div').filter('.high').items('span'): spinUrl = url + i.siblings()('a').attr('href') spin = getSpinDetails(spinUrl, source) spin['summary'] = i.html() spins.append(spin) return spins
def _parse_detail(content): """ 解析页面详细内容,提取并返回 标题+正文 :param content: 小说内容页面 :return: 标题+正文 """ doc = pq.PyQuery(content) title = doc( '#wrapper > div.content_read > div > div.bookname > h1').text() title = CommonTool.fix_title(title) content = doc('#content').text() return title, content
def _parse_detail(content): """ 解析页面详细内容,提取并返回 标题+正文 :param content: 小说内容页面 :return: 标题+正文 """ doc = pq.PyQuery(content) title = doc('#directs > div.bookInfo > h1 > strong').text().replace( "正文", "").strip() title = CommonTool.fix_title(title) content = doc('#content').text() content = content.replace('style6();', '').replace('style5();', '') return title, content
def get_kuaidaili_proxies(pages=5): for page in range(1,pages+1): url = "http://www.kuaidaili.com/free/inha/{0}/".format(page) response = requests.get(url,headers=HEADERS) if response.status_code == 200: pq = pyquery.PyQuery(response.text) else: print(response.status_code) return for item in pq("tbody > tr"): td = item.findall('td') yield td[0].text+":"+td[1].text time.sleep(5)
def getProductDetails(url, name): d = pyquery.PyQuery(urlread(url)) product = { 'name': '', 'summary': '', 'description': '', 'releaseDate': '', 'logo': 'qrc:/logo-fedora.svg', 'screenshots': [], 'source': '', 'variants': { '': dict(url='', sha256='', size=0) } } product['name'] = name product['source'] = name product['summary'] = d('h1').html() for i in d( 'div.col-md-8, div.col-sm-8, div.col-md-5, div.col-md-6, div.col-sm-5, div.col-sm-6' ).items('p, h3, h2'): i.remove('a, br, img') if i.parent().parent()('blockquote'): i = i.parent().parent()('blockquote') product['description'] += '<blockquote>' product['description'] += str(i('p')) product['description'] += '<p align=right> ― <em>' + i( 'cite').html() + '</em></p>' product['description'] += '</blockquote>' elif i.html() and len(i.html( )) > 0: # can't remove empty tags with :empty for some reason product['description'] += str(i) product['description'].replace('h2', 'h4') product['description'].replace('h3', 'h4') if name == "Workstation": product['logo'] = 'qrc:/logo-color-workstation.png' if name == "Cloud": product['logo'] = 'qrc:/logo-color-cloud.png' if name == "Server": product['logo'] = 'qrc:/logo-color-server.png' download = getDownload(url + "/download") product['variants'] = download #product['release'] = getRelease(download) return product
def getDownload(url): d = pyquery.PyQuery(urlread(url)) ret = dict() url = d('a.btn-success').attr('href') ret[getArch(url)] = dict( url=url, sha256=getSHA(url), size=getSize(d('a.btn-success').parent().parent()('h5').text())) for e in d.items("a"): if "32-bit" in e.html().lower() and e.attr("href").endswith(".iso"): altUrl = e.attr("href") ret[getArch(altUrl)] = dict(url=altUrl, sha256=getSHA(altUrl), size=getSize(e.text())) break return ret
def translate(self, text, target_language='ru', source_language='auto'): headers = {'User-Agent': self.user_agent} params = { 'q': text, 'hl': target_language, 'sl': source_language, 'ie': 'UTF-8', 'prev': '_m' } response = requests.get(GOOGLE_URL, params=params, headers=headers) if response.status_code != requests.codes.ok: print(response.reason) exit(1) pq = pyquery.PyQuery(response.text) translated = pq.find('div.t0').text() return translated
def getSHA(url): baseurl = '/'.join(url.split('/')[:-1]) filename = url.split('/')[-1] d = pyquery.PyQuery(urlread(baseurl)) checksum = '' for i in d.items('a'): if 'CHECKSUM' in i.attr('href'): checksum = urlread(baseurl + '/' + i.attr('href')) break for line in checksum.split('\n'): i = re.match(r'^SHA256 \(([^)]+)\) = ([a-f0-9]+)$', line) if i: if i.group(1) == filename: return i.group(2) return ''
def getProducts(url='https://getfedora.org/'): d = pyquery.PyQuery(urlread(url)) products = [] for i in d('div.productitem').items('a'): productUrl = url if i.attr('href').startswith("../"): productUrl += i.attr('href')[3:] else: productUrl += i.attr('href') productName = i('h4').html() if productName != "Cloud": products.append(getProductDetails(productUrl, productName)) return products
def _parse_detail(content): """ 解析页面详细内容,提取并返回 标题+正文 :param content: 小说内容页面 :return: 标题+正文 """ doc = pq.PyQuery(content) title = doc('tr:nth-child(1) > td > strong > font').text() content = doc('td:nth-child(2) > p').text() if '' == title: # 模式1未能获取标题,采用模式2 title = doc('#Article > h1').text() title = title.split('\n')[0] content = doc('#Article > div > p:not([align])').text() content = content.replace(' ', '\n') return title, content
def _parse_catalog(self): """ 请求self.url,获取小说目录页面内容 :return: 所有详细页面的链接 """ result = CommonTool.fetch_page(self.catalog_url) doc = pq.PyQuery(result) # 内存去重 detail_urls = set() for a in doc('#list > dl > dd > a').items(): detail_url = a.attr.href if detail_url in detail_urls: # 去重 continue detail_url = urllib.request.urljoin(self.HOST, detail_url) detail_urls.add(detail_url) return detail_urls
def _parse_catalog(self): """ 请求self.url,获取小说目录页面内容 :return: 所有详细页面的链接 """ result = CommonTool.fetch_page(self.catalog_url) doc = pq.PyQuery(result) # 内存去重 detail_urls = set() for a in doc( '#chapter > div.chapterSo > div.chapterNum > ul > div.clearfix.dirconone li > a' ).items(): detail_url = a.attr.href if detail_url in detail_urls: # 去重 continue if self.HOST not in detail_url: # 不是该站点链接 continue detail_urls.add(detail_url) return detail_urls
def retrieve_vine_video_url(vine_url): log('--Retrieving vine url') d = pyquery.PyQuery(url=vine_url) video_url = d("meta[property=twitter\\:player\\:stream]").attr['content'] video_url = video_url.partition("?")[0] return video_url
def fetch_content(self, url): """ Fetches the content of an URL, gets app links from it and pushes them down the queue. Then parses the content to determine if it is an app and if it is, then push the parsed result in the `results` queue for later processing. This logic is getting executed inside green threads. You shouldn't spawn new green threads here, as this is not the parent and trouble may arise. """ resp = urllib.urlopen(url) # silently ignores errors, even though the script will not # block here. if resp.getcode() == 404: return elif resp.getcode() != 200: # this is a slight problem, it shouldn't happen but it # does sometimes self.failed += 1 return try: content = resp.read() doc = pq.PyQuery(content) # we must do our best to ignore pages that are not # relevant (music, movies, other pages that don't have # links to apps in them) if not self.is_page_valid(url, doc): return # I like keeping a log of URLs processed sys.stderr.write(url + "\n") # fetches links in this page, by regular expressions. # we are interested in app links and publisher links. all_links = [ a.attrib['href'] for a in doc('a') if re.search(r'\/(details|developer)[?]', a.attrib.get('href', '')) \ and not re.search('reviewId', a.attrib.get('href', '')) \ and not re.search('accounts\/ServiceLogin', a.attrib.get('href', '')) ] # pushing new links down the queue for processing later for link in all_links: if not link: continue self.queue.put(self.absolute_url(link)) # fetches app info from the fetched content, but ONLY in # case the URL is about an actual app app_info = self.fetch_app_info(url, doc) if app_info: # prevents going to already visited IDs self.seen_app_ids.add(app_info['uid']) self.results.put(app_info) except: # we must ignore exceptions as sometimes we don't make the # best assumptions. Some fields may be missing, the page's # format can change slightly, etc... when I ran the script # the first time it froze halfway-through and had to start # all over again pass
if ex.code == 404: return # this is a slight problem, it shouldn't happen but it # does sometimes, so keeping tracking is useful to see how # often it does happen self.failed += 1 return except urllib2.URLError: self.failed += 1 return try: content = resp.read() doc = pq.PyQuery(content) # we must do our best to ignore pages that are not # relevant (music, movies, other pages that don't have # links to apps in them) if not self.is_page_valid(url, doc): return # I like keeping a log of URLs processed sys.stderr.write(url + "\n") # fetches links in this page, by regular expressions. # we are interested in app links and publisher links. all_links = [ a.attrib['href'] for a in doc('a')
def getCookie(self, username='******', passwd='gd19691818'): count = 1 # 模拟浏览器登入 from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait url = 'https://login.taobao.com/member/login.jhtml?' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') # 隐藏界面 browser = webdriver.Chrome(chrome_options=chrome_options) # 添加请求头信息 # dcap = dict(DesiredCapabilities.CHROME) # dcap['chorme.page.settings.userAgent'] = self.ua # browser = webdriver.Chrome(desired_capabilities=dcap) # browser = webdriver.Chrome() # browser = webdriver.Ie(desired_capabilities=dcap) # browser = webdriver.Ie()# ie 浏览器 wait = WebDriverWait(browser, 10) browser.get(url) browser.maximize_window() # 选择以用户名,密码的方式进行登入,(找到目标后点击,不然抓取不到输入窗口) element = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_Quick2Static'))) element.click() # time.sleep(1) # 获取输入用户名、密码,已及登入的按钮 input_username = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#TPL_username_1'))) input_passwd = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#TPL_password_1'))) # 清空输入框里面的内容 input_username.clear() input_passwd.clear() # time.sleep(random.random()) # 填写用户名,密码数据 input_username.send_keys(username) input_passwd.send_keys(passwd) # time.sleep(random.random()) # 循环拖动验证码,若没有出现‘哎呀。。。’字样表示验证通过,中断循环 while True: # time.sleep(10) slider = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#nc_1_n1z'))) action = ActionChains(browser) # action.click_and_hold(slider)# 点击并按住 for index in range(10): try: # action.move_by_offset(index * 50, 0).perform() # 水平拖动500 action.drag_and_drop_by_offset(slider, 500, 0).perform() # 平滑 except Exception: # 拖动超过报异常,中断循环 break # action.release().perform() error = pyquery.PyQuery(browser.page_source)('.nc-lang-cnt').text() print(error) if error.startswith('哎呀'): count += 1 print('--------------------------------------第%s次尝试' % count) restart = wait.until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#nocaptcha > div > span > a'))) restart.click() # time.sleep(random.random()) else: break # time.sleep(random.random()) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_SubmitStatic'))) submit.click() cookie = [ item["name"] + "=" + item["value"] for item in browser.get_cookies() ] cookiestr = ';'.join(item for item in cookie) print(cookiestr) self.cookie = cookiestr return cookiestr
from pyquery import pyquery doc = pyquery.PyQuery('http://www.baidu.com') print(doc)