def parse_category_type(self, response): """ 解析业态url,获取页码url :param response: :return: """ city_name = response.meta.get('city_name') category_type = response.meta.get('category_type') business_district = response.meta.get('business_district') adname = response.meta.get('item') dic = { 'item': adname, 'business_district': business_district, 'category_type': category_type, 'city_name': city_name } try: all_page = PyQuery( response.body).find('.page').find('a').eq(-2).attr('href') except: all_page = None if all_page: num = all_page.split('/')[-1].split('p')[-1] try: c_num, aid = num.split('?') except: c_num = num aid = "" head, head1, mid, end = all_page.split('p') for c_page in range(1, int(c_num) + 1): if aid == '': n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) else: n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) + '?' + aid md5_url = self.md5(n_page) ret = spider_service.select_url(md5_url, self.md5_table) if not ret: create_sleep() spider_service.update_into(md5_url, self.md5_table) yield scrapy.Request(url=n_page, meta=dic, callback=self.parse_page, headers=header2) else: time.sleep(3) md5_url = self.md5(response.url) spider_service.update_into(md5_url, self.md5_table) yield scrapy.Request(url=response.url, meta=dic, callback=self.parse_page, headers=header3)
def GetUSDTmarketPrice(): ''' url = "http://webforex.hermes.hexun.com/forex/quotelist?code=FOREXUSDCNY&column=Code,Price" req = requests.get(url) html = req.text #print(html) s = re.findall("{.*}",str(html))[0] sjson = json.loads(s) USDTmarketPrice = float(sjson["Data"][0][0][1]/10000.00) return USDTmarketPrice ''' list1=['1','2','3','4','5','6','7'] pricelist=[] #marketRes=requests.get("http://www.feixiaohao.com/currencies/bitcoin/") marketRes=PyQuery(url="http://www.feixiaohao.com/currencies/tether/") #xxx=marketRes("#markets").html() for data in marketRes("tr"): priceinfo=PyQuery(data).text().encode("utf-8") if priceinfo[0] in list1: prices=priceinfo.split(" ")[3].replace("¥","") prices=prices.replace(",","") pricelist.append(float(prices)) return sum(pricelist)/len(pricelist)
def calculate_concat(question_text, answer): query_url = define_url(question_text, answer.get_text()) google_results, full_page = search(query_url, full_page=True) # Extract the number of total google results answer.total_results = get_google_total_results(full_page) for result in google_results: result_text = PyQuery(result).text() # If Google doesn't find enough results, it includes some that aren't really relevant, # adding "Missing words: <keywords>", where keywords are words # included in the search query (answer, here). # In this context, the results described are not useful and are excluded # If the answer is in the result text # If "Mancanti:" is not in the result text (so, it's a relevant result) # If the answer is not in the "Must include" section if answer.get_text() in result_text.lower() and \ not "Mancanti:" in result_text and \ not answer.get_text() in result_text.split("\n")[-1].lower(): # Yay! This is a relevant result! answer.results += 1 # Calculate the score of the answer answer.score = answer.total_results * (answer.results if answer.results > 0 else 1) return f"{answer.get_text()[:40]:^40}{answer.score:<10}{answer.results:^10}{answer.total_results:<10}"
def test_user_with_permisions_follows_everyone(self): response = self.get_response() query = PyQuery(response.content) query = query("table#queryTable td.name").text() names = map(lambda u: u.username, (list(self.watcher.watches.all()) + [self.no_perms_user.user])) names.append("watcher") self.assertEqual(set(query.split()), set(names))
def test_followed_users_shows_correctly(self): response = self.get_response() query = PyQuery(response.content) query = query("table#queryTable td.name").text() names = map(lambda u: u.username, self.watcher.watches.all()) names.append("watcher") self.assertEqual(set(query.split()), set(names))
def test_followed_users_shows_correctly(self): response = self.get_response() query = PyQuery(response.content) query = query('table#queryTable td.name').text() names = map(lambda u: u.username, self.watcher.watches.all()) names.append('watcher') self.assertEqual(set(query.split()), set(names))
def get_info(self, area): try: # print pqhtml.outerHtml() form = area('#buyform') pvid = form('#arypvid').attr('value') stock = form('#arystock').attr('value') text = form('#arysubtext').attr('value') price = PyQuery(form('#aryprice').attr('value') or 'None').text().replace(' ', '') origprice = PyQuery(form('#aryorigprice').attr('value') or 'None').text().replace(' ', '') pvids = filter(lambda x: x, pvid.split('|')) texts = filter(lambda x: x, text.split('|')) or [self.cfg.DEFAULT_ONE_SIZE] prices = map( lambda x: re.search(r'(\d[\d.]*)', x.replace(',', '')).groups( )[0], filter(lambda x: x, price.split('|'))) stocks = map(lambda x: x if int(x) else self.DEFAULT_STOCK_NUMBER, filter(lambda x: x, stock.split('|'))) origprices = map( lambda x: re.search(r'(\d[\d.]*)', x.replace(',', '')).groups( )[0], filter(lambda x: x, origprice.split('|'))) sizes = map( lambda x: zip( ['sku', 'inventory', 'name', 'price', 'listPrice'], x), zip(pvids, stocks, texts, prices, origprices)) sizes = map(lambda x: dict(x), sizes) price = max(prices) listPrice = max(origprices) if not sizes: raise ValueError, 'Get size, price, info Fail' return price, listPrice, sizes except Exception, e: raise
def test_user_with_permisions_follows_everyone(self): response = self.get_response() query = PyQuery(response.content) query = query('table#queryTable td.name').text() names = map( lambda u: u.username, (list(self.watcher.watches.all()) + [self.no_perms_user.user])) names.append('watcher') self.assertEqual(set(query.split()), set(names))
def get_content_text(content): # content = [s.extract() for s in content('style')] content_text = PyQuery(str(content)).text() content_text = content_text.replace('\r\n', '\n').replace('\r', '\n') final_content_text = '' for each_text in content_text.split('\n'): each_final_text = remove_special_char(each_text).strip() if each_final_text != '': final_content_text += each_final_text + '\n' return final_content_text.strip()
def parse_page(self, page, link): doc = PyQuery(page) hrefs = doc.find('a[href]') for href in hrefs: href_attr = PyQuery(href).attr("href") href_no_qs = href_attr.split("?")[0] if href_no_qs not in self.queue: if not self.is_absolute(href_attr): self.queue.append(urljoin(link, href_no_qs)) self.add_to_backlinks(link, urljoin(link, href_no_qs)) else: self.queue.append(href_no_qs) self.add_to_backlinks(link, href_no_qs)
def parse_page(self, response): """ 解析页码 :param response: :return: """ city_name = response.meta.get('city_name') adname = response.meta.get('adname') end_page = PyQuery( response.body).find('.page').find('a').eq(-2).attr('href') dic = {"city_name": city_name, "adname": adname} if end_page: num = end_page.split('/')[-1].split('p')[-1] try: c_num, aid = num.split('?') except: c_num = num aid = "" head, head1, mid, end = end_page.split('p') for c_page in range(1, int(c_num) + 1): if aid == '': n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) else: n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format( c_page) + '?' + aid create_sleep() yield scrapy.Request(url=n_page, meta=dic, callback=self.parse_area, headers=header1) else: create_sleep() yield scrapy.Request(url=response.url, meta=dic, callback=self.parse_area, headers=header1)
def get_questions(): with session() as s: login(s, register=True) r = browse(s, BASE_URL) pq = PyQuery(r) questions = [] for question_div in pq('div.grid-question'): link = PyQuery(question_div).parent().attr('href') if '/play/' in link: questions.append(link.split('/play/')[-1]) return sorted(questions, key=int)
def GetBtcMarketPrice(): list1=['1','2','3','4','5','6','7'] pricelist=[] #marketRes=requests.get("http://www.feixiaohao.com/currencies/bitcoin/") marketRes=PyQuery(url="http://www.feixiaohao.com/currencies/bitcoin/") #xxx=marketRes("#markets").html() for data in marketRes("tr"): priceinfo=PyQuery(data).text().encode("utf-8") if priceinfo[0] in list1: prices=priceinfo.split(" ")[3].replace("¥","") prices=prices.replace(",","") pricelist.append(int(prices)) return sum(pricelist)/len(pricelist)
def get_cloud_rate(scene_name): """Read the MTL file and return the cloud_rate of the scene.""" sat = 'L%s' % scene_name[2] mtl_path = join(settings.MEDIA_ROOT, sat, scene_name, scene_name + '_MTL.txt') if isfile(mtl_path): with open(mtl_path, 'r') as f: lines = f.readlines() cloud_rate = [float(line.split(' = ')[-1]) for line in lines if 'CLOUD_COVER' in line][0] return cloud_rate else: url_code = get_metadata_code(scene_name) metadata = PyQuery( 'http://earthexplorer.usgs.gov/metadata/%s/%s/' % (url_code, scene_name) ) metadata = metadata.text()[metadata.text().find('Cloud Cover '):] return float(metadata.split(' ')[2])
def verify_token(self): for x in range(5): try: WebDriverWait(self.driver, wait).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-setting > div > div.card-body.d-flex.flex-column > div.overflow-container.flex-grow-1 > ul > li"))) HTML = self.driver.find_elements_by_css_selector(".card-control")[ 0].get_attribute("innerHTML") Doc = PQ(HTML) Doc = Doc('.list-group-item-action').text() Doc = Doc.replace(" ", "\n") Doc = Doc.split("\n") # print(Doc) path = Doc.index("表名") pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]' self.driver.find_element_by_xpath(pathh).click() break except: time.sleep(3) return "dp_ads." + Doc[path+1]
def get_info_5(file_html): """ 获取页面关键信息 """ text_pq = Pq(file_html) tr_list = text_pq('.normal-fir').find('.name') for tr_item in tr_list: td_list = Pq(tr_item).find('a') item_dict = { u'公司编号': td_list.attr('company_id'), u'公司名称': '', u'公司链接': '', u'职位名称': td_list.text(), u'职位链接': td_list.attr('href'), u'薪资待遇': '', u'工作地点': '', u'工作经验': '', u'最低学历': '', u'招聘人数': '', u'公司规模': '', } company = Pq(tr_item).find('.s-tit14.fl') item_dict[u'公司名称'] = company.text() introduce_list = Pq(tr_item).find('.s-butt.s-bb1 ul li') for introduce_item in introduce_list: item_text = Pq(introduce_item).text() item_list = item_text.split(': ') if len(item_list) < 2: continue key = item_list[0] value = item_list[1] item_dict[key] = value # 获取公司联系方式 contact_dict = get_contact(item_dict[u'公司编号']) item_dict = dict(item_dict, **contact_dict) yield item_dict print '单页共 %s 条记录' % len(tr_list)
def parseDetail(url, place): logger.info(url) qDetail = PyQuery(url) place["pNmEng"] = qDetail(".dest_toptitle > div > div > p").remove('span').text().strip() try: place["pDesc"] = qDetail(".toggle_l:first > .text_style").html().strip() except Exception as ex: logger.error(ex) # mapSrc = qDetail(".s_sight_map > a > img").attr('src').split('%7C')[1].split('&')[0] # place["lng"] = mapSrc.split(',')[1] # place["lat"] = mapSrc.split(',')[0] place["lng"] = qDetail("#Lon").val() place["lat"] = qDetail("#Lat").val() ctypeAList = qDetail(".s_sight_con:first > a") place["viewTypes"] = [] for element in ctypeAList[:3]: viewType = {} viewHref = PyQuery(element).attr("href") viewType["codeId"] = viewHref.split("/")[-1].split(".")[0].replace("s", "") viewType["codeName"] = PyQuery(element).text() place["viewTypes"].append(viewType) try: place["contactTel"] = PyQuery(qDetail(".s_sight_con")[2]).text().strip() place["website"] = PyQuery(qDetail(".s_sight_con")[3])("a").text() except Exception as ex: logger.error(ex) place["openHours"] = "" for element in qDetail("dt:contains('开放时间')").nextAll("dd"): place["openHours"] += PyQuery(element).outerHtml() place["expense"] = "" for element in qDetail("dt:contains('门票信息')").nextAll("dd"): place["expense"] += PyQuery(element).outerHtml() place["districtid"] = qDetail("#ctmdistrict").val() # 取得圖片用ID, #JS_DistrictId的值一樣 place["resourceid"] = qDetail("#wentClickID").attr("dataresource-cat") # 取得圖片用ID place["totalImgCount"] = qDetail(".r_text").text().replace("全部", "").replace("张照片", "") # 取得圖片用,數量 place["countryEngName"] = inputCountryJson["eName"] # place["countryChnName"] = PyQuery(qDetail("i.arrow")[1]).parent('a').text() # 國家中文 place["countryChnName"] = inputCountryJson["cName"] # 國家中文 place["cityEngName"] = qDetail("#EName").val() # city英文
def click_dataset(self,lan): #---PyQuery→Xpath--- for x in range(5): try: HTML = self.driver.find_elements_by_css_selector(".card-control")[ 0].get_attribute("innerHTML") Doc = PQ(HTML) Doc = Doc('.list-group-item-action').text() Doc = Doc.replace(" ", "\n") Doc = Doc.split("\n") # print(Doc) path = Doc.index(lan) pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]' self.driver.find_element_by_xpath(pathh).click() break except: time.sleep(3) #對照頁面上的→維度條件 WebDriverWait(self.driver, wait).until( EC.visibility_of_element_located((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)"))) check = self.driver.find_element_by_css_selector("body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)").get_attribute("innerText") return check
def load_file_list(self): try: sl = self.db_id.lower() file_url = BASE_FILE_URL + sl + "/" txt = load_with_retry(file_url) pq = PyQuery(txt) pq = pq('a') files = [] for a in pq[1:]: name = a.attrib['href'].split('/')[-1] if not name.lower().startswith(sl + self.file_prefix_delimiter): continue prvtxt = PyQuery(a).prev()[0].tail prvtxt = prvtxt.split() f = { "name": name[len(sl) + len(self.file_prefix_delimiter):], "modified": datetime.datetime.strptime(prvtxt[0] + " " + prvtxt[1] + " " + prvtxt[2], "%m/%d/%Y %H:%M %p"), "size": int(prvtxt[3]) } files.append(f) return files except: return []
def parse_transaction_tr(tr: PyQuery, domain: str) -> dict: """ 解析一页一号/二号平台站点的四类交易的tr :param tr: :param domain: :return: """ init_dict = dict() tds = tr.find("td") """平台1和平台2区别对待""" if domain == domain: """第一个td,取订单号和客户帐号""" first = PyQuery(tds[0]) texts_1 = first.text().split("\n") ticket = int(re.search(r'\d{4,}', texts_1[0]).group()) # 订单号 login = int(re.search(r'\d{6,}', texts_1[-1]).group()) # 客户帐号 init_dict['ticket'] = ticket init_dict['login'] = login """第二个td,取英文名和真实姓名""" second = PyQuery(tds[1]) texts_2 = second.text().split("\n") nick_name = texts_2[0][4:].strip("") real_name = texts_2[-1][5:].strip("") init_dict['nick_name'] = nick_name init_dict['real_name'] = real_name """第三个td,取交易指令和品种""" third = PyQuery(tds[2]) texts_3 = third.text().split("\n") command = texts_3[0].lower() init_dict['command'] = command sys_val = domain print("domain = {}, command = {}, tds'length = {}".format( sys_val, command, len(tds))) init_dict['system'] = sys_val # print(ticket, command, texts_3) if command == "balance" or command == "credit": """出入金和赠金,少了几个td""" """第四个,交易时间""" eighth = PyQuery(tds[4]).text() the_time = get_datetime_from_str(eighth) # 交易时间 init_dict['time'] = the_time # print("出入金时间:{}".format(the_time)) """ 第五个,盈亏 """ ninth = PyQuery(tds[5]).text() profit = re.search(r'[+, -]?\d+.?\d*', ninth) if profit is not None: profit = float(profit.group()) init_dict['profit'] = profit """第六个,点差""" tenth = PyQuery(tds[6]).text() spread_profit = None try: spread_profit = float(tenth) except ValueError as e: print(e) finally: if spread_profit is None: pass else: init_dict['spread_profit'] = spread_profit """第七个,注释""" comment = None try: eleventh = PyQuery(tds[7]).text() comment = eleventh except IndexError as e: print(e) finally: if comment is not None: init_dict['comment'] = comment else: pass init_dict = {k: v for k, v in init_dict.items() if v is not None} else: """buy和sell的情况""" symbol = '' if len(texts_3) > 1: symbol = texts_3[-1].lower() init_dict['symbol'] = symbol """第四个td,取交易手数""" fourth = PyQuery(tds[3]) lot_find = re.search(r'\d+.?\d*', fourth.text()) lot = lot_find if lot_find is None else float(lot_find.group()) if symbol != "hk50mini" else \ float(lot_find.group()) / 10 init_dict['lot'] = lot """ 第五个,取价格, """ fifth = PyQuery(tds[4]) prices = fifth.text().split("\n") enter_price = float(re.search(r'\d+.?\d*', prices[0]).group()) # 开仓 exit_price = float(re.search(r'\d+.?\d*', prices[-1]).group()) # 平仓 init_dict['enter_price'] = enter_price init_dict['exit_price'] = exit_price """ 第六个,止盈/止损, """ sixth = PyQuery(tds[5]) stop = sixth.text().split("\n") stop_losses = float(re.search(r'\d+.?\d*', stop[0]).group()) # 止损 take_profit = float(re.search(r'\d+.?\d*', stop[-1]).group()) # 止盈 init_dict['stop_losses'] = stop_losses init_dict['take_profit'] = take_profit """ 第七个,利息/佣金, """ seventh = PyQuery(tds[6]) seventh = seventh.text().split("\n") swap_match = re.search(r'[+, -]?\d+.?\d*', seventh[0]) if swap_match is not None: swap = float(swap_match.group()) # 利息 else: swap = None commission_match = re.search(r'[+, -]?\d+.?\d*', seventh[-1]) if commission_match is not None: commission = float(commission_match.group()) # 手续费 else: commission = None init_dict['swap'] = swap init_dict['commission'] = commission """第八个,交易时间""" eighth = PyQuery(tds[7]).text() eighth = eighth.split("\n") if command not in ["balance", "credit"]: open_time = get_datetime_from_str( eighth[0].split(":")[1]) # 开仓时间 init_dict['open_time'] = open_time if eighth[-1].find("持仓中") != -1: """持仓中""" pass else: close_time_list = eighth[-1].split(":") if len(close_time_list) > 1: close_time = get_datetime_from_str( close_time_list[1]) # 平仓时间 init_dict['close_time'] = close_time else: pass else: pass """ 第九个,盈亏 """ ninth = PyQuery(tds[8]).text() profit = re.search(r'[+, -]?\d+.?\d*', ninth) if profit is not None: profit = float(profit.group()) init_dict['profit'] = profit """注意,平台1和平台2的列数不一样,平台1有点差,11列,平台2没有点差,10列""" if len(tds) == 11: """第十个,点差""" tenth = PyQuery(tds[-2]).text() spread_profit = float(tenth) init_dict['spread_profit'] = spread_profit else: pass """最后一个,注释""" eleventh = PyQuery(tds[-1]).text() comment = eleventh init_dict['comment'] = comment else: """平台2的解析""" """第一个td,取订单号和客户帐号""" first = PyQuery(tds[0]) texts_1 = first.text().split("\n") ticket = int(re.search(r'\d{4,}', texts_1[0]).group()) # 订单号 login = int(re.search(r'\d{6,}', texts_1[-1]).group()) # 客户帐号 init_dict['ticket'] = ticket init_dict['login'] = login """第二个td,取英文名和MT名称""" second = PyQuery(tds[1]) texts_2 = second.text().split("\n") nick_name = texts_2[0][4:].strip("") real_name = texts_2[-1][5:].strip("") init_dict['nick_name'] = nick_name init_dict['real_name'] = real_name """第三个td,取交易指令和品种""" third = PyQuery(tds[2]) texts_3 = third.text().split("\n") command = texts_3[0].lower() init_dict['command'] = command sys_val = domain print("domain = {}, command = {}, tds'length = {}".format( sys_val, command, len(tds))) init_dict['system'] = sys_val # print(ticket, command, texts_3) if command == "balance" or command == "credit": """出入金和赠金,少了几个td""" """第四个,交易时间""" eighth = PyQuery(tds[4]).text() the_time = get_datetime_from_str(eighth) # 交易时间 init_dict['time'] = the_time # print("出入金时间:{}".format(the_time)) """ 第五个,盈亏 """ ninth = PyQuery(tds[5]).text() profit = re.search(r'[+, -]?\d+.?\d*', ninth) if profit is not None: profit = float(profit.group()) init_dict['profit'] = profit """第六个,注释""" comment = None try: eleventh = PyQuery(tds[-1]).text() comment = eleventh except IndexError as e: print(e) finally: if comment is not None: init_dict['comment'] = comment else: pass init_dict = {k: v for k, v in init_dict.items() if v is not None} else: """buy和sell的情况""" symbol = '' if len(texts_3) > 1: symbol = texts_3[-1].lower() init_dict['symbol'] = symbol """第四个td,取交易手数""" fourth = PyQuery(tds[3]) lot_find = re.search(r'\d+.?\d*', fourth.text()) lot = lot_find if lot_find is None else float(lot_find.group()) if symbol != "hk50mini" else \ float(lot_find.group()) / 10 init_dict['lot'] = lot """ 第五个,取价格, """ fifth = PyQuery(tds[4]) prices = fifth.text().split("\n") enter_price = float(re.search(r'\d+.?\d*', prices[0]).group()) # 开仓 exit_price = float(re.search(r'\d+.?\d*', prices[-1]).group()) # 平仓 init_dict['enter_price'] = enter_price init_dict['exit_price'] = exit_price """ 第六个,止盈/止损, """ sixth = PyQuery(tds[5]) stop = sixth.text().split("\n") stop_losses = float(re.search(r'\d+.?\d*', stop[0]).group()) # 止损 take_profit = float(re.search(r'\d+.?\d*', stop[-1]).group()) # 止盈 init_dict['stop_losses'] = stop_losses init_dict['take_profit'] = take_profit """ 第七个td是空的 """ """第八个,交易时间""" eighth = PyQuery(tds[7]).text() eighth = eighth.split("\n") if command not in ["balance", "credit"]: open_time = get_datetime_from_str( eighth[0].split(":")[1]) # 开仓时间 init_dict['open_time'] = open_time if eighth[-1].find("持仓中") != -1: """持仓中""" pass else: close_time_list = eighth[-1].split(":") if len(close_time_list) > 1: close_time = get_datetime_from_str( close_time_list[1]) # 平仓时间 init_dict['close_time'] = close_time else: pass else: pass """ 第九个,盈亏 """ ninth = PyQuery(tds[8]).text() profit = re.search(r'[+, -]?\d+.?\d*', ninth) if profit is not None: profit = float(profit.group()) init_dict['profit'] = profit """注意,平台1和平台2的列数不一样,平台1有点差,11列,平台2没有点差,10列""" if len(tds) == 11: """第十个,点差""" tenth = PyQuery(tds[-2]).text() spread_profit = float(tenth) init_dict['spread_profit'] = spread_profit else: pass """最后一个,注释""" eleventh = PyQuery(tds[-1]).text() comment = eleventh init_dict['comment'] = comment """先整理初始化字典""" init_dict = {k: v for k, v in init_dict.items() if v is not None} # 去None """只记录指定类型的单子""" if init_dict['command'] in ['balance', 'credit', 'buy', 'sell']: return init_dict else: return None
from subprocess import call url = 'http://www.vagrantup.com/downloads.html' request = requests.get(url) links = PyQuery(request.text) foi = False for link in links('a'): if PyQuery(link).attr('href')[-3:] == 'dmg': foi = True break if foi: remotefilename = PyQuery(link).attr('href') filename = remotefilename.split('/')[-1] locfile = os.environ['HOME'] + '/Downloads/' + filename if os.path.isfile(locfile): print locfile + ' is already downloaded' else: stream = requests.get(remotefilename, stream = True) with open(locfile, 'wb') as fd: print "getting " + filename count = 0 for chunk in stream.iter_content(4096): count += 1 if count%100 == 0: sys.stdout.write('.') fd.write(chunk) print 'done' print 'Vagrant dmg file downloaded to ' + os.environ['HOME'] + '/Downloads'
def test_not_billiable_projects_not_shown(self): response = self.test_client.get(reverse('client_projects')) query = PyQuery(response.content) query = query('table#queryTable td.name').text() self.assertNotIn('FakeProject3', set(query.split()))
def test_billing_types_for_each_project_shown(self): response = self.test_client.get(reverse('client_projects')) query = PyQuery(response.content) query = query('table#queryTable td.type').text() self.assertEqual(set(('HOUR', 'FIXED')), set(query.split()))
def test_all_billiable_projects_shown(self): response = self.test_client.get(reverse('client_projects')) query = PyQuery(response.content) query = query('table#queryTable td.name').text() self.assertEqual(set(('FakeProject1', 'FakeProject2')), set(query.split()))
# item = PyQuery(i).text() # if ":" in item: # itemlist = item.split(':', 1) # elif ":" in item: # itemlist = item.split(':', 1) # item_dict[itemlist[0]] = itemlist[1] # pass # print(item_dict) root = etree.HTML(html) dl = root.xpath('//*[@class="overview"]//dl') item_dict = {} for i in dl: print(PyQuery(i).text()) item = PyQuery(i).text() if ":" in item: itemlist = item.split(':', 1) elif ":" in item: itemlist = item.split(':', 1) item_dict[itemlist[0]] = itemlist[1].strip() pass select = root.xpath( '//*[@id="shareholderInfo_wrapper"]//table//tbody/tr') title = ['序号', '股东名称'] gd = {} gudong = {} for i in select: nom = 0 for j in title: gd[title[nom]] = i.xpath(".//text()")[nom] nom += 1 gudong[gd['序号']] = gd
def scan_proxy_qiaodm(): """ 扫描代理资源 :return: """ import requests from pyquery import PyQuery as Pq source_site = 'http://ip.qiaodm.com/' header = { 'Host': 'ip.qiaodm.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' } s = requests.session() # 抓取页面 file_html = s.get(source_site).content # 保存文件 # with open('test.html', 'a') as f: # f.write(file_html.encode('utf-8')) # # # 读取抓取的页面 # with open('test.html', 'r') as f: # file_html = f.read() text_pq = Pq(file_html) tr_list = text_pq('tbody').find('tr[style="text-align: center;"]') print '单页共 %s 条记录' % len(tr_list) for tr_item in tr_list: # print Pq(tr_item).html() # print('---------------------') td_list = Pq(tr_item).find('td') # print '单条共 %s 列字段' % len(td_list) field_list = [] for td_item in Pq(td_list): field = Pq(td_item).text() field_list.append(field) # print field # print('++++++++++++++++++') # 特殊处理ip地址 ip = Pq(td_list).eq(0).html() # 去除干扰信息 ip = html.replace_html(ip, r'<p style="display:none;"/>') ip = html.replace_html(ip, r'<p style="display: none;"/>') ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>') # 去除标签 ip = html.strip_html(ip) # print ip # 过滤掉非法ip地址 if len(ip.split('.')) != 4: continue # 特殊处理端口 port_key = Pq(td_list).eq(1).attr('class').split()[1] if port_key not in PortDict: print '发现新端口: %s' % port_key continue port = PortDict.get(port_key, '') ProsyItem['Ip'] = ip.replace(' ', '') ProsyItem['Port'] = port ProsyItem['Type'] = field_list[2].strip() ProsyItem['AnonymousDegree'] = field_list[3].strip() ProsyItem['Area'] = field_list[4].strip() ProsyItem['Speed'] = field_list[5].strip() ProsyItem['ScanTime'] = field_list[6].strip() # print ProsyItem proxy_item = json.dumps(ProsyItem, ensure_ascii=False) html.save_file('proxy.json', proxy_item + '\n', 'a')
def scan_proxy(): """ 扫描代理资源 :return: """ import requests from pyquery import PyQuery as Pq source_site = 'http://ip.qiaodm.com/' header = { 'Host': 'ip.qiaodm.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' } s = requests.session() # 抓取页面 file_html = s.get(source_site).content # 保存文件 # with open('test.html', 'a') as f: # f.write(file_html.encode('utf-8')) # # # 读取抓取的页面 # with open('test.html', 'r') as f: # file_html = f.read() text_pq = Pq(file_html) tr_list = text_pq('tbody').find('tr[style="text-align: center;"]') print '单页共 %s 条记录' % len(tr_list) for tr_item in tr_list: # print Pq(tr_item).html() # print('---------------------') td_list = Pq(tr_item).find('td') # print '单条共 %s 列字段' % len(td_list) field_list = [] for td_item in Pq(td_list): field = Pq(td_item).text() field_list.append(field) # print field # print('++++++++++++++++++') # 特殊处理ip地址 ip = Pq(td_list).eq(0).html() # 去除干扰信息 ip = html.replace_html(ip, r'<p style="display:none;"/>') ip = html.replace_html(ip, r'<p style="display: none;"/>') ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>') # 去除标签 ip = html.strip_html(ip) # print ip # 过滤掉非法ip地址 if len(ip.split('.')) != 4: continue # 特殊处理端口 port_key = Pq(td_list).eq(1).attr('class').split()[1] if port_key not in PortDict: print '发现新端口: %s' % port_key continue port = PortDict.get(port_key, '') ProsyItem['Ip'] = ip.replace(' ', '') ProsyItem['Port'] = port ProsyItem['Type'] = field_list[2].strip() ProsyItem['AnonymousDegree'] = field_list[3].strip() ProsyItem['Area'] = field_list[4].strip() ProsyItem['Speed'] = field_list[5].strip() ProsyItem['ScanTime'] = field_list[6].strip() # print ProsyItem proxy_item = json.dumps(ProsyItem, ensure_ascii=False) html.save_file('proxy.json', proxy_item + '\n', 'a')
from pprint import pprint ''' how to use python3 getLineAllCities.py http://you.ctrip.com/sitelist/asia120001.html ./output/cities/asia python3 getLineAllCities.py http://you.ctrip.com/sitelist/europe120002.html ./output/cities/europe python3 getLineAllCities.py http://you.ctrip.com/sitelist/northamerica120004.html ./output/cities/northamerica python3 getLineAllCities.py http://you.ctrip.com/sitelist/southamerica120005.html ./output/cities/southamerica python3 getLineAllCities.py http://you.ctrip.com/sitelist/oceania120003.html ./output/cities/oceania python3 getLineAllCities.py http://you.ctrip.com/sitelist/africa120006.html ./output/cities/africa python3 getLineAllCities.py http://you.ctrip.com/sitelist/nanji120481.html ./output/cities/nanji todo 南極的place要獨立去抓,因為南極沒有國家 todo 有些國家不在這7個洲裡,需再確認是否要全抓 ''' ''' Main ''' targetUrl = sys.argv[1] outputDirectory = sys.argv[2] qList = PyQuery(targetUrl) for element in qList('.normalbox')('li > a'): countryUrl = PyQuery(element).attr('href').replace('/place', '/countrysightlist') targetJson = outputDirectory + "/" + countryUrl.split('/')[2].replace('.html', '') + ".json" countryUrl = "http://you.ctrip.com" + countryUrl # if (countryUrl == "http://you.ctrip.com/countrysightlist/southkorea100042.html"): pprint(countryUrl) pprint(targetJson) getCtripAllCity.main(countryUrl, targetJson)
rawdata= myutils.ungzip(response) # print rawdata pquery = PyQuery(rawdata.decode('utf-8')) for li in pquery(".TreeList li"): self.pfolder = myutils.filenameCheck(PyQuery(li)("a").text()) while os.path.exists(os.path.join(self.root,self.pfolder)): self.pfolder = self.pfolder + "_2" try: os.mkdir(os.path.join(self.root,self.pfolder)) except Exception,e: print "%s created error" %(os.path.join(self.root,self.pfolder)) else: strParam = PyQuery(li)("a").attr('onclick') aParam = strParam.split('(')[1].strip(')').split(',') param = {} param["id"] = aParam[0].strip().strip("'") param["code"] = aParam[1].strip().strip("'")+ "?" param["type"] = aParam[2].strip().strip("'") param["fileid"] = aParam[3].strip().strip("'") self.get_child_catalog(param) time.sleep(1) self.deal_error() def get_child_catalog(self,param): url = "http://tongji.cnki.net/kns55/Navi/GetChildCatalog.aspx" req = urllib2.Request(url,urllib.urlencode(param),self.req_header) # print req.get_full_url()
raw_html = r.text pq = PQ(raw_html) # Select just the set of elements that you want to extract # Use CSS selectors! ul = pq("ul")[1] elements = PQ(ul).children() # Once you have the elements identfied, extract the text texts = [] for el in elements: # extract text text = PQ(el).text() # Append to "texts" list texts.append(text) # Split the text into year, first name, and last name data = [] for text in texts: # Split text into a tuple of the 3 data points split_vars = text.split() year = int(split_vars[0][:4]) name = " ".join(split_vars[1:]) tup = (year, name) # Append the tuple to the "data" list data.append(tup) print(data)