def fetch_versions(self): if self.versions: return result = get(self.url) tables = result('.tabel95') self.title = tables.find('.titulo').contents()[0].strip() for i, table in enumerate(tables[2:-1:2]): trs = query(table)('tr') release = trs.find('.NewsTitle').text().partition(',')[0] release = re.sub('version ', '', release, 0, re.I) infos = trs.next().find('.newsDate').eq(0).text() infos = re.sub('(?:should)? works? with ', '', infos, 0, re.I) for tr in trs[2:]: tr = query(tr) language = tr('.language') if not language: continue completeness = language.next().text().partition(' ')[0] language = language.text() download = tr('a[href*=updated]') or tr('a[href*=original]') if not download: continue hearing_impaired = \ bool(tr.next().find('img[title="Hearing Impaired"]')) download = download.attr.href self.add_version(download, language, release, infos, completeness, hearing_impaired)
def run(self): while 1: print('开始爬取...') page_goods = [] url = 'http://list.showjoy.com/search/?page={page}'.format(page=self.page) html = query(Sp(url).get('text')) containers = html('.brick-cover') # 获取最大页 if self.max_count + self.max_page == 0: self.max_count = int(html('.highlight').text()) self.max_page = int(self.max_count / 20) + 1 print('获取最大页:' + str(self.max_page)) for g in containers: g = query(g) url = g('.brick-pic').eq(0).attr('href') img = g('.brick-pic').eq(0)('img').eq(0).attr('src') brand = { 'name': g('.brand').eq(0)('img').eq(0).attr('alt'), 'img': g('.brand').eq(0)('img').eq(0).attr('src') } title = g('.brick-title').text().strip() # 价格转换为数字格式 price = g('.price').text().strip() price = float(re.findall('¥(\d+\.\d+)', price)[0]) # 销量转换为数字格式 sales = g('.sales').text().strip() sales = int(re.findall('最近成交(\d+)笔', sales)[0]) page_goods.append({ 'url': url, 'img': img, 'brand': brand, 'title': title, 'price': price, 'sales': sales }) print('{title},价格:{price},销量:{sales}'.format(title=title, price=price, sales=sales)) print('第 {page} 页:商品总数:{count}'.format(page=self.page, count=len(page_goods))) self.goods += page_goods self.page += 1 if self.page > self.max_page: self.end_time = time.time() break print( '完成所有商品获取,总页数:{max_page},商品总数:{count},耗费时间:{times}'.format(max_page=self.max_page, count=len(self.goods), times=self.end_time-self.start_time, ))
def __extractWeight(html): weight = {} try: query_site_info = query(html)('div.siteinfo') html_font = query_site_info('font') keys = ['baidu_weight', 'key_count', 'net_flow', 'place'] for i in range(html_font.size()): weight[keys[i]] = query(html_font[i]).text() except Exception, e: print e
def parse_next_page_href(html): next_page_list = [] try: query_page_a_s = query(html)('#nav a.fl') for item in query_page_a_s: next_page_list.append( 'https://www.google.com.hk' + query(item).attr('href')) except Exception, e: print 'DataParser: parse next page info failed' print e
def __extract_img_info(item): item_json = {} try: query_img_div = query(item) query_a_img = query_img_div('.r a') query_summary_span = query_img_div('.s') #print 'query_summary_span:', query_summary_span #print 'query_summary_span', query_summary_span.html().decode('iso-8859-1') if(query_a_img != None): img_title = query_a_img.text() img_web_url = query_a_img.attr('href') img_thumnail_src = query_img_div( 'img').attr('src') # print query_a_img('div.s') img_original_src = query_img_div( 'div.s')('a').attr('href') img_original_src = urlparse.parse_qs( urlparse.urlparse(img_original_src).query, True)['imgurl'][0] item_json['img_web_url'] = img_web_url item_json['img_title'] = img_title item_json['img_thumnail_src'] = img_thumnail_src item_json['img_original_src'] = img_original_src item_json['img_weight'] = 1 item_json['img_datetime'] = '2015.05.01' # print img_title # print img_web_url # print img_original_src # print img_thumnail_src except Exception, e: # print 'extract item json failed' print e pass
def process_goods(self, html): """ 清理获取到的商品数据,返回商品列表 :param ret: :return: """ goods = [] d_goods = html('.gl-item') # 解析商品数据 for d_good in d_goods: d_good = query(d_good) good = { 'id': d_good.attr('data-sku'), 'title': d_good('.p-name > a').text(), 'price': d_good('.p-price > strong > i').text(), 'url': 'https:' + d_good('div.p-img > a').attr('href') } # 商品封面 icon = d_good('.p-img > a > img').attr('source-data-lazy-img') if not icon or icon == 'done': icon = d_good('.p-img > a > img').attr('src') good.setdefault('icon', 'https:' + icon) goods.append(good) return goods
def parse_item(i): d = {} q = query(i) d['lat'] = float(q.attr('data-latitude')) if len(q.attr('data-latitude')) > 1 else '' d['lng'] = float(q.attr('data-longitude')) if len(q.attr('data-longitude')) > 1 else '' d['postdata'] = q.find('.itemdate').text() d['price'] = q.find('.itemph').text().split(' ')[0] d['title'] = q.find('a').text() d['link'] = q.find('a').attr('href') # print d['title'] d['district'] = '' district = q.find('.itempn').find('font').text() if district is not None and district is not "": for dis in districts.keys(): if dis in district: d['district'] = districts[dis] break for k in validationDistrict.keys(): r = validationDistrict[k] # print '%s > %s > %s' % (r[0], d['lat'], r[2]) if r[0] > d['lat'] and d['lat'] > r[2]: # print ' %s > %s > %s' % (r[1], d['lng'], r[3]) if r[1] < d['lng'] and d['lng'] < r[3]: d['district'] = districts[k] # print '%s %s - %s [%s]' % (k, d['price'], d['title'], d['link']) return d if d['district'] is not '' else None
def fetch_versions(self): if self.versions: return result = self._page or session.get(self.url) tables = result('.tabel95') self.title = tables.find('.titulo').contents()[0].strip() for i, table in enumerate(tables[2:-1:2]): trs = query(table)('tr') release = encode(trs.find('.NewsTitle').text().partition(',')[0]) release = re.sub('version ', '', release, 0, re.I) infos = encode(trs.next().find('.newsDate').eq(0).text()) infos = re.sub('(?:should)? works? with ', '', infos, 0, re.I) for tr in trs[2:]: tr = query(tr) language = tr('.language') if not language: continue completeness = encode(language.next().text().partition(' ')[0]) language = encode(language.text()) download = tr('a[href*=updated]') or tr('a[href*=original]') if not download: continue hearing_impaired = \ bool(tr.next().find('img[title="Hearing Impaired"]')) url = encode(download.attr.href) favorite = tr('a[href*="saveFavorite"]')[0].attrib['href'] id, language_id, version = \ re.search(r'(\d+),(\d+),(\d+)', favorite).groups() self.add_version( id=id, language_id=language_id, version=version, url=url, language=language, release=release, infos=infos, completeness=completeness, hearing_impaired=hearing_impaired, )
def get_max_page(self): """ 获取商品搜索最大页 :return: """ url = 'http://search.jd.com/Search?keyword=' + self.keyword max_page = query(spider(url).get('text')).html('#J_topPage > span > i').text() print('获取最大页:' + max_page) return max_page
def get_cards(): cards = [] page = 1 start_time = time() while 1: params = { 'resourceType': 1, # 0是软件排行,1是游戏排行 'page': page } ret = spider('http://www.wandoujia.com/api/top/more', params=params, fields='json') json_data = ret.get('json') content = unpack_dict(json_data, 'data.content') # 到最大页则 content 为空 if not content: end_time = time() print('爬取完成,总页数:{page},游戏总数:{count},耗时:{times}'.format(page=page - 1, count=len(cards), times=end_time-start_time)) return cards document = query(content) cards_dom = document('.card') for card_dom in cards_dom: # 游戏名称、下载量、图标、下载地址 # 下载地址需安装豌豆荚 card_dom = query(card_dom) download_btn = card_dom('a.i-source.install-btn') name = download_btn.attr('data-name') downloads = download_btn.attr('data-install') icon = download_btn.attr('data-app-icon') url = download_btn.attr('href') cards.append({ 'name': name, 'downloads': downloads, 'icon': icon, 'url': url }) print('完成第 {page} 页爬取,当前总数:{count}'.format(page=page, count=len(cards))) page += 1
def parse_img_info(html): print 'parse_img_info()' result_list = [] try: query_img_div_s = query(html)('.rc') if query_img_div_s == None: return result_list for item in query_img_div_s: result_list.append(DataParser.__extract_img_info(item)) except Exception, e: print 'DataParser: parse img info failed' print e
def main(sc): q = query(findapturl) print 'starting: %s' % (q('p').eq(0).find('.itemph').text()) geo = q('*[data-latitude]').filter(lambda i: query(this).attr('data-latitude') != '') district = q('p').filter(lambda i: query(this).find('.itempn').each(in_district)) results = [] resultNodes = geo + district for item in resultNodes: obj = parse_item(item) if obj: results.append( obj ) f = open(''.join([os.path.expanduser('~'),'/.craigslist.json']), 'r+') for item in results: if len(item['link']) > 1 and item['link'] not in data: print item['price'] try: cost = int(item['price'].replace('$','')) except: continue if cost < maxSMSRent: shortened = requests.get('http://is.gd/create.php?format=simple&url=%s' % (item['link'])) # notify( '%s [%s] %s %s' % (item['price'], item['district'], shortened.text, item['title']) ) page = requests.get(item['link']) email(page.text, item['link'], '*****@*****.**') # email(page.text, item['link'], '*****@*****.**') item['added'] = datetime.datetime.now() data[item['link']] = item f = open(''.join(crapy_JSON_db), 'w') f.write(json.dumps(data, indent=4, default=date_encoder)) f.close() sc.enter(sleeptime, 1, main, (sc,))
def get_page_goods(self, page): page = page * 2 - 1 # 京东商品搜索页每页请求两次 url = 'http://search.jd.com/s_new.php' """ 获取上半部分商品 """ params = { "keyword": self.keyword, "page": page, "click": "0", "enc": "utf-8", "qrst": "1", "rt": "1", "s": "110", "stop": "1", "vt": "2" } html = query(spider(url, params=params).get('text')) goods = self.process_goods(html) """ 获取下半部分商品 """ params = { "keyword": self.keyword, "show_items": ','.join([g.get('id') for g in goods]), "enc": "utf-8", "page": "2", "log_id": "1510505434.63851", "qrst": "1", "rt": "1", "s": "28", "scrolling": "y", "stop": "1", "tpl": "2_M", "vt": "2" } html = query(spider(url, params=params).get('text')) goods.extend(self.process_goods(html)) print('第 {page} 页商品数量:{count}'.format(page=page, count=len(goods))) return goods
def get_recommend_goods(self, page, keyword): """ 获取(左侧)商品精选 :param page: :param keyword: :return: """ goods = [] url = 'http://x.jd.com/Search' params = { "page": page, "keyword": keyword, "_": time.time() * 1000, "adType": "7", "ad_ids": "291:20", "area": "1", "callback": "jQuery5618052", "enc": "utf-8", "xtest": "new_search" } ret = spider(url, fields='json', params=params, debug=True) if ret.get('code') == 200: # 解析商品数据,格式化保存 json_data = ret.get('json') json_goods = json_data.get('291', []) for json_good in json_goods: good = { 'id': json_good.get('sku_id'), # 通过 ID 取商品评价 'title': query(json_good.get('ad_title')).text(), 'icon': 'http://img1.360buyimg.com/n1/' + json_good.get('image_url'), 'price': json_good.get('pc_price'), 'url': json_good.get('click_url') } # 获取商品评价 good.setdefault('comments', self.get_comments(good.get('id'))) goods.append(good) print('已获取商品:' + good.get('title')) else: print('第 {page} 页商品获取完成'.format(page=page)) else: print('获取商品出错:' + str(ret.get('err'))) return goods
def get_one_page(url: str): """ 将 $url 对应的文章保存为 html 和 md 虽然最后有清洗的步骤,最后格式还是需要手动调整(比如强迫症的我) """ assert url.strip(), "Invalid url!" headers = { "user-agent": Faker().chrome(), } if url[-1] == "/": url = url[:-1] page_id = url.split("/")[-1] if not os.path.exists(f"files/{page_id}.html"): print(f"Retrieving {url}...") r = requests.get(url, headers=headers) doc = query(r.text) print(f"Saving the article to files/{page_id}.html...") with open(f"files/{page_id}.html", "w", encoding="utf-8") as f: f.write(doc("article").html()) print("Converting html to md...") os.system(f"pandoc -o files/{page_id}.md files/{page_id}.html") print("Cleaning the md...") with open(f"files/{page_id}.md") as f: lines = f.readlines() trash_dots_lines = [] for idx, line in enumerate(lines): if line.startswith(":::"): trash_dots_lines.append(idx) if re.search(r"{#.*?}", line): lines[idx] = re.sub(r"{#.*?}", "", line) for x in trash_dots_lines[::-1]: lines.pop(x) text = re.sub(r"{\..*?}", "", "".join(lines), flags=re.S) while re.search(r"{\..*?}", text, flags=re.S): text = re.sub(r"{\..*?}", "", text, flags=re.S) text = re.sub(r"<div>.*?</div>", "", text, flags=re.S) text = text.replace("\\\n", "").replace("\n\n\n", "\n\n").replace("\n\n\n", "\n\n") with open(f"files/{page_id}.md", "w", encoding="utf-8") as f: f.write(text)
class Users(pymodel.BaseModel): def __init__(self): self.name = UserInfo() class Employees(pymodel.BaseModel): def __init__(self): import datetime self.set_model_name("employees") self.firstname = str self.lastname = str self.bithdate = datetime.datetime employees = Employees() qr = pyquery.query(employees.get_model_name()) qr.project(employees.firstname, employees.lastname) print qr.pipeline # user=Users() # c=pydoc.document.nam + pydoc.document.x>("1+{0}+3",15) # print c # pyaggregatebuilders.Match(pydoc.FilterFiels) # c=pyaggregatebuilders.Project( # pydoc.document.fullName<<("concat(firstName,'{0}',lastName)","xxx") # # ) # print c # c=(pyfuncs.regex(pydoc.document.name,"12")) # print c
import pymongo from pymongo import MongoClient import pyfuncs import pydocs Fields = pydocs.Fields() # print pyfuncs.toDouble(X.name) # fields=pydoc.Fields() # x=pyfuncs.cmp(fields.amount,fields.name)==0 # print isinstance(x,pydoc.Fields) # # c=x.__owner__ # print x.__tree__ cnn = MongoClient(host="localhost", port=27017) db = cnn.get_database("hrm") db.authenticate(name="root", password="******") qr = pyquery.query(db, "test.coll001") qr = qr.where(pyfuncs.regex(Fields.fx, "^312313$")) # qr.project({ # Fields.Users.username:1, # Fields.Users.fullName:pyfuncs.concat(Fields.Users.firstName, " ",Fields.Users.lastname) # }) # qr=qr+2 # #.set(x=1,y=2) # import pprint # items=list(qr.objects) import pprint x = list(qr.objects) pprint.pprint(list(qr.items))
def get_page_content(page): items = [] domain = 'https://www.qiushibaike.com' # 获取原始链接,内容详情页 url = 'https://www.qiushibaike.com/8hr/page/{0}/'.format(page) ret = spider(url) text = ret.get('text') index_document = query(text) articles = index_document('.article') for article in articles: href = query(article)('a.contentHerf').attr('href') items.append({'url': domain + href}) print('第 {page} 页获取链接数:{count}'.format(page=page, count=len(items))) if len(items) == 0: print(ret) for index, item in enumerate(items): for i in range(0, 2): # 访问详情页,获取内容 + 评论 text = spider(item['url']).get('text') document = query(text) # 内容 content = document('#single-next-link > div').text() if not content: print('获取失败,重试获取,进度:{index}/{maxlength}'.format( index=index + 1, maxlength=len(items))) continue # 内容配图 img_href = document('#single-next-link > div.thumb > img').attr( 'src') or '' if img_href: img_href = 'https:' + img_href # 评论 comments = [] comments_dom = document('.comment-block > div.replay > span.body') for span in comments_dom: comments.append(query(span).text()) item.update({ 'content': content, 'img_href': img_href, 'comments': comments }) print('获取第 {page} 页,进度:{index}/{maxlength}'.format( page=page, index=index + 1, maxlength=len(items))) break print('第 {page} 页获取完成'.format(page=page)) if page == 1: max_page = int( index_document( '#content-left > ul > li:nth-child(7) > a > span').text()) print('最大页:' + str(max_page)) return max_page, items return items
def get(url, raw=False, **params): global last_url url = urljoin(last_url, url) request = requests.get(url, headers={'Referer': last_url}, params=params) last_url = url return request.content if raw else query(request.content)