def render_document(vnode, expressions, context): for expression in expressions: evaluation = evaluate_expression(expression, context) node = expression.get('node') if isinstance(expression.get('value'), basestring) and expression.get('value') == evaluation: continue expression['value'] = evaluation if expression.get('type') == 'each': if expression.get('parent'): parent = expression.get('parent') else: parent = node.parent() expression['parent'] = parent riot_id = node.attr['data-riot-id'] original_children = parent.children('[data-riot-id="%s"]' % riot_id) # 0. add placeholder placeholder = PyQuery('<text></text>') placeholder.insertBefore(original_children.eq(0)) # 1. remove children original_node = original_children.clone() original_children.remove() expression['node'] = original_node # 2. insert children loopcontext = {} loopcontext.update( context if isinstance(context, dict) else vars(context)) expressions_col = [] for loop_index, item in enumerate(evaluation): loopcontext.update( item if isinstance(item, dict) else vars(item)) loopcontext['loopindex'] = loop_index child_node = PyQuery(expression.get('impl')) child_node.attr['data-riot-loopindex'] = str(loop_index) expressions = parse_document_expressions(child_node) expressions_col.append((expressions, loopcontext)) render_document(vnode, expressions, loopcontext) child_node.insertBefore(placeholder) # 3. remove placeholder if len(evaluation) == 0: placeholder.attr['data-riot-id'] = str(riot_id) else: placeholder.remove() mark_dirty(parent) generate_widget(parent) for expressions, loopcontext in expressions_col: connect_signals(vnode, expressions, loopcontext) continue if expression.get('type') == 'markup': node.attr['markup'] = json.dumps(evaluation) node.html('') mark_dirty(node) continue if expression.get('type') == 'attribute': attribute = expression.get('attribute') node.attr[attribute] = str(evaluation) mark_dirty(node) continue
def tweetPaser(tweets_html): tweetslist = [] if tweets_html.strip() != '': scraped_tweets = PyQuery(tweets_html) scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') if len(tweets) != 0: for tweet_html in tweets: t = {} tweetPQ = PyQuery(tweet_html) t['user'] = tweetPQ("span:first.username.u-dir b").text() txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) txt = txt.replace('# ', '#') txt = txt.replace('@ ', '@') t['tweet'] = txt t['id'] = tweetPQ.attr("data-tweet-id") t['retweets'] = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) t['favorites'] = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) t['link'] = 'https://twitter.com' + tweetPQ.attr( "data-permalink-path") t['mentions'] = re.compile('(@\\w*)').findall(t['tweet']) t['hashtags'] = re.compile('(#\\w*)').findall(t['tweet']) t['timestamp'] = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) tweetslist.append(t) return tweetslist
def getTweets(users, word, lastpost): try: query = '' if word.strip() != '': query += word if len(users) == 1: query += ' from:' + users[0] elif len(users) > 1: query += ' from:' + ' OR from:'.join(users) query = urllib.parse.quote_plus(query) url = 'https://twitter.com/i/search/timeline?f=tweets&q={query}&src=typd'.format( query=query) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0', 'Accept': "application/json, text/javascript, */*; q=0.01", 'Accept-Language': "de,en-US;q=0.7,en;q=0.3", 'X-Requested-With': "XMLHttpRequest", 'Referer': url, 'Connection': "keep-alive" } response = requests.get(url, headers=headers) statuscode = response.status_code tweetslist = [] new_tweets = [] res = response.json() if statuscode == 200: json_response = response.json() if json_response['items_html'].strip() != '': scraped_tweets = PyQuery(json_response['items_html']) scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') if len(tweets) != 0: for tweet_html in tweets: t = {} tweetPQ = PyQuery(tweet_html) t['user'] = tweetPQ( "span:first.username.u-dir b").text() txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) txt = txt.replace('# ', '#') txt = txt.replace('@ ', '@') t['tweet'] = txt t['id'] = tweetPQ.attr("data-tweet-id") t['link'] = tweetPQ.attr("data-permalink-path") t['timestamp'] = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) tweetslist.append(t) for tw in tweetslist: if tw['id'] == lastpost['id']: break if 'timestamp' in tw.keys() and 'timestamp' in lastpost.keys(): if tw['timestamp'] < lastpost['timestamp']: break new_tweets.append(tw) return new_tweets, statuscode except: return [], -1
def getTweets(userName, csv, id, proxy=None): ''' Get tweet information from twitter.com :param userName: the name of twitter account :param csv: the file to written in :param proxy: proxy of web :return: void ''' e_cursor = '' e_cursor_previous = 'none' extractedFormattedTweetInfo = [] cookieJar = http.cookiejar.CookieJar() while e_cursor != e_cursor_previous : # Pretend to be a human reading a html page and extract current page back in json jsonTweet = HtmlHandler.getJsonReponse(userName, e_cursor, cookieJar, proxy, id) if len(jsonTweet['items_html'].strip()) == 0: break # Control the cursor on the html e_cursor_previous = e_cursor e_cursor = jsonTweet['min_position'] tweets = PyQuery(jsonTweet['items_html']) tweets.remove('div.withheld-tweet') tweets = tweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetPiece in tweets: tweetPQ = PyQuery(tweetPiece) tweet = Tweet() # Filter correspoding information from html tweet.username = id tweet.date = datetime.datetime.fromtimestamp(int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))) tweet.tweetid = tweetPQ.attr("data-tweet-id") tweet.authorid = int(tweetPQ("a.js-user-profile-link").attr("data-user-id")) tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')) tweet.retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) tweet.favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) tweet.permalink = 'https://twitter.com' + tweetPQ.attr("data-permalink-path") if len(tweetPQ('span.Tweet-geo')) > 0: tweet.geo = tweetPQ('span.Tweet-geo').attr('title') else: tweet.geo = '' extractedFormattedTweetInfo.append(tweet) print(" Progress: ", end='') print(len(extractedFormattedTweetInfo), end='') print(" tweets extracted from html.", end='\r') # Write what extracted form html page into csv file HtmlHandler.writeToCSV(extractedFormattedTweetInfo, csv)
def render_document(vnode, expressions, context): for expression in expressions: evaluation = evaluate_expression(expression, context) node = expression.get('node') if isinstance(expression.get('value'), basestring) and expression.get('value') == evaluation: continue expression['value'] = evaluation if expression.get('type') == 'each': if expression.get('parent'): parent = expression.get('parent') else: parent = node.parent() expression['parent'] = parent riot_id = node.attr['data-riot-id'] original_children = parent.children('[data-riot-id="%s"]' % riot_id) # 0. add placeholder placeholder = PyQuery('<text></text>') placeholder.insertBefore(original_children.eq(0)) # 1. remove children original_node = original_children.clone() original_children.remove() expression['node'] = original_node # 2. insert children loopcontext = {} loopcontext.update(context if isinstance(context, dict) else vars(context)) expressions_col = [] for loop_index, item in enumerate(evaluation): loopcontext.update(item if isinstance(item, dict) else vars(item)) loopcontext['loopindex'] = loop_index child_node = PyQuery(expression.get('impl')) child_node.attr['data-riot-loopindex'] = str(loop_index) expressions = parse_document_expressions(child_node) expressions_col.append((expressions, loopcontext)) render_document(vnode, expressions, loopcontext) child_node.insertBefore(placeholder) # 3. remove placeholder if len(evaluation) == 0: placeholder.attr['data-riot-id'] = str(riot_id) else: placeholder.remove() mark_dirty(parent) generate_widget(parent) for expressions, loopcontext in expressions_col: connect_signals(vnode, expressions, loopcontext) continue if expression.get('type') == 'markup': node.attr['markup'] = json.dumps(evaluation) node.html('') mark_dirty(node) continue if expression.get('type') == 'attribute': attribute = expression.get('attribute') node.attr[attribute] = str(evaluation) mark_dirty(node) continue
def parse_json(search_params): """ Parse the json tweet :param search_params: SearchParams object :return: void """ min_position = get_last_search_position(search_params.log_file_name) count = 0 while True: json_res = get_tweets(search_params, min_position) if len(json_res['items_html'].strip()) == 0: break min_position = json_res['min_position'] search_params.logging.info('min_pos - {}'.format(min_position)) item = json_res['items_html'] scraped_tweets = PyQuery(item) scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') for tweet_html in tweets: print(count) tweet_py_query = PyQuery(tweet_html) name = tweet_py_query.attr("data-name") screen_name = tweet_py_query.attr("data-screen-name") tweet_id = tweet_py_query.attr("data-tweet-id") tweet_text = re.sub( r"\s+", " ", tweet_py_query("p.js-tweet-text").text().replace( '# ', '#').replace('@ ', '@')) tweet_date_time = int( tweet_py_query("small.time span.js-short-timestamp").attr( "data-time")) tweet_date_time = datetime.datetime.fromtimestamp(tweet_date_time) retweet_count = int( tweet_py_query( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites_count = int( tweet_py_query( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) permalink = 'https://twitter.com' + tweet_py_query.attr( "data-permalink-path") tweet = Tweet(str(uuid.uuid4()), name, screen_name, tweet_id, tweet_text, tweet_date_time, retweet_count, favorites_count, permalink) # Now Write to OP or save to DB write_op(search_params.op, tweet) count += 1 # sleep(5) if 0 < search_params.max_retrieval_count <= count: break
def _compute_signature(self): with file_open("partner_communication_switzerland/static/html/signature.html")\ as tfile: template = PyQuery(tfile.read()) phone = { "fr_CH": "+41 (0)24 434 21 24", "de_DE": "+41 (0)31 552 21 21", "it_IT": "+41 (0)31 552 21 24", "en_US": "+41 (0)31 552 21 25" } phone_link = { "fr_CH": "+41244342124", "de_DE": "+41315522121", "it_IT": "+41315522124", "en_US": "+41315522125" } facebook = { "fr_CH": "https://www.facebook.com/compassionsuisse/", "de_DE": "https://www.facebook.com/compassionschweiz/", "it_IT": "https://www.facebook.com/compassionsvizzera/", "en_US": "https://www.facebook.com/compassionsuisse/" } for user in self: values = { "user": user, "name": f"{user.preferred_name} {user.lastname}" if user.firstname else _("The team of Compassion"), "email": user.email if user.firstname else "*****@*****.**", "lang": self.env.lang, "lang_short": self.env.lang[:2], "team": _("and the team of Compassion") if user.firstname else "", "office_hours": _("mo-thu: 8am-4pm<br/>fri 8am-12am"), "company_name": user.company_id.address_name, "phone_link": phone_link.get(self.env.lang), "phone": phone.get(self.env.lang), "facebook": facebook.get(self.env.lang), } if self.env.lang in ("fr_CH", "en_US"): template.remove("#bern") else: template.remove("#yverdon") user.signature = template.html().format(**values)
def _get_tweet_batch_html(config, refresh_cursor, cookie_jar, proxy): """Scraper Identifies the tweets portion from the html json file """ json = _get_json_response(config, refresh_cursor, cookie_jar, proxy) if len(json["items_html"].strip()) == 0: return refresh_cursor = json["min_position"] scraped_tweets = PyQuery(json["items_html"]) # Remove incomplete tweets withheld by Twitter Guidelines scraped_tweets.remove("div.withheld-tweet") return scraped_tweets("div.js-stream-tweet")
def __processImageTag(self, i, e): obj = PyQuery(e) style = obj.attr('style') if style != None and style.find('display: none') != -1: obj.remove() return newObj = PyQuery("<img />") newObj.attr('src', obj.attr('rel:bf_image_src')) newObj.attr('style', obj.attr('style')) newObj.width(obj.width()) newObj.height(obj.height()) obj.replaceWith(newObj)
def pq_remove_nodes( pq: PyQuery, css_remove: Union[str, list], ) -> PyQuery: pq = pq.clone() if isinstance(css_remove, str): css_remove = [css_remove] for remove_node in css_remove: pq.remove(remove_node) return pq
def get_tweets_for_input(tweet_criteria, query_metadata, should_query_for_city, receive_buffer=None): print("Fetching results for :: Query : {0} Language : {1} City : {2} Range : {3} From: {4} Till: {5}".format( tweet_criteria['query'], tweet_criteria['language'], tweet_criteria['near'], tweet_criteria['within'], tweet_criteria['since'], tweet_criteria['until'])) buffer_length = 100 refresh_cursor = '' results = [] results_aux = [] active = True while active: try: tweet_response_json = Tc.getJsonReponse(tweet_criteria, should_query_for_city, refresh_cursor) except Exception as e: tweet_response_json = None break if len(tweet_response_json['items_html'].strip()) == 0: break refresh_cursor = tweet_response_json['min_position'] scraped_tweets = PyQuery(tweet_response_json['items_html']) # Remove incomplete tweets withheld by Twitter Guidelines scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') if len(tweets) == 0: break temp = Tf.parse_tweet_list_from_tweets_html(tweets) results.extend(temp) results_aux.extend(temp) print("Fetched {0} results from Twitter API. {1} / {2}, has more items: {3}".format(len(temp), len(results), tweet_criteria[ "max_tweets"], tweet_response_json[ 'has_more_items'])) # if receive_buffer and len(results_aux) >= buffer_length: # receive_buffer(results_aux, query_metadata, tweet_criteria["output_file_name"]) # results_aux = [] # if len(results) >= tweet_criteria["max_tweets"] >= 0 or tweet_response_json['has_more_items'] is False: if len(results) >= tweet_criteria["max_tweets"]: active = False if receive_buffer and len(results_aux) > 0: receive_buffer(results_aux, query_metadata, tweet_criteria["output_file_name"]) return results
def parse_content(): exclude_classes = [ '.article-metaline', '.article-metaline-right', '.push'] exclude_text_spans = ['發信站: 批踢踢實業坊(ptt.cc)', '文章網址:'] for exclude_text in exclude_text_spans: ele = main.lxml.xpath( f'//span[contains(text(),"{exclude_text}")]')[0] ele.getparent().remove(ele) cleaned_html = etree.tostring(main.lxml) cleaned_pq = PyQuery(cleaned_html) for exclude_cls in exclude_classes: cleaned_pq.remove(exclude_cls) return cleaned_pq.text()
def filter_html(self, html, is_body: bool = True): """ 过滤html """ try: text = re.sub(r"<!-[\s\S]*?-->", "", html) doc = PyQuery(text) doc.remove("script") doc.remove("style") if is_body: return list(doc("body"))[0] else: return list(doc("head"))[0] except Exception as e: self.logger.info(e) return None
def getTweets(criteria): refreshCursor = '' results = [] resultsAux = [] cookieJar = cookielib.CookieJar() active = True while active: json = TweetManager.getJsonReponse(criteria, refreshCursor, cookieJar) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) accountId = int(tweetPQ.attr("data-user-id")) accountFullname = tweetPQ.attr("data-name") accountHref = "/" + tweetPQ.attr("data-screen-name") account = TwitterAccount(accountId, accountFullname, accountHref) epoch = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(epoch)) likes = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) replies = int(tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')) tweet = Tweet(account, date, likes, replies, retweets, txt) results.append(tweet) if criteria['limit'] > 0 and len(results) >= criteria['limit']: active = False break return results
def sanitize_description(value): cleaned = PyQuery(value) cleaned = cleaned.remove('span.playMetaText') cleaned.remove('span.playMetaText') cleaned.remove('span.playCount') cleaned.remove('time') cleaned.remove('strong') desc = cleaned.html() if desc is None: return "" return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
def get_tweet_data(coin_name, max_position): cookieJar = http.cookiejar.CookieJar() active = True proxy = None receiveBuffer = None refresh_cursor = max_position while active: json = getJsonReponse(refresh_cursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break # 下一页的id游标,每一页有20个 refresh_cursor = json['min_position'] print(refresh_cursor) scrapedTweets = PyQuery(json['items_html']) # Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet')('div.content')('div.stream-item-footer')( 'div.ProfileTweet-actionCountList') # print("tweets", tweets) if len(tweets) == 0: break for tweetHTML in tweets: print(tweetHTML) tweetPQ = PyQuery(tweetHTML) tweet_id = tweetPQ( "span.ProfileTweet-action--reply span.ProfileTweet-actionCount span.ProfileTweet-actionCountForAria").attr( "id").split("-")[6] print("tweet_id: ", tweet_id) reply_num = tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr( "data-tweet-stat-count") print("reply_num: ", reply_num) retweet_num = tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr( "data-tweet-stat-count") print("retweet_num: ", retweet_num) favorite_num = tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr( "data-tweet-stat-count") print("favorite_num: ", favorite_num)
def parseNextPageUrl(self, category_page_content): doc = PyQuery(category_page_content) nodeAList = doc("span#view_47 > a") for nodeA in nodeAList: nodeAQ = PyQuery(nodeA) if nodeAQ.remove('span').text().strip().lower() == 'next': return nodeAQ.attr('href').strip() return None
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) content_node = doc('div.kb_zw') if not content_node: # content_node = doc('div.zw_text') content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0]) content_node.remove('script') content_node.remove('style') content_node.remove('iframe') content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]') content_node.remove('input') item = ContentItem() item['title'] = self.title = doc('td[align = "center"]')('b').text() if item['title'] == None: item['title'] = self.title = doc('div.zw_bt').text() if item['title'] == None: item['title'] = self.title = doc('h1.zw_title').text() item['release_time'] = '' item['source'] = u"新浪" item['author'] = '' item['pic_url'] = '' imgs = content_node('img') image_urls = [] for img in imgs: if ".gif" in img.get('src'): continue if not img.get('src'): continue else: imgs.eq(imgs.index(img)).before('<br>') imgs.eq(imgs.index(img)).append('<br>') image_urls.append(self.getRealURI(img.get('src'))) item['image_urls'] = image_urls content = content_node.__unicode__() item['content'] = self.content = content return item
def sanitize_description(value): cleaned = PyQuery(value) cleaned = cleaned.remove('span.playMetaText') cleaned.remove('span.playMetaText') cleaned.remove('time') cleaned.remove('strong') return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
def sanitize_html2(value): soup = PyQuery(value) soup = soup.remove("span.playMetaText") soup.remove("span.playMetaText") soup.remove("time") soup.remove("strong") return soup.html().split("<span>")[-1:]
def amazon_general_descr(centerCol, pqhtml): # print centerCol.outerHtml() # print pqhtml.outerHtml() descr = centerCol('#featurebullets_feature_div').remove( 'script').text() or '' descr += (pqhtml('#productDescription').remove('script').text() or '') if not descr: for ele in pqhtml('script[type="text/javascript"]').items(): if 'ProductDescriptionIframeResize' in ele.text(): descr = re.search(r'var iframeContent = "(.*)";\n', ele.text()).groups()[0] descr = PyQuery(urllib.unquote(descr)) descr.remove('script') descr = descr('#productDescription').text() break else: raise ValueError, 'Get Descr Fail' return descr
def feed(request, get_feed=get_feed): with shows_db() as shows: show_list = shows.values() d = PyQuery(get_feed(), parser="xml") for item in d("item"): ditem = PyQuery(item) title = ditem.find("title").text() match = detect_show(show_list, title) if match: name, episode = match # TODO: Record episode in the feed so that future versions of this episod will be ignored else: ditem.remove() response = Response() response.content_type = "application/rss+xml" response.ubody = unicode(d) response.cache_control = "no-cache" return response
def test_KernelResult_repr_html(): method = "foo" alternatives = ["a", "b", "c"] rank = [True, False, True] extra = {"alfa": 1} result = PyQuery( KernelResult(method=method, alternatives=alternatives, values=rank, extra=extra)._repr_html_()) expected = PyQuery(""" <div class='rankresult'> <table id="T_cc7f5_" > <thead> <tr> <th class="blank level0" ></th> <th class="col_heading level0 col0" >a</th> <th class="col_heading level0 col1" >b</th> <th class="col_heading level0 col2" >c</th> </tr> </thead> <tbody> <tr> <th id="T_cc7f5_level0_row0" class="row_heading level0 row0" > Kernel </th> <td id="T_cc7f5_row0_col0" class="data row0 col0" >True</td> <td id="T_cc7f5_row0_col1" class="data row0 col1" >False</td> <td id="T_cc7f5_row0_col2" class="data row0 col2" >True</td> </tr> </tbody> </table> <em class='rankresult-method'>Method: foo</em> </div> """) assert result.remove("style").text() == expected.remove("style").text()
async def handle_summary(summary: str, rss: rss_class.Rss) -> str: # 处理 summary 使其 HTML标签统一,方便处理 try: summary_html = Pq(summary) except Exception as e: logger.info(f"{rss.name} 没有正文内容! E: {e}") return "" # 最终消息初始化 res_msg = "" # 判断是否保留转发内容,保留的话只去掉标签,留下里面的内容 if config.blockquote: blockquote_html = summary_html("blockquote") for blockquote in blockquote_html.items(): blockquote.replace_with(blockquote.html()) else: summary_html.remove("blockquote") # 判断是否开启了 仅仅推送有图片的信息 if not rss.only_pic: # 处理标签及翻译 summary_text = await handle_html_tag(html=summary_html) # 移除指定内容 if rss.content_to_remove: for pattern in rss.content_to_remove: summary_text = re.sub(pattern, "", summary_text) res_msg += summary_text # 翻译处理后的正文 if rss.translation: res_msg += await handle_translation(content=summary_text) # 处理图片 res_msg += await handle_img(html=summary_html, img_proxy=rss.img_proxy, img_num=rss.max_image_number) return res_msg + "\n"
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) content_node = doc('.firstTopic')('div') content_node.remove('script') content_node.remove('.rate') content_node.remove('.affixContent') content_node.remove('.thread_gold') item = ContentItem() imgs = content_node('.p14')('img') img_all = [] for img in imgs: if".gif" in img.get('src'): continue else: imgs.eq(imgs.index(img)).append('<br>') imgs.eq(imgs.index(img)).before('<br>') img_all.append(self.getRealURI(img.get('src'))) item['image_urls'] = img_all item['title'] = self.title = doc('#thread_title').text() content = content_node('.p14').__unicode__() content = PyQuery(content) del_style = content('div') for d in del_style: if d.get('style'): del_style.eq(del_style.index(d)).attr['style'] = '' content.remove('dl.rate_list') content.remove('span[style = "font-size:12px"]') content.remove('dl.rate') item['content'] = self.content = content.__unicode__() release_time=doc('.firstTopic')('.postTime').text() ob=re.compile(u'20\d\d.*\d\d') release_time=ob.findall(release_time) item['release_time'] = release_time[0] # item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(release_time[0],u'%Y-%m-%d %H:%M:%S')) item['source'] = u"17173论坛" item['author'] = doc('.th1').eq(0).text() item['pic_url'] = '' return item
def get_beer_detail(self, url): """ Follow the link to beer page to get detailed review information. """ r = requests.get(BASE_URL + url) pq = PyQuery(r.text) pq = pq('#rating_fullview_content_2:first') # user ratings section self.rating = self.clean_xml(pq('.BAscore_norm:first').text()) # comment is the look/smell/taste/feel/overall appended to any # other comments. we remove the other sections so text() # return only the comments self.comment = pq('.muted:first').text() pq.remove('br') pq.remove('.muted') pq.remove('.BAscore_norm') self.comment += "\n" + pq.text()
def get_tweets(tweet_criteria, receive_buffer=None, location_search=False, buffer_length=100, proxy=None): refresh_cursor = '' results = [] results_aux = [] cookiejar = cookielib.CookieJar() if hasattr(tweet_criteria, 'username') and ( tweet_criteria.username.startswith("\'") or tweet_criteria.username.startswith("\"")) and ( tweet_criteria.username.endswith("\'") or tweet_criteria.username.endswith("\"")): tweet_criteria.username = tweet_criteria.username[1:-1] active = True while active: try: json = TweetManager.get_json_response(tweet_criteria, refresh_cursor, cookiejar, proxy) if len(json['items_html'].strip()) == 0: break refresh_cursor = json['min_position'] scraped_tweets = PyQuery(json['items_html']) # Remove incomplete tweets withheld by Twitter Guidelines scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') if len(tweets) == 0: break for tweet_html in tweets: tweetPQ = PyQuery(tweet_html) tweet = model.Tweet() username_tweet = tweetPQ( "span:first.username.u-dir b").text() txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) txt = txt.replace('# ', '#') txt = txt.replace('@ ', '@') print( colored("@" + username_tweet + ": ", "red") + colored(txt, "green") + "\n") retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) if location_search == True: page = requests.get( 'https://twitter.com/tubiity/status/' + id) script_geo = html.fromstring(page.content) location = script_geo.xpath( '//a[@class="u-textUserColor js-nav js-geo-pivot-link"]/text()' ) sp_location = ','.join(location) tweet.geo = sp_location else: geo = '' tweet.geo = geo # user-information ''' If this code block is uncommented, application will be slower due to response time''' '''result = requests.get("https://twitter.com/" + username_tweet) c = result.content soup = BeautifulSoup(c, "html.parser") liste = [] samples = soup.find_all("a", "ProfileNav-stat ProfileNav-stat--link u-borderUserColor u-textCenter js-tooltip js-openSignupDialog js-nonNavigable u-textUserColor") # Follower, Follow and number of likes in list for a in samples: liste.append(a.attrs['title']) ''' tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = username_tweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.user_id = user_id results.append(tweet) results_aux.append(tweet) if receive_buffer and len(results_aux) >= buffer_length: receive_buffer(results_aux) results_aux = [] if tweet_criteria.maxTweets > 0 and len( results) >= tweet_criteria.maxTweets: active = False break except: receive_buffer(results_aux) return if receive_buffer and len(results_aux) > 0: receive_buffer(results_aux) return results
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() if hasattr(tweetCriteria, 'username') and ( tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and ( tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): tweetCriteria.username = tweetCriteria.username[1:-1] active = True totalNumTweets = 0 while active: json, fullurl = TweetManager.getJsonReponse( tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: # print("break") break refreshCursor = json['min_position'] # print ("refreshCursor is {}".format(refreshCursor)) scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') totalNumTweets += len(tweets) if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() try: usernameTweet = tweetPQ( "span:first.username.u-dir b").text() except Exception as e: usernameTweet = "" print("can not get username") try: # get text in different tag seperated by \n tweet_text = tweetPQ("p.js-tweet-text").text( squash_space=False) tweet_text_list = tweet_text.split("\n") # replace the "" with " ", for the \n\n situation for i, v in enumerate(tweet_text_list): if v == "": tweet_text_list[i] = " " txt = "".join(tweet_text_list) # print(" ".join(re.compile('(#\\w*)').findall(txt))) except Exception as e: txt = "" print("can not get txt") traceback.print_exc() try: reply = int( tweetPQ( "span.ProfileTweet-action--reply span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) except Exception as e: reply = 0 print("can not get retweets.") traceback.print_exc() try: retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) except Exception as e: retweets = 0 print("can not get retweets") traceback.print_exc() try: favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) except Exception as e: favorites = 0 print("can not get retweets.") traceback.print_exc() try: dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) except Exception as e: dateSec = 0 print("can not get dateSec") traceback.print_exc() try: idx = tweetPQ.attr("data-tweet-id") except Exception as e: idx = "" print("can not get id") traceback.print_exc() try: permalink = tweetPQ.attr("data-permalink-path") except Exception as e: permalink = "" print("can not get permalink") traceback.print_exc() try: url = tweetPQ('a.twitter-timeline-link').attr( 'data-expanded-url') except Exception as e: url = "" print("can not get url") traceback.print_exc() tweet.url = url # hashtag try: hashtags = tweetPQ( 'a.twitter-hashtag.pretty-link.js-nav').text().replace( "# ", "#") except Exception as e: hashtags = "" traceback.print_exc() tweet.hashtags = hashtags.replace('\n', '') geo = '' try: geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') except Exception as e: geo = '' tweet.id = idx tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.reply = reply tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) # tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break print("url: {}".format(fullurl)) if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results, totalNumTweets
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernameTweet = tweetPQ("span.username.js-action-profile-name b").text() txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')) retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int(tweetPQ("a.js-user-profile-link").attr("data-user-id")) geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec).strftime("%a %b %d %X +0000 %Y") tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo tweet.urls = ",".join(urls) tweet.author_id = user_id results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def extract_content(self, html): html = re.sub(r'xmlns="[^"]+"', "", html) doc = PyQuery(html) content_node = doc.find(self.content_css_selector) self.should_remove_css_selector and doc.remove(self.should_remove_css_selector) return content_node.outer_html()
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = cookielib.CookieJar() if hasattr(tweetCriteria, 'username') and ( tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and ( tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): tweetCriteria.username = tweetCriteria.username[1:-1] active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernameTweet = tweetPQ("span:first.username.u-dir b").text() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None, debug=False): """Get tweets that match the tweetCriteria parameter A static method. Parameters ---------- tweetCriteria : tweetCriteria, an object that specifies a match criteria receiveBuffer : callable, a function that will be called upon a getting next `bufferLength' tweets bufferLength: int, the number of tweets to pass to `receiveBuffer' function proxy: str, a proxy server to use debug: bool, output debug information """ results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() user_agent = random.choice(TweetManager.user_agents) all_usernames = [] usernames_per_batch = 20 if hasattr(tweetCriteria, 'username'): if type(tweetCriteria.username) == str or not hasattr( tweetCriteria.username, '__iter__'): tweetCriteria.username = [tweetCriteria.username] usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u] all_usernames = sorted({u.lower() for u in usernames_ if u}) n_usernames = len(all_usernames) n_batches = n_usernames // usernames_per_batch + ( n_usernames % usernames_per_batch > 0) else: n_batches = 1 for batch in range(n_batches): # process all_usernames by batches refreshCursor = '' batch_cnt_results = 0 sleep(1) if all_usernames: # a username in the criteria? tweetCriteria.username = all_usernames[ batch * usernames_per_batch:batch * usernames_per_batch + usernames_per_batch] active = True while active: json = TweetManager.getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, user_agent, debug=debug) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernames = tweetPQ("span.username.u-dir b").text().split() if not len(usernames): # fix for issue #13 continue tweet.username = usernames[0] tweet.to = usernames[1] if len( usernames ) >= 2 else None # take the first recipient if many rawtext = TweetManager.textify( tweetPQ("p.js-tweet-text").html(), tweetCriteria.emoji) tweet.text = re.sub(r"\s+", " ", rawtext)\ .replace('# ', '#').replace('@ ', '@').replace('$ ', '$') tweet.retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.replies = int( tweetPQ( "span.ProfileTweet-action--reply span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.id = tweetPQ.attr("data-tweet-id") tweet.permalink = 'https://twitter.com' + tweetPQ.attr( "data-permalink-path") tweet.author_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) tweet.date = datetime.datetime.fromtimestamp( dateSec, tz=datetime.timezone.utc) tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc)\ .strftime("%a %b %d %X +0000 %Y") tweet.hashtags, tweet.mentions = TweetManager.getHashtagsAndMentions( tweetPQ) geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: tweet.geo = geoSpan.attr('title') else: tweet.geo = '' urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.urls = ",".join(urls) results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] batch_cnt_results += 1 if tweetCriteria.maxTweets > 0 and batch_cnt_results >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) resultsAux = [] return results
class MagpieAgentThread( threading.Thread ): def __init__( self, thread_id ): threading.Thread.__init__( self ) self.thread_id = thread_id self.mongo = MongoClient( ).katipo.pages def next_job( self ): data = stored_procedure_as_dict( "next_job" ) if len( data ) > 0: self.current_job = data return True else: self.current_job = None return False def process( self ): self.discovered_urls = set() self.basic_content_type = "text/html" self.message_stack = [ "-" * 80 ] # resolve the address uri = urlparse( self.current_job[ 'url' ] ) answers = dns.resolver.query( uri.hostname, 'A' ) for answer in answers: self.message_stack.append( "DNS) %s" % answer ) try: self.current_response = requests.get( self.current_job[ 'url' ], stream=True ) self.basic_content_type = self.current_response.headers[ 'content-type' ].split( ";" )[ 0 ] except: self.current_response = None self.basic_content_type = None if self.current_response: for r in self.current_response.history: self.message_stack.append( "-URL (%s) %s" % ( r.status_code, r.url ) ) self.message_stack.append( "+URL (%s) %s" % ( self.current_response.status_code, self.current_response.url ) ) self.message_stack.append( "BASIC CONTENT-TYPE) %s" % self.basic_content_type ) self.message_stack.append( "CONTENT TYPE) %s" % self.current_response.headers['content-type'] ) self.message_stack.append( "ENCODING) %s" % self.current_response.encoding ) if self.basic_content_type in ACCEPTABLE_CONTENT_TYPES: # we need to handle the odd, but real case of the mystery <? palantir_blog_list('sidebar') ?> tag # tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text ) tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text ) tidy_response_text = re.sub( "<!--.*?-->", "", tidy_response_text ) self.dom = PyQuery( tidy_response_text, parser='html' ) self.titles = [ safe_str( title.text ) for title in self.dom("title") ] for a in self.dom('a'): a = PyQuery(a) new_url = PyQuery(a).attr.href if new_url != None: new_url = urldefrag( urljoin( self.current_response.url, new_url ) )[0] self.discovered_urls.add( new_url ) self.message_stack.append( "DISCOVERED) %s" % len( self.discovered_urls ) ) # BOILERPIPE for excluded_tag in BOILERPIPE_REMOVE_TAGS: self.dom( excluded_tag ).after( "\n" ) self.dom.remove( excluded_tag ) # remove tags with style="display:none" # http://www.microsoft.com/en-us/legal/intellectualproperty/copyright/default.aspx display_none_pattern = re.compile( "display: ?none" ) for x in self.dom("*"): try: tag = PyQuery(x) if not tag.attr("style") == None: if re.match( display_none_pattern, tag.attr("style") ): tag.remove() except Exception as inst: print type(inst) print inst.args print inst self.save() else: self.message_stack.append( "DISCARDED" ) else: self.message_stack.append( "NO RESPONSE" ) def save( self ): try: # if domain( self.current_response.url ) == domain( self.current_job['url'] ): scraper_name = SCRAPER_NAME if 'scraper_name' in self.current_job: scraper_name = self.current_job[ 'scraper_name' ].lower() scraper_version = SCRAPER_VERSION if 'scraper_version' in self.current_job: scraper_version = self.current_job[ 'scraper_version' ] qid_base = "%s:%s:%s" % ( self.current_job[ 'starting_point' ], self.current_job[ 'url' ], scraper_name ) qid_hash = hashlib.md5() qid_hash.update( qid_base ) qid = qid_hash.hexdigest() headers = self.current_response.headers scraped_at = datetime.utcnow().strftime( r'%Y-%m-%dT%H:%M:%SZ' ) last_modified = scraped_at if 'date' in headers: last_modified = _parse_http_datetime( headers['date'] ).strftime( r'%Y-%m-%dT%H:%M:%SZ' ) quid_orgid = None if 'org_id' in self.current_job: quid_orgid = self.current_job[ 'org_id' ] alternative_urls = [] if self.current_response.history != None: for x in self.current_response.history: alternative_urls.append( x.url ) content_type = "text/html" if "content-type" in headers: content_type = headers['content-type'] content_type = content_type.split( ";" )[ 0 ] url_selection_rule = "(page_count < %s) && (depth <= %s)" % ( self.current_job[ 'page_limit' ], self.current_job[ 'depth_limit' ] ) if len( alternative_urls ) > 0: print qid # sample tokenize tokens = [] for token in re.compile( "\W", re.UNICODE).split( safe_unicode( self.dom.text().lstrip().rstrip() ) ): if token.lstrip().rstrip() != '': tokens.append( token ) # find any docs which already exist with this url matches = [] for d in self.mongo.find( { "meta.data.source_urls" : self.current_response.url } ): matches.append( d["_id"] ) print( "matches for %s\n%s" % ( self.current_response.url, matches ) ) self.mongo.insert( { u"meta" : { u"data" : { u"qid" : qid, u"qid_base" : qid_base, u"content_encoding" : u"UTF-8", u"content_type" : content_type, u"source_url" : alternative_urls + [ self.current_response.url ], u"doc_type_name" : u"unstructured/web/WEBPAGE", u"doc_type_version" : u"1.0.0", u"scraper_name" : scraper_name, u"scraper_version" : scraper_version, u"scraped_at" : scraped_at, u"date_publication" : last_modified, u"quid_orgid" : quid_orgid, u"katipo" : { u"starting_url" : self.current_job[ 'starting_point' ], u"last_modified" : last_modified, u"domain" : self.current_job[ 'domain' ] } }, u"v" : 2, u"id" : qid }, u"raw" : { u"data" : self.current_response.text, u"v" : 2, u"id" : qid }, u"structured" : { u"data" : { u"http_headers" : headers, u"page_depth" : self.current_job[ 'depth' ], u"job_count" : self.current_job[ 'job_count' ], u"url_selection_rule" : url_selection_rule, u"meta_tags" : u'', u"body_text" : self.dom.text().lstrip().rstrip(), u"tokens" : tokens }, u"v" : 2, u"id" : qid } } ) except Exception as inst: self.message_stack.append( inst ) # self.message_stack.append( self.dom.text() ) print string.join( self.message_stack, "\n" ) def acknowledge( self ): stored_procedure( "acknowledge_job", self.current_job['domain'], self.current_job['url'] ) for url in self.discovered_urls: new_url_scheme = urlsplit( url )[0] if domain(url) == domain( self.current_job['url'] ) and new_url_scheme in ACCEPTABLE_SCHEMES: self.message_stack.append( "ACCEPT) %s" % url ) self.queue( url ) else: self.message_stack.append( "REJECT) %s" % url ) self.message_stack.append( "ACK'd) %s" % self.current_job['url'] ) def fail( self ): self.message_stack.append( "FAIL" ) # self.acknowledge( ) pass def queue( self, url ): j, d = self.current_job, ( int( self.current_job['depth'] ) + 1 ) stored_procedure( "queue_job", j['domain'], j['page_limit'], j['depth_limit'], url, d, j['url'], j['starting_point'], j['batch'], j['org_id'] ) def run( self ): for x in range( 3600 ): while self.next_job(): try: self.process() self.acknowledge( ) except Exception as inst: print inst self.fail() # print string.join( self.message_stack, "\n" ) sleep( 1 )
def getTweets(tweet_criteria, receive_buffer=None, buffer_length=100, proxy=None): refresh_cursor = '' results = [] results_aux = [] cookie_jar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonResponse(tweet_criteria, refresh_cursor, cookie_jar, proxy) if len(json['items_html'].strip()) == 0: break refresh_cursor = json['min_position'] scraped_tweets = PyQuery(json['items_html']) scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') if len(tweets) == 0: break for tweet_HTML in tweets: tweet_PQ = PyQuery(tweet_HTML) tweet = TweetModel() username = tweet_PQ.attr("data-screen-name") text = re.sub( r"\s+", " ", tweet_PQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) retweets = int( tweet_PQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) likes = int( tweet_PQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) date = int( tweet_PQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweet_PQ.attr("data-tweet-id") permalink = tweet_PQ.attr("data-permalink-path") user_id = int( tweet_PQ("a.js-user-profile-link").attr("data-user-id")) media = tweet_PQ("div.AdaptiveMedia-photoContainer").attr( "data-image-url") geo = '' geo_span = tweet_PQ('span.Tweet-geo') if len(geo_span) > 0: geo = geo_span.attr('titile') urls = [] for link in tweet_PQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.source = 'https://twitter.com' + permalink tweet.username = username tweet.text = text tweet.created_at = date tweet.retweet_count = retweets tweet.favorite_count = likes tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo tweet.urls = ",".join(urls) tweet.author_id = user_id tweet.media = media results.append(tweet) results_aux.append(tweet) if receive_buffer and len(results_aux) >= buffer_length: receive_buffer(results_aux) results_aux = [] if tweet_criteria.max_tweets > 0 and len( results) > tweet_criteria.max_tweets: active = False break if receive_buffer and len(results_aux) > 0: receive_buffer(results_aux) return results
def faltantes(): bajados = {int(l.split('.')[0]) - 1 for l in glob.glob('*.md')} links = get_all_links() faltan = set(range(len(links))) - bajados return [links[i] if i in faltan else None for i in range(len(links))] for did, url in enumerate(faltantes()): if not url: continue try: d = PyQuery(url=url, headers=headers) # cleanups d.remove('ul.actions, #fb-root, script, div[style="clear:both"]') for cf in d('.clearfix'): if d(cf).text() == "": d(cf).remove() fecha = d('dd.published').text() d('.article-info').before(u'<p>[{}]</p>'.format(fecha)) d.remove('.article-info') # no link in the title titulo = d('.item-page h2 a').text().decode('utf8') d('.item-page h2').text(titulo) # clean html content discurso = d('.item-page').html() import ipdb;ipdb.set_trace()
def strip_tags(text): html = PyQuery(text) return html.remove('code').remove('a').text()
def extract(self): self.html = re.sub('<!--.*?-->', '', self.html) doc = PyQuery(self.html) doc.remove('div#tipswindow') content_node = doc('div#Cnt-Main-Article-QQ') if not content_node: content_node = doc('div#ArticleCnt') if not content_node: content_node = doc('div#textContent') if not content_node: content_node = doc('#content') if not content_node: content_node = doc('div[id = "qnews-content"]') content_node.remove('script') content_node.remove('style') content_node.remove('iframe') content_node.remove('div.adpip_Aritcle_QQ') content_node.remove('table#picInPic') content_node.remove('div.dayuw_ad') content_node.remove('div.tJieHot_') content_node.remove('div.b_new_mod') content_node.remove('div#awh_sports') content_node.remove('div[id = "photo-warp"]') content_node.remove('div#MorePic') content_node.remove('div#cmenu') content_node.remove('div#flashCff') content_node.remove('div#contTxt') content_node.remove('div#PGViframe') content_node.remove('div#Reading') content_node.remove('span[style = "BACKGROUND-COLOR: navy; COLOR: white"]') content_node.remove('img[width="592"][height="100"]') content = content_node.__unicode__() item = ContentItem() item['title'] = self.title = doc('h1').text() if not item['title']: item['title'] = self.title = doc('div#ArticleTit').text() if not item['title']: item['title'] = self.title = doc('h2').text() item['content'] = self.content = content item['release_time'] = self.release_time = doc('span.pubTime').text() p = re.compile(u"(20\d\d.*\d\d:\d\d)") if not self.release_time: self.release_time = doc('div[class = "info"]').text() if self.release_time == None: self.release_time = doc('div[id = "ArtFrom"]').text() if self.release_time == None: self.release_time = doc('div[class = "pubtime"]').text() if self.release_time == None: self.release_time = doc('span[id= "Freleasetime"]').text() if self.release_time == None: self.release_time = doc('td.xborderb1').eq(1).text() p = re.compile(u"(20.*-\d\d)") item['release_time'] = self.release_time = p.search(self.release_time).group() #item['release_switch_time'] = time.mktime(time.strptime(self.release_time,time_s)) item['source'] = u"腾讯" item['author'] = '' item['pic_url'] = '' imgs = content_node('img') image_urls = [] for img in imgs: if ".gif" in img.get('src'): continue if not img.get('src'): continue else: imgs.eq(imgs.index(img)).before('<br>') image_urls.append(self.getRealURI(img.get('src'))) item['image_urls'] = image_urls return item
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) JscriptTxt = pqhtml('script').text() pqhtml.remove('script').remove('style') area = pqhtml('div#product-summary') # print area.outerHtml().encode('utf-8') buttonTxt = area('#product-form .add-button').text() if u'售罄' in buttonTxt.lower() or u'sold out' in buttonTxt.lower(): log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) detail = dict() #所有图片 imgs = self.get_imgs(pqhtml) detail['imgs'] = imgs detail['img'] = imgs[0] #名称 detail['name'] = area('h1.brand').text() + ' ' + area( '.name').text() #货币 currency = area('span.regular-price').text().split()[0] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = self.get_all_price(area) detail['price'] = price detail['listPrice'] = listPrice color, sizes = self.get_sizes(area) #颜色 detail['color'] = color #sizes detail['sizes'] = sizes #下架: if isinstance(detail['sizes'], basestring) and detail['sizes'] == 'sold out': log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #描述 detail['descr'] = area('div#description').text() or pqhtml( '#product-details .product-details-section').text() #品牌 detail['brand'] = area('h1.brand').text() #产品ID prodId = area.attr('data-id') detail['productId'] = prodId detail['colorId'] = prodId #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def process( self ): self.discovered_urls = set() self.basic_content_type = "text/html" self.message_stack = [ "-" * 80 ] # resolve the address uri = urlparse( self.current_job[ 'url' ] ) answers = dns.resolver.query( uri.hostname, 'A' ) for answer in answers: self.message_stack.append( "DNS) %s" % answer ) try: self.current_response = requests.get( self.current_job[ 'url' ], stream=True ) self.basic_content_type = self.current_response.headers[ 'content-type' ].split( ";" )[ 0 ] except: self.current_response = None self.basic_content_type = None if self.current_response: for r in self.current_response.history: self.message_stack.append( "-URL (%s) %s" % ( r.status_code, r.url ) ) self.message_stack.append( "+URL (%s) %s" % ( self.current_response.status_code, self.current_response.url ) ) self.message_stack.append( "BASIC CONTENT-TYPE) %s" % self.basic_content_type ) self.message_stack.append( "CONTENT TYPE) %s" % self.current_response.headers['content-type'] ) self.message_stack.append( "ENCODING) %s" % self.current_response.encoding ) if self.basic_content_type in ACCEPTABLE_CONTENT_TYPES: # we need to handle the odd, but real case of the mystery <? palantir_blog_list('sidebar') ?> tag # tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text ) tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text ) tidy_response_text = re.sub( "<!--.*?-->", "", tidy_response_text ) self.dom = PyQuery( tidy_response_text, parser='html' ) self.titles = [ safe_str( title.text ) for title in self.dom("title") ] for a in self.dom('a'): a = PyQuery(a) new_url = PyQuery(a).attr.href if new_url != None: new_url = urldefrag( urljoin( self.current_response.url, new_url ) )[0] self.discovered_urls.add( new_url ) self.message_stack.append( "DISCOVERED) %s" % len( self.discovered_urls ) ) # BOILERPIPE for excluded_tag in BOILERPIPE_REMOVE_TAGS: self.dom( excluded_tag ).after( "\n" ) self.dom.remove( excluded_tag ) # remove tags with style="display:none" # http://www.microsoft.com/en-us/legal/intellectualproperty/copyright/default.aspx display_none_pattern = re.compile( "display: ?none" ) for x in self.dom("*"): try: tag = PyQuery(x) if not tag.attr("style") == None: if re.match( display_none_pattern, tag.attr("style") ): tag.remove() except Exception as inst: print type(inst) print inst.args print inst self.save() else: self.message_stack.append( "DISCARDED" ) else: self.message_stack.append( "NO RESPONSE" )
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = cookielib.CookieJar() if hasattr(tweetCriteria, 'username') and ( tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and ( tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): tweetCriteria.username = tweetCriteria.username[1:-1] active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) try: dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) except: dateSec = int('1575158200') tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] print len(results) if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def simple_package(package_name): ''' Given a package name, returns all the versions for downloading that package. If the package doesn't exists, then it will call PyPi (CheeseShop). But if the package exists in the local path, then it will get all the versions for the local package. This will take into account if the egg is private or if it is a normal egg that was uploaded to PyPi. This is important to take into account the version of the eggs. For example, a proyect requires request==1.0.4 and another package uses request==1.0.3. Then the instalation of the second package will fail because it wasn't downloaded an the **request** folder only has the 1.0.4 version. To solve this problem, the system uses 2 different kinds of eggs: * private eggs: are the eggs that you uploaded to the private repo. * normal eggs: are the eggs that are downloaded from pypi. So the normal eggs will always get the simple page from the pypi repo, will the private eggs will always be read from the filesystem. :param package_name: the name of the egg package. This is only the name of the package with the version or anything else. :return: a template with all the links to download the packages. ''' app.logger.debug('Requesting index for: %s', package_name) package_folder = get_package_path(package_name) if (is_private(package_name) or ( exists(package_name) and app.config['SHOULD_USE_EXISTING'])): app.logger.debug('Found information of package: %s in local repository', package_name) package_versions = [] template_data = dict( source_letter=package_name[0], package_name=package_name, versions=package_versions ) for filename in listdir(package_folder): if not filename.endswith('.md5'): # I only read .md5 files so I skip this egg (or tar, # or zip) file continue with open(join(package_folder, filename)) as md5_file: md5 = md5_file.read(-1) # remove .md5 extension name = filename[:-4] data = VersionData(name, md5) package_versions.append(data) return render_template('simple_package.html', **template_data) else: app.logger.debug('Didnt found package: %s in local repository. ' 'Using proxy.', package_name) url = app.config['PYPI_URL'] + 'simple/%s' % package_name response = get(url) if response.status_code != 200: app.logger.warning('Error while getting proxy info for: %s' 'Errors details: %s', package_name, response.text) abort(response.status_code) content = response.content p = PyQuery(content) external_links = set() for anchor in p("a"): panchor = PyQuery(anchor) href = panchor.attr('href') # robin-jarry: modified the href to ../../packages/ # so that it works also for non-source packages (.egg, .exe and .msi) parsed = urlparse.urlparse(href) if parsed.hostname: # the link is to an external server. if parsed.hostname == 'pypi.python.org': # we remove the hostname to make the URL relative panchor.attr('href', parsed.path) else: if panchor.attr('rel') == 'download': if url_is_egg_file(parsed.path): # href points to a filename external_links.add('<a href="%s">%s</a>' % (href, basename(parsed.path))) else: # href points to an external page where we will find # links to package files external_links.update(find_external_links(href)) # what ever happens, we remove the link for now # we'll add the external_links after that we found after panchor.remove() else: # local link to pypi.python.org if not href.startswith('../../packages/'): # ignore anything else than package links panchor.remove() # after collecting all external links, we insert them in the html page for link in external_links: plink = PyQuery(link) href = plink.attr('href') plink.attr('href', convert_to_internal_url(href, package_name, basename(href))) p('a').after(plink) content = p.outerHtml() return content
def detail(self, url): try: resp = self.session.get(url, timeout=self.cfg.REQUEST_TIME_OUT) # resp = requests.get(url,headers=self.session.headers,timeout=self.cfg.REQUEST_TIME_OUT) # print self.session.headers # resp = requests.get(url,headers=self.session.headers,timeout=20) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') # print resp.headers #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) Jtxt = pqhtml('script').text() #下架 if 'productDetails' not in Jtxt: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) pdata = self.get_pdata(Jtxt) #前期准备 product = pdata['product'] allLooks = product['allLooks'] skuJournal = self.get_skuJournal(Jtxt) sizeAttribute = product['sizeAttribute'] if product.has_key( 'sizeAttribute') else { 'values': [{ 'id': 0, 'value': self.cfg.DEFAULT_ONE_SIZE }] } colorAttribute = product['colorAttribute'] if product.has_key( 'colorAttribute') else { 'values': [{ 'id': 0, 'value': self.cfg.DEFAULT_ONE_COLOR }] } #lookId 和 SkuArr 映射 # lookId2SkuArr = dict([(look['productLookId'],[Id['skuId'] for Id in look['skus']]) for look in allLooks]) #lookId 和 ImgArr 映射 lookId2ImgArr = dict([(look['productLookId'], [ 'http:' + img['retinaQuickViewLookUrl'] for img in look['images'] ]) for look in allLooks]) #lookId 和 现价 映射, 多颜色多价格 lookId2Price = dict([(look['productLookId'], look['pricing']['maxSkuSalePrice']['raw']) for look in allLooks]) #lookId 和 原价 映射,多颜色多价格 lookId2ListPrice = dict([ (look['productLookId'], look['pricing']['maxSkuMsrpPrice']['raw']) for look in allLooks ]) #lookId 和 skuArr 映射 lookId2SkuArr = dict([(look['productLookId'], [Id['skuId'] for Id in look['skus']]) for look in allLooks]) #sizeId 和 名称 映射 #{2000: u's', 2001: u'm', 2002: u'l', 2003: u'xl', 2004: u'xxl'} sizeId2Name = dict([(size['id'], size['value']) for size in sizeAttribute['values']]) #colorId 和 名称 映射 #{1000: u'dark red', 1001: u'true navy'} colorId2Name = dict([(color['id'], color['value']) for color in colorAttribute['values']]) #sku 和 有库存 映射 sku2Inventory = self.get_sku2Inventory(skuJournal) #sku 和 无库存 映射 sku2NoInventory = dict([ (sku['skuId'], sku['numberUnitsForSale']) for sku in skuJournal['entries'] if sku['type'] == 'inventory' and sku['status'] == ['X', 'U'] ]) #更新 库存 字典 sku2Inventory.update(sku2NoInventory) #sku 和 现价 映射, 多size多价格. sku2Price = dict([(sku['skuId'], str(sku['salePrice']['raw'])) for sku in skuJournal['entries'] if sku['type'] == 'pricing']) #sku 和 原价 映射, 多size多价格. sku2ListPrice = dict([(sku['skuId'], str(sku['msrpPrice']['raw'])) for sku in skuJournal['entries'] if sku['type'] == 'pricing']) #skuId 和 sizeId 映射 skuId2SizeId = dict([ (sku['skuId'], sku['savId']) for sku in skuJournal['entries'] if sku['type'] == 'associate' and sku['attribute'] == 'Size' ]) #skuId 和 colorId 映射 skuId2ColorId = dict([ (sku['skuId'], sku['savId']) for sku in skuJournal['entries'] if sku['type'] == 'associate' and sku['attribute'] == 'Color' ]) #sku 和 sizeName 映射 sku2SizeName = self.get_sku2SizeName(product, skuId2SizeId, sizeId2Name) #sku 和 colorName 映射 sku2ColorName = self.get_sku2ColorName(product, skuId2ColorId, colorId2Name) #lookId 和 colorId 映射 lookId2ColorId = self.get_lookIe2ColorId(lookId2SkuArr, skuId2ColorId) #lookId 和 colorName 映射 lookId2ColorName = self.get_lookIe2ColorName( lookId2SkuArr, sku2ColorName) #lookId 和 size集合 映射 lookId2Sizes = self.get_lookId2Sizes(lookId2SkuArr, sku2SizeName, sku2Inventory, sku2Price, sku2ListPrice) # print(json.dumps(sku2Price)) # print(json.dumps(sku2ListPrice)) # print(json.dumps(lookId2SkuArr)) # print(json.dumps(sku2ColorName)) # print(json.dumps(lookId2ColorName)) # print(json.dumps(sku2SizeName)) detail = dict() #只获取当前连接中的sku值 try: lookId = None if '-' in url[url.rindex('/'):]: lookId = url[url.rindex('/') + 1:].split('-')[0] lookIds = [int(lookId)] except Exception, e: pass #钥匙 detail['keys'] = lookId2SkuArr.keys() #只获取链接中lookId # detail['keys'] = lookIds or lookId2SkuArr.keys() #颜色 detail['color'] = lookId2ColorName detail['colorId'] = lookId2ColorId #产品ID detail['productId'] = product['productId'] #图片 detail['img'] = dict([(lookId, imgArr[0]) for lookId, imgArr in lookId2ImgArr.items()]) detail['imgs'] = lookId2ImgArr #规格 detail['sizes'] = lookId2Sizes #价格 detail['price'] = lookId2Price detail['listPrice'] = lookId2ListPrice #品牌 brand = pdata['brand']['name'] detail['brand'] = brand #名称 detail['name'] = brand + ' ' + pdata['product']['name'] #货币符号 currency = pdata['defaultLook']['pricing']['currencyCode'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #退换货 detail['returns'] = pdata['returnPolicy']['description'] #描述 dtxt = PyQuery(pdata['product']['description']) dtxt.remove('strong') detail['descr'] = dtxt.text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail)
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = "" # results = [] # ORIGINAL CODE LINE # MY MODIFICATION START results = {} # MY MODIFICATION END resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json["items_html"].strip()) == 0: break refreshCursor = json["min_position"] scrapedTweets = PyQuery(json["items_html"]) # Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove("div.withheld-tweet") tweets = scrapedTweets("div.js-stream-tweet") if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() # usernameTweet = tweetPQ("span.username.js-action-profile-name b").text() usernameTweet = tweetPQ("span:first.username.u-dir b").text() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace("# ", "#").replace( "@ ", "@"), ) # NEW CODE START txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) txt = re.sub(r"#\s*", "#", txt) txt = re.sub(r"@\s*", "@", txt) # NEW CODE END retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) geo = "" geoSpan = tweetPQ("span.Tweet-geo") if len(geoSpan) > 0: geo = geoSpan.attr("title") urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.permalink = "https://twitter.com" + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.formatted_date = datetime.datetime.fromtimestamp( dateSec).strftime("%a %b %d %X +0000 %Y") tweet.retweets = retweets tweet.favorites = favorites # tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) #OLD tweet.mentions = " ".join( re.compile(r"(@\s\w*)").findall(tweet.text)) # NEW # tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) # OLD tweet.hashtags = " ".join( re.compile(r"(#\s\w*)").findall(tweet.text)) # NEW tweet.geo = geo tweet.urls = ",".join(urls) tweet.author_id = user_id # tweet.replies = int(tweetPQ("span.ProfileTweet-action--reply span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "").strip()) replies = int( tweetPQ( "span.ProfileTweet-action--reply span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.reply = replies # NEW CODE START try: tweet.isReply = tweetPQ( "div.ReplyingToContextBelowAuthor").is_("div") if tweet.isReply: tweet.replyTo = tweetPQ( "div.ReplyingToContextBelowAuthor span.username b" ).contents()[0] else: tweet.replyTo = "" except: pass # NEW CODE END # results.append(tweet) # ORIGINAL CODE LINE results[id] = [ tweet.geo, tweet.author_id, tweet.date, tweet.text, tweet.retweets, tweet.favorites, tweet.mentions, tweet.hashtags, tweet.permalink, tweet.reply, tweet.isReply, tweet.replyTo, ] resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if (tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets): active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def extract(self): item = ContentItem() self.html = re.sub('<!--.*?-->', '', self.html) content_node = self.hxs.select("//div[@class = 'art_con']").extract() content_node = PyQuery(content_node[0]) content_node.remove('div[class = "pconline_page"]') content_node.remove('div[class = "pc3g"]') content_node.remove('div[class = "pageTips"]') content_node.remove('div[class = "art_nav_box mt10"]') content_node.remove('div[class = "art_bottom"]') content_node.remove('div[class = "art_con_top"]') item['image_urls'] = [self.getRealURI(img.get('src')) for img in content_node('img') if not img.get('src').endswith('.gif')] item['title'] = self.title = self.hxs.select("//h1/text()").extract()[0] if not item['title']: item['title'] = self.title = self.hxs.select("//div[@id = 'UC_newsInfoDetail_lbl_newsTitle']/text()").extract()[0] item['content'] = self.content = content_node.__unicode__() release_time = self.hxs.select("//div[@class = 'art_con_top']").extract()[0] doc_t = PyQuery(release_time) release_time = doc_t('span').text() p = re.compile(u'20\d\d年\d\d月\d\d日') #item['release_time'] = self.release_time = doc('div[class="art_con_top"]').find('span').eq(0).text() item['release_time'] = self.release_time = p.search(release_time).group() item['source'] = u'pconline' item['author'] = '' item['pic_url'] = '' return item
class HTMLGenerator(object): """HTML Generator """ def __init__(self): self.MAX_WORKERS = 4 self.MULTIPROCESS_BOUND = 20 def load_tree_template(self): """Load tree HTML templates """ with open(os.path.join(os.path.dirname(__file__), 'template', 'tree_template.html')) as f: self.template = PyQuery(f.read(), parser='html') with open(os.path.join(os.path.dirname(__file__), 'template', 'tree_node_template.html')) as f: self.node_template = PyQuery(f.read(), parser='html') self.node_template_html = self.node_template.html() def import_js(self, js_ids): """Import JS to HTML :param js_ids: dict type, {script_id with #: js_file_name} exmaple: {"#script_jquery": "jquery.min.js"} """ _path = os.path.dirname(__file__) for _id in js_ids.iterkeys(): self.template(_id).attr("src", "%s/bin/js/%s" % (_path, js_ids[_id])) # In case that lxml change <script></script> to <script/> self.template(_id).html("var _lxml = 0;") def generate_tree_structure_HTML(self, root_node, output): """Generate a html file with tree structure. :param root_node: RDirNode root of the module :param output: Output html file """ # Init self.load_tree_template() self.tree_nodes = [] self.max_layer = 0 self.import_js({ # script_id : js_file_name "#script_jquery": "jquery.min.js", "#script_rdir_tree": "rdir_tree.js" }) self.template('#header_name').html(root_node.name) self.template('#header_type').html(" <%s>" % root_node.type) header_doc = root_node.doc.replace('\t', ' ' * 4) \ .replace(' ', ' ').replace('\n', '<br/>').strip() if len(header_doc) > 0: self.template('#header_doc').html(header_doc + '<br/>') else: self.template.remove('#header_doc') self.template('title').html(root_node.name) # Recur if len(root_node.list_children()) == 0: # self._add_node_to_HTML("No visible children methods or members.", # "If you see this, that means this object has nothing else to show.", # "404", # 0) pass else: self.render_tree_html(root_node) # Render html for i in xrange(self.max_layer + 1): self.template("#choose_layer").append( "<option value='%d'>%d</option>" % (i, i) ) self.template('#wrapper').append("\n".join(self.tree_nodes)) # Write to file with open(output, 'w') as f: f.write(self.template.html()) def render_tree_html(self, root_node): """ Render the node html. Use multiprocessing to speed up if needed. :param root_node: RDirNode root of the module """ job_list = self.get_job_list(root_node) job_size = len(job_list) if job_size > self.MULTIPROCESS_BOUND: jobs_list = Util.split_jobs(job_list, self.MAX_WORKERS) else: jobs_list = [job_list] pool = multiprocessing.Pool(processes=self.MAX_WORKERS) result = [] html = self.node_template.html() for jobs in jobs_list: if len(jobs) > 0: result.append(pool.apply_async(parse_tree_node_worker, (html, jobs))) # pool.close() # pool.join() self.tree_nodes = [None] * job_size for res in result: res = res.get() for tpl in res: index, node_html = tpl self.tree_nodes[index] = node_html def get_job_list(self, root_node): """Generate the job list :param root_node: RdirNode type, root of rdir_node :return: list type, [(index, rdir_node, depth)] """ job_list = [] for key in root_node.list_children(): job_list += self.recur_node_to_list(root_node.get_children(key), 0) return [(index, job[0], job[1]) for index, job in enumerate(job_list)] def recur_node_to_list(self, rdir_node, depth): """Recursively traverse all the nodes into a sequential list. :param rdir_node: :param depth: :return: list type, [(rdir_node, depth)] """ self.max_layer = (self.max_layer < depth) and depth or self.max_layer _list = [(rdir_node, depth)] for key in rdir_node.list_children(): _list += self.recur_node_to_list(rdir_node.get_children(key), depth + 1) return _list
from pyquery import PyQuery assert len(sys.argv) == 2, "Second argument is the notebook name!" NOTEBOOK = sys.argv[1] parts = NOTEBOOK.split('.') parts[-1] = "html" HTML_FILE = ".".join(parts) # Gather the information from the first cell. with open(NOTEBOOK) as f: res = json.load(f) blocks = json.loads("".join(res['cells'][0]['source'])) # Convert the notebook. call(['ipython', 'nbconvert', NOTEBOOK, '--to', 'html', '--template', 'basic']) # Remove input cells. with open(HTML_FILE) as f: doc = PyQuery(f.read(), parser='html') doc.remove('.input') blocks['body'] = doc.html() # Insert into simple template. BASE_DIR = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(BASE_DIR, 'my_template.html')) as f: tmpl = f.read() template = Template(tmpl) with open(HTML_FILE, 'w') as f: f.write(template.render(**blocks))
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernameTweet = tweetPQ( "span.username.js-action-profile-name b").text() raw_txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) txt = raw_txt.replace('#', '# ').replace('@', '@ ') retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.raw_txt = raw_txt tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.formatted_date = datetime.datetime.fromtimestamp( dateSec).strftime("%a %b %d %X +0000 %Y") tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('@ \\S+').findall(raw_txt)) tweet.hashtags = " ".join( re.compile('# \\S+').findall(raw_txt)) tweet.geo = geo tweet.urls = ",".join(urls) tweet.author_id = user_id results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
import os from pyquery import PyQuery in_dir = '/home/jental/dev/eda_ru/eda.ru/' # in_dir = '/home/jental/dev/eda_ru/tmp/' for cdir, dirs, files in os.walk(in_dir): for file in files: full_filename_b = os.path.join(cdir, file).encode("utf-8", "surrogateescape") try: full_filename = full_filename_b.decode("utf-8") print(full_filename) if full_filename.endswith(('.html', '.htm')): with open(full_filename, 'r') as fh: html = fh.read() jQuery = PyQuery(html) jQuery.remove('.ad-link') with open(full_filename, 'w') as fh: fh.write(jQuery("html").html()) except: pass