def get_articles(grab_object, title_path, link_path, source, site_url="", summary_path=''): posts = [] post_links = grab_object.doc.tree.cssselect(link_path) post_titles = grab_object.doc.tree.cssselect(title_path) if summary_path: summary = grab_object.doc.tree.cssselect(summary_path) for i in summary: for j in i.cssselect('script') + i.cssselect('style'): j.drop_tree() else: summary = [] while len(summary) < len(post_links): summary.append(u'') zip_object = zip(post_links, post_titles, summary) for (title, link, summary_text) in zip_object: title = unicode_(title.text_content()).strip() link = grab_object.make_url_absolute(link.get("href")) posts.append( {"title": escape_title(title), "link": unicode_(link), "source": source, "summary": summary_text}) return posts
def get_articles(grab_object, title_path, link_path, source, site_url="", summary_path=''): posts = [] post_links = grab_object.css_list(link_path) post_titles = grab_object.css_list(title_path) if summary_path: summary = grab_object.css_list(summary_path) for i in summary: for j in i.cssselect('script') + i.cssselect('style'): j.drop_tree() else: summary = [] while len(summary) < len(post_links): summary.append('') zip_object = zip(post_links, post_titles, summary) for (title, link, summary_text) in zip_object: title = unicode_(title.text_content()).strip() link = grab_object.make_url_absolute(link.get("href")) posts.append({ "title": escape_title(title), "link": unicode_(link), "source": source, "summary": summary_text }) return posts
def has_words(qs, article): """Check if article contains words""" text = remove_tags(unicode_(article['title'])).lower() \ + remove_tags(unicode_(article['summary'])).lower() for i in qs: if i not in text: return False return True
def has_words(qs, article): """Check if article contains words""" title = unicode_(article['title']).lower() summary = unicode_(article['summary']).lower() for i in qs: if i not in title and i not in summary: return False return True
def has_words(qs, article): """Check if article contains words""" title = remove_tags(unicode_(article['title']).lower()) summary = remove_tags(unicode_(article['summary']).lower()) for i in qs: if i not in title and i not in summary: return False return True
def show_blacklist(page_number=1): history_page = mylookup.get_template('blacklist.html') q = unicode_(request.GET.get('q', '')) articles = recommend.get_blacklist(db=get_conf.config.db) try: page_number = int(page_number) except ValueError: page_number = 1 if q: qs = q.lower().split() articles = filter(lambda x: has_words(qs, x), articles) articles = map(lambda x: replace_newlines(escape_link(x)), articles) all_articles = articles articles = split_into_pages(articles, 30) try: requested_page = articles[page_number-1] except IndexError: requested_page = [] return history_page.render(articles=requested_page, num_pages=len(articles), page_num=page_number, q=q, all_articles=all_articles)
def show_blacklist(page_number=1): blacklist_page = mylookup.get_template('blacklist.html') q = unicode_(request.GET.get('q', '')) articles = recommend.get_blacklist() try: page_number = int(page_number) except ValueError: page_number = 1 if q: qs = q.lower().split() articles = filter(lambda x: has_words(qs, x), articles) articles = map(lambda x: escape_link(x), articles) all_articles = articles articles = split_into_pages(articles, 30) try: requested_page = articles[page_number - 1] except IndexError: requested_page = [] return blacklist_page.render(articles=requested_page, num_pages=len(articles), page_num=page_number, q=q, page='blacklist', config=get_conf.config)
def show_blacklist(page_number=1): try: page_number = int(page_number) except ValueError: page_number = 1 q = unicode_(request.GET.getunicode('q', '')) html = get_cache('cached_blacklist_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8'))) if html: return html blacklist_page = cache['templates'].get('blacklist.html') if not blacklist_page: blacklist_page = mylookup.get_template('blacklist.html') cache['templates']['blacklist.html'] = blacklist_page # TODO cache data articles = recommend.get_blacklist() if q: qs = q.lower().split() articles = iter(filter(lambda x: has_words(qs, x), articles)) articles = iter(map(lambda x: escape_link(x), articles)) requested_page = list(get_page(articles, 30, page_num=page_number)) num_pages = page_number try: next(articles) num_pages += 1 except StopIteration: pass html = blacklist_page.render(articles=requested_page, num_pages=num_pages, page_num=page_number, q=q, page='blacklist', config=get_conf.config).decode('utf8') if len(requested_page): cache_data('cached_blacklist_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8')), html) return html
def get_articles(): g = grab.Grab() parser.setup_grab(g) g.go('http://planet.clojure.in') css_path = '.entry .article > h2 a' summary_texts = [] for elem in g.css_list(".entry .article"): text = '' for children in elem.getchildren()[1:-1]: text += unicode_(children.text_content()).strip() summary_texts.append(parser.cut_text(text)) posts = parser.get_articles(g, css_path, css_path, 'planetclojure', 'planet.clojure.in') for (post, summary_text) in zip(posts, summary_texts): post['summary'] = summary_text return posts
def article_list(page_number=1): """Show list of articles | Search for articles""" main_page = mylookup.get_template("articles.html") q = unicode_(request.GET.get('q', '')) try: page_number = int(page_number) except ValueError: page_number = 1 try: articles = load_articles() except IOError: dump_articles() articles = load_articles() articles = filter_articles(articles) if q: qs = q.lower().split() articles = filter(lambda x: has_words(qs, x), articles.values()) articles = map(lambda x: escape_link(x), articles) else: articles = map(lambda x: escape_link(x), articles.values()) all_articles = articles articles = split_into_pages(articles, 30) try: requested_page = articles[page_number - 1] set_liked(requested_page) except IndexError: requested_page = [] return main_page.render(articles=requested_page, num_pages=len(articles), page_num=page_number, q=q, page='main', config=get_conf.config, is_parsing=get_var('parsing', '0') == '1')
def unescape(text): try: text = unicode_(text) except TypeError: pass def fixup(m): text = m.group(0) if text[:2] == "&#": try: if text[:3] == "&#x": return chr(int(text[3:-1], 16)) else: return chr(int(text[2:-1])) except ValueError: pass else: try: text = chr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text return TextClassifier.unicode_chr_regex.sub(fixup, text)
def article_list(page_number=1): """Show list of articles | Search for articles""" main_page = mylookup.get_template("articles.html") q = unicode_(request.GET.get('q', '')) try: page_number = int(page_number) except ValueError: page_number = 1 try: articles = load_articles() except IOError: dump_articles() articles = load_articles() articles = filter_articles(articles) if q: qs = q.lower().split() articles = filter(lambda x: has_words(qs, x[0]), articles) articles = map(lambda x: replace_newlines(escape_link(x[0])), articles) all_articles = articles articles = split_into_pages(articles, 30) try: requested_page = articles[page_number-1] set_liked(requested_page) except IndexError: requested_page = [] return main_page.render(articles=requested_page, num_pages=len(articles), page_num=page_number, q=q, all_articles=all_articles,)
def article_list(page_number=1): """Show list of articles | Search for articles""" try: page_number = int(page_number) except ValueError: page_number = 1 q = unicode_(request.GET.getunicode('q', '')) if not get_conf.config.enable_random or q: html = get_cache('cached_main_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8'))) if html: return html main_page = cache['templates'].get('articles.html') if not main_page: main_page = mylookup.get_template("articles.html") cache['templates']['articles.html'] = main_page cache_page = False if get_conf.config.data_format == 'db': if q: articles = select_all_articles() qs = q.lower().split() requested_page = [] j = 0 k = (page_number - 1) * 30 n = 0 append = requested_page.append for article in articles: if has_words(qs, article): j += 1 if j > k: n += 1 if n == 31: break else: append(escape_link(article)) num_pages = page_number - 1 + math.ceil(n / 30.0) cache_page = True else: requested_page = select_articles_from_page(page_number) requested_page = list(map(lambda x: escape_link(x), requested_page)) num_pages = int(get_var('num_pages', 0)) if num_pages == 0: num_pages += 1 cache_page = False else: cache_page = True else: # TODO reduce memory usage try: articles = load_articles() except IOError: dump_articles() articles = load_articles() if q: qs = q.lower().split() articles = iter(filter(lambda x: has_words(qs, x), articles.values())) articles = iter(map(lambda x: escape_link(x), articles)) cache_page = True else: articles = iter(map(lambda x: escape_link(x), articles.values())) articles = split_into_pages(articles, 30) num_pages = len(articles) try: requested_page = articles[page_number-1] except IndexError: requested_page = [] if get_conf.config.enable_random and not q: random_articles = [escape_link(x) for x in articles_from_list(getRandomArticles(page_number))] set_liked(random_articles) else: random_articles = [] set_liked(requested_page) html = main_page.render(articles=requested_page, random_articles=random_articles, num_pages=num_pages, page_num=page_number, q=q, page='main', config=get_conf.config, is_parsing=get_var('parsing', '0') == '1').decode('utf8') if not cache_page: cache_page = bool(len(requested_page)) if cache_page and not get_conf.config.enable_random and not q: cache_data('cached_main_{0}_{1}'.format(page_number, hexlify(q.encode('utf8')).decode('utf8')), html) return html