def ichinese(): urls = get_test_urls() res = defaultdict(int) for url in urls: html = get_or_cache(url) chs = get_chinese(html) for ch in chs: res[ch] += 1 res = '|'.join([a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()), key=lambda x: -x[1])]) save_json('chineses.json', res)
def ichinese(): urls = get_test_urls() res = defaultdict(int) for url in urls: html = get_or_cache(url) chs = get_chinese(html) for ch in chs: res[ch] += 1 res = '|'.join([ a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()), key=lambda x: -x[1]) ]) save_json('chineses.json', res)
def article(): url = request.args.get('url') article = mongo.article.find_one({'_id': url}) if not article: try: html = get_or_cache(url) article = html2article(html, url, selector=True, merge=True) if article and not article['src_name']: article['src_name'] = get_domain(url) tpl = url2tpl(url) urls = html2urls(html, url) texts = dict( map(lambda x: (x[0], max(x[1], key=lambda y: len(y))), urls.iteritems())) tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys())) urls = {} for u, t in tmp.iteritems(): if u != url and t == tpl: urls[u] = texts[u] if len(urls) >= 10: break if article: article['urls'] = urls article['_id'] = url article['view'] = 1 article['last'] = time.time() copy = article.copy() copy['urls'] = json.dumps(copy['urls']) mongo.article.save(copy) except: pass else: article['urls'] = json.loads(article['urls']) mongo.article.update({'_id': url}, {'$set': { 'view': article['view'] + 1 }}) if article: article['pubtime'] = article['pubtime'][:10] return render_template('extract/article.html', article=article, url=url)
def test_article(): debug = True if request.args.get('debug') == 'true' else False url = request.args.get('url', '') if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url, print_path=True) extractor = ArticleExtractor(html, url, debug=debug) article = extractor.article selector = extractor.selector if extractor.pages and article: article['content'] = ArticleMerger(url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), debug=debug, **selector).content return json.dumps({'url': url, 'article': article, 'selector': selector})
def article(): url = request.args.get('url') article = mongo.article.find_one({'_id':url}) if not article: try: html = get_or_cache(url) article = html2article(html, url, selector=True, merge=True) if article and not article['src_name']: article['src_name'] = get_domain(url) tpl = url2tpl(url) urls = html2urls(html, url) texts = dict(map(lambda x: (x[0], max(x[1], key=lambda y:len(y))), urls.iteritems())) tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys())) urls = {} for u, t in tmp.iteritems(): if u != url and t == tpl: urls[u] = texts[u] if len(urls) >= 10: break if article: article['urls'] = urls article['_id'] = url article['view'] = 1 article['last'] = time.time() copy = article.copy() copy['urls'] = json.dumps(copy['urls']) mongo.article.save(copy) except: pass else: article['urls'] = json.loads(article['urls']) mongo.article.update({'_id':url}, {'$set':{'view':article['view'] + 1}}) if article: article['pubtime'] = article['pubtime'][:10] return render_template('extract/article.html', article=article, url=url)
def test_article(): debug = True if request.args.get('debug') == 'true' else False url = request.args.get('url', '') if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url, print_path=True) extractor = ArticleExtractor(html, url, debug=debug) article = extractor.article selector = extractor.selector if extractor.pages and article: article['content'] = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), debug=debug, **selector ).content return json.dumps({'url':url, 'article':article, 'selector':selector})
def run(self): html = get_or_cache(self.url) doc = clean_html(html, self.url, return_doc=True) urls = html2urls(html, self.url, name=False) urls = sorted(urls, key=self.score) name = tag2text(doc, 'meta', property="og:site_name") if name: self.name = self.text = name else: cnt = 10 while cnt <= 100: if self.get_name(urls[:cnt]): print self.domain, cnt break cnt += 10 if self.name is not None: self.get_sub(urls)
def test_article_all(): urls = get_test_urls() res = [] for url in urls: try: html = get_or_cache(url) except: continue try: extractor = ArticleExtractor(html, url) res.append({ 'url': url, 'article': extractor.article, 'selector': extractor.selector, }) except ArticleNotFound, e: print url print str(e) continue
def test_segment(url): url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content return json.dumps({ 'url':url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content)) })
def test_segment_all(): urls = get_test_urls() res = [] for url in urls: html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content res.append({ 'url':url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content, code=False)) }) return json.dumps(res)
def test_article_all(): urls = get_test_urls() res = [] for url in urls: try: html = get_or_cache(url) except: continue try: extractor = ArticleExtractor(html, url) res.append({ 'url':url, 'article':extractor.article, 'selector':extractor.selector, }) except ArticleNotFound, e: print url print str(e) continue
def test_segment(url): url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content return json.dumps({ 'url': url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content)) })
def test_segment_all(): urls = get_test_urls() res = [] for url in urls: html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content res.append({ 'url': url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content, code=False)) }) return json.dumps(res)
def test_article_all(): urls = get_test_urls() res = [] for url in urls: try: html = get_or_cache(url) extractor = Article(html, url) res.append({ 'url':url, 'article':extractor.article, 'selector':extractor.selector, }) from utils import print_dict print_dict(extractor.article) print_dict(extractor.selector) except: print 'error', url print '-' * 80 print len(urls) return json.dumps(res)
def test(url): html = get_or_cache(url) urls = html2urls(html, url, name=False) words = defaultdict(int) u = set() for i in urls: if i.startswith(url) and len(get_path(i).split('/')) <= 2: u.add(i) if len(u) < 10: for i in urls: if i.startswith(url) and len(get_path(i).split('/')) <= 3: u.add(i) if len(u) < 20: for i in urls: if i.startswith(url) and len(get_path(i).split('/')) <= 4: u.add(i) urls = list(u)[:10] for i in urls: res = url2meta(i, get=get_or_cache) if res is not None: if '_' in res['title']: for word in res['title'].split('_'): if word.strip(): words[word.strip()] += 1 elif '|' in res['title']: for word in res['title'].split('|'): if word.strip(): words[word.strip()] += 1 elif '-' in res['title']: for word in res['title'].split('-'): if word.strip(): words[word.strip()] += 1 print_dict(words, cmp_key=lambda x: -x[1], limit=5)
def test_article(url): debug = True if request.args.get('debug') == 'true' else False url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = Article(html, url, debug=debug) article = extractor.article selector = extractor.selector from utils import print_dict print_dict(article) print_dict(selector) # if extractor.pages: # article['content'] = ArticleMerger( # url, # extractor.title, # fetch_urls(extractor.pages, handle=get_or_cache), # debug=debug, # **selector # ).content return json.dumps({'url':url, 'article':article, 'selector':selector})