def url_matcher(event, url, *args, **kwargs): html = requests.get(url).text readable_article = Document(html).summary().encode("utf-8") readable_article = TAG_RE.sub('', readable_article) readable_article = WHITESPACE_RE.sub(' ', readable_article) readable_article = readable_article.replace('\n', ' ') readable_article = readable_article.replace(' ', '') if len(readable_article) > 75: readable_article = readable_article[:75] + '...' readable_title = Document(html).short_title().encode("utf-8") return "> " + url + " > " + readable_title + " > " + readable_article
def getTextFromHTML(self, url_id): """ Runs Readability (Document) on the HTML text """ html_row = get_html(self.pg_conn, url_id) if not html_row or 'html' not in html_row: return False if html_row['readabletext'] and html_row['readabletext'] != '': return html_row['readabletext'] html = html_row['html'] try: html_summary = Document(html).summary(html_partial=True) html_summary = html_summary.replace('\n', '').replace('\t', '') if len( html_summary ) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary: return False raw_text = lxml.html.document_fromstring( html_summary).text_content() except: raw_text = False if raw_text: save_readabletext(self.pg_conn, url_id, raw_text, 'meta') else: save_readabletext(self.pg_conn, url_id, '', 'meta') return raw_text
def getText(): dataList = [] for f in os.listdir('unsupervised\\documents'): filePath = 'unsupervised\\documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" return dataList
def fetch_article_contents(self): """ Uses Readability.js + BS4 methods to parse raw html list and outputs list of text in an article """ for article in self.raw_html: article = Document(article).summary() article = BeautifulSoup(article) [tag.extract() for tag in article.find_all('img')] [tag.extract() for tag in article.find_all('embed')] article = article.get_text() article = unicode(article) article = article.replace('\t', '') article = article.replace('\n', ' ') self.article_html.append(article) return self.article_html
def getTextFromHTML(self, url_id): """ Runs Readability (Document) on the HTML text """ html_row = get_html(self.pg_conn, url_id) if not html_row or 'html' not in html_row: return False if html_row['readabletext'] and html_row['readabletext'] != '': return html_row['readabletext'] html = html_row['html'] try: html_summary = Document(html).summary(html_partial=True) html_summary = html_summary.replace('\n','').replace('\t','') if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary: return False raw_text = lxml.html.document_fromstring(html_summary).text_content() except: raw_text = False if raw_text: save_readabletext(self.pg_conn, url_id, raw_text, 'meta') else: save_readabletext(self.pg_conn, url_id, '', 'meta') return raw_text
def main(): #print 'Hello there' # Command line args are in sys.argv[1], sys.argv[2] ... # sys.argv[0] is the script name itself and can be ignored dataList = [] for f in os.listdir('documents'): filePath = 'documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) # with open(filePath) as f: # doc = slate.PDF(f) # print doc[1] # exit() #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" #print dataList #for i in dataList: # print i cachedStopWords = stopwords.words("english") combined = ' '.join(dataList) #print combined bloblist = [tb(combined)] for i, blob in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')} #print scores sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) #print sorted_words for word, score in sorted_words: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
def get_article (url, referrer=None): """Fetch the html found at url and use the readability algorithm to return just the text content""" html = load_url(url, referrer) if html is not None: doc_html = Document(html).summary(html_partial=True) clean_html = doc_html.replace('&', u'&').replace(u' ', u'\n') return BeautifulSoup(clean_html).getText(separator=u' ').replace(u' ', u' ')
def get_article(url, referrer=None): """Fetch the html found at url and use the readability algorithm to return just the text content""" html = load_url(url, referrer) if html is not None: doc_html = Document(html).summary(html_partial=True) clean_html = doc_html.replace('&', u'&').replace(u' ', u'\n') return BeautifulSoup(clean_html).getText(separator=u' ').replace( u' ', u' ')
def url_matcher(event, url, *args, **kwargs): r = requests.head(url) # files that are too big cause trouble. Let's just ignore them. if 'content-length' in r.headers and \ int(r.headers['content-length']) > 5e6: return html = requests.get(url).text readable_article = Document(html).summary().encode("utf-8") readable_article = TAG_RE.sub('', readable_article) readable_article = WHITESPACE_RE.sub(' ', readable_article) readable_article = readable_article.replace('\n', ' ') readable_article = readable_article.replace(' ', '') if len(readable_article) > 75: readable_article = readable_article[:75] + '...' readable_title = Document(html).short_title().encode("utf-8") return "> " + url + " > " + readable_title + " > " + readable_article
def get_main_text(html): main_text = Document(html).summary() main_text = BeautifulSoup(main_text).getText() # 处理空行 r = re.compile(r'\n+', re.M | re.S) main_text = r.sub('\n', main_text) # 去除首行回车 if main_text.find('\n') == 0: main_text = main_text.replace('\n', '', 1) return main_text
def getTextFromHTML(self, html): """ Runs Readability (Document) on the HTML text """ try: html_summary = Document(html).summary(html_partial=True) html_summary = html_summary.replace('\n','').replace('\t','') if "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary: return False raw_text = lxml.html.document_fromstring(html_summary).text_content() except: raw_text = False return raw_text
def get_content(self, url): rt_result = [] dr = re.compile(r'<[^>]+>', re.S) html = urllib.urlopen(url).read() cur_title = Document(html).short_title().replace(' ', '') readable_article = Document(html).summary() print readable_article.encode('utf8') readable_article = readable_article.replace(' ', '') cur_list = readable_article.replace('</p>', '\n').split('\n') for item in cur_list: if '<img' in item and 'src=' in item: #print item.split('src=')[1].split('"')[1] dom = soupparser.fromstring(item) if len(dom) > 0: img_path = dom[0].xpath('.//img') for img in img_path: rt_result.append(['0', img.get('src')]) else: use_item = dr.sub('', item).replace(' ', '') if len(use_item) > 10: rt_result.append(['1', use_item]) return cur_title, rt_result
def extrat_html_document(url): try: print "extrat_html_document" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} r = urllib2.Request(url, headers=headers) socket = urllib2.urlopen(r, timeout=1) url = socket.geturl() html = socket.read() #block_url pass for bl_url in block_url: if len(url.split(bl_url)) > 1: summary = "block" return summary for ext_url in exception_url: if len(url.split(ext_url)) > 1: readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') _file.write(summary + '\n') return summary readable_article = Document(html).summary() readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') + readable_title.encode( 'utf-8') print "soup start" soup = BeautifulSoup(readable_article.replace("br/", "p"), "html.parser") print "summary:" for s in soup("p"): summary += str(s.encode('utf-8')) # summary += readable_article.encode('utf-8') except Exception: _file.write('extrat_html_document Failed URL : ' + url + '\n') summary = "Failed Get data" return summary
def extrat_html_document(url): try : print "extrat_html_document" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } r = urllib2.Request(url, headers=headers) socket = urllib2.urlopen(r,timeout = 1) url = socket.geturl() html = socket.read() #block_url pass for bl_url in block_url: if len(url.split(bl_url)) > 1: summary="block" return summary for ext_url in exception_url: if len(url.split(ext_url)) > 1: readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') _file.write(summary+'\n') return summary readable_article = Document(html).summary() readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') + readable_title.encode('utf-8') print "soup start" soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser") print "summary:" for s in soup("p"): summary += str(s.encode('utf-8')) # summary += readable_article.encode('utf-8') except Exception: _file.write('extrat_html_document Failed URL : ' + url + '\n') summary = "Failed Get data" return summary
def parser_content(url): rt_result = [] dr = re.compile(r'<[^>]+>',re.S) html = urllib.urlopen(url).read() readable_article = Document(html).summary().encode('utf8') #print readable_article readable_article = readable_article.replace(' ','') cur_list = readable_article.split('\n') for item in cur_list: if '<img' in item and 'src=' in item: #print item.split('src=')[1].split('"')[1] dom = soupparser.fromstring(item) if len(dom) > 0: img_path = dom[0].xpath('.//img') for img in img_path: rt_result.append(['0',img.get('src')]) else: use_item = dr.sub('',item).replace(' ','') if len(use_item) > 10: rt_result.append(['1',use_item]) return rt_result
def parse_item(self, response): sel = Selector(response) try: print(response.url) #region title if sel.css("h1#title::text").extract_first().strip(): title = sel.css("h1#title::text").extract_first().strip() elif sel.xpath("//title/text()").extract_first().strip(): title = sel.xpath("//title/text()").extract_first().strip() else: title = "" #endregion #region publish_data if sel.css("span#pubtime::text").re_first(r"\d{4}年\d{2}月\d{2}日"): publish_data = sel.css("span#pubtime::text").re_first( "\d{4}年\d{2}月\d{2}日") else: publish_data = "" # endregion #region reference if sel.css("span#pubtime::text").re_first(r"来源:(.*)"): reference = sel.css("span#pubtime::text").re_first(r"来源:(.*)") else: reference = "" #endregion #region keywords if sel.xpath( "//div[@class='zuoyou0']/div[5]/font/text()").extract(): keywords = ",".join( sel.xpath("//div[@class='zuoyou0']/div[5]/font/text()"). extract()) elif sel.xpath( "//div[@class='zuoyou0']/div[4]/font/text()").extract(): keywords = ",".join( sel.xpath("//div[@class='zuoyou0']/div[4]/font/text()"). extract()) elif sel.xpath( '//meta[@name="keywords"]/@content').extract_first(): keywords = ",".join( sel.xpath( '//meta[@name="keywords"]/@content').extract_first()) else: keywords = '' #endregion #region html_content if sel.xpath("//div[@class='duiqi']/p/font/text()").extract(): html_content = "".join( sel.xpath("//div[@class='duiqi'][2]/p").extract()) else: html_content = Document(response.text).summary() html_content = html_content.replace('<html><body>', '').replace( '<html><body>', '') content = "".join( Selector(text=html_content).css("::text").extract()) #endregion #region img_url if Selector(text=html_content).css("img::attr(src)").extract(): img = Selector( text=html_content).css("img::attr(src)").extract_first else: img = '' #endregion # region item i = ItemLoader(item=NewsItem(), response=response) if (title): i.add_value(field_name='title', value=title) i.add_value(field_name='publish_date', value=publish_data) i.add_value(field_name="reference", value=reference) # i.add_value(field_name="author", value=authon) i.add_value(field_name="keywords", value=keywords) i.add_value(field_name="html_content", value=html_content) i.add_value(field_name="image_url", value=img) # 补全代码 # 基类 yield i.load_item() # endregion except Exception as e: print(e.args)
def get_cleaned_html_from_url(url): readable_article = Document(get_html(url)).summary() readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"") string_out = "<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>" string_out += readable_article[6:] return string_out
def write_readable_text_from_url(url,out_file): readable_article = Document(get_html(url)).summary() readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"") out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>") out_file.write(readable_article[6:])
def update(offset): offset = int(offset) if offset == 0: db.session.query(Entry).delete() db.session.commit() return '' # Obtain bearer token from Twitter url = "https://api.twitter.com/oauth2/token" consumer_key = os.environ.get('TWITTER_CONSUMER_KEY') consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET') auth = base64.b64encode(consumer_key + ':' + consumer_secret) request = urllib2.Request(url, "grant_type=client_credentials", {"Authorization": "Basic "+auth}) response = urllib2.urlopen(request).read() json_response = json.loads(response) access_token = json_response['access_token'] # Obtain HN posts >100 pts url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=newsyc100&count=40" request = urllib2.Request(url, headers={"Authorization": "Bearer "+access_token}) response = urllib2.urlopen(request).read() tweets = json.loads(response) increment = 2 start_at = (offset - 1) * increment tweets = tweets[start_at:start_at + increment] for tweet in tweets: title = tweet['text'] start_link = title.rfind("(http") end_link = title.find(")", start_link) comment_link = title[start_link+1:end_link] title = title[0:start_link] start_link = title.rfind("http") end_link = title.find(" ", start_link) link = title[start_link:end_link] title = title[0:start_link] try: response = urllib2.urlopen(link) except urllib2.HTTPError: continue encoding = response.headers['content-type'].split('charset=')[-1] if encoding == 'text/html': encoding = 'utf-8' if encoding == 'application/pdf': continue html = response.read().decode(encoding, 'ignore') if sys.modules.has_key('readability.readability'): body = Document(html).summary() else: body = html body = body.replace('<html><body>', '<html><body><a href="' + comment_link + '">HN Comments</a><br>') body = body.replace('<body id="readabilityBody">', '') entry = Entry(link, title, body) db.session.add(entry) db.session.commit() return ''
import requests from readability.readability import Document url_in = "" # put url here r = requests.get(url_in) # print(r.status_code) html = r.text with open('out.html','w') as out_file: readable_article = Document(html).summary() readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"") out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>") out_file.write(readable_article[6:])