class ReadabilityToEpub: def __init__(self, parser_token=None): if not parser_token: raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api") self.parser_client = ParserClient(token=parser_token) def convert_url(self, url): parser_resp = self.parser_client.get_article(url).json() epub_book = epub.EpubBook() epub_book.set_title(parser_resp["title"]) epub_book.add_author(parser_resp["author"]) content_html = epub.EpubHtml( title=parser_resp["title"], file_name="content.xhtml", content="<h1>{}</h1>\n{}".format(parser_resp["title"], parser_resp["content"]), ) epub_book.add_item(content_html) epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) # A spine determines the order in which content will be shown epub_book.spine = [content_html] epub.write_epub( "{}.epub".format(slugify(parser_resp["title"])), epub_book, dict(plugins=[DownloadImagesPlugin()]) )
class ReadabilityToEpub: def __init__(self, parser_token=None): if not parser_token: raise Exception( "Get a Readability parser token at: https://www.readability.com/developers/api" ) self.parser_client = ParserClient(token=parser_token) def convert_url(self, url): parser_resp = self.parser_client.get_article(url).json() epub_book = epub.EpubBook() epub_book.set_title(parser_resp['title']) epub_book.add_author(parser_resp['author']) content_html = epub.EpubHtml(title=parser_resp['title'], file_name='content.xhtml', content="<h1>{}</h1>\n{}".format( parser_resp['title'], parser_resp['content'])) epub_book.add_item(content_html) epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) # A spine determines the order in which content will be shown epub_book.spine = [content_html] epub.write_epub("{}.epub".format(slugify(parser_resp['title'])), epub_book, dict(plugins=[DownloadImagesPlugin()]))
def generate_content(url,category): parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87') parser_response = parser_client.get_article(str(url)) article = parser_response.json() str_article_title=article['title'] strarticle = article['content'] final_article = re.sub('<.*?>', '', strarticle) final_article2 = re.sub('&.*?;', '', final_article) line = re.sub('["]', '', final_article2) final_article3=line.encode('utf-8').strip() final_article3=os.linesep.join([s for s in final_article3.splitlines() if s]) final_article4=re.sub(' +',' ',final_article3) linet=re.sub('["]', '', str_article_title) final_article_title = linet.encode('utf-8').strip() intcategory=int(category) db = MySQLdb.connect("localhost", 'root', '', "inswipes") cursor = db.cursor() try: sql='INSERT INTO meta_content(article_content,link,main_category_id,article_title)VALUES("%s","%s","%d","%s")'%(final_article4,url,intcategory,final_article_title) cursor.execute(sql) db.commit() db.close() except: db.rollback() db.close() summarization()
def extracting_content(url): parser_client = ParserClient(token='#########################') parser_response = parser_client.get_article(str(url)) article = parser_response.json() str_article_title = article['title'] strarticle = article['content'] print(str_article_title) print(strarticle)
def get_page_metadata(url): token = os.environ.get('READABILITY_PARSER_KEY', None) if not token: return {} try: parser_client = ParserClient(token=os.environ.get('READABILITY_PARSER_KEY')) return parser_client.get_article(url).json() except Exception: logger.exception('Failed to readability for url %s', url)
def post(self, request, *args, **kwargs): form = LinkForm(request.POST) if form.is_valid(): link = form.save(commit=False) link.group = Group.objects.get(pk=self.kwargs['group_id']) # extract data from readability parser_client = ParserClient(token=settings.READABILITY_TOKEN) parser_response = parser_client.get_article(link.url) article = parser_response.json() link.title = article.get('title', '') link.content = article.get('content', '') link.description = article.get('excerpt', '') link.save() tags = extract_tags(link.title + ' ' + link.content) link.tags.add(*tags) url = reverse('groups:list_links', kwargs={'group_id': self.kwargs['group_id']}) return redirect(url)
def get(self): client = ParserClient(token='64c0f2ae58811bc3d09104e8d22abb3e3b328971') feeds = RSSinfo.query() for feed in feeds: if feed.get_full_article == True: items = RSS.query(ancestor = feed.key) for item in items: if item.content == 'no content': parser_response = client.get_article(url = item.link) sleep(1) article = parser_response.json() item.content = article['content'] item.put() else: pass else: pass
def generate_content(url,category): parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87') parser_response = parser_client.get_article(str(url)) article = parser_response.json() str_article_title=article['title'] strarticle = article['content'] final_article = re.sub('<.*?>', '', strarticle) final_article2 = re.sub('&.*?;', '', final_article) line = re.sub('["]', '', final_article2) final_article3=line.encode('utf-8').strip() final_article3=os.linesep.join([s for s in final_article3.splitlines() if s]) final_article4=re.sub(' +',' ',final_article3) linet=re.sub('["]', '', str_article_title) final_article_title = linet.encode('utf-8').strip() print(url) print(final_article4) insertion(category,url,final_article4)
import os from readability import ParserClient os.environ['READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294' # READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294' token = "c4e591e3f00ed1512c8194ab6616cf826d155294" from readability import ParserClient client = ParserClient(token=token) parser_client = ParserClient(token) parser_response = client.get_article('http://paulgraham.com/altair.html') article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article("http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681") article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article("http://www.vox.com/2016/4/6/11376948/babymetal-us-debut-colbert") article = parser_response.json()
class ParserClientTest(unittest.TestCase): """ Test case for the Parser Client """ def setUp(self): self.parser_token = required_from_env('READABILITY_PARSER_TOKEN') self.parser_client = ParserClient(token=self.parser_token) self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain' def test_generate_url(self): """ Test the clients ability to generate urls to endpoints. """ # Test root resource expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('') expected_url = '{}?token={}'.format(expected_url, self.parser_token) generated_url = self.parser_client._generate_url('') self.assertEqual(generated_url, expected_url) # Test parser resource expected_url = '{base_url}?token={token}&url=http%3A%2F%2Fwww.google.biz%2Fblog.html'.format( base_url=DEFAULT_PARSER_URL_TEMPLATE.format('parser'), token=self.parser_token) params = {'url': 'http://www.google.biz/blog.html'} generated_url = self.parser_client._generate_url( 'parser', query_params=params) self.assertEqual(generated_url, expected_url) def test_get_root(self): """ Test the client's ability to hit the root endpoint. """ response = self.parser_client.get_root() expected_keys = set(['resources', ]) self.assertEqual(set(response.json().keys()), expected_keys) def test_get_confidence(self): """ Test the client's ability to hit the confidence endpoint. """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status_code, 400) expected_keys = set(['url', 'confidence']) response = self.parser_client.get_confidence(url=self.test_url) self.assertEqual(response.status_code, 200) self.assertEqual(set(response.json().keys()), expected_keys) # confidence for wikipedia should be over .5 self.assertTrue(response.json()['confidence'] >= .5) def test_get_article_status(self): """ Test the client's ability to hit the parser endpoint with a HEAD """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status_code, 400) response = self.parser_client.get_article_status(url=self.test_url) self.assertEqual(response.status_code, 200) self.assertTrue(response.headers.get('x-article-status') is not None) self.assertTrue(response.headers.get('x-article-id') is not None) def test_get_article(self): """ Test the client's ability to hit the parser endpoint with a GET """ # test with incorrect params response = self.parser_client.get_article() self.assertEqual(response.status_code, 400) response = self.parser_client.get_article(url=self.test_url) self.assertEqual(response.status_code, 200) some_expected_keys = set(['content', 'domain', 'author', 'word_count', 'title', 'total_pages']) self.assertTrue( some_expected_keys.issubset(set(response.json().keys()))) def test_post_article_content(self): """ Test the client's ability to hit the parser endpoint with a POST request. """ content = load_test_content('content/test_post_content.html') url = 'http://thisisaurlthatdoesntmatterbutmustbepassedanyway.com/article.html' response = self.parser_client.post_article_content(content, url) self.assertEqual(response.status_code, 200)
# print tag import os from readability import ParserClient os.environ[ 'READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294' # READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294' token = "c4e591e3f00ed1512c8194ab6616cf826d155294" from readability import ParserClient client = ParserClient(token=token) parser_client = ParserClient(token) parser_response = client.get_article('http://paulgraham.com/altair.html') article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article( "http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681") article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article( "http://www.vox.com/2016/4/6/11376948/babymetal-us-debut-colbert") article = parser_response.json() print(article['title'])
def main(): if not os.path.isfile('credentials.config'): # if credentials file does not exist, start the first run function first_run() # Authenticate and generate the credentials file. # command line switches function args = read_command_args() use_evernote = args.e debug_mode = args.debug delete_files = args.t if use_evernote is True else False path = args.p info_mode = args.i if debug_mode: # print("Warning - Debug mode active. Files will be downloaded, but not added to index") logger = create_logger(log_to_console=True) logger.setLevel(logging.DEBUG) logger.info('Warning - Debug mode active. Files will be downloaded, but not added to index') elif info_mode: warnings.warn("Suppressed Resource warning", ResourceWarning) # suppresses sll unclosed socket warnings. logger = create_logger(log_to_console=True) else: warnings.warn("Suppressed Resource warning", ResourceWarning) # suppresses sll unclosed socket warnings. logger = create_logger() logger.info("\n###########\nStarting SR\n###########") try: with open('credentials.config', 'r') as json_file: credentials = json.load(json_file) # get various OAuth tokens except OSError: logger.error('Unable to open credentials file') raise SystemExit # Create the downloads folder on the specified path, or in the dir where file is stored. if path is not "": path = path[0] else: path = os.getcwd() path += "/SRDownloads" if not os.path.exists(path): os.makedirs(path) # Authenticate with Reddit logger.info('Authenticating with Reddit') client_id = credentials['reddit']['client_id'] client_secret = credentials['reddit']['client_secret'] redirect_uri = credentials['reddit']['redirect_uri'] refresh_token = credentials['reddit']['refresh_token'] user_agent = "SavedRetriever 0.9 by /u/fuzzycut" try: r = praw.Reddit(user_agent=user_agent, oauth_client_id=client_id, oauth_client_secret=client_secret, oauth_redirect_uri=redirect_uri) access_information = r.refresh_access_information(refresh_token) r.set_access_credentials(**access_information) except Exception as e: logger.error(e) raise SystemExit time_since_accesstoken = time.time() index = set() if os.path.isfile('index.txt'): # checking for index file, which contains index of downloaded files. try: with open('index.txt', 'r') as ind: for line in ind: index.add(line[:-1]) # -1 truncates the newline in the index file. except OSError: logger.error("Unable to open index file for reading") raise SystemExit if use_evernote is True: enclient = evernoteWrapper.Client(credentials['evernote']['dev_token'], 'Saved from Reddit') html_index_file = None if delete_files is False: # only create index if we're going to use it. html_index_file = html_index.index(r.get_me().name, path) try: ind = open('index.txt', 'a') # open index file for appending except OSError: logger.error("Unable to open index file for writing") raise SystemExit logger.info("Beginning to save files...") for i in r.get_me().get_saved(limit=None): if (time.time() - time_since_accesstoken) / 60 > 55: # Refresh the access token before it runs out. logger.debug('Refreshing Reddit token') r.refresh_access_information(access_information['refresh_token']) time_since_accesstoken = time.time() name = i.name file_name = name # to stop ide complaining. note = None evernote_tags = ('Reddit', 'SavedRetriever', '/r/' + i.subreddit.display_name) # add config for this later # logger.info('Saving post - {}'.format(name)) if name not in index: # file has not been downloaded permalink = i.permalink author = i.author title = i.link_title if hasattr(i, 'link_title') else i.title # ========== # # IS COMMENT # # ========== # if hasattr(i, 'body_html'): logger.debug("{} is comment".format(name)) body = i.body_html # html output body = subreddit_linker(body) output = html_output_string(permalink, author, body, title) if delete_files is False: file_name = html_writer(path, name, output) # en api section if use_evernote is True: enclient.new_note(title) enclient.add_html(output) enclient.add_tag(*evernote_tags) # the * is very important. It unpacks the tags tuple properly note = enclient.create_note() # ============ # # IS SELF-POST # # ============ # elif hasattr(i, 'is_self') and i.is_self is True: logger.debug('{} is self-post'.format(name)) text = i.selftext_html if i.selftext_html is not None else "" # html output text = subreddit_linker(text) output = html_output_string(permalink, author, text, title) if delete_files is False: file_name = html_writer(path, name, output) # en api section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) enclient.add_html(output) note = enclient.create_note() # ====================== # # IS DIRECT LINKED IMAGE # # ====================== # elif hasattr(i, 'url') and re.sub("([^A-z0-9])\w+", "", i.url.split('.')[-1]) in ['jpg', 'png', 'gif', 'gifv', 'pdf']: """ Need to check file types and test pdf. How does this handle gfycat and webm? Can EN display that inline? The regex in the if is to strip out non-valid filetype chars. """ logger.debug('{} is direct linked image'.format(name)) url = i.url base_filename = "{}_image.{}".format(name, re.sub("([^A-z0-9])\w+", "", url.split('.')[ -1])) # filename for image. regex same as above. filename = path + "/" + base_filename # image downloader section if os.path.exists(filename) and (os.path.getsize(filename) > 0): # If image exists and is valid image_downloaded = True logger.info("Image already exists - {}".format(base_filename)) else: image_downloaded = image_saver(url, filename) logger.info('Downloaded image - {}'.format(base_filename)) if image_downloaded: # write image as <img> or link to local pdf downloaded in html file if filename.split('.')[-1] == 'pdf': img = '<a href="{}">Click here for link to downloaded pdf</a>'.format(base_filename) else: img = '<br><a href="{0}"><img src="{0}"></a>'.format( base_filename) # html for embedding in html file else: img = "Image failed to download - It may be temporarily or permanently unavailable" # Evernote api section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) enclient.add_html(html_output_string_image(permalink, author, "", title)) # should add body="" in the function if image_downloaded: enclient.add_resource(filename) note = enclient.create_note() if delete_files is False: file_name = html_writer(path, name, html_output_string_image(permalink, author, img, title)) else: os.remove(filename) # ============== # # IS IMGUR ALBUM # # ============== # elif hasattr(i, 'url') and 'imgur' in i.url: # Add option to download images to folder. logger.debug('{} is Imgur album'.format(name)) url = i.url body = "<h2>{}</h2>".format(title) # imgur api section client = ImgurClient(credentials['imgur']['client_id'], credentials['imgur']['client_secret']) pattern = '\/([A-z0-9]{5,7})' # matches any 5-7 long word that comes after a forward slash (/). match = re.findall(pattern, url) gallery_id = match[-1].replace('/', '') # removes any forward slashes for processing gallery = [] filename = None try: gallery = client.get_album_images(gallery_id) except imgurpython.helpers.error.ImgurClientError: # if 'gallery' is actually just a lone image try: gallery = [client.get_image(gallery_id)] except imgurpython.helpers.error.ImgurClientError as error: # if gallery does not exist. Is this the best way to do this? if debug_mode is True or error.status_code != 404: print("**{} - {}**".format(error.status_code, error.error_message)) # img_path = 'Downloads/{}'.format(gallery_id) img_path = path + "/" + gallery_id if not os.path.exists(img_path): os.makedirs(img_path) for image in gallery: # add if gallery > 10, then just add a link (would be too large for the note) image_name = image.title if image.title is not None else "" image_description = image.description if image.description is not None else "" image_filetype = image.type.split('/')[1] image_id = image.id image_link = image.link # sets up downloaded filename and html for embedding image base_filename = "{}_image.{}".format(image_id, image_filetype) img = '<p><h3>{0}</h3><a href="{1}/{2}"><img src="{1}/{2}"></a><br/>{3}</p>'.format(image_name, gallery_id, base_filename, image_description) filename = img_path + "/" + base_filename if os.path.exists(filename) and (os.path.getsize(filename) > 0): # only download if file doesn't already exist logger.info('Image already exists - {}'.format(base_filename)) else: image_saver(image_link, filename) logger.info('Image downloaded - {}'.format(base_filename)) body += img # Evernote api section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) if len(gallery) == 1 and filename is not None: enclient.add_html(html_output_string_image(permalink, author, "", title)) enclient.add_resource(filename) else: enclient.add_html(html_output_string_image(permalink, author, 'This album is too large to embed; please see <a href="{}">here</a> for the original link.'.format(url), title)) note = enclient.create_note() if delete_files is False: file_name = html_writer(path, name, html_output_string_image(permalink, author, body, title)) else: shutil.rmtree(img_path) # ========== # # IS ARTICLE # # ========== # elif hasattr(i, 'title') and i.is_self is False: # This section needs work. It is semi-complete. Ultimately, adding in the full article is the goal. logger.debug('{} is article/webpage'.format(name)) url = i.url # readability api section os.environ["READABILITY_PARSER_TOKEN"] = credentials['readability'][ 'parser_key'] # set the environment variable as the parser key logger.info('Initializing Readability Client') parse = ParserClient() # readability api doesn't take the token directly parse_response = parse.get_article(url) article = parse_response.json() if 'content' not in article: # if unable to parse document, manually set an error message article['content'] = 'Unable to parse page - See <a href="{}">here</a> for the original link'.format(url) article = article['content'] article = "<a href='{}'>{}</a><br/>{}<br/>".format(url, title, article) # source of article # html output section. output = html_output_string(permalink, author, article, title) if delete_files is False: file_name = html_writer(path, name, output) # Evernote section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) output = html_output_string(permalink, author, article, title) enclient.add_html(output) # Add html file to note # enclient.add_resource("Downloads/{}.html".format(name)) note = enclient.create_note() # end of checking for saved items # failed_upload = False if use_evernote is True: if note is not None: # print("Saved {:9} - GUID: {}".format(name, note.guid)) logger.info('Saved {:9} - GUID: {}'.format(name, note.guid)) else: # Upload failed # print("Saved {:9} - Note failed to upload".format(name)) logger.info('Saved {:9} - Note failed to upload'.format(name)) failed_upload = True elif use_evernote is False: # print("Saved " + name) logger.info('Saved ' + name) if not debug_mode and not failed_upload: ind.write(name + "\n") ind.flush() # this fixes python not writing the file if it terminates before .close() can be called if delete_files is False: html_index_file.add_link(title, file_name, permalink) # end of for loop ind.close() logger.info("All items downloaded") if delete_files is False: html_index_file.save_and_close() else: # try remove downloads if -t is set, but don't force it if directory has things in it already. try: os.rmdir('Downloads') except OSError: logger.error("Unable to remove files")