Ejemplo n.º 1
0
class ReadabilityToEpub:
    def __init__(self, parser_token=None):
        if not parser_token:
            raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api")
        self.parser_client = ParserClient(token=parser_token)

    def convert_url(self, url):
        parser_resp = self.parser_client.get_article(url).json()

        epub_book = epub.EpubBook()
        epub_book.set_title(parser_resp["title"])
        epub_book.add_author(parser_resp["author"])

        content_html = epub.EpubHtml(
            title=parser_resp["title"],
            file_name="content.xhtml",
            content="<h1>{}</h1>\n{}".format(parser_resp["title"], parser_resp["content"]),
        )

        epub_book.add_item(content_html)
        epub_book.add_item(epub.EpubNcx())
        epub_book.add_item(epub.EpubNav())
        # A spine determines the order in which content will be shown
        epub_book.spine = [content_html]

        epub.write_epub(
            "{}.epub".format(slugify(parser_resp["title"])), epub_book, dict(plugins=[DownloadImagesPlugin()])
        )
Ejemplo n.º 2
0
class ReadabilityToEpub:
    def __init__(self, parser_token=None):
        if not parser_token:
            raise Exception(
                "Get a Readability parser token at: https://www.readability.com/developers/api"
            )
        self.parser_client = ParserClient(token=parser_token)

    def convert_url(self, url):
        parser_resp = self.parser_client.get_article(url).json()

        epub_book = epub.EpubBook()
        epub_book.set_title(parser_resp['title'])
        epub_book.add_author(parser_resp['author'])

        content_html = epub.EpubHtml(title=parser_resp['title'],
                                     file_name='content.xhtml',
                                     content="<h1>{}</h1>\n{}".format(
                                         parser_resp['title'],
                                         parser_resp['content']))

        epub_book.add_item(content_html)
        epub_book.add_item(epub.EpubNcx())
        epub_book.add_item(epub.EpubNav())
        # A spine determines the order in which content will be shown
        epub_book.spine = [content_html]

        epub.write_epub("{}.epub".format(slugify(parser_resp['title'])),
                        epub_book, dict(plugins=[DownloadImagesPlugin()]))
def generate_content(url,category):
    parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87')
    parser_response = parser_client.get_article(str(url))
    article = parser_response.json()
    str_article_title=article['title']
    strarticle = article['content']
    final_article = re.sub('<.*?>', '', strarticle)
    final_article2 = re.sub('&.*?;', '', final_article)

    line = re.sub('["]', '', final_article2)
    final_article3=line.encode('utf-8').strip()
    final_article3=os.linesep.join([s for s in final_article3.splitlines() if s])
    final_article4=re.sub(' +',' ',final_article3)
    linet=re.sub('["]', '', str_article_title)
    final_article_title = linet.encode('utf-8').strip()


    intcategory=int(category)
    db = MySQLdb.connect("localhost", 'root', '', "inswipes")
    cursor = db.cursor()
    try:
        sql='INSERT INTO meta_content(article_content,link,main_category_id,article_title)VALUES("%s","%s","%d","%s")'%(final_article4,url,intcategory,final_article_title)

        cursor.execute(sql)

        db.commit()
        db.close()
    except:
        db.rollback()
        db.close()


    summarization()
def extracting_content(url):
    parser_client = ParserClient(token='#########################')
    parser_response = parser_client.get_article(str(url))
    article = parser_response.json()
    str_article_title = article['title']
    strarticle = article['content']

    print(str_article_title)
    print(strarticle)
Ejemplo n.º 5
0
def get_page_metadata(url):
    token = os.environ.get('READABILITY_PARSER_KEY', None)
    if not token:
        return {}
    try:
        parser_client = ParserClient(token=os.environ.get('READABILITY_PARSER_KEY'))
        return parser_client.get_article(url).json()
    except Exception:
        logger.exception('Failed to readability for url %s', url)
Ejemplo n.º 6
0
 def post(self, request, *args, **kwargs):
     form = LinkForm(request.POST)
     if form.is_valid():
         link = form.save(commit=False)
         link.group = Group.objects.get(pk=self.kwargs['group_id'])
         # extract data from readability
         parser_client = ParserClient(token=settings.READABILITY_TOKEN)
         parser_response = parser_client.get_article(link.url)
         article = parser_response.json()
         link.title = article.get('title', '')
         link.content = article.get('content', '')
         link.description = article.get('excerpt', '')
         link.save()
         tags = extract_tags(link.title + ' ' + link.content)
         link.tags.add(*tags)
     url = reverse('groups:list_links', kwargs={'group_id': self.kwargs['group_id']})
     return redirect(url)
Ejemplo n.º 7
0
 def get(self):
     client = ParserClient(token='64c0f2ae58811bc3d09104e8d22abb3e3b328971')
     feeds = RSSinfo.query()
     for feed in feeds:
         if feed.get_full_article == True:
             items = RSS.query(ancestor = feed.key)
             for item in items:
                 if item.content == 'no content':
                     parser_response = client.get_article(url = item.link)
                     sleep(1)
                     article = parser_response.json()
                     item.content = article['content']
                     item.put()
                 else:
                     pass
         else:
             pass
Ejemplo n.º 8
0
def generate_content(url,category):
    parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87')
    parser_response = parser_client.get_article(str(url))
    article = parser_response.json()
    str_article_title=article['title']
    strarticle = article['content']
    final_article = re.sub('<.*?>', '', strarticle)
    final_article2 = re.sub('&.*?;', '', final_article)

    line = re.sub('["]', '', final_article2)
    final_article3=line.encode('utf-8').strip()
    final_article3=os.linesep.join([s for s in final_article3.splitlines() if s])
    final_article4=re.sub(' +',' ',final_article3)
    linet=re.sub('["]', '', str_article_title)
    final_article_title = linet.encode('utf-8').strip()
    print(url)
    print(final_article4)
    insertion(category,url,final_article4)
Ejemplo n.º 9
0
import os
from readability import ParserClient


os.environ['READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294'


# READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294'
token = "c4e591e3f00ed1512c8194ab6616cf826d155294"

from readability import ParserClient
client = ParserClient(token=token)

parser_client = ParserClient(token)
parser_response = client.get_article('http://paulgraham.com/altair.html')
article = parser_response.json()

print(article['title'])
print(article['content'])


parser_response = client.get_article("http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681")
article = parser_response.json()
print(article['title'])
print(article['content'])



parser_response = client.get_article("http://www.vox.com/2016/4/6/11376948/babymetal-us-debut-colbert")
article = parser_response.json()
class ParserClientTest(unittest.TestCase):
    """
    Test case for the Parser Client
    """
    def setUp(self):
        self.parser_token = required_from_env('READABILITY_PARSER_TOKEN')
        self.parser_client = ParserClient(token=self.parser_token)
        self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'

    def test_generate_url(self):
        """
        Test the clients ability to generate urls to endpoints.
        """
        # Test root resource
        expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('')
        expected_url = '{}?token={}'.format(expected_url, self.parser_token)
        generated_url = self.parser_client._generate_url('')
        self.assertEqual(generated_url, expected_url)

        # Test parser resource
        expected_url = '{base_url}?token={token}&url=http%3A%2F%2Fwww.google.biz%2Fblog.html'.format(
            base_url=DEFAULT_PARSER_URL_TEMPLATE.format('parser'),
            token=self.parser_token)
        params = {'url': 'http://www.google.biz/blog.html'}
        generated_url = self.parser_client._generate_url(
            'parser', query_params=params)

        self.assertEqual(generated_url, expected_url)

    def test_get_root(self):
        """
        Test the client's ability to hit the root endpoint.
        """
        response = self.parser_client.get_root()

        expected_keys = set(['resources', ])
        self.assertEqual(set(response.json().keys()), expected_keys)

    def test_get_confidence(self):
        """
        Test the client's ability to hit the confidence endpoint.
        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status_code, 400)

        expected_keys = set(['url', 'confidence'])

        response = self.parser_client.get_confidence(url=self.test_url)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(set(response.json().keys()), expected_keys)
        # confidence for wikipedia should be over .5
        self.assertTrue(response.json()['confidence'] >= .5)

    def test_get_article_status(self):
        """
        Test the client's ability to hit the parser endpoint with a HEAD
        """
        # hit without an article_id or url. Should get an error.
        response = self.parser_client.get_confidence()
        self.assertEqual(response.status_code, 400)

        response = self.parser_client.get_article_status(url=self.test_url)
        self.assertEqual(response.status_code, 200)
        self.assertTrue(response.headers.get('x-article-status') is not None)
        self.assertTrue(response.headers.get('x-article-id') is not None)

    def test_get_article(self):
        """
        Test the client's ability to hit the parser endpoint with a GET
        """
        # test with incorrect params
        response = self.parser_client.get_article()
        self.assertEqual(response.status_code, 400)

        response = self.parser_client.get_article(url=self.test_url)
        self.assertEqual(response.status_code, 200)

        some_expected_keys = set(['content', 'domain', 'author', 'word_count',
            'title', 'total_pages'])
        self.assertTrue(
            some_expected_keys.issubset(set(response.json().keys())))

    def test_post_article_content(self):
        """
        Test the client's ability to hit the parser endpoint with a POST
        request.
        """
        content = load_test_content('content/test_post_content.html')
        url = 'http://thisisaurlthatdoesntmatterbutmustbepassedanyway.com/article.html'
        response = self.parser_client.post_article_content(content, url)
        self.assertEqual(response.status_code, 200)
Ejemplo n.º 11
0
#     print tag

import os
from readability import ParserClient

os.environ[
    'READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294'

# READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294'
token = "c4e591e3f00ed1512c8194ab6616cf826d155294"

from readability import ParserClient
client = ParserClient(token=token)

parser_client = ParserClient(token)
parser_response = client.get_article('http://paulgraham.com/altair.html')
article = parser_response.json()

print(article['title'])
print(article['content'])

parser_response = client.get_article(
    "http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681")
article = parser_response.json()
print(article['title'])
print(article['content'])

parser_response = client.get_article(
    "http://www.vox.com/2016/4/6/11376948/babymetal-us-debut-colbert")
article = parser_response.json()
print(article['title'])
Ejemplo n.º 12
0
def main():
    if not os.path.isfile('credentials.config'):  # if credentials file does not exist, start the first run function
        first_run()  # Authenticate and generate the credentials file.

    # command line switches function
    args = read_command_args()
    use_evernote = args.e
    debug_mode = args.debug
    delete_files = args.t if use_evernote is True else False
    path = args.p
    info_mode = args.i

    if debug_mode:
        # print("Warning - Debug mode active. Files will be downloaded, but not added to index")
        logger = create_logger(log_to_console=True)
        logger.setLevel(logging.DEBUG)
        logger.info('Warning - Debug mode active. Files will be downloaded, but not added to index')
    elif info_mode:
        warnings.warn("Suppressed Resource warning", ResourceWarning)  # suppresses sll unclosed socket warnings.
        logger = create_logger(log_to_console=True)
    else:
        warnings.warn("Suppressed Resource warning", ResourceWarning)  # suppresses sll unclosed socket warnings.
        logger = create_logger()

    logger.info("\n###########\nStarting SR\n###########")

    try:
        with open('credentials.config', 'r') as json_file:
            credentials = json.load(json_file)  # get various OAuth tokens
    except OSError:
        logger.error('Unable to open credentials file')
        raise SystemExit

    # Create the downloads folder on the specified path, or in the dir where file is stored.
    if path is not "":
        path = path[0]
    else:
        path = os.getcwd()
    path += "/SRDownloads"

    if not os.path.exists(path):
        os.makedirs(path)

    # Authenticate with Reddit
    logger.info('Authenticating with Reddit')
    client_id = credentials['reddit']['client_id']
    client_secret = credentials['reddit']['client_secret']
    redirect_uri = credentials['reddit']['redirect_uri']
    refresh_token = credentials['reddit']['refresh_token']
    user_agent = "SavedRetriever 0.9 by /u/fuzzycut"

    try:
        r = praw.Reddit(user_agent=user_agent,
                        oauth_client_id=client_id,
                        oauth_client_secret=client_secret,
                        oauth_redirect_uri=redirect_uri)

        access_information = r.refresh_access_information(refresh_token)
        r.set_access_credentials(**access_information)
    except Exception as e:
        logger.error(e)
        raise SystemExit
    time_since_accesstoken = time.time()

    index = set()
    if os.path.isfile('index.txt'):  # checking for  index file, which contains index of downloaded files.
        try:
            with open('index.txt', 'r') as ind:
                for line in ind:
                    index.add(line[:-1])  # -1 truncates the newline in the index file.
        except OSError:
            logger.error("Unable to open index file for reading")
            raise SystemExit

    if use_evernote is True:
        enclient = evernoteWrapper.Client(credentials['evernote']['dev_token'], 'Saved from Reddit')

    html_index_file = None
    if delete_files is False:  # only create index if we're going to use it.
        html_index_file = html_index.index(r.get_me().name, path)

    try:
        ind = open('index.txt', 'a')  # open index file for appending
    except OSError:
        logger.error("Unable to open index file for writing")
        raise SystemExit

    logger.info("Beginning to save files...")
    for i in r.get_me().get_saved(limit=None):
        if (time.time() - time_since_accesstoken) / 60 > 55:  # Refresh the access token before it runs out.
            logger.debug('Refreshing Reddit token')
            r.refresh_access_information(access_information['refresh_token'])
            time_since_accesstoken = time.time()

        name = i.name
        file_name = name  # to stop ide complaining.
        note = None
        evernote_tags = ('Reddit', 'SavedRetriever', '/r/' + i.subreddit.display_name)  # add config for this later

        # logger.info('Saving post - {}'.format(name))

        if name not in index:  # file has not been downloaded
            permalink = i.permalink
            author = i.author
            title = i.link_title if hasattr(i, 'link_title') else i.title
            # ========== #
            # IS COMMENT #
            # ========== #
            if hasattr(i, 'body_html'):
                logger.debug("{} is comment".format(name))
                body = i.body_html

                # html output
                body = subreddit_linker(body)
                output = html_output_string(permalink, author, body, title)
                if delete_files is False:
                    file_name = html_writer(path, name, output)

                # en api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_html(output)
                    enclient.add_tag(*evernote_tags)  # the * is very important. It unpacks the tags tuple properly
                    note = enclient.create_note()
            # ============ #
            # IS SELF-POST #
            # ============ #
            elif hasattr(i, 'is_self') and i.is_self is True:
                logger.debug('{} is self-post'.format(name))
                text = i.selftext_html if i.selftext_html is not None else ""

                # html output
                text = subreddit_linker(text)
                output = html_output_string(permalink, author, text, title)
                if delete_files is False:
                    file_name = html_writer(path, name, output)

                # en api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    enclient.add_html(output)
                    note = enclient.create_note()
            # ====================== #
            # IS DIRECT LINKED IMAGE #
            # ====================== #
            elif hasattr(i, 'url') and re.sub("([^A-z0-9])\w+", "", i.url.split('.')[-1]) in ['jpg', 'png', 'gif', 'gifv', 'pdf']:
                """
                Need to check file types and test pdf. How does this handle gfycat and webm? Can EN display that inline?
                The regex in the if is to strip out non-valid filetype chars.
                """
                logger.debug('{} is direct linked image'.format(name))
                url = i.url
                base_filename = "{}_image.{}".format(name, re.sub("([^A-z0-9])\w+", "", url.split('.')[
                    -1]))  # filename for image. regex same as above.
                filename = path + "/" + base_filename

                # image downloader section
                if os.path.exists(filename) and (os.path.getsize(filename) > 0):  # If image exists and is valid
                    image_downloaded = True
                    logger.info("Image already exists - {}".format(base_filename))
                else:
                    image_downloaded = image_saver(url, filename)
                    logger.info('Downloaded image - {}'.format(base_filename))

                if image_downloaded:
                    # write image as <img> or link to local pdf downloaded in html file
                    if filename.split('.')[-1] == 'pdf':
                        img = '<a href="{}">Click here for link to downloaded pdf</a>'.format(base_filename)
                    else:
                        img = '<br><a href="{0}"><img src="{0}"></a>'.format(
                            base_filename)  # html for embedding in html file
                else:
                    img = "Image failed to download - It may be temporarily or permanently unavailable"

                # Evernote api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    enclient.add_html(html_output_string_image(permalink, author, "", title))  # should add body="" in the function
                    if image_downloaded:
                        enclient.add_resource(filename)
                    note = enclient.create_note()

                if delete_files is False:
                    file_name = html_writer(path, name, html_output_string_image(permalink, author, img, title))
                else:
                    os.remove(filename)
            # ============== #
            # IS IMGUR ALBUM #
            # ============== #
            elif hasattr(i, 'url') and 'imgur' in i.url:  # Add option to download images to folder.
                logger.debug('{} is Imgur album'.format(name))
                url = i.url
                body = "<h2>{}</h2>".format(title)

                # imgur api section
                client = ImgurClient(credentials['imgur']['client_id'], credentials['imgur']['client_secret'])
                pattern = '\/([A-z0-9]{5,7})'  # matches any 5-7 long word that comes after a forward slash (/).
                match = re.findall(pattern, url)
                gallery_id = match[-1].replace('/', '')  # removes any forward slashes for processing
                gallery = []
                filename = None
                try:
                    gallery = client.get_album_images(gallery_id)
                except imgurpython.helpers.error.ImgurClientError:  # if 'gallery' is actually just a lone image
                    try:
                        gallery = [client.get_image(gallery_id)]
                    except imgurpython.helpers.error.ImgurClientError as error:  # if gallery does not exist. Is this the best way to do this?
                        if debug_mode is True or error.status_code != 404:
                            print("**{} - {}**".format(error.status_code, error.error_message))

                # img_path = 'Downloads/{}'.format(gallery_id)
                img_path = path + "/" + gallery_id
                if not os.path.exists(img_path):
                    os.makedirs(img_path)
                for image in gallery:  # add if gallery > 10, then just add a link (would be too large for the note)
                    image_name = image.title if image.title is not None else ""
                    image_description = image.description if image.description is not None else ""
                    image_filetype = image.type.split('/')[1]
                    image_id = image.id
                    image_link = image.link
                    # sets up downloaded filename and html for embedding image
                    base_filename = "{}_image.{}".format(image_id, image_filetype)
                    img = '<p><h3>{0}</h3><a href="{1}/{2}"><img src="{1}/{2}"></a><br/>{3}</p>'.format(image_name,
                                                                                                        gallery_id,
                                                                                                        base_filename,
                                                                                                        image_description)
                    filename = img_path + "/" + base_filename
                    if os.path.exists(filename) and (os.path.getsize(filename) > 0):  # only download if file doesn't already exist
                        logger.info('Image already exists - {}'.format(base_filename))
                    else:
                        image_saver(image_link, filename)
                        logger.info('Image downloaded - {}'.format(base_filename))
                    body += img

                # Evernote api section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    if len(gallery) == 1 and filename is not None:
                        enclient.add_html(html_output_string_image(permalink, author, "", title))
                        enclient.add_resource(filename)
                    else:
                        enclient.add_html(html_output_string_image(permalink, author,
                        'This album is too large to embed; please see <a href="{}">here</a> for the original link.'.format(url),
                                                             title))
                    note = enclient.create_note()

                if delete_files is False:
                    file_name = html_writer(path, name, html_output_string_image(permalink, author, body, title))
                else:
                    shutil.rmtree(img_path)
            # ========== #
            # IS ARTICLE #
            # ========== #
            elif hasattr(i, 'title') and i.is_self is False:
                # This section needs work. It is semi-complete. Ultimately, adding in the full article is the goal.
                logger.debug('{} is article/webpage'.format(name))
                url = i.url

                # readability api section
                os.environ["READABILITY_PARSER_TOKEN"] = credentials['readability'][
                    'parser_key']  # set the environment variable as the parser key
                logger.info('Initializing Readability Client')
                parse = ParserClient()  # readability api doesn't take the token directly
                parse_response = parse.get_article(url)
                article = parse_response.json()
                if 'content' not in article:  # if unable to parse document, manually set an error message
                    article['content'] = 'Unable to parse page - See <a href="{}">here</a> for the original link'.format(url)
                article = article['content']
                article = "<a href='{}'>{}</a><br/>{}<br/>".format(url, title, article)  # source of article

                # html output section.
                output = html_output_string(permalink, author, article, title)
                if delete_files is False:
                    file_name = html_writer(path, name, output)

                # Evernote section
                if use_evernote is True:
                    enclient.new_note(title)
                    enclient.add_tag(*evernote_tags)
                    output = html_output_string(permalink, author, article, title)
                    enclient.add_html(output)

                    # Add html file to note
                    # enclient.add_resource("Downloads/{}.html".format(name))
                    note = enclient.create_note()

            # end of checking for saved items #
            failed_upload = False
            if use_evernote is True:
                if note is not None:
                    # print("Saved {:9} - GUID: {}".format(name, note.guid))
                    logger.info('Saved {:9} - GUID: {}'.format(name, note.guid))
                else:  # Upload failed
                    # print("Saved {:9} - Note failed to upload".format(name))
                    logger.info('Saved {:9} - Note failed to upload'.format(name))
                    failed_upload = True
            elif use_evernote is False:
                # print("Saved " + name)
                logger.info('Saved ' + name)
            if not debug_mode and not failed_upload:
                ind.write(name + "\n")
                ind.flush()  # this fixes python not writing the file if it terminates before .close() can be called
                if delete_files is False:
                    html_index_file.add_link(title, file_name, permalink)

    # end of for loop
    ind.close()
    logger.info("All items downloaded")
    if delete_files is False:
        html_index_file.save_and_close()
    else:  # try remove downloads if -t is set, but don't force it if directory has things in it already.
        try:
            os.rmdir('Downloads')
        except OSError:
            logger.error("Unable to remove files")