Ejemplo n.º 1
0
 def test_no_content_type(self):
     url = "http://www.1011ltd.com/web/blog/post/evolving_pid"
     resource = URLFetcher(url)
     self.assertFalse(resource.is_image())
     self.assertFalse(resource.is_PDF())
     self.assertFalse(resource.is_HTML())
Ejemplo n.º 2
0
 def test_image_content_type(self):
     resource = URLFetcher(self.html_url)
     self.assertFalse(resource.is_image())
     self.assertEquals(resource.image_content_type(), None)
Ejemplo n.º 3
0
 def test_size_limit(self):
     #FIXME makes actual connection
     resource = URLFetcher(self.pdf_url)
     # This raises error if we forgot to cast the 'content-length' header to int
     # '104' > 100 * 2**10 * 2**10
     resource.fetch()
Ejemplo n.º 4
0
 def test_html_fetch(self):
     #FIXME makes actual connection
     resource = URLFetcher(self.html_url)
     self.assertFalse(resource.is_PDF())
     self.assertTrue(resource.is_HTML())
Ejemplo n.º 5
0
 def test_pdf_fetch(self):
     #FIXME makes actual connection
     resource = URLFetcher(self.pdf_url)
     self.assertTrue(resource.is_PDF())
Ejemplo n.º 6
0
def main():
    pinboard_db = PinboardDatabase()
    datestr = pinboard_db.last_updated

    pinboard = PinboardSource(PINBOARD_API_TOKEN)
    diffbot = DiffbotTransformer(DIFFBOT_TOKEN)
    evernote = EvernoteSink(EVERNOTE_DEVELOPER_TOKEN)

    logging.info("Fetching data from {}".format(datestr))

    bookmarks = pinboard.fetch_from_date(datestr)
    # bookmarks = pinboard.fetch_from_url("http://i.imgur.com/4n92M.jpg")
    # bookmarks = pinboard.fetch_from_url("http://neoocean.net/blog/i/entry/%EB%B2%94%EC%A3%84%EC%97%90-%EB%8C%80%ED%95%9C-%ED%8B%80%EB%A6%B0-%EC%98%88%EC%B8%A1#_post_2057")
    # bookmarks = pinboard.fetch_from_url("http://nullmodel.egloos.com/3425248")
    # bookmarks = pinboard.fetch_from_url("http://www.daniel-lemire.com/blog/archives/2010/11/02/how-do-search-engines-handle-special-characters-should-you-care/")
    # bookmarks = pinboard.fetch_from_url("http://www.1011ltd.com/web/blog/post/evolving_pid")  # no content type returned

    items = []
    for bookmark in reversed(bookmarks):
        logging.info("Handling : {}".format(bookmark.url))
        try:
            resource = URLFetcher(bookmark.url)
        except requests.exceptions.ConnectionError as e:
            logging.error("Failed to fetch resource at {}".format(bookmark.url))
            logging.error("Reason: {}".format(e))
            continue
        except requests.exceptions.TooManyRedirects as e:
            logging.error("Failed to fetch resource at {}".format(bookmark.url))
            logging.error("Reason: {}".format(e))
            continue

        item = Item()
        if resource.is_PDF():
            item = PDFItem.from_pinboard_item(bookmark)
            item.content = resource.fetch()  #FIXME this could take very long. Need a way to address this problem.
        elif resource.is_image():
            item = ImageItem.from_pinboard_item(bookmark)
            item.content_type = resource.image_content_type()
            item.content = resource.fetch()
        elif resource.is_HTML() or resource.is_text():
            if resource.is_HTML():
                item = HTMLItem.from_pinboard_item(bookmark)
                json_result = diffbot.extract(item.url, html=True)
                try:
                    json_object = json.loads(json_result)
                except json.scanner.JSONDecodeError:
                    logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url))
                    continue

                if 'error' in json_object:
                    logging.error("Failed to fetch resource at {}".format(item.url))
                    logging.error(u"Reason: {}".format(json_object['error']))
                    continue

                if 'statusCode' in json_object:
                    if json_object['statusCode'] == 500:
                        logging.error("Failed to fetch resource at {}".format(item.url))
                        logging.error(u"Reason: {}".format(json_object['message']))
                        continue

                if 'html' in json_object:
                    item.content = html2enml(json_object['html'])
                else:
                    # try plaintext
                    if 'text' not in json_object:
                        logging.error("Failed to fetch HTML document at all: {}".format(item.url))
                        continue
                    logging.warn("Failed to fetch HTML document for {}".format(item.url))
                    logging.warn("Degrading to using text summary")
                    item.content = html2enml(json_object['text'])
            else:
                item = TextItem.from_pinboard_item(bookmark)
                json_result = diffbot.extract(item.url, html=True)
                try:
                    json_object = json.loads(json_result)
                except json.scanner.JSONDecodeError:
                    logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url))
                    continue

                # resource is plain text
                contents = resource.fetch().split('\n\n')
                data = "<div>"
                for content in contents:
                    data += ''.join(['<div>' + body + '</div>' for body in content.split('\n')])
                    data += "<div><br /></div>"
                data += "</div>"

                item.content = html2enml(data)

            # Check for default tags
            # FIXME seemingly random criteria for checking tags
            if not item.tags or (item.tags.lower() == 'unread' and len(item.tags.split()) == 1):
                # Diffbot will not contain tags key even if explicitly told to return tags if it does not find any
                if 'tags' in json_object:
                    # autotag tells that this was autotagged.
                    # Evernote cannot handle tags with commas.
                    tags = 'autotag ' + ' '.join(('_'.join(x.replace(',','').split()) for x in json_object['tags']))  # diffbot tags
                    item.tags = tags.encode('utf-8', 'xmlcharrefreplace')

        else:
            logging.error("Unknown content-type of {}".format(resource.content_type))
            continue

        try:
            evernote.push(item)
        except socket.error as e:
            logging.error("Socket error: {}".format(e))
            continue
        except EDAMUserException as e:
            logging.error("Unrecognized evernote type: {}".format(e))
            continue

        pinboard_db.last_updated = item.time

    pinboard_db.close()