class TestPinboardSource(unittest.TestCase): def setUp(self): self.PINBOARD_API_TOKEN = PINBOARD_API_TOKEN self.pin = PinboardSource(self.PINBOARD_API_TOKEN) self.xml = """<?xml version="1.0" encoding="UTF-8" ?>\n<posts user="******" dt="2013-04-25T08:42:48Z">\n <post href="http://estima.wordpress.com/2013/04/25/axon" time="2013-04-25T08:42:48Z" description="\xeb\xaa\xa8\xeb\x93\xa0 \xea\xb2\x83\xec\x9d\x84 \xeb\x8b\xa4 \xec\xb0\x8d\xeb\x8a\x94 \xea\xb2\xbd\xec\xb0\xb0\xec\x9d\x98 \xec\x86\x8c\xed\x98\x95\xeb\xb9\x84\xeb\x94\x94\xec\x98\xa4\xec\xb9\xb4\xeb\xa9\x94\xeb\x9d\xbc-\xec\x97\x91\xec\x86\x90 \xed\x94\x8c\xeb\xa0\x89\xec\x8a\xa4 | \xec\x97\x90\xec\x8a\xa4\xed\x8b\xb0\xeb\xa7\x88\xec\x9d\x98 \xec\x9d\xb8\xed\x84\xb0\xeb\x84\xb7\xec\x9d\xb4\xec\x95\xbc\xea\xb8\xb0" extended="" tag="police camera" hash="6c0309ae794a2727a048e5e1f9e1876b" shared="no" />\n <post href="http://theory.snu.ac.kr/mediawiki/images/d/d5/%ED%95%9C%EA%B8%80_edit_distance.pdf" time="2013-04-25T07:16:33Z" description="\xed\x95\x9c\xea\xb8\x80\xec\x97\x90 \xeb\x8c\x80\xed\x95\x9c \xed\x8e\xb8\xec\xa7\x91\xea\xb1\xb0\xeb\xa6\xac \xeb\xac\xb8\xec\xa0\x9c" extended="" tag="algorithm distance profanity_filtering" hash="fa9322de8bdcb82e6cadcd4b0dec4bdb" shared="no" />\n <post href="https://github.com/kevinschaul/binify" time="2013-04-25T06:02:07Z" description="kevinschaul/binify \xc2\xb7 GitHub" extended="" tag="repo visualization hexagon_binning" hash="f378fa02b0a9965bfdea858e0985b220" shared="no" />\n <post href="http://techcrunch.com/2013/04/22/want-to-raise-a-million-bucks-heres-what-youll-need/" time="2013-04-25T05:43:50Z" description="Want To Raise A Million Bucks? Here\xe2\x80\x99s What You\xe2\x80\x99ll Need | TechCrunch" extended="" tag="startup investment" hash="d76e2079690d7c1c1020805a761e7a76" shared="no" />\n <post href="http://mobile.reuters.com/article/idUSBRE93N06E20130424?irpc=932" time="2013-04-25T01:35:21Z" description="Analysis: Sleeping ad giant Amazon finally stirs" extended="" tag="amazon advertisement" hash="1c4b82b4331a246a278445fc6f7b4bb1" shared="no" />\n <post href="http://qiao.github.io/PathFinding.js/visual/" time="2013-04-25T01:32:24Z" description="PathFinding.js" extended="" tag="path_finding ai" hash="388e9fca89029606b761a602359224e7" shared="no" />\n <post href="http://kevinschaul.com/2013/04/18/introducing-binify?buffer_share=55493" time="2013-04-25T01:30:15Z" description="Introducing: Binify | Kevin Schaul" extended="" tag="visualization hexagon_binning" hash="f5c2d58817398126a21cfa3d8c3cd115" shared="no" />\n <post href="http://www.valvesoftware.com/publications/2009/ai_systems_of_l4d_mike_booth.pdf" time="2013-04-25T00:30:05Z" description="The AI Systems of Left 4 Dead" extended="" tag="pdf slides game ai" hash="3004f5a94d870a0ddfcfc639981262a6" shared="no" />\n <post href="http://blogs.hbr.org/bregman/2010/05/how-and-why-to-stop-multitaski.html?buffer_share=7f1b8" time="2013-04-24T16:31:06Z" description="How (and Why) to Stop Multitasking - Peter Bregman - Harvard Business Revie" extended="" tag="productivity multitasking" hash="9afea76d5d6e758375522b6c323d2959" shared="no" />\n <post href="http://pyvideo.org/video/1717/effective-django-0" time="2013-04-24T16:28:59Z" description="pyvideo.org - Effective Django" extended="" tag="video pycon_2013 django" hash="97397c0b2fdc9fd2867508c1a4597f12" shared="no" />\n <post href="http://pyvideo.org/video/1654/a-beginners-introduction-to-pydata-how-to-build" time="2013-04-24T16:13:39Z" description="pyvideo.org - A beginner\'s introduction to Pydata: how to build a minimal r" extended="" tag="video pycon_2013" hash="f5f02efb0d02c7221693b86fce05331c" shared="no" />\n <post href="https://developer.nvidia.com/content/cuda-pro-tip-write-flexible-kernels-grid-stride-loops" time="2013-04-24T14:56:08Z" description="CUDA Pro Tip: Write Flexible Kernels with Grid-Stride Loops" extended="" tag="cuda gpgpu" hash="8b1b715837eba081166c23261a8fddbf" shared="no" />\n <post href="http://arxiv.org/abs/1304.6257" time="2013-04-24T14:55:18Z" description="An Evolutionary Algorithm Approach to Link Prediction in Dynamic Social Net" extended="" tag="paper link_prediction" hash="dfd9cd529351e8916f0915bb692ce728" shared="no" />\n <post href="http://arxiv.org/abs/1304.6181" time="2013-04-24T14:54:19Z" description="Evaluating Web Content Quality via Multi-scale Features. (arXiv:1304.6181v1" extended="" tag="paper" hash="3626f3ae8ab2a8fa85df66f7673c993a" shared="no" />\n <post href="https://github.com/bponsler/pysiriproxy" time="2013-04-24T14:44:57Z" description="bponsler/pysiriproxy \xc2\xb7 GitHub" extended="Port of SiriProxy from Ruby to Python." tag="repo" hash="a5ba89a718d125fbd69dbf7fb5a2ef20" shared="no" />\n</posts>\n\t""" def test_connection(self): url = self.pin.URL + 'posts/recent?auth_token=' + self.PINBOARD_API_TOKEN with mock.patch('requests.get') as requests_get: self.pin.grab_xml(url) self.assertTrue(requests_get.called) def test_xml_parsing(self): result = self.pin.parse_xml(self.xml) self.assertEquals(str(result[0]), "http://estima.wordpress.com/2013/04/25/axon") def test_fetch_from_date(self): with mock.patch('requests.get') as requests_get: requests_get.return_value.content = self.xml datestr = '2013-04-25T00:00:00Z' self.pin.fetch_from_date(datestr) requests_get.assert_called_with( 'https://api.pinboard.in/v1/posts/all?auth_token={}&fromdt={}'.format(self.PINBOARD_API_TOKEN, datestr)) def test_fetch_all(self): with mock.patch('requests.get') as requests_get: requests_get.return_value.content = self.xml bookmarks = self.pin.fetch_all() requests_get.assert_called_with( 'https://api.pinboard.in/v1/posts/all?auth_token={}'.format(self.PINBOARD_API_TOKEN)) self.assertEquals(bookmarks[0].url, "http://estima.wordpress.com/2013/04/25/axon")
def main(): pinboard_db = PinboardDatabase() datestr = pinboard_db.last_updated pinboard = PinboardSource(PINBOARD_API_TOKEN) diffbot = DiffbotTransformer(DIFFBOT_TOKEN) evernote = EvernoteSink(EVERNOTE_DEVELOPER_TOKEN) logging.info("Fetching data from {}".format(datestr)) bookmarks = pinboard.fetch_from_date(datestr) # bookmarks = pinboard.fetch_from_url("http://i.imgur.com/4n92M.jpg") # bookmarks = pinboard.fetch_from_url("http://neoocean.net/blog/i/entry/%EB%B2%94%EC%A3%84%EC%97%90-%EB%8C%80%ED%95%9C-%ED%8B%80%EB%A6%B0-%EC%98%88%EC%B8%A1#_post_2057") # bookmarks = pinboard.fetch_from_url("http://nullmodel.egloos.com/3425248") # bookmarks = pinboard.fetch_from_url("http://www.daniel-lemire.com/blog/archives/2010/11/02/how-do-search-engines-handle-special-characters-should-you-care/") # bookmarks = pinboard.fetch_from_url("http://www.1011ltd.com/web/blog/post/evolving_pid") # no content type returned items = [] for bookmark in reversed(bookmarks): logging.info("Handling : {}".format(bookmark.url)) try: resource = URLFetcher(bookmark.url) except requests.exceptions.ConnectionError as e: logging.error("Failed to fetch resource at {}".format(bookmark.url)) logging.error("Reason: {}".format(e)) continue except requests.exceptions.TooManyRedirects as e: logging.error("Failed to fetch resource at {}".format(bookmark.url)) logging.error("Reason: {}".format(e)) continue item = Item() if resource.is_PDF(): item = PDFItem.from_pinboard_item(bookmark) item.content = resource.fetch() #FIXME this could take very long. Need a way to address this problem. elif resource.is_image(): item = ImageItem.from_pinboard_item(bookmark) item.content_type = resource.image_content_type() item.content = resource.fetch() elif resource.is_HTML() or resource.is_text(): if resource.is_HTML(): item = HTMLItem.from_pinboard_item(bookmark) json_result = diffbot.extract(item.url, html=True) try: json_object = json.loads(json_result) except json.scanner.JSONDecodeError: logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url)) continue if 'error' in json_object: logging.error("Failed to fetch resource at {}".format(item.url)) logging.error(u"Reason: {}".format(json_object['error'])) continue if 'statusCode' in json_object: if json_object['statusCode'] == 500: logging.error("Failed to fetch resource at {}".format(item.url)) logging.error(u"Reason: {}".format(json_object['message'])) continue if 'html' in json_object: item.content = html2enml(json_object['html']) else: # try plaintext if 'text' not in json_object: logging.error("Failed to fetch HTML document at all: {}".format(item.url)) continue logging.warn("Failed to fetch HTML document for {}".format(item.url)) logging.warn("Degrading to using text summary") item.content = html2enml(json_object['text']) else: item = TextItem.from_pinboard_item(bookmark) json_result = diffbot.extract(item.url, html=True) try: json_object = json.loads(json_result) except json.scanner.JSONDecodeError: logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url)) continue # resource is plain text contents = resource.fetch().split('\n\n') data = "<div>" for content in contents: data += ''.join(['<div>' + body + '</div>' for body in content.split('\n')]) data += "<div><br /></div>" data += "</div>" item.content = html2enml(data) # Check for default tags # FIXME seemingly random criteria for checking tags if not item.tags or (item.tags.lower() == 'unread' and len(item.tags.split()) == 1): # Diffbot will not contain tags key even if explicitly told to return tags if it does not find any if 'tags' in json_object: # autotag tells that this was autotagged. # Evernote cannot handle tags with commas. tags = 'autotag ' + ' '.join(('_'.join(x.replace(',','').split()) for x in json_object['tags'])) # diffbot tags item.tags = tags.encode('utf-8', 'xmlcharrefreplace') else: logging.error("Unknown content-type of {}".format(resource.content_type)) continue try: evernote.push(item) except socket.error as e: logging.error("Socket error: {}".format(e)) continue except EDAMUserException as e: logging.error("Unrecognized evernote type: {}".format(e)) continue pinboard_db.last_updated = item.time pinboard_db.close()