def api_worker(worker_number): """stupidly simulates long running process""" print('Start API {0} worker.'.format(worker_number)) global file_q global api_q diffbot = DiffbotClient() while True: url = api_q.get() if url == 'kill': print('Kill {0} api_worker'.format(worker_number)) break # print('api_q {0} size {1}'.format(worker_number, api_q.qsize())) try: response = diffbot.request(url, API_TOKEN, 'article') except requests.exceptions.HTTPError: continue if 'objects' in response: obj = response['objects'][0] if obj['text'] and obj.get('pageURL') == obj.get( 'resolvedPageURL'): file_q.put(obj)
class DiffbotClientUnitTest(unittest.TestCase): def setUp(self): self.patcher = patch('client.requests.get', fake_get) self.patcher.start() self.client = DiffbotClient() def tearDown(self): self.patcher.stop() def test_article_api(self): url = "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/" token = "SOME_TOKEN" api = "article" version = 2 response = self.client.request(url, token, api, version=version) self.assertIn('title', response) self.assertEqual( response['title'], "Diffbot Is Using Computer Vision to Reinvent the Semantic Web") def test_frontpage_api(self): url = "http://www.huffingtonpost.com/" token = "SOME_TOKEN" api = "frontpage" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['tagName'], "dml") self.assertIn('childNodes', response) def test_product_api(self): url = "http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html" token = "SOME_TOKEN" api = "product" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['type'], "product") self.assertEqual(response['products'][0]['title'], "iRobot 650 Roomba Vacuuming Robot") def test_image_api(self): url = "http://www.google.com/" token = "SOME_TOKEN" api = "image" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['title'], "Google") self.assertEqual(response['images'][0]['url'], "https://www.google.com/images/srpr/logo9w.png") def test_analyze_api(self): url = "http://www.twitter.com/" token = "SOME_TOKEN" api = "analyze" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['type'], "image") self.assertEqual(response['title'], "Welcome to Twitter.")
def main(): diffbot = DiffbotClient() token = API_TOKEN url = "https://newtonfreelibrary.libcal.com/event/4924168" api = "product" response = diffbot.request(url, token, api) print response print type(response) print response["objects"][0]["title"]
class DiffbotClientUnitTest(unittest.TestCase): def setUp(self): self.patcher = patch('client.requests.get', fake_get) self.patcher.start() self.client = DiffbotClient() def tearDown(self): self.patcher.stop() def test_article_api(self): url = "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/" token = "SOME_TOKEN" api = "article" version = 2 response = self.client.request(url, token, api, version=version) self.assertIn('title', response) self.assertEqual(response['title'], "Diffbot Is Using Computer Vision to Reinvent the Semantic Web") def test_frontpage_api(self): url = "http://www.huffingtonpost.com/" token = "SOME_TOKEN" api = "frontpage" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['tagName'], "dml") self.assertIn('childNodes', response) def test_product_api(self): url = "http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html" token = "SOME_TOKEN" api = "product" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['type'], "product") self.assertEqual(response['products'][0]['title'], "iRobot 650 Roomba Vacuuming Robot") def test_image_api(self): url = "http://www.google.com/" token = "SOME_TOKEN" api = "image" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['title'], "Google") self.assertEqual(response['images'][0]['url'], "https://www.google.com/images/srpr/logo9w.png") def test_analyze_api(self): url = "http://www.twitter.com/" token = "SOME_TOKEN" api = "analyze" version = 2 response = self.client.request(url, token, api, version=version) self.assertEqual(response['type'], "image") self.assertEqual(response['title'], "Welcome to Twitter.")
def swallowURL(url): import time diffbot = DiffbotClient() token = API_TOKEN api = "article" response = diffbot.request(url, token, api, version=2) #time.sleep(1) if 'html' not in response: return subtexts = response['html'].split("<p>") res = [] for t in subtexts: outgt = clean( t, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=False, # lowercase text no_line_breaks= True, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers= False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols= False, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation replace_with_email="<EMAIL>", replace_with_phone_number="<PHONE>", replace_with_currency_symbol="<CUR>", lang="en" # set to 'de' for German special handling ) outgt = re.sub("<.*?>", " ", outgt) ogt = outgt.strip() ogt = re.sub("\s\s+", " ", ogt) ogt = re.sub('</?[a-z]+>', '', ogt) ogt = ogt.replace("’", "") ogt = ogt.replace("“", "") ogt = ogt.replace("\n", "") ogt = ogt.replace("\r", "") ogt = ogt.replace("\\n", "") if len(ogt) > 10: res.append(outgt) if len(res) > 2: fields = [''.join(e for e in url if e.isalnum()), str(res)] if 'http' in fields[0]: with open('pdfs/data.csv', 'a') as f: writer = csv.writer(f) writer.writerow(fields)
def diffbotScrape(my_url): global count global json_count global my_dict count = count + 1 diffbot = DiffbotClient() token = "2587daf076cad7bba4e58fd272780b2d" url = my_url api = "article" response = diffbot.request(url, token, api, fields=['title', 'type']) print("\nPrinting response:\n") print(count) #forgot to add to my dict before adding it to tthe json my_dict[my_url] = response print("Writing to my_dict...\n") if (count == 15): file = "diffbot" + str(json_count) + ".json" with open(file, 'a+') as f: json.dump(my_dict, f, sort_keys=True, indent=4) count = 0 json_count += 1 my_dict = {}
from client import DiffbotClient from config import API_TOKEN import pprint print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN version = 2 url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api, version=2) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) print print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN version = 2 url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api, fields=['title', 'type'], version=2) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response)
def setUp(self): self.patcher = patch('client.requests.get', fake_get) self.patcher.start() self.client = DiffbotClient()
from client import DiffbotClient, DiffbotCrawl from config import API_TOKEN import pprint import time print( "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n" ) diffbot = DiffbotClient() token = API_TOKEN url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api) print("\nPrinting response:\n") pp = pprint.PrettyPrinter(indent=4) print(pp.pprint(response)) print() print( "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n" ) diffbot = DiffbotClient() token = API_TOKEN url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api, fields=['title', 'type']) print("\nPrinting response:\n") pp = pprint.PrettyPrinter(indent=4) print(pp.pprint(response)) print()
from client import DiffbotClient,DiffbotCrawl from config import API_TOKEN import pprint import time print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) print print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api, fields=['title', 'type']) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) print print "Calling frontpage API endpoint on the url: http://www.huffingtonpost.com/...\n" diffbot = DiffbotClient() token = API_TOKEN url = "http://www.huffingtonpost.com/"