def api_worker(worker_number): """stupidly simulates long running process""" print('Start API {0} worker.'.format(worker_number)) global file_q global api_q diffbot = DiffbotClient() while True: url = api_q.get() if url == 'kill': print('Kill {0} api_worker'.format(worker_number)) break # print('api_q {0} size {1}'.format(worker_number, api_q.qsize())) try: response = diffbot.request(url, API_TOKEN, 'article') except requests.exceptions.HTTPError: continue if 'objects' in response: obj = response['objects'][0] if obj['text'] and obj.get('pageURL') == obj.get( 'resolvedPageURL'): file_q.put(obj)
def main(): diffbot = DiffbotClient() token = API_TOKEN url = "https://newtonfreelibrary.libcal.com/event/4924168" api = "product" response = diffbot.request(url, token, api) print response print type(response) print response["objects"][0]["title"]
def swallowURL(url): import time diffbot = DiffbotClient() token = API_TOKEN api = "article" response = diffbot.request(url, token, api, version=2) #time.sleep(1) if 'html' not in response: return subtexts = response['html'].split("<p>") res = [] for t in subtexts: outgt = clean( t, fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=False, # lowercase text no_line_breaks= True, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers= False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols= False, # replace all currency symbols with a special token no_punct=False, # fully remove punctuation replace_with_email="<EMAIL>", replace_with_phone_number="<PHONE>", replace_with_currency_symbol="<CUR>", lang="en" # set to 'de' for German special handling ) outgt = re.sub("<.*?>", " ", outgt) ogt = outgt.strip() ogt = re.sub("\s\s+", " ", ogt) ogt = re.sub('</?[a-z]+>', '', ogt) ogt = ogt.replace("’", "") ogt = ogt.replace("“", "") ogt = ogt.replace("\n", "") ogt = ogt.replace("\r", "") ogt = ogt.replace("\\n", "") if len(ogt) > 10: res.append(outgt) if len(res) > 2: fields = [''.join(e for e in url if e.isalnum()), str(res)] if 'http' in fields[0]: with open('pdfs/data.csv', 'a') as f: writer = csv.writer(f) writer.writerow(fields)
def diffbotScrape(my_url): global count global json_count global my_dict count = count + 1 diffbot = DiffbotClient() token = "2587daf076cad7bba4e58fd272780b2d" url = my_url api = "article" response = diffbot.request(url, token, api, fields=['title', 'type']) print("\nPrinting response:\n") print(count) #forgot to add to my dict before adding it to tthe json my_dict[my_url] = response print("Writing to my_dict...\n") if (count == 15): file = "diffbot" + str(json_count) + ".json" with open(file, 'a+') as f: json.dump(my_dict, f, sort_keys=True, indent=4) count = 0 json_count += 1 my_dict = {}
from client import DiffbotClient from config import API_TOKEN import pprint print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN version = 2 url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api, version=2) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response) print print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n" diffbot = DiffbotClient() token = API_TOKEN version = 2 url = "http://shichuan.github.io/javascript-patterns/" api = "article" response = diffbot.request(url, token, api, fields=['title', 'type'], version=2) print "\nPrinting response:\n" pp = pprint.PrettyPrinter(indent=4) print pp.pprint(response)
def setUp(self): self.patcher = patch('client.requests.get', fake_get) self.patcher.start() self.client = DiffbotClient()