Ejemplo n.º 1
0
def api_worker(worker_number):
    """stupidly simulates long running process"""

    print('Start API {0} worker.'.format(worker_number))

    global file_q
    global api_q
    diffbot = DiffbotClient()

    while True:
        url = api_q.get()

        if url == 'kill':
            print('Kill {0} api_worker'.format(worker_number))
            break

        # print('api_q {0} size {1}'.format(worker_number, api_q.qsize()))

        try:
            response = diffbot.request(url, API_TOKEN, 'article')
        except requests.exceptions.HTTPError:
            continue

        if 'objects' in response:
            obj = response['objects'][0]

            if obj['text'] and obj.get('pageURL') == obj.get(
                    'resolvedPageURL'):
                file_q.put(obj)
Ejemplo n.º 2
0
class DiffbotClientUnitTest(unittest.TestCase):
    def setUp(self):
        self.patcher = patch('client.requests.get', fake_get)
        self.patcher.start()
        self.client = DiffbotClient()

    def tearDown(self):
        self.patcher.stop()

    def test_article_api(self):
        url = "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/"
        token = "SOME_TOKEN"
        api = "article"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertIn('title', response)
        self.assertEqual(
            response['title'],
            "Diffbot Is Using Computer Vision to Reinvent the Semantic Web")

    def test_frontpage_api(self):
        url = "http://www.huffingtonpost.com/"
        token = "SOME_TOKEN"
        api = "frontpage"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['tagName'], "dml")
        self.assertIn('childNodes', response)

    def test_product_api(self):
        url = "http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html"
        token = "SOME_TOKEN"
        api = "product"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['type'], "product")
        self.assertEqual(response['products'][0]['title'],
                         "iRobot 650 Roomba Vacuuming Robot")

    def test_image_api(self):
        url = "http://www.google.com/"
        token = "SOME_TOKEN"
        api = "image"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['title'], "Google")
        self.assertEqual(response['images'][0]['url'],
                         "https://www.google.com/images/srpr/logo9w.png")

    def test_analyze_api(self):
        url = "http://www.twitter.com/"
        token = "SOME_TOKEN"
        api = "analyze"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['type'], "image")
        self.assertEqual(response['title'], "Welcome to Twitter.")
Ejemplo n.º 3
0
def main():
    diffbot = DiffbotClient()
    token = API_TOKEN
    url = "https://newtonfreelibrary.libcal.com/event/4924168"
    api = "product"
    response = diffbot.request(url, token, api)
    print response
    print type(response)
    print response["objects"][0]["title"]
Ejemplo n.º 4
0
class DiffbotClientUnitTest(unittest.TestCase):

    def setUp(self):
        self.patcher = patch('client.requests.get', fake_get)
        self.patcher.start()
        self.client = DiffbotClient()

    def tearDown(self):
        self.patcher.stop()

    def test_article_api(self):
        url = "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/"
        token = "SOME_TOKEN"
        api = "article"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertIn('title', response)
        self.assertEqual(response['title'], "Diffbot Is Using Computer Vision to Reinvent the Semantic Web")

    def test_frontpage_api(self):
        url = "http://www.huffingtonpost.com/"
        token = "SOME_TOKEN"
        api = "frontpage"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['tagName'], "dml")
        self.assertIn('childNodes', response)

    def test_product_api(self):
        url = "http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html"
        token = "SOME_TOKEN"
        api = "product"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['type'], "product")
        self.assertEqual(response['products'][0]['title'], "iRobot 650 Roomba Vacuuming Robot")

    def test_image_api(self):
        url = "http://www.google.com/"
        token = "SOME_TOKEN"
        api = "image"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['title'], "Google")
        self.assertEqual(response['images'][0]['url'], "https://www.google.com/images/srpr/logo9w.png")

    def test_analyze_api(self):
        url = "http://www.twitter.com/"
        token = "SOME_TOKEN"
        api = "analyze"
        version = 2
        response = self.client.request(url, token, api, version=version)
        self.assertEqual(response['type'], "image")
        self.assertEqual(response['title'], "Welcome to Twitter.")
Ejemplo n.º 5
0
def swallowURL(url):
    import time
    diffbot = DiffbotClient()
    token = API_TOKEN
    api = "article"
    response = diffbot.request(url, token, api, version=2)
    #time.sleep(1)
    if 'html' not in response:
        return
    subtexts = response['html'].split("<p>")
    res = []
    for t in subtexts:
        outgt = clean(
            t,
            fix_unicode=True,  # fix various unicode errors
            to_ascii=True,  # transliterate to closest ASCII representation
            lower=False,  # lowercase text
            no_line_breaks=
            True,  # fully strip line breaks as opposed to only normalizing them
            no_urls=False,  # replace all URLs with a special token
            no_emails=False,  # replace all email addresses with a special token
            no_phone_numbers=
            False,  # replace all phone numbers with a special token
            no_numbers=False,  # replace all numbers with a special token
            no_digits=False,  # replace all digits with a special token
            no_currency_symbols=
            False,  # replace all currency symbols with a special token
            no_punct=False,  # fully remove punctuation
            replace_with_email="<EMAIL>",
            replace_with_phone_number="<PHONE>",
            replace_with_currency_symbol="<CUR>",
            lang="en"  # set to 'de' for German special handling
        )
        outgt = re.sub("<.*?>", " ", outgt)
        ogt = outgt.strip()
        ogt = re.sub("\s\s+", " ", ogt)
        ogt = re.sub('&lt;/?[a-z]+&gt;', '', ogt)
        ogt = ogt.replace("&rsquo;", "")
        ogt = ogt.replace("&ldquo;", "")
        ogt = ogt.replace("\n", "")
        ogt = ogt.replace("\r", "")
        ogt = ogt.replace("\\n", "")
        if len(ogt) > 10:
            res.append(outgt)

    if len(res) > 2:
        fields = [''.join(e for e in url if e.isalnum()), str(res)]
        if 'http' in fields[0]:
            with open('pdfs/data.csv', 'a') as f:
                writer = csv.writer(f)
                writer.writerow(fields)
Ejemplo n.º 6
0
def diffbotScrape(my_url):
    global count
    global json_count
    global my_dict
    count = count + 1
    diffbot = DiffbotClient()
    token = "2587daf076cad7bba4e58fd272780b2d"
    url = my_url
    api = "article"
    response = diffbot.request(url, token, api, fields=['title', 'type'])
    print("\nPrinting response:\n")
    print(count)
    #forgot to add to my dict before adding it to tthe json

    my_dict[my_url] = response
    print("Writing to my_dict...\n")
    if (count == 15):
        file = "diffbot" + str(json_count) + ".json"
        with open(file, 'a+') as f:
            json.dump(my_dict, f, sort_keys=True, indent=4)
        count = 0
        json_count += 1
        my_dict = {}
Ejemplo n.º 7
0
from client import DiffbotClient
from config import API_TOKEN
import pprint

print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api, version=2)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)

print
print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url,
                           token,
                           api,
                           fields=['title', 'type'],
                           version=2)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
Ejemplo n.º 8
0
 def setUp(self):
     self.patcher = patch('client.requests.get', fake_get)
     self.patcher.start()
     self.client = DiffbotClient()
Ejemplo n.º 9
0
from client import DiffbotClient, DiffbotCrawl
from config import API_TOKEN
import pprint
import time

print(
    "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n"
)
diffbot = DiffbotClient()
token = API_TOKEN
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api)
print("\nPrinting response:\n")
pp = pprint.PrettyPrinter(indent=4)
print(pp.pprint(response))

print()
print(
    "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n"
)
diffbot = DiffbotClient()
token = API_TOKEN
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api, fields=['title', 'type'])
print("\nPrinting response:\n")
pp = pprint.PrettyPrinter(indent=4)
print(pp.pprint(response))

print()
Ejemplo n.º 10
0
 def setUp(self):
     self.patcher = patch('client.requests.get', fake_get)
     self.patcher.start()
     self.client = DiffbotClient()
Ejemplo n.º 11
0
from client import DiffbotClient,DiffbotCrawl
from config import API_TOKEN
import pprint
import time

print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)

print
print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api, fields=['title', 'type'])
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)

print
print "Calling frontpage API endpoint on the url: http://www.huffingtonpost.com/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
url = "http://www.huffingtonpost.com/"