def get_query(self, search_term): """ build query object for querying webhose using the sdk :param search_term: search term from app home page :return: web hose query string """ site_types = ['news'] webhose.config(token='d72a236b-f6d9-4c52-b2d1-cceb9734579a') q = webhose.Query() q.__setattr__('body_text', search_term) q.__setattr__('language', "english") q.__setattr__('site_type', site_types) print q.query_string() return q
def get_query(self, search_term): """ build query object for querying webhose using the sdk :param search_term: search term from app home page :return: web hose query string """ site_types = ["news"] webhose.config(token="d72a236b-f6d9-4c52-b2d1-cceb9734579a") q = webhose.Query() q.__setattr__("body_text", search_term) q.__setattr__("language", "english") q.__setattr__("site_type", site_types) print q.query_string() return q
def test_next(self): """ check that if we use the 'since' parameter from one query, that we don't get any overlap """ # run a "regular" query webhose.config(os.environ['WEBHOSE_TOKEN']) query = webhose.Query() query.some_terms = ('boston','red sox') query.language = 'english' query.site_type = 'news' response = webhose.search(query) # grab some stuff that we need for testing next_ts = response.next_ts last_post_crawled = response.posts[99].crawled_parsed # now run our second query response = webhose.search(query, since=response.next_ts) self.assertGreater(response.posts[99].crawled_parsed, last_post_crawled)
def test_next(self): """ check that if we use the 'since' parameter from one query, that we don't get any overlap """ # run a "regular" query webhose.config(os.environ['WEBHOSE_TOKEN']) query = webhose.Query() query.some_terms = ('boston', 'red sox') query.language = 'english' query.site_type = 'news' response = webhose.search(query) # grab some stuff that we need for testing next_ts = response.next_ts last_post_crawled = response.posts[99].crawled_parsed # now run our second query response = webhose.search(query, since=response.next_ts) self.assertGreater(response.posts[99].crawled_parsed, last_post_crawled)
def fullretrievew(terms,stage1): #Initiate Webhose search instance, return max results webhose.config(token = 'b88b78c1-0dac-4793-913e-7d20e0559144') re = webhose.search(terms) open(stage1, 'w').close() s1 = open(stage1, "a") i = 0 for post in re: i = i+1 try: s1.write("-=-=-ARTICLE " + str(i) + ": " + str(post.thread.url.encode('utf-8')) + "\n") except: s1.write("-=-=-ARTICLE " + str(i) + "\n") try: s1.write("-=-=-TITLE: " + str(post.thread.title_full.encode('utf-8')) + "\n") except: s1.write("-=-=-TITLE: \n") s1.write("-=-=-SNIPPET: \n") try: s1.write("-=-=-TEXT: " + str(post.text.encode('utf-8')) + "\n") except: s1.write("-=-=-TEXT: \n") return
import unirest import time import webhose from elasticsearch import Elasticsearch import json import cnfg import pandas as pd from nltk.tokenize import sent_tokenize from vaderSentiment.vaderSentiment import sentiment as vaderSentiment import datetime config = cnfg.load("/home/ubuntu/dfsharp/.webhoser_config") tok = config["token"] webhose.config(token=tok) # In[3]: def get_archive(): yesterday = datetime.date.today() - datetime.timedelta(hours=24) unix_time = yesterday.strftime("%s") # get response from webhose query # response = unirest.get("https://webhose.io/search?token=" + tok + "&format=json&q=NBA+DFS", response = unirest.get( "https://webhose.io/search?token=" + tok + "&format=json&q=MLB%20DFS", # response = unirest.get("https://webhose.io/search?token=" + tok + "&format=json&q=MLB+DFS+ts="+unix_time, headers={"Accept": "application/json"}) return (response)
from __future__ import division import os import re import time import webhose # Initializing webhose SDK with our private TOKEN API_TOKEN = 'YOUR_WEBHOSE_API_TOKEN' webhose.config(API_TOKEN) # Setting the relative location of the train/test files resources_dir = './src/main/resources' def collect(filename, query, limit, sentiment, partition): lines = set() # Collect the data from webhose.io with the given query up to the given limit response = webhose.search(query) while len(response.posts) > 0 and len(lines) < limit: # Go over the list of posts returned from the response for post in response.posts: # Verify that the length of the text is not too short nor too long if 1000 > len(post.text) > 50: # Extracting the text from the post object and clean it text = re.sub(r'(\([^\)]+\)|(stars|rating)\s*:\s*\S+)\s*$', '', post.text.replace('\n', '').replace('\t', ''), 0, re.I) # add the post-text to the lines we are going to save in the train/test file
!pip install webhose import webhose webhose.config(token=os.environ["WEBHOSE_KEY"]) ## Just make a call #posts = webhose.search("Obama performance_score:>8") q = webhose.Query() #q.some_terms = ['"big data"','"machine learning"'] #q.title = '"big data" OR "machine learning"' q.phrase = '"data science" OR "machine learning"' print q.query_string() results = webhose.search(q.query_string() + ' performance_score:>1') for post in results.posts: score = post.thread.performance_score if (score > 0): print(post.title + ' by ' + post.thread.site + ' with score ' + str(score)) #print(post.thread.main_image)
import webhose webhose.config(token="ce676c6c-02c7-47f4-a4e3-6f332774a976") for post in webhose.search("github"): print(post.title)
import unirest import time import webhose from elasticsearch import Elasticsearch import json import cnfg import pandas as pd from nltk.tokenize import sent_tokenize from vaderSentiment.vaderSentiment import sentiment as vaderSentiment config = cnfg.load("/home/ubuntu/dfsharp/.webhoser_config") tok = config["token"] webhose.config(token=tok) # In[3]: def get_archive(): # get response from webhose query "NBA DFS" response = unirest.get("https://webhose.io/search?token=" + tok + "&format=json&q=NBA+DFS", headers={ "Accept": "application/json" } ) return(response) # In[4]:
import time, os import constants as ct """ import unirest response = unirest.get("https://webhose.io/search?token=f9cf7cbd-5c93-4672-8cb0-f6da249d1808&format=json&q=amazon%20OR%20AMZN&sort=relevancy&ts=1478463348741", headers={ "Accept": "text/plain" } ) print response.body """ import webhose webhose.config(token='f9cf7cbd-5c93-4672-8cb0-f6da249d1808') company_list = ["PayPal"] news_content = {} for org in company_list: r = webhose.search(org) news_content[org] = {} articleNo = 1 while True: for post in r: news_content[org][articleNo] = {} if post.language == 'english' and post.published[:4] == '2016': timestamp = post.published[:10] + post.published[11:19] news_content[org][articleNo][timestamp] = [ post.title, post.text ]
def news_search(keyword): webhose.config(token="97029546-2c7f-4116-a16d-e88dd66f09c2") r = webhose.search(keyword) for i in range(1): print(r.posts[i].title)
from urlparse import urlparse import datetime import jwt import random import gspread from oauth2client.client import SignedJwtAssertionCredentials import re from urlparse import urlparse from os.path import splitext #import ast reload(sys) sys.setdefaultencoding('utf8') webhose.config(token="b307005e-a773-4710-9aa8-98db353af657") mod_search = Blueprint('search', __name__, url_prefix='') mod_search_res = Blueprint('search_res', __name__, url_prefix='') mod_annotate = Blueprint('annotate', __name__, url_prefix='') mod_annotate2 = Blueprint('annotate2', __name__, url_prefix='') mod_annotate3 = Blueprint('annotate3', __name__, url_prefix='') mod_proxy = Blueprint('proxy', __name__, url_prefix='') mod_js = Blueprint('js', __name__, url_prefix='') mod_img = Blueprint('img', __name__, url_prefix='') mod_login = Blueprint('login', __name__, url_prefix='') mod_create_profile = Blueprint('create_profile', __name__, url_prefix='') mod_profile = Blueprint('profile', __name__, url_prefix='') mod_logout = Blueprint('logout', __name__, url_prefix='') mod_index = Blueprint('index', __name__, url_prefix='') mod_generate_token = Blueprint('generate_token', __name__, url_prefix='')
# Loading Developer Libraries import webhose import requests import os import json from flask import Flask, request app = Flask(__name__) # Loading Developer Key for Webhose api_key = os.environ.get('KEY') webhose.config(token=api_key) def search_engine(): search_input = input("Enter what you are searching for:\n") num_searches = input("How many search results would you like:\n") response = requests.get("https://webhose.io/search?token="+api_key+"&format=json&q=" \ + search_input) query = response.json() for i in range(0, int(num_searches)): params = { 'site': query['posts'][i]['thread']['site_full'], 'text': query['posts'][i]['text'], 'title': query['posts'][i]['thread']['title'], 'author': query['posts'][i]['author'] }
import proxypy from urlparse import urlparse import datetime import jwt import random import gspread from oauth2client.client import SignedJwtAssertionCredentials import re from urlparse import urlparse from os.path import splitext #import ast reload(sys) sys.setdefaultencoding('utf8') webhose.config(token="b307005e-a773-4710-9aa8-98db353af657") mod_search = Blueprint('search', __name__, url_prefix='') mod_search_res = Blueprint('search_res', __name__, url_prefix='') mod_annotate = Blueprint('annotate', __name__, url_prefix='') mod_annotate2 = Blueprint('annotate2', __name__, url_prefix='') mod_annotate3 = Blueprint('annotate3', __name__, url_prefix='') mod_proxy = Blueprint('proxy', __name__, url_prefix='') mod_js = Blueprint('js', __name__, url_prefix='') mod_img = Blueprint('img', __name__, url_prefix='') mod_login = Blueprint('login', __name__, url_prefix='') mod_create_profile = Blueprint('create_profile', __name__, url_prefix='') mod_profile = Blueprint('profile', __name__, url_prefix='') mod_logout = Blueprint('logout', __name__, url_prefix='') mod_index = Blueprint('index', __name__, url_prefix='') mod_generate_token = Blueprint('generate_token', __name__, url_prefix='')
from flask import Flask, render_template import webhose from datetime import datetime app = Flask(__name__) @app.route('/') def index(name=None): return render_template('index.html', name=name) if __name__ == '__main__': app.run() webhose.config(token='46e002b6-73c5-4281-8bde-c1791802ff5f') query = '(transgender OR "trans woman" OR "trans man") AND (killed OR murdered)' def runQuery(): queryResults = webhose.search(str(query)) return queryResults def find_most_recent(results): recent_time = datetime.strptime(results[0].published[0:10], "%Y-%m-%d") recent_post = results[0] for r in results: r_date = datetime.strptime(r.published[0:10], "%Y-%m-%d")
import webhose import access_tokens from datetime import datetime, timedelta import ml_model import geocoder import collections import risk_constants webhose.config(token=access_tokens.webhose_access_token) def fetch_news_by_location(location): geocode = geocoder.google(location) query = { 'location': location, 'language': 'english', 'site_type': 'news', 'thread.country':geocode.country, } headlines = [x.title for x in webhose.search(query=query, since=int((datetime.now() - timedelta(days=3)).strftime("%s")))] return headlines def get_crisis_risk_from_news_by_location(location): print('STATUS : Analyzing local news headlines ...') headlines = fetch_news_by_location(location) if len(headlines): informative_news_clf = ml_model.tweet_clf_extra.predict(headlines) informative_news = [headlines[x] for x in range(len(headlines)) if informative_news_clf[x] == 'Related and informative'] crisis_news_clf = ml_model.tweet_clf.predict(informative_news) crisis_news = [informative_news[x] for x in range(len(informative_news)) if crisis_news_clf[x] == 'on-topic']
import webhose webhose.config(token="35699326-6aec-4b1e-8aa4-a0794ba56819") r = webhose.search("python java") for i in xrange(0, 20): print r.posts[i].title # print "\n"
import webhose; import time; from datetime import datetime, timedelta from lxml import html import requests import unirest webhose.config(token='c6052904-f312-436b-a6d8-d915084ac866') days_back = 30 date_days_ago = datetime.now() - timedelta(days=days_back) organization = 'honeywell' lang = 'english' country = 'US' #set API Token apiToken = 'c6052904-f312-436b-a6d8-d915084ac866' # Build URL #queryURL = 'https://webhose.io/search?token=' + apiToken + '&format=json&q=' + sentiment + '%3A%22' + organization + '%22&ts=1478565932339' ### UPDATE YOUR END POINT HERE - Amazon Positive response = unirest.get("https://webhose.io/search?token=c6052904-f312-436b-a6d8-d915084ac866&format=json&q=language%3A(english)%20thread.country%3AUS%20organization.positive%3A%22Honeywell%22&ts=1478579458386", headers={ "Accept": "text/plain" } ) count = 1 results = response.body["totalResults"]
import webhose, operator, time, Queue, json, httplib, urllib, urllib2 from difflib import SequenceMatcher from datetime import datetime, timedelta # get your free access token from Webhose.io webhose.config(token="XXXX-XXXXX-XXX-XXXX-XXXX") dup_check = set() image_cache = {} person_queue = Queue.Queue(100) # Return a score about how similar two strings are def similar(a, b): return SequenceMatcher(None, a, b).ratio() # Query Bing Image Search API for the facial image of a person def get_image(search_string): if search_string in image_cache: return image_cache[search_string] headers = { # Request headers 'Content-Type': 'multipart/form-data', 'Ocp-Apim-Subscription-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', # get your key from Bing API } try: params = urllib.urlencode({"q":'"' + search_string + '"', "count":10,"offset":0,"mkt":"en-us", "size":"small", "imageType":"Photo","imageContent":"Face"})