def get_query(self, search_term):
     """
     build query object for querying webhose using the sdk
     :param search_term: search term from app home page
     :return: web hose query string
     """
     site_types = ['news']
     webhose.config(token='d72a236b-f6d9-4c52-b2d1-cceb9734579a')
     q = webhose.Query()
     q.__setattr__('body_text', search_term)
     q.__setattr__('language', "english")
     q.__setattr__('site_type', site_types)
     print q.query_string()
     return q
 def get_query(self, search_term):
     """
     build query object for querying webhose using the sdk
     :param search_term: search term from app home page
     :return: web hose query string
     """
     site_types = ["news"]
     webhose.config(token="d72a236b-f6d9-4c52-b2d1-cceb9734579a")
     q = webhose.Query()
     q.__setattr__("body_text", search_term)
     q.__setattr__("language", "english")
     q.__setattr__("site_type", site_types)
     print q.query_string()
     return q
    def test_next(self):
        """
        check that if we use the 'since' parameter from one query, that we
        don't get any overlap
        """

        # run a "regular" query
        webhose.config(os.environ['WEBHOSE_TOKEN'])
        query = webhose.Query()
        query.some_terms = ('boston','red sox')
        query.language = 'english'
        query.site_type = 'news'

        response = webhose.search(query)

        # grab some stuff that we need for testing
        next_ts = response.next_ts
        last_post_crawled = response.posts[99].crawled_parsed

        # now run our second query
        response = webhose.search(query, since=response.next_ts)

        self.assertGreater(response.posts[99].crawled_parsed,
                           last_post_crawled)
Esempio n. 4
0
    def test_next(self):
        """
        check that if we use the 'since' parameter from one query, that we
        don't get any overlap
        """

        # run a "regular" query
        webhose.config(os.environ['WEBHOSE_TOKEN'])
        query = webhose.Query()
        query.some_terms = ('boston', 'red sox')
        query.language = 'english'
        query.site_type = 'news'

        response = webhose.search(query)

        # grab some stuff that we need for testing
        next_ts = response.next_ts
        last_post_crawled = response.posts[99].crawled_parsed

        # now run our second query
        response = webhose.search(query, since=response.next_ts)

        self.assertGreater(response.posts[99].crawled_parsed,
                           last_post_crawled)
def fullretrievew(terms,stage1):
	#Initiate Webhose search instance, return max results
    	webhose.config(token = 'b88b78c1-0dac-4793-913e-7d20e0559144')
	re = webhose.search(terms)

	open(stage1, 'w').close()
	s1 = open(stage1, "a")
	i = 0
	for post in re:
		i = i+1
		try:
			s1.write("-=-=-ARTICLE " + str(i) + ": " + str(post.thread.url.encode('utf-8')) + "\n")
		except:
			s1.write("-=-=-ARTICLE " + str(i) + "\n")
		try:
			s1.write("-=-=-TITLE: " + str(post.thread.title_full.encode('utf-8')) + "\n")
		except:
			s1.write("-=-=-TITLE: \n")
		s1.write("-=-=-SNIPPET: \n")
		try:
			s1.write("-=-=-TEXT: " + str(post.text.encode('utf-8')) + "\n")
		except:
			s1.write("-=-=-TEXT: \n")
	return
Esempio n. 6
0
import unirest
import time
import webhose
from elasticsearch import Elasticsearch
import json
import cnfg
import pandas as pd
from nltk.tokenize import sent_tokenize
from vaderSentiment.vaderSentiment import sentiment as vaderSentiment
import datetime

config = cnfg.load("/home/ubuntu/dfsharp/.webhoser_config")
tok = config["token"]
webhose.config(token=tok)

# In[3]:


def get_archive():
    yesterday = datetime.date.today() - datetime.timedelta(hours=24)
    unix_time = yesterday.strftime("%s")
    # get response from webhose query
    # response = unirest.get("https://webhose.io/search?token=" + tok + "&format=json&q=NBA+DFS",
    response = unirest.get(
        "https://webhose.io/search?token=" + tok + "&format=json&q=MLB%20DFS",
        # response = unirest.get("https://webhose.io/search?token=" + tok + "&format=json&q=MLB+DFS+ts="+unix_time,
        headers={"Accept": "application/json"})

    return (response)

Esempio n. 7
0
from __future__ import division

import os
import re
import time
import webhose

# Initializing webhose SDK with our private TOKEN
API_TOKEN = 'YOUR_WEBHOSE_API_TOKEN'
webhose.config(API_TOKEN)

# Setting the relative location of the train/test files
resources_dir = './src/main/resources'


def collect(filename, query, limit, sentiment, partition):
    lines = set()

    # Collect the data from webhose.io with the given query up to the given limit
    response = webhose.search(query)

    while len(response.posts) > 0 and len(lines) < limit:
        # Go over the list of posts returned from the response
        for post in response.posts:
            # Verify that the length of the text is not too short nor too long
            if 1000 > len(post.text) > 50:
                # Extracting the text from the post object and clean it
                text = re.sub(r'(\([^\)]+\)|(stars|rating)\s*:\s*\S+)\s*$', '',
                              post.text.replace('\n', '').replace('\t', ''), 0,
                              re.I)
                # add the post-text to the lines we are going to save in the train/test file
Esempio n. 8
0
!pip install webhose

import webhose
webhose.config(token=os.environ["WEBHOSE_KEY"])

## Just make a call
#posts = webhose.search("Obama performance_score:>8")



q = webhose.Query()
#q.some_terms = ['"big data"','"machine learning"']
#q.title = '"big data" OR "machine learning"'
q.phrase = '"data science" OR "machine learning"'
print q.query_string()

results = webhose.search(q.query_string() + ' performance_score:>1')

for post in results.posts:
  score = post.thread.performance_score
  if (score > 0):
    print(post.title + ' by ' + post.thread.site + ' with score ' + str(score))
    #print(post.thread.main_image)
    
 
Esempio n. 9
0
import webhose

webhose.config(token="ce676c6c-02c7-47f4-a4e3-6f332774a976")
for post in webhose.search("github"):
    print(post.title)
Esempio n. 10
0
import unirest
import time
import webhose
from elasticsearch import Elasticsearch
import json
import cnfg
import pandas as pd
from nltk.tokenize import sent_tokenize
from vaderSentiment.vaderSentiment import sentiment as vaderSentiment


config = cnfg.load("/home/ubuntu/dfsharp/.webhoser_config")
tok = config["token"]
webhose.config(token=tok)

# In[3]:


def get_archive():
    # get response from webhose query "NBA DFS"
    response = unirest.get("https://webhose.io/search?token=" + tok + "&format=json&q=NBA+DFS",

                           headers={
                               "Accept": "application/json"
                           }
                           )

    return(response)


# In[4]:
import time, os
import constants as ct
"""
import unirest

response = unirest.get("https://webhose.io/search?token=f9cf7cbd-5c93-4672-8cb0-f6da249d1808&format=json&q=amazon%20OR%20AMZN&sort=relevancy&ts=1478463348741",
    headers={
    "Accept": "text/plain"
    }
)

print response.body
"""

import webhose
webhose.config(token='f9cf7cbd-5c93-4672-8cb0-f6da249d1808')

company_list = ["PayPal"]
news_content = {}
for org in company_list:
    r = webhose.search(org)
    news_content[org] = {}
    articleNo = 1
    while True:
        for post in r:
            news_content[org][articleNo] = {}
            if post.language == 'english' and post.published[:4] == '2016':
                timestamp = post.published[:10] + post.published[11:19]
                news_content[org][articleNo][timestamp] = [
                    post.title, post.text
                ]
Esempio n. 12
0
def news_search(keyword):
    webhose.config(token="97029546-2c7f-4116-a16d-e88dd66f09c2")
    r = webhose.search(keyword)
    for i in range(1):
        print(r.posts[i].title)
Esempio n. 13
0
from urlparse import urlparse
import datetime
import jwt
import random
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import re
from urlparse import urlparse
from os.path import splitext
#import ast


reload(sys)
sys.setdefaultencoding('utf8')

webhose.config(token="b307005e-a773-4710-9aa8-98db353af657")

mod_search = Blueprint('search', __name__, url_prefix='')
mod_search_res = Blueprint('search_res', __name__, url_prefix='')
mod_annotate = Blueprint('annotate', __name__, url_prefix='')
mod_annotate2 = Blueprint('annotate2', __name__, url_prefix='')
mod_annotate3 = Blueprint('annotate3', __name__, url_prefix='')
mod_proxy = Blueprint('proxy', __name__, url_prefix='')
mod_js = Blueprint('js', __name__, url_prefix='')
mod_img = Blueprint('img', __name__, url_prefix='')
mod_login = Blueprint('login', __name__, url_prefix='')
mod_create_profile = Blueprint('create_profile', __name__, url_prefix='')
mod_profile = Blueprint('profile', __name__, url_prefix='')
mod_logout = Blueprint('logout', __name__, url_prefix='')
mod_index = Blueprint('index', __name__, url_prefix='')
mod_generate_token = Blueprint('generate_token', __name__, url_prefix='')
Esempio n. 14
0
# Loading Developer Libraries
import webhose
import requests
import os
import json
from flask import Flask, request

app = Flask(__name__)

# Loading Developer Key for Webhose

api_key = os.environ.get('KEY')
webhose.config(token=api_key)


def search_engine():

    search_input = input("Enter what you are searching for:\n")
    num_searches = input("How many search results would you like:\n")
    response = requests.get("https://webhose.io/search?token="+api_key+"&format=json&q=" \
                          + search_input)

    query = response.json()
    for i in range(0, int(num_searches)):
        params = {
            'site': query['posts'][i]['thread']['site_full'],
            'text': query['posts'][i]['text'],
            'title': query['posts'][i]['thread']['title'],
            'author': query['posts'][i]['author']
        }
Esempio n. 15
0
import proxypy
from urlparse import urlparse
import datetime
import jwt
import random
import gspread
from oauth2client.client import SignedJwtAssertionCredentials
import re
from urlparse import urlparse
from os.path import splitext
#import ast

reload(sys)
sys.setdefaultencoding('utf8')

webhose.config(token="b307005e-a773-4710-9aa8-98db353af657")

mod_search = Blueprint('search', __name__, url_prefix='')
mod_search_res = Blueprint('search_res', __name__, url_prefix='')
mod_annotate = Blueprint('annotate', __name__, url_prefix='')
mod_annotate2 = Blueprint('annotate2', __name__, url_prefix='')
mod_annotate3 = Blueprint('annotate3', __name__, url_prefix='')
mod_proxy = Blueprint('proxy', __name__, url_prefix='')
mod_js = Blueprint('js', __name__, url_prefix='')
mod_img = Blueprint('img', __name__, url_prefix='')
mod_login = Blueprint('login', __name__, url_prefix='')
mod_create_profile = Blueprint('create_profile', __name__, url_prefix='')
mod_profile = Blueprint('profile', __name__, url_prefix='')
mod_logout = Blueprint('logout', __name__, url_prefix='')
mod_index = Blueprint('index', __name__, url_prefix='')
mod_generate_token = Blueprint('generate_token', __name__, url_prefix='')
Esempio n. 16
0
from flask import Flask, render_template
import webhose
from datetime import datetime

app = Flask(__name__)


@app.route('/')
def index(name=None):
    return render_template('index.html', name=name)


if __name__ == '__main__':
    app.run()

webhose.config(token='46e002b6-73c5-4281-8bde-c1791802ff5f')

query = '(transgender OR "trans woman" OR "trans man") AND (killed OR murdered)'


def runQuery():
    queryResults = webhose.search(str(query))
    return queryResults


def find_most_recent(results):
    recent_time = datetime.strptime(results[0].published[0:10], "%Y-%m-%d")
    recent_post = results[0]

    for r in results:
        r_date = datetime.strptime(r.published[0:10], "%Y-%m-%d")
Esempio n. 17
0
import webhose
import access_tokens
from datetime import datetime, timedelta
import ml_model
import geocoder
import collections
import risk_constants

webhose.config(token=access_tokens.webhose_access_token)

def fetch_news_by_location(location):
    geocode = geocoder.google(location)
    query = {
        'location': location,
        'language': 'english',
        'site_type': 'news',
        'thread.country':geocode.country,
    }

    headlines = [x.title for x in webhose.search(query=query, since=int((datetime.now() - timedelta(days=3)).strftime("%s")))]
    return headlines


def get_crisis_risk_from_news_by_location(location):
    print('STATUS : Analyzing local news headlines ...')
    headlines = fetch_news_by_location(location)
    if len(headlines):
        informative_news_clf = ml_model.tweet_clf_extra.predict(headlines)
        informative_news = [headlines[x] for x in range(len(headlines)) if informative_news_clf[x] == 'Related and informative']
        crisis_news_clf = ml_model.tweet_clf.predict(informative_news)
        crisis_news = [informative_news[x] for x in range(len(informative_news)) if crisis_news_clf[x] == 'on-topic']
Esempio n. 18
0
import webhose
webhose.config(token="35699326-6aec-4b1e-8aa4-a0794ba56819")
r = webhose.search("python java")
for i in xrange(0, 20):
    print r.posts[i].title
#	print "\n"
Esempio n. 19
0
import webhose;
import time;
from datetime import datetime, timedelta
from lxml import html
import requests
import unirest

webhose.config(token='c6052904-f312-436b-a6d8-d915084ac866')

days_back = 30
date_days_ago = datetime.now() - timedelta(days=days_back)

organization = 'honeywell'
lang = 'english'
country = 'US'

#set API Token
apiToken = 'c6052904-f312-436b-a6d8-d915084ac866'

# Build URL
#queryURL = 'https://webhose.io/search?token=' + apiToken + '&format=json&q=' + sentiment + '%3A%22' + organization + '%22&ts=1478565932339'

### UPDATE YOUR END POINT HERE - Amazon Positive
response = unirest.get("https://webhose.io/search?token=c6052904-f312-436b-a6d8-d915084ac866&format=json&q=language%3A(english)%20thread.country%3AUS%20organization.positive%3A%22Honeywell%22&ts=1478579458386",
    headers={
    "Accept": "text/plain"
    }
)

count = 1
results = response.body["totalResults"]
import webhose, operator, time, Queue, json, httplib, urllib, urllib2
from difflib import SequenceMatcher
from datetime import datetime, timedelta

# get your free access token from Webhose.io
webhose.config(token="XXXX-XXXXX-XXX-XXXX-XXXX")

dup_check = set()
image_cache = {}
person_queue = Queue.Queue(100)


# Return a score about how similar two strings are
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


# Query Bing Image Search API for the facial image of a person
def get_image(search_string):

    if search_string in image_cache:
        return image_cache[search_string]

    headers = {
        # Request headers
        'Content-Type': 'multipart/form-data',
        'Ocp-Apim-Subscription-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', # get your key from Bing API
    }

    try:
        params = urllib.urlencode({"q":'"' + search_string + '"', "count":10,"offset":0,"mkt":"en-us", "size":"small", "imageType":"Photo","imageContent":"Face"})