Python query Examples, webhoseio.query Python Examples

Example #1

0

Show file

File: webhose.py Project: dipikabaad/WorldNews_MapVizApp

def get_data(place, country, lat, lng):
    print(place, country)
    query_params = {
        "q":
        "language:english site_type:news thread.country:{} location:{}".format(
            country, place),
        "sort":
        "relevancy",
        "size":
        10
    }

    output = webhoseio.query("filterWebContent", query_params)
    output["extra"] = {
        "place": place,
        "country": country,
        "lat": lat,
        "lng": lng
    }
    print("LLLLLLLLLLLLLLL", len(output['posts']))
    f = open("./outs/" + place + "_" + country, "w")
    f.write(json.dumps(output))
    #print(json.dumps(output))
    #print(output['posts'][0]['text']) # Print the text of the first post
    #print output['posts'][0]['published'] # Print the text of the first post publication date

    # Get the next batch of posts
    #output = webhoseio.get_next()
    #print output['posts'][0]['thread']['site'] # Print the site of the first post

    time.sleep(10)
    f.close()

Example #2

0

Show file

def index(request):
    # Create your views here.
    webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64")
    time_now = datetime.datetime.now() 
    time_30_days_before =  time_now - datetime.timedelta(days=30)
    ts_30_days_before = time_30_days_before.timestamp()
    query_params = {
    "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel')",
    "ts": ts_30_days_before,
    "sort": "published"
    }
    output = webhoseio.query("filterWebContent", query_params)
    context = {'output': output}
    return render(request, 'news/index.html', context)

# def index(request):
#     # Create your views here.
#     webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64")
#     query_params = {
#     "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel' OR 'AIMS AMP Capital' OR 'Sabana REIT')",
#     "ts": "1516537944411",
#     "sort": "crawled"
#     }
#     output = webhoseio.query("filterWebContent", query_params)
#     return JsonResponse(output)

Example #3

0

Show file

File: webhoseQueries.py Project: zorro2000se/SciNewsClassification

def get_pages_into_json(domain, n=1):
    domain = domain
    num_pages = n

    webhoseio.config(token="a64af0cc-bb64-44dd-a56d-d1d1e06b287e")
    query_params = {
        "q": "language:english",
        "ts": "1512637551646",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    newpath = file_path + '/{}'.format('20171214')

    if not os.path.exists(newpath):
        os.makedirs(newpath)

    with open(newpath + '/data_1.json', 'w') as outfile:
        json.dump(output, outfile)

    for p in range(2, num_pages + 1):
        output = webhoseio.get_next()
        with open(newpath + '/data_{}.json'.format(p), 'w') as outfile:
            json.dump(output, outfile)

Example #4

0

Show file

def query(start_time, end_time, keywords, entities):
    q_str = " OR ".join(keywords)
    #"published:>" + dt_to_posix(start_time) + " published:<" + dt_to_posix(end_time) + \
    q = " domain_rank:<10000 site_type:news language:english title:(" + q_str + ")" + \
        " site_category:(business OR jobs OR financial_news OR international_news OR internet_technology OR investing OR investors_and_patents OR law_government_and_politics OR legal_issues OR national_news OR finance OR stocks OR tech)"

    params = {
        "q":q,
        "format":"json",
        "ts": str(start_time)
    }

    output = webhoseio.query("filterWebContent", params)

    n = output['totalResults']
    print("TOTAL RESULTS: " + str(n))
    print("REQUESTS REMAINING: " + str(output['requestsLeft']))

    """
    if not os.path.isdir("data/articles/" + dirname):
        os.mkdir("data/articles/" + dirname)

    json.dump(output, open("data/articles/" + dirname + "/0.json", "w"))
    """

    articles = parse_and_update(entities, output)

    for i in range(1, ceil(n/100.0)):
        output = webhoseio.get_next()
        articles += parse_and_update(entities, output)
        #json.dump(output, open("data/articles/" + dirname + "/" + str(i) + ".json", "w"))

    return articles

Example #5

0

Show file

 def get_meta_info(self):
     if not self.meta_info:
         if self.uuid:
             resp = webhoseio.query('images',
                                    {'q': 'uuid:{}'.format(self.uuid)})
             self.meta_info = resp['imageDocs'][0]
     return self.meta_info

Example #6

0

Show file

File: webhose.py Project: julio-cernadas/centiment_learn

def get_webhose_news(ticker):
    webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85")
    query_params = {
        "q":
        """language:english thread.country:US published:>1483232461 (site_category:stocks) (WFC OR wells fargo)
                (site_type:blogs OR site_type:discussions) (thread.title:'WFC' OR thread.title:'wells fargo')""",
        "ts": "1533754757303",
        "sort": "published"
    }
    output = webhoseio.query("filterWebContent", query_params)
    lst = [
        x for x in output['posts'][20]['text'].split('. ') for var in names
        if var in x.lower()
    ]

    if len(company_name) >= 2:
        var1 = company_name[0].lower()
        var2 = company_name[1].lower()
        tick = ticker.lower()
        names = [var1, var2, tick]
    else:
        var1 = company_name[0].lower()
        tick = ticker.lower()
        names = [var1, tick]
    barrons_news = [[date, text] for date, text in lst for var in names
                    if var in text]

Example #7

0

Show file

    def retrieve_topmost_article_new(self, stock_name, num_sentences):
        """
        Retrieves the topmost article about the stock, but solely through the usage of the webhose API. This
        does not involve checking the database for an already existing article for the stock.
        :param stock_name:    stock name.
        :param num_sentences: number of sentences to return for summary.
        :return:              a StockArticle object.
        """
        webhoseio.config(token=webhose_api_key)
        filters = {
            'language': 'english',
            'text': stock_name,
            'site_type': 'news',
            'site_category': 'finance',
            'thread.title': stock_name
        }
        query_result = webhoseio.query('filterWebContent', filters)
        stock_posts = query_result['posts']
        if len(stock_posts) == 0:
            return None
        article_url = stock_posts[0].get('url')
        article_text = stock_posts[0].get('text')
        article_summary = self.summarize_article(article_text, num_sentences)

        return StockArticle(stock_name, article_url, article_summary)

Example #8

0

Show file

    def get_results(self):
        # try parameters  domain_rank:<1000  , "ts":ts,
        # ts = self.calculate_timestamp()
        self.output = webhoseio.query("filterWebData", {"q": self.webhoseio_query, "latest": "true"})

        '''if less than specified number of articles retrieved..then change that specified number'''
        if (len(self.output['posts']) < self.number_of_articles_to_extract):
            self.number_of_articles_to_extract = len(self.output['posts'])

Example #9

0

Show file

File: webhose_reader.py Project: oliverzhang42/TitaniumCrab

 def __init__(self):
     webhoseio.config(token='')
     self.params = {'q': 'site.type:news', 'from': 0}
     results = webhoseio.query('nseFilter', self.params)
     self.total_results = results['totalResults']
     self.page = 0
     self.batches = max(self.total_results // 10, 10)
     self.news_batch = results['docs']

Example #10

0

Show file

File: webhoseio_search.py Project: Sarathisme/tango_with_django

def get_results(query):
    config()
    final_output = []
    output = webhoseio.query("filterWebContent", {"q": query})
    for post in output['posts']:
        final_output.append({
            'title': post['title'],
            'text': post['text'][0:200],
            'url': post['url']
        })
    return final_output

Example #11

0

Show file

def getWebHoseData(location):
	webhoseio.config(token="b99dbdf5-caac-4a2c-886a-fb8f37f365a0")

	query_params = {
	"q": "performance_score:>7 location:"+location,
	"ts": "1506110156153",
	"sort": "crawled" }

    
  	output = webhoseio.query("filterWebContent", query_params)
  	totalWebHose = len(output['posts'])
 	return totalWebHose

Example #12

0

Show file

def product_page(request, product_id, product_brand):
    if 'user_id' not in request.session:
        return redirect(reverse("userLG:login"))

    product_list = {}

    if 'product_id' not in request.session:
        print("product_id Initialized <<<<<<<-------")
        request.session['product_id'] = None

    if request.session['product_id'] != str(
            product_id) or 'product' not in request.session:
        request.session['product_id'] = product_id

        print("data from request <<<<<-------")
        query_params = {"q": "product_id: " + product_id + "", 'size': '1'}

        output = webhoseio.query("productFilter", query_params)
        product_list = {
            'product_name': output['products'][0]['name'],
            'product_brand': output['products'][0]['brand'],
            'product_price': output['products'][0]['price'],
            'product_image': output['products'][0]['images'][0],
            'product_description': output['products'][0]['description']
        }
        request.session['product'] = product_list
        # Get the next batch of products
        output = webhoseio.get_next()

        # changing the brand filter in the session
        request.session['productInfo']['brand'] = product_brand
        request.session.modified = True

        suggestion_list = sendingRequest(
            request,
            catergories=request.session['productInfo']['categories'],
            brand=product_brand,
            product_id=product_id)
        request.session["suggested_product"] = suggestion_list
    else:
        print("data from session <<<<<-------")
        product_list = request.session['product']
        suggestion_list = request.session["suggested_product"]

    itemsInCart = Cart.objects.all().count()

    return render(
        request, "ecommerce/productPage.html", {
            'product_list': product_list,
            'suggested_product': suggestion_list,
            'itemsInCart': itemsInCart
        })

Example #13

0

Show file

def sendingRequest(request,
                   brand='nike',
                   catergories="sport shirt",
                   price_range=50,
                   product_id=None):
    product_list = []

    if product_id is not None:
        print("There is a product Id <<<<<<----------")
        query_params = {
            "q": "name:(" + catergories + ") brand:" + brand + " ",
            'size': '5'
        }
    else:
        print(
            "Products more diverse, because, not requesting by product_id <<<<<<----------"
        )
        query_params = {
            "q":
            "name:(" + catergories + ") price: <" + str(price_range) +
            " brand:" + brand + " ",
            "size":
            "25"
        }

    try:
        output = webhoseio.query("productFilter", query_params)
    except IndexError:
        print("Not found <<<<<<<<<<----------")

    for key, value in output.items():
        if key == 'products':
            for index in value:
                if len(index['images']) < 1:
                    continue
                else:
                    product_list.append({
                        'product_price': index['price'],
                        'product_image': index['images'][0],
                        'product_id': index['product_id'],
                        'product_brand': index['brand']
                    })

    # Get the next batch of products
    output = webhoseio.get_next()

    if len(product_list) < 1:

        return HttpResponse(
            "<h4 class='text-center text-white bg-dark p-3 mt-5 shadow'>Items Not Found!!</h4>"
        )
    return (product_list)

Example #14

0

Show file

 def request(self, category):
     #yesterday = datetime.date.today() - datetime.timedelta(days=1)
     #yesterday_string = yesterday.strftime("%s")
     #print(yesterday_string)
     q = "language:(english) thread.country:US performance_score:>4 (site_type:news) site_category:" + category
     #self.output = webhoseio.query("filterWebData", {"q":q, "sort":"performance_score", "latest":"true"})
     #"sort":"relevancy",
     self.output = webhoseio.query("filterWebData", {
         "q": q,
         "sort": "social.facebook.likes",
         "latest": "true"
     })
     return self.output

Example #15

0

Show file

File: webhose_search.py Project: tsarboot/rango

def undependant():
    query = input('what are you searching for ?')
    threads = input('how many results you want ?')
    with open('tango_django/search.key','r') as f:
        key = f.readline()
    try:
        webhoseio.config(token=key)
        results = webhoseio.query("filterWebContent",{"q":query})
        for i in range(len(threads)):
            print(results['posts'][i]['text'])
        # for post in results['posts'][:10]['text']:
        #     count += 1
        #     print(f'result number {count} \n {post}')
    except KeyError as err:
        print(f'ooopsie :{err}')

Example #16

0

Show file

File: final_project_py.py Project: ScottKrysa/Intro-to-Programming-w-Python

def get_headlines(search_term, site):
	query_params = {
		"q": search_term + " site:" + site + ".com language:english",
		"sort": "published"
	    }
	output = webhoseio.query("filterWebContent", query_params)
	print('[-] creating ' + site + '_output.txt')
	file = open(site + '_output.txt','w') 
	try:
		for x in range(100):
			file.write(output['posts'][x]['text'])
	except IndexError:
		print('[-] Warning: less than 100 results')
	file.close()
	print('[+] operation complete')

Example #17

0

Show file

def find_related_articles(df):
    n_keywords = 5
    articles = []
    while len(articles) < 10:
        query = _build_query(
            df.content.iloc[:n_keywords]) + " language:english"
        query_params = {"q": query, "ts": "1580463415153", "sort": "relevancy"}
        search_results = webhoseio.query("filterWebContent", query_params)
        # search_results = newsapi.get_everything(q=query, language='en', sort_by='relevancy')
        print(len(search_results['posts']))
        for article in search_results['posts']:
            if len(articles) < 10:
                articles.append(article)

        n_keywords -= 1

    return articles

Example #18

0

Show file

File: webhouse_search.py Project: Holmes555/tango_with_django_project

def webhoseio_search(query):

    key = read_webhoseio_key()
    results = []

    webhoseio.config(token=key)
    query_params = {'q': query + ' language:english', 'sort': 'relevancy'}
    output = webhoseio.query('filterWebContent', query_params)

    for result in output['posts']:
        results.append({
            'name': result['title'],
            'url': result['url'],
            'summary': result['published']
        })

    return results[:10]

Example #19

0

Show file

def get_webhose_news(ticker):
    webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85")
    names = get_names(ticker)
    query_params = {
        "q": f"""language:english thread.country:US published:>1483232461 (site_category:stocks) ({ticker} OR {names[0]} OR {names[-1]})
                (site_type:blogs OR site_type:discussions) (thread.title:'{ticker}' OR thread.title:'{names[0]}' OR thread.title:'{names[-1]}')""",
        "ts": "1533754757303",
        "sort": "published"
        }
    output = [x for x in webhoseio.query("filterWebContent", query_params)['posts']]
    lst = [[y['published'],
            y['text'].replace('\n',' ').lower().split('. ')] for y in output]
    webhose_new = [[datetime.strptime(date.split('T')[0],'%Y-%m-%d').date(),
                    re.sub('// no comments|posted by','',text)] 
                    for date,y in lst for text in y if len(text) < 200
                    for var in names if var in text]
    return webhose_new

Example #20

0

Show file

def getContent(query_params):
    output = webhoseio.query("filterWebContent", query_params)
    print(output)
    with open("./webhose_results.json", 'w') as outfile:
        json.dump(output, outfile, sort_keys=True)

    insertToDB(output["posts"])
    ReqNumber = 1
    while (output["moreResultsAvailable"]):
        output = webhoseio.get_next()
        # do something for subsequent query results
        with open("./webhose_results_" + str(ReqNumber) + ".json",
                  'w') as outfile:
            json.dump(output, outfile, sort_keys=True)
        insertToDB(output["posts"])
        ReqNumber = ReqNumber + 1
        if (ReqNumber >= 5):
            break

Example #21

0

Show file

File: NewsSearch.py Project: ddx-510/Splash-2018

def main():
    global output
    qn = input('What do you want to ask?')

    tokens = word_tokenize(qn)
    Tokens = []
    for token in tokens:
        if token.lower() not in sw:
            Tokens.append(token)
    qnF = ' '.join(Tokens)

    typeSort()

    query_params = {
        "q": qnF + " language:english site_type:"+ sorttype,
        "ts": "1526543100240",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    firstPost = []
    if sorttype is "blogs":
        for h in output['posts']:
            if curse in output['posts']:
                continue
            else:
                firstPost.append(h)
                printArticle()
                break
    else:
        printArticle()
        output = webhoseio.get_next()

    again = input("Do you want to hear about something else?")
    for x in again.split():
        if x.lower() in agree:
            main()
        else:
            print("Good day! See you!")
            break

Example #22

0

Show file

File: webhose_reader.py Project: oliverzhang42/TitaniumCrab

 def __next__(self):
     if len(self.news_batch) == 0 and self.page == self.batches:
         raise StopIteration
     if len(self.news_batch) == 0:
         self.page += 1
         self.params['from'] = self.page * 10
         results = webhoseio.query('nseFilter', self.params)
         self.news_batch = results['docs']
     news_instance = self.news_batch.pop()
     article_raw = news_instance['article']
     site_raw = news_instance['site']
     for i in range(len(article_raw['categories'])):
         category_name = article_raw['categories'][i]['name'].lower()
         if 'politics' in category_name or 'election' in category_name:
             category = 'politics'
             break
         elif 'business' in category_name or 'economics' in category_name or 'money' in category_name:
             category = 'business'
             break
         elif 'entertainment' in category_name or 'culture' in category_name or 'movies' in category_name or 'games' in category_name:
             category = 'entertaiment'
             break
         elif 'health' in category_name or 'covid' in category_name or 'medicine' in category_name:
             category = 'health'
             break
         elif 'technology' in category_name or 'science' in category_name or 'electronics' in category_name:
             category = 'technology'
             break
         else:
             category = 'uncategorized'
     article = Article(article_raw['text'],
                       category=category,
                       source=site_raw['name'],
                       author=article_raw['author'],
                       title=article_raw['title'],
                       url=article_raw['url'],
                       urlToImage=article_raw['media']['main_image'],
                       publishedAt=article_raw['published'],
                       viewCount=article_raw['social']['facebook']['likes'])
     article.set_summary(article_raw['summary'])
     return article

Example #23

0

Show file

File: fake-review-airflow.py Project: prasanna12510/BEAD-Team-21-databricks

    def extract_data_from_webhose():
        webhoseio.config(token="7ad89131-980e-48c3-b588-e68adb7c1be0")
        s = int(time.time()) - 500
        query_params = {"q": "language:english site:amazon.com site_category:shopping spam_score:>0.7", "ts": "{}".format(s), "sort": "crawled"}
        output = webhoseio.query("filterWebContent", query_params)
        logging.info(output)
        key = []
        reviewname = []
        productname = []
        reviewdate = []
        rating = []
        label = []
        sourcetype = []
        runtype = []
        spam_score = []
        text = []
        for i in range(0,1):
            logging.info(i)
            logging.info(output)
            key.append(i)
            reviewname.append(output['posts'][i]['author'])
            productname.append(output['posts'][i]['thread']['title'])
            reviewdate.append(output['posts'][i]['thread']['published'])
            rating.append(output['posts'][i]['thread']['rating'])
            tt = output['posts'][i]['text']
            text.append(tt)
            ss = output['posts'][i]['thread']['spam_score']
            spam_score.append(ss)
        df= pd.DataFrame()
        df['key'] = key
        df['reviewname'] = reviewname
        df['productname'] = productname
        df['reviewdate'] = reviewdate
        df['rating'] = rating
        df['label'] = 'fake'
        df['sourcetype'] = 'amazon'
        df['runtype'] = 'near_real_time'
        df['text'] = text  
        df['snapshot_time'] = s
		
        df.to_gbq('webhoseDB.staging_table', 'wehosestream', if_exists='append', verbose=False)

Example #24

0

Show file

File: webhose.py Project: thucycookie/news_and_stocks

def webhose_func():
    YOUR_API_KEY = "a161f6e5-ab51-40a1-afaf-ba13e67baefa"
    webhoseio.config(token=YOUR_API_KEY)
    print("\n")
    print("WELCOME TO WEBHOSE\n")
    search = input(
        "Input the string that you want to search for! It can be somethinglike ipod OR ipad\nType in a list of strings like cow,chicken,pig to plot sentiment for those words against the stock price.\n3 TERMS ARE ENOUGH!\n"
    )
    search_terms = search.split(",")
    search_df_arr = []
    for search in search_terms:
        search += " language:english"
        sort = input(
            "\nType crawled, relevancy, rating or publishes for your sorting option\n"
        )
        timestamp = 1541348859918
        size = input(
            "\nWhat is the number of post returned per request? 1 is the smallest and 100 is the biggest!\n"
        )
        query_params = {
            "accuracy_confidence": "high",
            "q": search,
            "sort": sort,
            "ts": timestamp,
            "size": size,
        }
        output = webhoseio.query("filterWebContent", query_params)
        number_of_posts = len(output['posts'])
        dates = []

        for a in range(number_of_posts):
            dates.append(output['posts'][a]['published'])

        df = pd.DataFrame(index=dates, columns=["Title"])
        for i in range(number_of_posts):
            df.iloc[i] = [output['posts'][i]['title']]
        search_df_arr.append(df)

    search_df_arr = search_df_arr + search_terms
    return search_df_arr

Example #25

0

Show file

def main():
    webhoseio.config(token="XXXXXXXXXXXXXXXXX"
                     )  # needs to be substituted by real webhoseio token
    query_params = {
        "q":
        "language:english has_video:false is_first:true site_type:news site:(cnn.com OR bbc.com OR reuters.com OR nbcnews.com OR foxnews.com OR washingtonpost.com OR espn.com OR tmz.com OR sportingnews.com OR usnews.com OR wsj.com OR latimes.com OR time.com OR nydailynews.com OR economist.com OR technewsworld.com OR computerworld.com OR newsmax.com OR theatlantic.com OR hollywoodreporter.com) spam_score:0.0",
        "ts": "1510212713819",
        "sort": "crawled"
    }
    #get 1st set of articles
    output = webhoseio.query("filterWebContent", query_params)

    fl_counter = 1
    while fl_counter <= 1000:
        fl_name = "file" + "_" + str(fl_counter)
        opfile = open('C:/Users/Heena/News3/' + fl_name, 'w',
                      encoding='utf-8')  #specify path to corpus folder here
        for post in output['posts']:
            uuid = post['uuid']
            url = post['url']
            site_full = post['thread']['site_full']
            site_categories = post['thread']['site_categories']
            title_full = post['thread']['title_full']
            title = post['title']
            published = post['published']
            author = post['author']
            text = post['text']

            doc = document(uuid, url, site_full, site_categories, title,
                           title_full, published, author, text)
            jsondata = json.dumps(doc.__dict__, sort_keys=True)
            opfile.write(jsondata + '\n')

        opfile.close()
        time.sleep(30)
        print("fl_counter = ", fl_counter)
        output = webhoseio.get_next()
        print("next = ", output['next'])
        fl_counter += 1

Example #26

0

Show file

File: api_to_df.py Project: ypk22/NewsPhi

def api_df(token, site_lists, time_delta, filename):
    """
    A pipeline from Webhose API to CSV.

    :param token:
        api token for Webhose API.
    :param site_lists:
        list of sites we need to crawl.
    :param time_delta:
        time window. Ex: -3 means the most recent 3 days. Can only be from -1 to -30.
    :param filename:
        filename of CSV.
    :return:
        None
    """
    webhoseio.config(token=token)
    query_params = get_query(site_lists, time_delta)
    output_init = webhoseio.query("filterWebContent", query_params)
    output_flat = pd.io.json.json_normalize(output_init['posts'])
    df = output_flat[[
        'thread.uuid', 'author', 'external_links', 'published', 'text',
        'thread.site_full', 'thread.site_categories', 'thread.site_section',
        'thread.section_title', 'thread.main_image',
        'thread.social.facebook.comments', 'thread.social.facebook.likes',
        'thread.social.facebook.shares', 'title', 'url'
    ]]
    output = webhoseio.get_next()
    while len(output['posts']) > 0:
        df = output_to_df(output, df)
        try:
            output = webhoseio.get_next()
        except HTTPError:
            return df
            # df.to_csv(filename, index=False)
        if len(df) % 1000 == 0:
            print(str(len(df)) + ' has finished')
    return df

Example #27

0

Show file

def related_news(keywords):
    """
    search for related news by keywords
    use Webhose.io API
    """

    if len(keywords) >= 4:
        keywords = keywords[0:3]

    keyword_str = " ".join(keywords)

    #API key
    webhoseio.config(token="0e3f95f5-2fc7-494f-881e-e29915cc3e9a")
    query_params = {
        "q": keyword_str + " language:english site_type:news",
        "ts": "1528948373304",
        "sort": "relevancy"
    }

    resp = webhoseio.query("filterWebContent", query_params)
    posts = resp['posts']

    if len(posts) < 2:
        return None, None, True

    MAX_ARTICLES = 5  # take first 5

    related_articles = []
    related_urls = []

    for i in range(min(MAX_ARTICLES, len(posts))):
        post = posts[i]['thread']
        related_url = {'url': post['url'], 'title': post['title']}
        related_urls.append(related_url)
        related_articles.append(post['site_full'])  # currently redirected link

    return related_articles, related_urls, False

Example #28

0

Show file

File: webhose_scraper.py Project: targoons/forex-trading

def scrape(query, category, start_time_str, time_diff):
    print('Start scraping data from ' + start_time_str)

    query_params = {"q": query, "sort": "crawled"}

    news_list = []

    while True:
        output = webhoseio.query("filterWebContent", query_params)
        news_list = news_list + output['posts']
        output = webhoseio.get_next()

        if len(news_list) > output['totalResults'] or len(news_list) == 0:
            break

    filename = (DATA_PATH + 'News_{0}_'.format(category) +
                str(datetime.datetime.utcnow() + time_diff).replace(
                    ' ', '_').replace(':', '_') + '.json')

    with open(filename, 'w') as outfile:
        json.dump(news_list, outfile)

    print('Persisted News Article at the following location: ' + filename)
    print('{0} news articles were collected.'.format(len(news_list)))

Example #29

0

Show file

File: models.py Project: pushpreet/mediatracker

    def update(self):
        crawledFrom = self.last_updated.timestamp()
        if abs(self.last_updated - self.last_modified) < timedelta(seconds=1):
            crawledFrom = (timezone.now() - timedelta(days=3)).timestamp()
        crawledFrom = int(crawledFrom*1000)
        
        webhoseio.config(token='e187b1d6-59c5-4b3b-9614-1c42b3e3658e')
        output = webhoseio.query(
            "filterWebContent", 
            {
                "q": self.query,
                "ts": crawledFrom,
                "language": "english",
                "site_type": "news",
            })
        
        output = output['posts']
        while True:
            temp = webhoseio.get_next()
            output += temp['posts']
            if temp['moreResultsAvailable'] <= 0:
                break

        previous_posts_uuid = []
        previous_posts_title = []
        
        if len(output) > 0:
            previous_posts_uuid = [post.uuid for post in Post.objects.all()]
            previous_posts_title = [post.title.lower() for post in Post.objects.all()]

        for post in output:
            if post['thread']['uuid'] in previous_posts_uuid:
                old_post = Post.objects.get(uuid = post['thread']['uuid'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)
            
            elif post['thread']['title'].lower() in previous_posts_title:
                old_post = Post.objects.get(title__iexact = post['thread']['title'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)

            else:
                try:
                    new_post = Post(
                        uuid = post['thread']['uuid'],
                        url = post['thread']['url'],
                        site_full = post['thread']['site_full'],
                        site_categories = post['thread']['site_categories'],
                        title = post['thread']['title'][:1024],
                        published = post['thread']['published'],
                        site_type = post['thread']['site_type'],
                        country = post['thread']['country'],
                        main_image = post['thread']['main_image'],
                        performance_score = post['thread']['performance_score'],
                        domain_rank = post['thread']['domain_rank'],
                        author = post['author'],
                        text = post['text'],
                        language = post['language'],
                        entities = post['entities'],
                        social = post['thread']['social'],
                    )

                    new_post.save()
                    new_post.trackers.add(self)
                    
                    previous_posts_uuid.append(post['thread']['uuid'])
                    previous_posts_title.append(post['thread']['title'].lower())
                
                except DataError as err:
                    print("Error: %s"%(err))
                    print(post)

        self.last_updated = timezone.now()
        self.save()
        
        return True

Example #30

0

Show file

File: get_articles.py Project: imoutidi/Climate_Change

# List with urls for the diffbot API
urlList = []

if action == 'Y':
    # configuring webhose request
    webhoseio.config(token="4057ff96-3ff1-4982-8c99-41d708f980ef")
    # query = "politics language:english thread.country:GB performance_score:>5"
    query = "Climate Change"
    query_params = {
        "q": "Climate Change",
        "ts": "1518278227788",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    # getting the urls of the websites that matched our query/params
    # saving the urls to a file for verification
    outputFilename = input("Enter the name of the file which will contain the webhose urls: ")
    with open(outputFilename, 'w') as urlsOut:
        urlsOut.write("Query used: "+query+"\n\n")
        j = 0
        while output['posts']:
            i = 0
            for var in output['posts']:
                urlsOut.write(str(j)+".\n"+output['posts'][i]['url']+"\n")
                urlList.append(output['posts'][i]['url'])
                i += 1
                j += 1
            output = webhoseio.get_next()

Example #31

0

Show file

#! /usr/bin/env python3

#from tinydb import TinyDB, Query
import json
import webhoseio

webhoseio.config(token='11a5bf53-12f6-440d-a84f-e42c18c7c38d')
output = webhoseio.query("filterWebContent", {
    "q": "Global Warming",
    "sort": "relevancy"
})
print("URL: " + output['posts'][0]['url'])  # Print the text of the first post
print("title: " +
      output['posts'][0]['title'])  # Print the text of the first post
print("published: " + output['posts'][0]['published']
      )  # Print the text of the first post publication date

output = webhoseio.get_next()
print("URL: " + output['posts'][0]['url'])  # Print the text of the first post
print("title: " +
      output['posts'][0]['title'])  # Print the text of the first post
print("published: " + output['posts'][0]['published']
      )  # Print the text of the first post publication date

# try:
# 	response = urlopen(request)
# 	data = response.read()
# 	parsed_json = json.loads(data)
# except URLError, e:
#    print 'API call not working. Got an error code:', e
# else: