コード例 #1
0
def get_data(place, country, lat, lng):
    print(place, country)
    query_params = {
        "q":
        "language:english site_type:news thread.country:{} location:{}".format(
            country, place),
        "sort":
        "relevancy",
        "size":
        10
    }

    output = webhoseio.query("filterWebContent", query_params)
    output["extra"] = {
        "place": place,
        "country": country,
        "lat": lat,
        "lng": lng
    }
    print("LLLLLLLLLLLLLLL", len(output['posts']))
    f = open("./outs/" + place + "_" + country, "w")
    f.write(json.dumps(output))
    #print(json.dumps(output))
    #print(output['posts'][0]['text']) # Print the text of the first post
    #print output['posts'][0]['published'] # Print the text of the first post publication date

    # Get the next batch of posts
    #output = webhoseio.get_next()
    #print output['posts'][0]['thread']['site'] # Print the site of the first post

    time.sleep(10)
    f.close()
コード例 #2
0
def index(request):
    # Create your views here.
    webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64")
    time_now = datetime.datetime.now() 
    time_30_days_before =  time_now - datetime.timedelta(days=30)
    ts_30_days_before = time_30_days_before.timestamp()
    query_params = {
    "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel')",
    "ts": ts_30_days_before,
    "sort": "published"
    }
    output = webhoseio.query("filterWebContent", query_params)
    context = {'output': output}
    return render(request, 'news/index.html', context)

# def index(request):
#     # Create your views here.
#     webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64")
#     query_params = {
#     "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel' OR 'AIMS AMP Capital' OR 'Sabana REIT')",
#     "ts": "1516537944411",
#     "sort": "crawled"
#     }
#     output = webhoseio.query("filterWebContent", query_params)
#     return JsonResponse(output)
コード例 #3
0
def get_pages_into_json(domain, n=1):
    domain = domain
    num_pages = n

    webhoseio.config(token="a64af0cc-bb64-44dd-a56d-d1d1e06b287e")
    query_params = {
        "q": "language:english",
        "ts": "1512637551646",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    newpath = file_path + '/{}'.format('20171214')

    if not os.path.exists(newpath):
        os.makedirs(newpath)

    with open(newpath + '/data_1.json', 'w') as outfile:
        json.dump(output, outfile)

    for p in range(2, num_pages + 1):
        output = webhoseio.get_next()
        with open(newpath + '/data_{}.json'.format(p), 'w') as outfile:
            json.dump(output, outfile)
コード例 #4
0
def query(start_time, end_time, keywords, entities):
    q_str = " OR ".join(keywords)
    #"published:>" + dt_to_posix(start_time) + " published:<" + dt_to_posix(end_time) + \
    q = " domain_rank:<10000 site_type:news language:english title:(" + q_str + ")" + \
        " site_category:(business OR jobs OR financial_news OR international_news OR internet_technology OR investing OR investors_and_patents OR law_government_and_politics OR legal_issues OR national_news OR finance OR stocks OR tech)"

    params = {
        "q":q,
        "format":"json",
        "ts": str(start_time)
    }

    output = webhoseio.query("filterWebContent", params)

    n = output['totalResults']
    print("TOTAL RESULTS: " + str(n))
    print("REQUESTS REMAINING: " + str(output['requestsLeft']))

    """
    if not os.path.isdir("data/articles/" + dirname):
        os.mkdir("data/articles/" + dirname)

    json.dump(output, open("data/articles/" + dirname + "/0.json", "w"))
    """

    articles = parse_and_update(entities, output)

    for i in range(1, ceil(n/100.0)):
        output = webhoseio.get_next()
        articles += parse_and_update(entities, output)
        #json.dump(output, open("data/articles/" + dirname + "/" + str(i) + ".json", "w"))

    return articles
コード例 #5
0
 def get_meta_info(self):
     if not self.meta_info:
         if self.uuid:
             resp = webhoseio.query('images',
                                    {'q': 'uuid:{}'.format(self.uuid)})
             self.meta_info = resp['imageDocs'][0]
     return self.meta_info
コード例 #6
0
def get_webhose_news(ticker):
    webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85")
    query_params = {
        "q":
        """language:english thread.country:US published:>1483232461 (site_category:stocks) (WFC OR wells fargo)
                (site_type:blogs OR site_type:discussions) (thread.title:'WFC' OR thread.title:'wells fargo')""",
        "ts": "1533754757303",
        "sort": "published"
    }
    output = webhoseio.query("filterWebContent", query_params)
    lst = [
        x for x in output['posts'][20]['text'].split('. ') for var in names
        if var in x.lower()
    ]

    if len(company_name) >= 2:
        var1 = company_name[0].lower()
        var2 = company_name[1].lower()
        tick = ticker.lower()
        names = [var1, var2, tick]
    else:
        var1 = company_name[0].lower()
        tick = ticker.lower()
        names = [var1, tick]
    barrons_news = [[date, text] for date, text in lst for var in names
                    if var in text]
コード例 #7
0
    def retrieve_topmost_article_new(self, stock_name, num_sentences):
        """
        Retrieves the topmost article about the stock, but solely through the usage of the webhose API. This
        does not involve checking the database for an already existing article for the stock.
        :param stock_name:    stock name.
        :param num_sentences: number of sentences to return for summary.
        :return:              a StockArticle object.
        """
        webhoseio.config(token=webhose_api_key)
        filters = {
            'language': 'english',
            'text': stock_name,
            'site_type': 'news',
            'site_category': 'finance',
            'thread.title': stock_name
        }
        query_result = webhoseio.query('filterWebContent', filters)
        stock_posts = query_result['posts']
        if len(stock_posts) == 0:
            return None
        article_url = stock_posts[0].get('url')
        article_text = stock_posts[0].get('text')
        article_summary = self.summarize_article(article_text, num_sentences)

        return StockArticle(stock_name, article_url, article_summary)
コード例 #8
0
    def get_results(self):
        # try parameters  domain_rank:<1000  , "ts":ts,
        # ts = self.calculate_timestamp()
        self.output = webhoseio.query("filterWebData", {"q": self.webhoseio_query, "latest": "true"})

        '''if less than specified number of articles retrieved..then change that specified number'''
        if (len(self.output['posts']) < self.number_of_articles_to_extract):
            self.number_of_articles_to_extract = len(self.output['posts'])
コード例 #9
0
 def __init__(self):
     webhoseio.config(token='')
     self.params = {'q': 'site.type:news', 'from': 0}
     results = webhoseio.query('nseFilter', self.params)
     self.total_results = results['totalResults']
     self.page = 0
     self.batches = max(self.total_results // 10, 10)
     self.news_batch = results['docs']
コード例 #10
0
def get_results(query):
    config()
    final_output = []
    output = webhoseio.query("filterWebContent", {"q": query})
    for post in output['posts']:
        final_output.append({
            'title': post['title'],
            'text': post['text'][0:200],
            'url': post['url']
        })
    return final_output
コード例 #11
0
def getWebHoseData(location):
	webhoseio.config(token="b99dbdf5-caac-4a2c-886a-fb8f37f365a0")

	query_params = {
	"q": "performance_score:>7 location:"+location,
	"ts": "1506110156153",
	"sort": "crawled" }

    
  	output = webhoseio.query("filterWebContent", query_params)
  	totalWebHose = len(output['posts'])
 	return totalWebHose
コード例 #12
0
def product_page(request, product_id, product_brand):
    if 'user_id' not in request.session:
        return redirect(reverse("userLG:login"))

    product_list = {}

    if 'product_id' not in request.session:
        print("product_id Initialized <<<<<<<-------")
        request.session['product_id'] = None

    if request.session['product_id'] != str(
            product_id) or 'product' not in request.session:
        request.session['product_id'] = product_id

        print("data from request <<<<<-------")
        query_params = {"q": "product_id: " + product_id + "", 'size': '1'}

        output = webhoseio.query("productFilter", query_params)
        product_list = {
            'product_name': output['products'][0]['name'],
            'product_brand': output['products'][0]['brand'],
            'product_price': output['products'][0]['price'],
            'product_image': output['products'][0]['images'][0],
            'product_description': output['products'][0]['description']
        }
        request.session['product'] = product_list
        # Get the next batch of products
        output = webhoseio.get_next()

        # changing the brand filter in the session
        request.session['productInfo']['brand'] = product_brand
        request.session.modified = True

        suggestion_list = sendingRequest(
            request,
            catergories=request.session['productInfo']['categories'],
            brand=product_brand,
            product_id=product_id)
        request.session["suggested_product"] = suggestion_list
    else:
        print("data from session <<<<<-------")
        product_list = request.session['product']
        suggestion_list = request.session["suggested_product"]

    itemsInCart = Cart.objects.all().count()

    return render(
        request, "ecommerce/productPage.html", {
            'product_list': product_list,
            'suggested_product': suggestion_list,
            'itemsInCart': itemsInCart
        })
コード例 #13
0
def sendingRequest(request,
                   brand='nike',
                   catergories="sport shirt",
                   price_range=50,
                   product_id=None):
    product_list = []

    if product_id is not None:
        print("There is a product Id <<<<<<----------")
        query_params = {
            "q": "name:(" + catergories + ") brand:" + brand + " ",
            'size': '5'
        }
    else:
        print(
            "Products more diverse, because, not requesting by product_id <<<<<<----------"
        )
        query_params = {
            "q":
            "name:(" + catergories + ") price: <" + str(price_range) +
            " brand:" + brand + " ",
            "size":
            "25"
        }

    try:
        output = webhoseio.query("productFilter", query_params)
    except IndexError:
        print("Not found <<<<<<<<<<----------")

    for key, value in output.items():
        if key == 'products':
            for index in value:
                if len(index['images']) < 1:
                    continue
                else:
                    product_list.append({
                        'product_price': index['price'],
                        'product_image': index['images'][0],
                        'product_id': index['product_id'],
                        'product_brand': index['brand']
                    })

    # Get the next batch of products
    output = webhoseio.get_next()

    if len(product_list) < 1:

        return HttpResponse(
            "<h4 class='text-center text-white bg-dark p-3 mt-5 shadow'>Items Not Found!!</h4>"
        )
    return (product_list)
コード例 #14
0
 def request(self, category):
     #yesterday = datetime.date.today() - datetime.timedelta(days=1)
     #yesterday_string = yesterday.strftime("%s")
     #print(yesterday_string)
     q = "language:(english) thread.country:US performance_score:>4 (site_type:news) site_category:" + category
     #self.output = webhoseio.query("filterWebData", {"q":q, "sort":"performance_score", "latest":"true"})
     #"sort":"relevancy",
     self.output = webhoseio.query("filterWebData", {
         "q": q,
         "sort": "social.facebook.likes",
         "latest": "true"
     })
     return self.output
コード例 #15
0
ファイル: webhose_search.py プロジェクト: tsarboot/rango
def undependant():
    query = input('what are you searching for ?')
    threads = input('how many results you want ?')
    with open('tango_django/search.key','r') as f:
        key = f.readline()
    try:
        webhoseio.config(token=key)
        results = webhoseio.query("filterWebContent",{"q":query})
        for i in range(len(threads)):
            print(results['posts'][i]['text'])
        # for post in results['posts'][:10]['text']:
        #     count += 1
        #     print(f'result number {count} \n {post}')
    except KeyError as err:
        print(f'ooopsie :{err}')
コード例 #16
0
def get_headlines(search_term, site):
	query_params = {
		"q": search_term + " site:" + site + ".com language:english",
		"sort": "published"
	    }
	output = webhoseio.query("filterWebContent", query_params)
	print('[-] creating ' + site + '_output.txt')
	file = open(site + '_output.txt','w') 
	try:
		for x in range(100):
			file.write(output['posts'][x]['text'])
	except IndexError:
		print('[-] Warning: less than 100 results')
	file.close()
	print('[+] operation complete')
コード例 #17
0
def find_related_articles(df):
    n_keywords = 5
    articles = []
    while len(articles) < 10:
        query = _build_query(
            df.content.iloc[:n_keywords]) + " language:english"
        query_params = {"q": query, "ts": "1580463415153", "sort": "relevancy"}
        search_results = webhoseio.query("filterWebContent", query_params)
        # search_results = newsapi.get_everything(q=query, language='en', sort_by='relevancy')
        print(len(search_results['posts']))
        for article in search_results['posts']:
            if len(articles) < 10:
                articles.append(article)

        n_keywords -= 1

    return articles
コード例 #18
0
def webhoseio_search(query):

    key = read_webhoseio_key()
    results = []

    webhoseio.config(token=key)
    query_params = {'q': query + ' language:english', 'sort': 'relevancy'}
    output = webhoseio.query('filterWebContent', query_params)

    for result in output['posts']:
        results.append({
            'name': result['title'],
            'url': result['url'],
            'summary': result['published']
        })

    return results[:10]
コード例 #19
0
def get_webhose_news(ticker):
    webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85")
    names = get_names(ticker)
    query_params = {
        "q": f"""language:english thread.country:US published:>1483232461 (site_category:stocks) ({ticker} OR {names[0]} OR {names[-1]})
                (site_type:blogs OR site_type:discussions) (thread.title:'{ticker}' OR thread.title:'{names[0]}' OR thread.title:'{names[-1]}')""",
        "ts": "1533754757303",
        "sort": "published"
        }
    output = [x for x in webhoseio.query("filterWebContent", query_params)['posts']]
    lst = [[y['published'],
            y['text'].replace('\n',' ').lower().split('. ')] for y in output]
    webhose_new = [[datetime.strptime(date.split('T')[0],'%Y-%m-%d').date(),
                    re.sub('// no comments|posted by','',text)] 
                    for date,y in lst for text in y if len(text) < 200
                    for var in names if var in text]
    return webhose_new
コード例 #20
0
def getContent(query_params):
    output = webhoseio.query("filterWebContent", query_params)
    print(output)
    with open("./webhose_results.json", 'w') as outfile:
        json.dump(output, outfile, sort_keys=True)

    insertToDB(output["posts"])
    ReqNumber = 1
    while (output["moreResultsAvailable"]):
        output = webhoseio.get_next()
        # do something for subsequent query results
        with open("./webhose_results_" + str(ReqNumber) + ".json",
                  'w') as outfile:
            json.dump(output, outfile, sort_keys=True)
        insertToDB(output["posts"])
        ReqNumber = ReqNumber + 1
        if (ReqNumber >= 5):
            break
コード例 #21
0
ファイル: NewsSearch.py プロジェクト: ddx-510/Splash-2018
def main():
    global output
    qn = input('What do you want to ask?')

    tokens = word_tokenize(qn)
    Tokens = []
    for token in tokens:
        if token.lower() not in sw:
            Tokens.append(token)
    qnF = ' '.join(Tokens)

    typeSort()

    query_params = {
        "q": qnF + " language:english site_type:"+ sorttype,
        "ts": "1526543100240",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    firstPost = []
    if sorttype is "blogs":
        for h in output['posts']:
            if curse in output['posts']:
                continue
            else:
                firstPost.append(h)
                printArticle()
                break
    else:
        printArticle()
        output = webhoseio.get_next()

    again = input("Do you want to hear about something else?")
    for x in again.split():
        if x.lower() in agree:
            main()
        else:
            print("Good day! See you!")
            break
コード例 #22
0
 def __next__(self):
     if len(self.news_batch) == 0 and self.page == self.batches:
         raise StopIteration
     if len(self.news_batch) == 0:
         self.page += 1
         self.params['from'] = self.page * 10
         results = webhoseio.query('nseFilter', self.params)
         self.news_batch = results['docs']
     news_instance = self.news_batch.pop()
     article_raw = news_instance['article']
     site_raw = news_instance['site']
     for i in range(len(article_raw['categories'])):
         category_name = article_raw['categories'][i]['name'].lower()
         if 'politics' in category_name or 'election' in category_name:
             category = 'politics'
             break
         elif 'business' in category_name or 'economics' in category_name or 'money' in category_name:
             category = 'business'
             break
         elif 'entertainment' in category_name or 'culture' in category_name or 'movies' in category_name or 'games' in category_name:
             category = 'entertaiment'
             break
         elif 'health' in category_name or 'covid' in category_name or 'medicine' in category_name:
             category = 'health'
             break
         elif 'technology' in category_name or 'science' in category_name or 'electronics' in category_name:
             category = 'technology'
             break
         else:
             category = 'uncategorized'
     article = Article(article_raw['text'],
                       category=category,
                       source=site_raw['name'],
                       author=article_raw['author'],
                       title=article_raw['title'],
                       url=article_raw['url'],
                       urlToImage=article_raw['media']['main_image'],
                       publishedAt=article_raw['published'],
                       viewCount=article_raw['social']['facebook']['likes'])
     article.set_summary(article_raw['summary'])
     return article
コード例 #23
0
    def extract_data_from_webhose():
        webhoseio.config(token="7ad89131-980e-48c3-b588-e68adb7c1be0")
        s = int(time.time()) - 500
        query_params = {"q": "language:english site:amazon.com site_category:shopping spam_score:>0.7", "ts": "{}".format(s), "sort": "crawled"}
        output = webhoseio.query("filterWebContent", query_params)
        logging.info(output)
        key = []
        reviewname = []
        productname = []
        reviewdate = []
        rating = []
        label = []
        sourcetype = []
        runtype = []
        spam_score = []
        text = []
        for i in range(0,1):
            logging.info(i)
            logging.info(output)
            key.append(i)
            reviewname.append(output['posts'][i]['author'])
            productname.append(output['posts'][i]['thread']['title'])
            reviewdate.append(output['posts'][i]['thread']['published'])
            rating.append(output['posts'][i]['thread']['rating'])
            tt = output['posts'][i]['text']
            text.append(tt)
            ss = output['posts'][i]['thread']['spam_score']
            spam_score.append(ss)
        df= pd.DataFrame()
        df['key'] = key
        df['reviewname'] = reviewname
        df['productname'] = productname
        df['reviewdate'] = reviewdate
        df['rating'] = rating
        df['label'] = 'fake'
        df['sourcetype'] = 'amazon'
        df['runtype'] = 'near_real_time'
        df['text'] = text  
        df['snapshot_time'] = s
		
        df.to_gbq('webhoseDB.staging_table', 'wehosestream', if_exists='append', verbose=False)
コード例 #24
0
def webhose_func():
    YOUR_API_KEY = "a161f6e5-ab51-40a1-afaf-ba13e67baefa"
    webhoseio.config(token=YOUR_API_KEY)
    print("\n")
    print("WELCOME TO WEBHOSE\n")
    search = input(
        "Input the string that you want to search for! It can be somethinglike ipod OR ipad\nType in a list of strings like cow,chicken,pig to plot sentiment for those words against the stock price.\n3 TERMS ARE ENOUGH!\n"
    )
    search_terms = search.split(",")
    search_df_arr = []
    for search in search_terms:
        search += " language:english"
        sort = input(
            "\nType crawled, relevancy, rating or publishes for your sorting option\n"
        )
        timestamp = 1541348859918
        size = input(
            "\nWhat is the number of post returned per request? 1 is the smallest and 100 is the biggest!\n"
        )
        query_params = {
            "accuracy_confidence": "high",
            "q": search,
            "sort": sort,
            "ts": timestamp,
            "size": size,
        }
        output = webhoseio.query("filterWebContent", query_params)
        number_of_posts = len(output['posts'])
        dates = []

        for a in range(number_of_posts):
            dates.append(output['posts'][a]['published'])

        df = pd.DataFrame(index=dates, columns=["Title"])
        for i in range(number_of_posts):
            df.iloc[i] = [output['posts'][i]['title']]
        search_df_arr.append(df)

    search_df_arr = search_df_arr + search_terms
    return search_df_arr
コード例 #25
0
def main():
    webhoseio.config(token="XXXXXXXXXXXXXXXXX"
                     )  # needs to be substituted by real webhoseio token
    query_params = {
        "q":
        "language:english has_video:false is_first:true site_type:news site:(cnn.com OR bbc.com OR reuters.com OR nbcnews.com OR foxnews.com OR washingtonpost.com OR espn.com OR tmz.com OR sportingnews.com OR usnews.com OR wsj.com OR latimes.com OR time.com OR nydailynews.com OR economist.com OR technewsworld.com OR computerworld.com OR newsmax.com OR theatlantic.com OR hollywoodreporter.com) spam_score:0.0",
        "ts": "1510212713819",
        "sort": "crawled"
    }
    #get 1st set of articles
    output = webhoseio.query("filterWebContent", query_params)

    fl_counter = 1
    while fl_counter <= 1000:
        fl_name = "file" + "_" + str(fl_counter)
        opfile = open('C:/Users/Heena/News3/' + fl_name, 'w',
                      encoding='utf-8')  #specify path to corpus folder here
        for post in output['posts']:
            uuid = post['uuid']
            url = post['url']
            site_full = post['thread']['site_full']
            site_categories = post['thread']['site_categories']
            title_full = post['thread']['title_full']
            title = post['title']
            published = post['published']
            author = post['author']
            text = post['text']

            doc = document(uuid, url, site_full, site_categories, title,
                           title_full, published, author, text)
            jsondata = json.dumps(doc.__dict__, sort_keys=True)
            opfile.write(jsondata + '\n')

        opfile.close()
        time.sleep(30)
        print("fl_counter = ", fl_counter)
        output = webhoseio.get_next()
        print("next = ", output['next'])
        fl_counter += 1
コード例 #26
0
ファイル: api_to_df.py プロジェクト: ypk22/NewsPhi
def api_df(token, site_lists, time_delta, filename):
    """
    A pipeline from Webhose API to CSV.

    :param token:
        api token for Webhose API.
    :param site_lists:
        list of sites we need to crawl.
    :param time_delta:
        time window. Ex: -3 means the most recent 3 days. Can only be from -1 to -30.
    :param filename:
        filename of CSV.
    :return:
        None
    """
    webhoseio.config(token=token)
    query_params = get_query(site_lists, time_delta)
    output_init = webhoseio.query("filterWebContent", query_params)
    output_flat = pd.io.json.json_normalize(output_init['posts'])
    df = output_flat[[
        'thread.uuid', 'author', 'external_links', 'published', 'text',
        'thread.site_full', 'thread.site_categories', 'thread.site_section',
        'thread.section_title', 'thread.main_image',
        'thread.social.facebook.comments', 'thread.social.facebook.likes',
        'thread.social.facebook.shares', 'title', 'url'
    ]]
    output = webhoseio.get_next()
    while len(output['posts']) > 0:
        df = output_to_df(output, df)
        try:
            output = webhoseio.get_next()
        except HTTPError:
            return df
            # df.to_csv(filename, index=False)
        if len(df) % 1000 == 0:
            print(str(len(df)) + ' has finished')
    return df
コード例 #27
0
def related_news(keywords):
    """
    search for related news by keywords
    use Webhose.io API
    """

    if len(keywords) >= 4:
        keywords = keywords[0:3]

    keyword_str = " ".join(keywords)

    #API key
    webhoseio.config(token="0e3f95f5-2fc7-494f-881e-e29915cc3e9a")
    query_params = {
        "q": keyword_str + " language:english site_type:news",
        "ts": "1528948373304",
        "sort": "relevancy"
    }

    resp = webhoseio.query("filterWebContent", query_params)
    posts = resp['posts']

    if len(posts) < 2:
        return None, None, True

    MAX_ARTICLES = 5  # take first 5

    related_articles = []
    related_urls = []

    for i in range(min(MAX_ARTICLES, len(posts))):
        post = posts[i]['thread']
        related_url = {'url': post['url'], 'title': post['title']}
        related_urls.append(related_url)
        related_articles.append(post['site_full'])  # currently redirected link

    return related_articles, related_urls, False
コード例 #28
0
def scrape(query, category, start_time_str, time_diff):
    print('Start scraping data from ' + start_time_str)

    query_params = {"q": query, "sort": "crawled"}

    news_list = []

    while True:
        output = webhoseio.query("filterWebContent", query_params)
        news_list = news_list + output['posts']
        output = webhoseio.get_next()

        if len(news_list) > output['totalResults'] or len(news_list) == 0:
            break

    filename = (DATA_PATH + 'News_{0}_'.format(category) +
                str(datetime.datetime.utcnow() + time_diff).replace(
                    ' ', '_').replace(':', '_') + '.json')

    with open(filename, 'w') as outfile:
        json.dump(news_list, outfile)

    print('Persisted News Article at the following location: ' + filename)
    print('{0} news articles were collected.'.format(len(news_list)))
コード例 #29
0
ファイル: models.py プロジェクト: pushpreet/mediatracker
    def update(self):
        crawledFrom = self.last_updated.timestamp()
        if abs(self.last_updated - self.last_modified) < timedelta(seconds=1):
            crawledFrom = (timezone.now() - timedelta(days=3)).timestamp()
        crawledFrom = int(crawledFrom*1000)
        
        webhoseio.config(token='e187b1d6-59c5-4b3b-9614-1c42b3e3658e')
        output = webhoseio.query(
            "filterWebContent", 
            {
                "q": self.query,
                "ts": crawledFrom,
                "language": "english",
                "site_type": "news",
            })
        
        output = output['posts']
        while True:
            temp = webhoseio.get_next()
            output += temp['posts']
            if temp['moreResultsAvailable'] <= 0:
                break

        previous_posts_uuid = []
        previous_posts_title = []
        
        if len(output) > 0:
            previous_posts_uuid = [post.uuid for post in Post.objects.all()]
            previous_posts_title = [post.title.lower() for post in Post.objects.all()]

        for post in output:
            if post['thread']['uuid'] in previous_posts_uuid:
                old_post = Post.objects.get(uuid = post['thread']['uuid'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)
            
            elif post['thread']['title'].lower() in previous_posts_title:
                old_post = Post.objects.get(title__iexact = post['thread']['title'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)

            else:
                try:
                    new_post = Post(
                        uuid = post['thread']['uuid'],
                        url = post['thread']['url'],
                        site_full = post['thread']['site_full'],
                        site_categories = post['thread']['site_categories'],
                        title = post['thread']['title'][:1024],
                        published = post['thread']['published'],
                        site_type = post['thread']['site_type'],
                        country = post['thread']['country'],
                        main_image = post['thread']['main_image'],
                        performance_score = post['thread']['performance_score'],
                        domain_rank = post['thread']['domain_rank'],
                        author = post['author'],
                        text = post['text'],
                        language = post['language'],
                        entities = post['entities'],
                        social = post['thread']['social'],
                    )

                    new_post.save()
                    new_post.trackers.add(self)
                    
                    previous_posts_uuid.append(post['thread']['uuid'])
                    previous_posts_title.append(post['thread']['title'].lower())
                
                except DataError as err:
                    print("Error: %s"%(err))
                    print(post)

        self.last_updated = timezone.now()
        self.save()
        
        return True
コード例 #30
0
# List with urls for the diffbot API
urlList = []

if action == 'Y':
    # configuring webhose request
    webhoseio.config(token="4057ff96-3ff1-4982-8c99-41d708f980ef")
    # query = "politics language:english thread.country:GB performance_score:>5"
    query = "Climate Change"
    query_params = {
        "q": "Climate Change",
        "ts": "1518278227788",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    # getting the urls of the websites that matched our query/params
    # saving the urls to a file for verification
    outputFilename = input("Enter the name of the file which will contain the webhose urls: ")
    with open(outputFilename, 'w') as urlsOut:
        urlsOut.write("Query used: "+query+"\n\n")
        j = 0
        while output['posts']:
            i = 0
            for var in output['posts']:
                urlsOut.write(str(j)+".\n"+output['posts'][i]['url']+"\n")
                urlList.append(output['posts'][i]['url'])
                i += 1
                j += 1
            output = webhoseio.get_next()
コード例 #31
0
#! /usr/bin/env python3

#from tinydb import TinyDB, Query
import json
import webhoseio

webhoseio.config(token='11a5bf53-12f6-440d-a84f-e42c18c7c38d')
output = webhoseio.query("filterWebContent", {
    "q": "Global Warming",
    "sort": "relevancy"
})
print("URL: " + output['posts'][0]['url'])  # Print the text of the first post
print("title: " +
      output['posts'][0]['title'])  # Print the text of the first post
print("published: " + output['posts'][0]['published']
      )  # Print the text of the first post publication date

output = webhoseio.get_next()
print("URL: " + output['posts'][0]['url'])  # Print the text of the first post
print("title: " +
      output['posts'][0]['title'])  # Print the text of the first post
print("published: " + output['posts'][0]['published']
      )  # Print the text of the first post publication date

# try:
# 	response = urlopen(request)
# 	data = response.read()
# 	parsed_json = json.loads(data)
# except URLError, e:
#    print 'API call not working. Got an error code:', e
# else: