def connect_to_er(api_key, max_retries=3): """Establish a connection to Event Registry.""" if api_key is not None: er = ER.EventRegistry(apiKey=api_key, repeatFailedRequestCount=max_retries) else: er = ER.EventRegistry(repeatFailedRequestCount=max_retries) return er
def DMOZ(results): final_Dmoz = {} t0 = time.time() for key, value in results.items(): dmozResults = [] for j in value: if type(j) == list: for predictions in j: er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(predictions[1]) try: for k, v in cat.items(): if k == 'categories': if len(v) != 0 and len(v) != '': for y, value in v[0].items(): if y == 'label': dmozResults.append( value.split('/')[2]) except: pass else: er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(j[1]) try: for k, v in cat.items(): if k == 'categories': if len(v) != 0 and len(v) != '': for y, value in v[0].items(): if y == 'label': dmozResults.append(value.split('/')[2]) except: pass if key in final_Dmoz: final_Dmoz[key].append(dmozResults) else: final_Dmoz[key] = [dmozResults] print("### Executed time:", round(time.time() - t0, 3), "s ###") return final_Dmoz
def connect_to_service(self): """ Establish connection with EventRegistry service """ if self._access_token is not None: self._er = ER.EventRegistry(self._access_token) else: raise Exception("[ERROR] No access_token has been specified") return self._er
def __init__(self, max_repeat_request=-1): """Initializes the event registry collector Args: max_repeat_request (int): The number of maximum requests that can be repeated if something goes wrong. If -1, repeat indefinately (Default: -1) """ # initialize the event registry instance self._er = ER.EventRegistry( apiKey=API_KEY, repeatFailedRequestCount=max_repeat_request) self.MAX_EVENT_REQUESTS = 50
def Dmoz(pred): final_Dmoz ={} timestamps = [] counter = 1 t0 = time.time() try: for key, value in pred.items(): start_time = time.time() dmozResults = [] for j in value: for k, v in j.items(): er = ER.EventRegistry(apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(v[0]) try: for k, v in cat.items(): if k == 'categories': if len(v) != 0 and len(v) != '': for y, value in v[0].items(): if y == 'label': dmozResults.append(value.split('/')[2]) except: pass except: pass timestamps.append((counter, (time.time() - start_time))) counter +=1 with open('/data/s1931628/latinumbigDatafile.csv', 'a') as file: csv_writer = csv.writer(file) csv_writer.writerow((key, dmozResults)) # if key in final_Dmoz: # final_Dmoz[key].append(dmozResults) # else: # final_Dmoz[key] = dmozResults with open('latinumtimeOneParseDmoz.csv', 'a') as file: csv_writer = csv.writer(file) csv_writer.writerow(timestamps) with open('latinumtimeOneChunkParseDmozOnly.csv', 'a') as file: csv_writer = csv.writer(file) csv_writer.writerow([(time.time() - t0)]) print("### Executed time:", round(time.time() - t0, 3), "s ###")
def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ self.userTweets.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(self.userTweets) else: docs[key] = self.userTweets print(key, ":", self.userTweets) currentWordsByUser = [] for i in range(len(self.userTweets)): tweetWords = self.userTweets[i].strip("'") tweetWords = tweetWords.strip('"') tweetWords = tweetWords.strip(",") currentWordsByUser.append(list(set(str(tweetWords).split()))) uniqueWordsByUser = list( set(list(itertools.chain.from_iterable(currentWordsByUser)))) print("uniqueWordsByUser:"******"len(uniqueWordsByUser):", len(uniqueWordsByUser)) #append all unique words from each user to global word vector self.allWordsFromUsers.append(uniqueWordsByUser) ### mm = Models(50, 10, **docs) #50,10 terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) #print(equal_length) #print("------") #print(frq) results = ll.predicting_label(**frq) l = [] for i in range(len(results)): er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(results[i][1]) for k, v in cat.items(): if k == 'categories': for y, value in v[0].items(): if y == 'label': l.append(value.split('/')[2]) self.userTopicLabels.append(l) print('########### FINAL FILE EXECUTED ##################') self.allWordsFromUsersJoined = list( itertools.chain.from_iterable(self.allWordsFromUsers)) #joined self.noneDuplicateWordsUsedFromAllUsers = list( set(self.allWordsFromUsersJoined)) self.allUsersIndexing() self.savePreprocessedData()
import json import eventregistry as ER er = ER.EventRegistry('569a0bbd-eb92-4249-9434-c401f4d2c4cc') analytics = ER.Analytics(er) sum = [] for i in range(30): with open('JsonFile{}.json'.format(i), 'r') as fp: jsonObj = json.load(fp) print(jsonObj) newList = [] for article in jsonObj['articles']: title = article['title'] if (title.find('Google') != -1): newList.append(article) print(newList) newJsonString = json.dumps(newList) newJsonString2 = json.loads(newJsonString) sum1 = 0 for article in newJsonString2: print(article['description']) text = article['description'] if (text != None): sentiment = analytics.sentiment(text=text) sum1 = sum1 + sentiment['avgSent'] else: sum1 = sum1 + 0 sum.append(sum1)
def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] documents = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ documents.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(documents) else: docs[key] = documents mm = Models(50, 10, **docs) terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) results = ll.predicting_label(**frq) l = [] for i in range(len(results)): er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(results[i][1]) for k, v in cat.items(): if k == 'categories': for y, value in v[0].items(): if y == 'label': l.append(value.split('/')[2]) print('\n') print(key, l) print('########### FINAL FILE EXECUTED ##################')
import eventregistry as evr import asyncio import json from watson_developer_cloud import ToneAnalyzerV3 from flask import Flask, render_template, send_from_directory from flask_sockets import Sockets import threading import time tone_analyzer = ToneAnalyzerV3( username='******', password='******', version='2016-05-19') er = evr.EventRegistry(apiKey = '3a705c62-c9ae-4c4f-9b94-a0963352b8b3') def searchTopic(topic): async def search(websocket): evq = evr.QueryEventsIter(conceptUri=er.getConceptUri(topic)) evq.addRequestedResult(evr.RequestEventsInfo(sortBy = 'date')) for event in evq.execQuery(er): evUri = event['uri'] if event['location'] == None: continue location = event['location']['label']['eng'] + ',', event['location']['country']['label']['eng'] articles = 0 article_content = [] arq = evr.QueryEventArticlesIter(evUri) avg_sentiments = {} for article in arq.execQuery(er): analysis = tone_analyzer.tone(article['body'], 'emotion')
import datetime import eventregistry from .ArticleProto_pb2 import ArticleDetail, ArticleList er = eventregistry.EventRegistry(apiKey='9a66d7d3-b8e3-4fc0-ab52-ed70d71fb121') source_uri_dict = { "National Geographic": "news.nationalgeographic.com", "Nature": "nature.com", "The Economist": "economist.com", "TIME": "time.com", "The New York Times": "nytimes.com", "Bloomberg Business": "bloomberg.com", "CNN": "edition.cnn.com", "Fox News": "foxnews.com", "Forbes": "forbes.com", "Washington Post": "washingtonpost.com", "The Guardian": "theguardian.com", "The Times": "thetimes.co.uk", "Mail Online": "dailymail.co.uk", "BBC": "bbc.com", "PEOPLE": "people.com", } def get_source_uri(source_title): if source_title in source_uri_dict: return source_uri_dict[source_title] else:
import eventregistry as ER import datetime import pandas as pd import time from eventregistry import * er = ER.EventRegistry(apiKey="5ba73408-ea81-459b-abf4-6fedd8cb8ec6") # dany #er = ER.EventRegistry(apiKey = "5fed3642-762a-4abc-aabf-ac6213c1bcea") #philipp #er = ER.EventRegistry(apiKey = "7571801b-6710-4166-90cc-9c5352ddeedd") #andi #er = ER.EventRegistry(apiKey="1b673182-c9e4-4554-90cf-d082a0bd6b53") # Hendrik? analytics = ER.Analytics(er) # DEFINE companies companies = ['Samsung', 'BASF', 'Apple', 'Tesla', 'Airbus', 'Bayer', 'BMW', 'Telefonica', 'Google', 'Allianz', 'Total'] # DEFINE start and end date startDate = datetime.date(2018, 7, 18) endDate = datetime.date(2018, 7, 18) # Get all Business Days in Period time_frame = pd.bdate_range(startDate, endDate) # Set maximum number of articles per day number_of_articles = 50 # DEFINE df results columns result = dict() for company in companies: print("- Starting article processing for company :", company) # Dictionary
def __init__(self, eventregistry_keys): self.KEY = eventregistry_keys["KEY"] self.api = evr.EventRegistry(apiKey=self.KEY)
# -*- coding: utf-8 -*- from flask import Flask, request app = Flask(__name__) import eventregistry er = eventregistry.EventRegistry(apiKey="23760d8a-beec-49ae-be16-250ff16e2e1f") #from flask import Flask, request, jsonify import json from eventregistry import * @app.route('/', methods=['GET']) def search(): if request.method == 'GET': key = request.args.get("keyword") lang0 = request.args.get("language") typ = request.args.get("type") if typ == "event": q = QueryEvents(lang=lang0, keywords=key # sourceLocationUri != None, #dataType = ["news"]) ) q.setRequestedResult( RequestEventsInfo(returnInfo=ReturnInfo( # articleInfo = ArticleInfoFlags(location = True), locationInfo=LocationInfoFlags(geoLocation=True)))) res = er.execQuery(q) # datalist = [[]*3]*100
def authenticate(): ''' authenticate event registry session ''' with open(ER_KEY, 'r') as key: auth_key = key.read().splitlines()[0] return er.EventRegistry(auth_key)
def fetch_event_articles(api_key, min_articles=500, force=False, save_on_api_fail=True, csv_file=None): event_registry = er.EventRegistry(apiKey=api_key, repeatFailedRequestCount=2) # Single query to collect event ids all_events_gzip_file = op.join('csv', 'events_min%d.csv' % min_articles) + '.gz' if not force and op.exists(all_events_gzip_file): df_events = pd.read_csv(all_events_gzip_file, compression='gzip') else: event_data = [] qei = er.QueryEventsIter(lang='eng', minArticlesInEvent=min_articles, maxArticlesInEvent=min_articles * 10) for event in qei.execQuery(event_registry, maxItems=1001): event_data.append(event) df_events = pd.DataFrame(event_data) df_events.to_csv(all_events_gzip_file, encoding='utf-8', compression='gzip') del event_data # Uncache csv file. if not force and op.exists(csv_file): print("Loading articles from disk...") df_articles = pd.read_csv(csv_file) else: event_uris = df_events.uri.tolist() event_uris = [ev for ev in event_uris if ev[:3] == 'eng'] print("Downloading articles for %d events..." % len(event_uris)) # Loop to retrieve all articles for an event. return_info = er.ReturnInfo(articleInfo=er.ArticleInfoFlags( bodyLen=-1, concepts=True, categories=True, originalArticle=True)) all_articles = [] api_failed = False for uri in event_uris: print "current uri: ", uri current_event_data = [] event_gzip_file = op.join('csv', 'event-%s.csv.gz' % uri) if not force and op.exists(event_gzip_file): tmp_df = pd.read_csv(event_gzip_file, compression='gzip') elif api_failed: print("\tSkipping; API failed.") try: query_iter = er.QueryEventArticlesIter(uri) for article in query_iter.execQuery( event_registry, lang="eng", returnInfo=return_info): current_event_data.append(article) except TypeError: # This is how API errors come through. if save_on_api_fail: print("\tWARNING: API failed. Skipping.") api_failed = True # end loop; we can't continue. continue else: raise # Specify columns, so that we skip any empty events. tmp_df = pd.DataFrame(current_event_data, columns=[ 'body', 'categories', 'concepts', 'date', 'dateTime', 'eventUri', 'id', 'isDuplicate', 'lang', 'originalArticle', 'sim', 'source', 'time', 'title', 'uri', 'url' ]) tmp_df.to_csv(event_gzip_file, encoding='utf-8', compression='gzip') if len(tmp_df) == 0: print("WARNING: event contains no articles.") # print "shape of df: {}".format(tmp_df.shape) # print "unique url: {}".format(len(set(tmp_df['url']))) all_articles.append(tmp_df) # Combine all news articles into a single dataframe. df_articles = pd.concat(all_articles) csv_file = csv_file or 'articles-min%d.csv' % min_articles df_articles.to_csv(csv_file, encoding='utf-8') return df_events, df_articles