def testTrainTopic(self): analytics = ER.Analytics(self.er) ret = analytics.trainTopicCreateTopic("my topic") assert ret and "uri" in ret uri = ret["uri"] analytics.trainTopicAddDocument( uri, "Facebook has removed 18 accounts and 52 pages associated with the Myanmar military, including the page of its commander-in-chief, after a UN report accused the armed forces of genocide and war crimes." ) analytics.trainTopicAddDocument( uri, "Emmanuel Macron’s climate commitment to “make this planet great again” has come under attack after his environment minister dramatically quit, saying the French president was not doing enough on climate and other environmental goals." ) analytics.trainTopicAddDocument( uri, "Theresa May claimed that a no-deal Brexit “wouldn’t be the end of the world” as she sought to downplay a controversial warning made by Philip Hammond last week that it would cost £80bn in extra borrowing and inhibit long-term economic growth." ) # finish training of the topic ret = analytics.trainTopicGetTrainedTopic(uri, ignoreConceptTypes="wiki") assert ret and "topic" in ret topic = ret["topic"] assert "concepts" in topic and len(topic["concepts"]) > 0 assert "categories" in topic and len(topic["categories"]) > 0 for concept in topic["concepts"]: assert concept["type"] != "wiki" # check that we can also get the topic later on ret = analytics.trainTopicGetTrainedTopic(uri) assert ret and "topic" in ret topic = ret["topic"] assert "concepts" in topic and len(topic["concepts"]) > 0 assert "categories" in topic and len(topic["categories"]) > 0
def testLanguage(self): analytics = ER.Analytics(self.er) langInfo = analytics.detectLanguage( "Microsoft released a new version of Windows OS.") self.assertTrue("languages" in langInfo) self.assertTrue("code" in langInfo["languages"][0]) self.assertTrue("name" in langInfo["languages"][0]) self.assertTrue("percent" in langInfo["languages"][0])
def testCategories(self): analytics = ER.Analytics(self.er) res = analytics.categorize( "Microsoft released a new version of Windows OS.") self.assertTrue("categories" in res) for catInfo in res["categories"]: self.assertTrue("label" in catInfo) self.assertTrue("score" in catInfo)
def testSentiment(self): analytics = ER.Analytics(self.er) res = analytics.sentiment( """Residents and tourists enjoy holiday weekend even as waves start to pound; beaches remain closed due to dangerous rip currents. Despite a state of emergency declared by the governor and warnings about dangerous surf and the possibility of significant coastal flooding, residents and visitors to the Jersey Shore spent Saturday making the most of the calm before the storm. Cloudy skies in the morning gave way to sunshine in the afternoon, and despite winds that already were kicking up sand and carving the beach, people flocked to the boardwalk in both Seaside Heights and Point Pleasant Beach, where children rode amusement rides and teens enjoyed ice cream cones. """ ) self.assertTrue("avgSent" in res) self.assertTrue("sentimentPerSent" in res)
def DMOZ(results): final_Dmoz = {} t0 = time.time() for key, value in results.items(): dmozResults = [] for j in value: if type(j) == list: for predictions in j: er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(predictions[1]) try: for k, v in cat.items(): if k == 'categories': if len(v) != 0 and len(v) != '': for y, value in v[0].items(): if y == 'label': dmozResults.append( value.split('/')[2]) except: pass else: er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(j[1]) try: for k, v in cat.items(): if k == 'categories': if len(v) != 0 and len(v) != '': for y, value in v[0].items(): if y == 'label': dmozResults.append(value.split('/')[2]) except: pass if key in final_Dmoz: final_Dmoz[key].append(dmozResults) else: final_Dmoz[key] = [dmozResults] print("### Executed time:", round(time.time() - t0, 3), "s ###") return final_Dmoz
def testExtractArticleInfo(self): analytics = ER.Analytics(self.er) info = analytics.extractArticleInfo( "https://www.theguardian.com/world/2018/jan/31/this-is-over-puigdemonts-catalan-independence-doubts-caught-on-camera" ) self.assertTrue("title" in info) self.assertTrue("body" in info) self.assertTrue("date" in info) self.assertTrue("datetime" in info) self.assertTrue("image" in info)
def testCategories(self): analytics = ER.Analytics(self.er) cats = analytics.categorize("Microsoft released a new version of Windows OS.") self.assertTrue("dmoz" in cats) self.assertTrue("categories" in cats.get("dmoz")) self.assertTrue("keywords" in cats.get("dmoz")) cat = cats.get("dmoz").get("categories")[0] self.assertTrue("label" in cat) self.assertTrue("score" in cat) kw = cats.get("dmoz").get("keywords")[0] self.assertTrue("keyword" in kw) self.assertTrue("wgt" in kw)
def testTrainTopicOnTwitter(self): analytics = ER.Analytics(self.er) ret = analytics.trainTopicOnTweets("@SeanEllis", maxConcepts=50, maxCategories=20, maxTweets=400, maxUsedLinks=400, ignoreConceptTypes=["wiki", "loc"]) assert ret and "uri" in ret uri = ret["uri"] # here we should sleep more than 5 seconds time.sleep(5) ret = analytics.trainTopicGetTrainedTopic(uri) assert ret and "topic" in ret
def Dmoz(pred): final_Dmoz ={} timestamps = [] counter = 1 t0 = time.time() try: for key, value in pred.items(): start_time = time.time() dmozResults = [] for j in value: for k, v in j.items(): er = ER.EventRegistry(apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(v[0]) try: for k, v in cat.items(): if k == 'categories': if len(v) != 0 and len(v) != '': for y, value in v[0].items(): if y == 'label': dmozResults.append(value.split('/')[2]) except: pass except: pass timestamps.append((counter, (time.time() - start_time))) counter +=1 with open('/data/s1931628/latinumbigDatafile.csv', 'a') as file: csv_writer = csv.writer(file) csv_writer.writerow((key, dmozResults)) # if key in final_Dmoz: # final_Dmoz[key].append(dmozResults) # else: # final_Dmoz[key] = dmozResults with open('latinumtimeOneParseDmoz.csv', 'a') as file: csv_writer = csv.writer(file) csv_writer.writerow(timestamps) with open('latinumtimeOneChunkParseDmozOnly.csv', 'a') as file: csv_writer = csv.writer(file) csv_writer.writerow([(time.time() - t0)]) print("### Executed time:", round(time.time() - t0, 3), "s ###")
def testConcepts(self): analytics = ER.Analytics(self.er) annInfo = analytics.annotate("Microsoft released a new version of Windows OS.") self.assertTrue("annotations" in annInfo, "Annotations were not provided for the given text") anns = annInfo["annotations"] self.assertTrue(len(anns) == 2) self.assertTrue("url" in anns[0]) self.assertTrue("title" in anns[0]) self.assertTrue("lang" in anns[0]) self.assertTrue("secLang" in anns[0]) self.assertTrue("secUrl" in anns[0]) self.assertTrue("secTitle" in anns[0]) self.assertTrue("wgt" in anns[0]) self.assertTrue("wikiDataItemId" in anns[0]) self.assertTrue("adverbs" in annInfo) self.assertTrue("adjectives" in annInfo) self.assertTrue("verbs" in annInfo) self.assertTrue("nouns" in annInfo) self.assertTrue("ranges" in annInfo) self.assertTrue("language" in annInfo)
def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ self.userTweets.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(self.userTweets) else: docs[key] = self.userTweets print(key, ":", self.userTweets) currentWordsByUser = [] for i in range(len(self.userTweets)): tweetWords = self.userTweets[i].strip("'") tweetWords = tweetWords.strip('"') tweetWords = tweetWords.strip(",") currentWordsByUser.append(list(set(str(tweetWords).split()))) uniqueWordsByUser = list( set(list(itertools.chain.from_iterable(currentWordsByUser)))) print("uniqueWordsByUser:"******"len(uniqueWordsByUser):", len(uniqueWordsByUser)) #append all unique words from each user to global word vector self.allWordsFromUsers.append(uniqueWordsByUser) ### mm = Models(50, 10, **docs) #50,10 terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) #print(equal_length) #print("------") #print(frq) results = ll.predicting_label(**frq) l = [] for i in range(len(results)): er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(results[i][1]) for k, v in cat.items(): if k == 'categories': for y, value in v[0].items(): if y == 'label': l.append(value.split('/')[2]) self.userTopicLabels.append(l) print('########### FINAL FILE EXECUTED ##################') self.allWordsFromUsersJoined = list( itertools.chain.from_iterable(self.allWordsFromUsers)) #joined self.noneDuplicateWordsUsedFromAllUsers = list( set(self.allWordsFromUsersJoined)) self.allUsersIndexing() self.savePreprocessedData()
def testSemanticSimilarity(self): doc1 = "The editor, Carrie Gracie, who joined the network 30 years ago, said she quit her position as China editor last week to protest pay inequality within the company. In the letter posted on her website, she said that she and other women had long suspected their male counterparts drew larger salaries and that BBC management had refused to acknowledge the problem." doc2 = "Paukenschlag bei der britischen BBC: Die China-Expertin Carrie Gracie hat aus Protest gegen die illegale Gehaltskultur und damit verbundene Heimlichtuerei ihren Job bei dem öffentlich-rechtlichen Sender hingeworfen. Zwei ihrer männlichen Kollegen in vergleichbaren Positionen würden nachweislich wesentlich besser bezahlt." analytics = ER.Analytics(self.er) ret = analytics.semanticSimilarity(doc1, doc2) self.assertTrue("similarity" in ret)
import json import eventregistry as ER er = ER.EventRegistry('569a0bbd-eb92-4249-9434-c401f4d2c4cc') analytics = ER.Analytics(er) sum = [] for i in range(30): with open('JsonFile{}.json'.format(i), 'r') as fp: jsonObj = json.load(fp) print(jsonObj) newList = [] for article in jsonObj['articles']: title = article['title'] if (title.find('Google') != -1): newList.append(article) print(newList) newJsonString = json.dumps(newList) newJsonString2 = json.loads(newJsonString) sum1 = 0 for article in newJsonString2: print(article['description']) text = article['description'] if (text != None): sentiment = analytics.sentiment(text=text) sum1 = sum1 + sentiment['avgSent'] else: sum1 = sum1 + 0 sum.append(sum1)
def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] documents = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ documents.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(documents) else: docs[key] = documents mm = Models(50, 10, **docs) terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) results = ll.predicting_label(**frq) l = [] for i in range(len(results)): er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(results[i][1]) for k, v in cat.items(): if k == 'categories': for y, value in v[0].items(): if y == 'label': l.append(value.split('/')[2]) print('\n') print(key, l) print('########### FINAL FILE EXECUTED ##################')
def testLanguage(self): analytics = ER.Analytics(self.er) langInfo = analytics.detectLanguage("Microsoft released a new version of Windows OS.") print langInfo