Esempio n. 1
0
 def testTrainTopic(self):
     analytics = ER.Analytics(self.er)
     ret = analytics.trainTopicCreateTopic("my topic")
     assert ret and "uri" in ret
     uri = ret["uri"]
     analytics.trainTopicAddDocument(
         uri,
         "Facebook has removed 18 accounts and 52 pages associated with the Myanmar military, including the page of its commander-in-chief, after a UN report accused the armed forces of genocide and war crimes."
     )
     analytics.trainTopicAddDocument(
         uri,
         "Emmanuel Macron’s climate commitment to “make this planet great again” has come under attack after his environment minister dramatically quit, saying the French president was not doing enough on climate and other environmental goals."
     )
     analytics.trainTopicAddDocument(
         uri,
         "Theresa May claimed that a no-deal Brexit “wouldn’t be the end of the world” as she sought to downplay a controversial warning made by Philip Hammond last week that it would cost £80bn in extra borrowing and inhibit long-term economic growth."
     )
     # finish training of the topic
     ret = analytics.trainTopicGetTrainedTopic(uri,
                                               ignoreConceptTypes="wiki")
     assert ret and "topic" in ret
     topic = ret["topic"]
     assert "concepts" in topic and len(topic["concepts"]) > 0
     assert "categories" in topic and len(topic["categories"]) > 0
     for concept in topic["concepts"]:
         assert concept["type"] != "wiki"
     # check that we can also get the topic later on
     ret = analytics.trainTopicGetTrainedTopic(uri)
     assert ret and "topic" in ret
     topic = ret["topic"]
     assert "concepts" in topic and len(topic["concepts"]) > 0
     assert "categories" in topic and len(topic["categories"]) > 0
Esempio n. 2
0
 def testLanguage(self):
     analytics = ER.Analytics(self.er)
     langInfo = analytics.detectLanguage(
         "Microsoft released a new version of Windows OS.")
     self.assertTrue("languages" in langInfo)
     self.assertTrue("code" in langInfo["languages"][0])
     self.assertTrue("name" in langInfo["languages"][0])
     self.assertTrue("percent" in langInfo["languages"][0])
Esempio n. 3
0
 def testCategories(self):
     analytics = ER.Analytics(self.er)
     res = analytics.categorize(
         "Microsoft released a new version of Windows OS.")
     self.assertTrue("categories" in res)
     for catInfo in res["categories"]:
         self.assertTrue("label" in catInfo)
         self.assertTrue("score" in catInfo)
Esempio n. 4
0
 def testSentiment(self):
     analytics = ER.Analytics(self.er)
     res = analytics.sentiment(
         """Residents and tourists enjoy holiday weekend even as waves start to pound; beaches remain closed due to dangerous rip currents.
         Despite a state of emergency declared by the governor and warnings about dangerous surf and the possibility of significant coastal flooding, residents and visitors to the Jersey Shore spent Saturday making the most of the calm before the storm.
         Cloudy skies in the morning gave way to sunshine in the afternoon, and despite winds that already were kicking up sand and carving the beach, people flocked to the boardwalk in both Seaside Heights and Point Pleasant Beach, where children rode amusement rides and teens enjoyed ice cream cones. """
     )
     self.assertTrue("avgSent" in res)
     self.assertTrue("sentimentPerSent" in res)
Esempio n. 5
0
def DMOZ(results):
    final_Dmoz = {}
    t0 = time.time()
    for key, value in results.items():
        dmozResults = []
        for j in value:
            if type(j) == list:
                for predictions in j:
                    er = ER.EventRegistry(
                        apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                    analytics = ER.Analytics(er)
                    cat = analytics.categorize(predictions[1])
                    try:
                        for k, v in cat.items():
                            if k == 'categories':
                                if len(v) != 0 and len(v) != '':
                                    for y, value in v[0].items():
                                        if y == 'label':
                                            dmozResults.append(
                                                value.split('/')[2])
                    except:
                        pass
            else:
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(j[1])
                try:
                    for k, v in cat.items():
                        if k == 'categories':
                            if len(v) != 0 and len(v) != '':
                                for y, value in v[0].items():
                                    if y == 'label':
                                        dmozResults.append(value.split('/')[2])
                except:
                    pass

        if key in final_Dmoz:
            final_Dmoz[key].append(dmozResults)
        else:
            final_Dmoz[key] = [dmozResults]

    print("### Executed time:", round(time.time() - t0, 3), "s ###")
    return final_Dmoz
Esempio n. 6
0
 def testExtractArticleInfo(self):
     analytics = ER.Analytics(self.er)
     info = analytics.extractArticleInfo(
         "https://www.theguardian.com/world/2018/jan/31/this-is-over-puigdemonts-catalan-independence-doubts-caught-on-camera"
     )
     self.assertTrue("title" in info)
     self.assertTrue("body" in info)
     self.assertTrue("date" in info)
     self.assertTrue("datetime" in info)
     self.assertTrue("image" in info)
Esempio n. 7
0
 def testCategories(self):
     analytics = ER.Analytics(self.er)
     cats = analytics.categorize("Microsoft released a new version of Windows OS.")
     self.assertTrue("dmoz" in cats)
     self.assertTrue("categories" in cats.get("dmoz"))
     self.assertTrue("keywords" in cats.get("dmoz"))
     cat = cats.get("dmoz").get("categories")[0]
     self.assertTrue("label" in cat)
     self.assertTrue("score" in cat)
     kw = cats.get("dmoz").get("keywords")[0]
     self.assertTrue("keyword" in kw)
     self.assertTrue("wgt" in kw)
Esempio n. 8
0
 def testTrainTopicOnTwitter(self):
     analytics = ER.Analytics(self.er)
     ret = analytics.trainTopicOnTweets("@SeanEllis",
                                        maxConcepts=50,
                                        maxCategories=20,
                                        maxTweets=400,
                                        maxUsedLinks=400,
                                        ignoreConceptTypes=["wiki", "loc"])
     assert ret and "uri" in ret
     uri = ret["uri"]
     # here we should sleep more than 5 seconds
     time.sleep(5)
     ret = analytics.trainTopicGetTrainedTopic(uri)
     assert ret and "topic" in ret
Esempio n. 9
0
def Dmoz(pred):
    final_Dmoz ={}
    timestamps = []
    counter = 1
    t0 = time.time()
    try:
        for key, value in pred.items():
            start_time = time.time()
            dmozResults = []
            for j in value: 
                for k, v in j.items():
                    er = ER.EventRegistry(apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                    analytics = ER.Analytics(er)
                    cat = analytics.categorize(v[0])
                    try:
                        for k, v in cat.items():
                            if k == 'categories':
                                if len(v) != 0 and len(v) != '':
                                    for y, value in v[0].items():
                                        if y == 'label':
                                            dmozResults.append(value.split('/')[2])

                    except: 
                        pass
    except: 
        pass
        timestamps.append((counter, (time.time() - start_time)))
        counter +=1
        
        with open('/data/s1931628/latinumbigDatafile.csv', 'a') as file:
            csv_writer = csv.writer(file)
            csv_writer.writerow((key, dmozResults))

#         if key in final_Dmoz: 
#             final_Dmoz[key].append(dmozResults)
#         else:
#             final_Dmoz[key] = dmozResults
    with open('latinumtimeOneParseDmoz.csv', 'a') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(timestamps)

    with open('latinumtimeOneChunkParseDmozOnly.csv', 'a') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow([(time.time() - t0)])
        
    print("### Executed time:", round(time.time() - t0, 3), "s ###")    
Esempio n. 10
0
 def testConcepts(self):
     analytics = ER.Analytics(self.er)
     annInfo = analytics.annotate("Microsoft released a new version of Windows OS.")
     self.assertTrue("annotations" in annInfo, "Annotations were not provided for the given text")
     anns = annInfo["annotations"]
     self.assertTrue(len(anns) == 2)
     self.assertTrue("url" in anns[0])
     self.assertTrue("title" in anns[0])
     self.assertTrue("lang" in anns[0])
     self.assertTrue("secLang" in anns[0])
     self.assertTrue("secUrl" in anns[0])
     self.assertTrue("secTitle" in anns[0])
     self.assertTrue("wgt" in anns[0])
     self.assertTrue("wikiDataItemId" in anns[0])
     self.assertTrue("adverbs" in annInfo)
     self.assertTrue("adjectives" in annInfo)
     self.assertTrue("verbs" in annInfo)
     self.assertTrue("nouns" in annInfo)
     self.assertTrue("ranges" in annInfo)
     self.assertTrue("language" in annInfo)
Esempio n. 11
0
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []

        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                self.userTweets.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(self.userTweets)
            else:
                docs[key] = self.userTweets

            print(key, ":", self.userTweets)
            currentWordsByUser = []
            for i in range(len(self.userTweets)):
                tweetWords = self.userTweets[i].strip("'")
                tweetWords = tweetWords.strip('"')
                tweetWords = tweetWords.strip(",")

                currentWordsByUser.append(list(set(str(tweetWords).split())))

            uniqueWordsByUser = list(
                set(list(itertools.chain.from_iterable(currentWordsByUser))))
            print("uniqueWordsByUser:"******"len(uniqueWordsByUser):", len(uniqueWordsByUser))
            #append all unique words from each user to global word vector
            self.allWordsFromUsers.append(uniqueWordsByUser)

            ###

            mm = Models(50, 10, **docs)  #50,10
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            #print(equal_length)
            #print("------")
            #print(frq)

            results = ll.predicting_label(**frq)
            l = []
            for i in range(len(results)):
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(results[i][1])
                for k, v in cat.items():
                    if k == 'categories':
                        for y, value in v[0].items():
                            if y == 'label':
                                l.append(value.split('/')[2])

            self.userTopicLabels.append(l)

        print('########### FINAL FILE EXECUTED ##################')
        self.allWordsFromUsersJoined = list(
            itertools.chain.from_iterable(self.allWordsFromUsers))  #joined
        self.noneDuplicateWordsUsedFromAllUsers = list(
            set(self.allWordsFromUsersJoined))
        self.allUsersIndexing()
        self.savePreprocessedData()
Esempio n. 12
0
 def testSemanticSimilarity(self):
     doc1 = "The editor, Carrie Gracie, who joined the network 30 years ago, said she quit her position as China editor last week to protest pay inequality within the company. In the letter posted on her website, she said that she and other women had long suspected their male counterparts drew larger salaries and that BBC management had refused to acknowledge the problem."
     doc2 = "Paukenschlag bei der britischen BBC: Die China-Expertin Carrie Gracie hat aus Protest gegen die illegale Gehaltskultur und damit verbundene Heimlichtuerei ihren Job bei dem öffentlich-rechtlichen Sender hingeworfen. Zwei ihrer männlichen Kollegen in vergleichbaren Positionen würden nachweislich wesentlich besser bezahlt."
     analytics = ER.Analytics(self.er)
     ret = analytics.semanticSimilarity(doc1, doc2)
     self.assertTrue("similarity" in ret)
import json
import eventregistry as ER

er = ER.EventRegistry('569a0bbd-eb92-4249-9434-c401f4d2c4cc')
analytics = ER.Analytics(er)

sum = []

for i in range(30):
    with open('JsonFile{}.json'.format(i), 'r') as fp:
        jsonObj = json.load(fp)
        print(jsonObj)
        newList = []
        for article in jsonObj['articles']:
            title = article['title']
            if (title.find('Google') != -1):
                newList.append(article)
        print(newList)
        newJsonString = json.dumps(newList)
        newJsonString2 = json.loads(newJsonString)
        sum1 = 0
        for article in newJsonString2:
            print(article['description'])
            text = article['description']
            if (text != None):
                sentiment = analytics.sentiment(text=text)
                sum1 = sum1 + sentiment['avgSent']
            else:
                sum1 = sum1 + 0

        sum.append(sum1)
Esempio n. 14
0
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []
        documents = []
        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                documents.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(documents)
            else:
                docs[key] = documents

            mm = Models(50, 10, **docs)
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            results = ll.predicting_label(**frq)
            l = []
            for i in range(len(results)):
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(results[i][1])
                for k, v in cat.items():
                    if k == 'categories':
                        for y, value in v[0].items():
                            if y == 'label':
                                l.append(value.split('/')[2])

            print('\n')
            print(key, l)
        print('########### FINAL FILE EXECUTED ##################')
Esempio n. 15
0
 def testLanguage(self):
     analytics = ER.Analytics(self.er)
     langInfo = analytics.detectLanguage("Microsoft released a new version of Windows OS.")
     print langInfo