Example #1
0
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []
        documents = []
        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                documents.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(documents)
            else:
                docs[key] = documents

            mm = Models(5, 10, **docs)
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            results = ll.predicting_label(**frq)

            print(key, results)
        print('########### FINAL FILE EXECUTED ##################')
Example #2
0
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []

        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                self.userTweets.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(self.userTweets)
            else:
                docs[key] = self.userTweets

            print(key, ":", self.userTweets)
            currentWordsByUser = []
            for i in range(len(self.userTweets)):
                tweetWords = self.userTweets[i].strip("'")
                tweetWords = tweetWords.strip('"')
                tweetWords = tweetWords.strip(",")

                currentWordsByUser.append(list(set(str(tweetWords).split())))

            uniqueWordsByUser = list(
                set(list(itertools.chain.from_iterable(currentWordsByUser))))
            print("uniqueWordsByUser:"******"len(uniqueWordsByUser):", len(uniqueWordsByUser))
            #append all unique words from each user to global word vector
            self.allWordsFromUsers.append(uniqueWordsByUser)

            ###

            mm = Models(50, 10, **docs)  #50,10
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            #print(equal_length)
            #print("------")
            #print(frq)

            results = ll.predicting_label(**frq)
            l = []
            for i in range(len(results)):
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(results[i][1])
                for k, v in cat.items():
                    if k == 'categories':
                        for y, value in v[0].items():
                            if y == 'label':
                                l.append(value.split('/')[2])

            self.userTopicLabels.append(l)

        print('########### FINAL FILE EXECUTED ##################')
        self.allWordsFromUsersJoined = list(
            itertools.chain.from_iterable(self.allWordsFromUsers))  #joined
        self.noneDuplicateWordsUsedFromAllUsers = list(
            set(self.allWordsFromUsersJoined))
        self.allUsersIndexing()
        self.savePreprocessedData()
    def creating_dataframe(self, dictionary):
        final_words = []
        final_words1 = []
        documents = []
        l = []
        z = []
        docs = {}
        keys = dictionary.keys()
        for key in keys:
            kk = str(key)
            k = re.findall(r'\d{8}', kk)
            l.append(k)
        for i in l:
            for j in i:
                z.append(j)
        for key in z:
            # if key == '19234329':
            print(
                "###################### Generating topic labels for {} ############################"
                .format(key))
            df = pd.DataFrame(dictionary[key])
            df.columns = ['Text']
            df_ = df['Text'].apply(lambda x: ''.join(x))
            df_ = df_.str.lower()
            df_ = df_.apply(self.tokenize)
            df_ = df_.apply(self.replace)
            df_ = df_.apply(self.split)
            df_ = df_.apply(self.terms_only)
            df_ = df_.apply(lambda x: ' '.join(x))
            df_ = df_.apply(lambda x: re.sub(r' +', ' ', x))
            [final_words.append("".join(i).strip().split()) for i in df_]
            [final_words1.append(i) for i in final_words if len(i) >= 5]
            [
                documents.append(re.sub(r' +', " ", (' '.join(i))))
                for i in final_words1
            ]

            if key in docs:
                docs[key].append(documents)
            else:
                docs[key] = documents

            mm = Models(50, 10, **docs)
            terms_to_wiki = mm.calling_methods('LDA')
            ll = Labels(terms_to_wiki)
            wiki_titles = ll.get_titles_wiki()
            equal_length = ll.remove_all_null_dicts_returned_from_wiki(
                **wiki_titles)
            frq = ll.calculating_word_frequency(**equal_length)
            results = ll.predicting_label(**frq)
            l = []
            for i in range(len(results)):
                er = ER.EventRegistry(
                    apiKey='32db7607-6c90-40bd-b653-e167da1462c9')
                analytics = ER.Analytics(er)
                cat = analytics.categorize(results[i][1])
                for k, v in cat.items():
                    if k == 'categories':
                        for y, value in v[0].items():
                            if y == 'label':
                                l.append(value.split('/')[2])

            print('\n')
            print(key, l)
        print('########### FINAL FILE EXECUTED ##################')