def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] documents = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ documents.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(documents) else: docs[key] = documents mm = Models(5, 10, **docs) terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) results = ll.predicting_label(**frq) print(key, results) print('########### FINAL FILE EXECUTED ##################')
def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ self.userTweets.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(self.userTweets) else: docs[key] = self.userTweets print(key, ":", self.userTweets) currentWordsByUser = [] for i in range(len(self.userTweets)): tweetWords = self.userTweets[i].strip("'") tweetWords = tweetWords.strip('"') tweetWords = tweetWords.strip(",") currentWordsByUser.append(list(set(str(tweetWords).split()))) uniqueWordsByUser = list( set(list(itertools.chain.from_iterable(currentWordsByUser)))) print("uniqueWordsByUser:"******"len(uniqueWordsByUser):", len(uniqueWordsByUser)) #append all unique words from each user to global word vector self.allWordsFromUsers.append(uniqueWordsByUser) ### mm = Models(50, 10, **docs) #50,10 terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) #print(equal_length) #print("------") #print(frq) results = ll.predicting_label(**frq) l = [] for i in range(len(results)): er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(results[i][1]) for k, v in cat.items(): if k == 'categories': for y, value in v[0].items(): if y == 'label': l.append(value.split('/')[2]) self.userTopicLabels.append(l) print('########### FINAL FILE EXECUTED ##################') self.allWordsFromUsersJoined = list( itertools.chain.from_iterable(self.allWordsFromUsers)) #joined self.noneDuplicateWordsUsedFromAllUsers = list( set(self.allWordsFromUsersJoined)) self.allUsersIndexing() self.savePreprocessedData()
def creating_dataframe(self, dictionary): final_words = [] final_words1 = [] documents = [] l = [] z = [] docs = {} keys = dictionary.keys() for key in keys: kk = str(key) k = re.findall(r'\d{8}', kk) l.append(k) for i in l: for j in i: z.append(j) for key in z: # if key == '19234329': print( "###################### Generating topic labels for {} ############################" .format(key)) df = pd.DataFrame(dictionary[key]) df.columns = ['Text'] df_ = df['Text'].apply(lambda x: ''.join(x)) df_ = df_.str.lower() df_ = df_.apply(self.tokenize) df_ = df_.apply(self.replace) df_ = df_.apply(self.split) df_ = df_.apply(self.terms_only) df_ = df_.apply(lambda x: ' '.join(x)) df_ = df_.apply(lambda x: re.sub(r' +', ' ', x)) [final_words.append("".join(i).strip().split()) for i in df_] [final_words1.append(i) for i in final_words if len(i) >= 5] [ documents.append(re.sub(r' +', " ", (' '.join(i)))) for i in final_words1 ] if key in docs: docs[key].append(documents) else: docs[key] = documents mm = Models(50, 10, **docs) terms_to_wiki = mm.calling_methods('LDA') ll = Labels(terms_to_wiki) wiki_titles = ll.get_titles_wiki() equal_length = ll.remove_all_null_dicts_returned_from_wiki( **wiki_titles) frq = ll.calculating_word_frequency(**equal_length) results = ll.predicting_label(**frq) l = [] for i in range(len(results)): er = ER.EventRegistry( apiKey='32db7607-6c90-40bd-b653-e167da1462c9') analytics = ER.Analytics(er) cat = analytics.categorize(results[i][1]) for k, v in cat.items(): if k == 'categories': for y, value in v[0].items(): if y == 'label': l.append(value.split('/')[2]) print('\n') print(key, l) print('########### FINAL FILE EXECUTED ##################')