def main_new_dataset(): newData = pd.read_csv('../xsense_data/global_dataset_abs_speed_diff_yaw.txt', sep=';') newDataToWord = newData.ix[:,['Acc_X','Acc_Y','Speed_X','Speed_Y','Diff_Yaw']] worder = WordData(newDataToWord) words = worder.create_words(worder.dataset) colWords = pd.Series(words, name='Word') wordDataset = pd.concat([newData,colWords], axis=1) #wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';') docs = worder.create_text_corpus(wordDataset) texts = [[i for i in doc.lower().split()] for doc in docs] dictionary = corpora.Dictionary(texts) dictionary.save('data_topic_modeling/new_dataset/doc_dictionary.dict'); # corpus = corpora.TextCorpus(docs) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('data_topic_modeling/new_dataset/documents.mm', corpus) hdp = models.HdpModel(corpus, dictionary, T=50, K=10) print hdp.show_topics(topics=20, topn=5) topicDocs= hdp[corpus] for x in topicDocs: print x
def main_new_dataset(): newData = pd.read_csv( '../xsense_data/global_dataset_abs_speed_diff_yaw.txt', sep=';') newDataToWord = newData.ix[:, [ 'Acc_X', 'Acc_Y', 'Speed_X', 'Speed_Y', 'Diff_Yaw' ]] worder = WordData(newDataToWord) words = worder.create_words(worder.dataset) colWords = pd.Series(words, name='Word') wordDataset = pd.concat([newData, colWords], axis=1) #wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';') docs = worder.create_text_corpus(wordDataset) texts = [[i for i in doc.lower().split()] for doc in docs] dictionary = corpora.Dictionary(texts) dictionary.save('data_topic_modeling/new_dataset/doc_dictionary.dict') # corpus = corpora.TextCorpus(docs) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('data_topic_modeling/new_dataset/documents.mm', corpus) hdp = models.HdpModel(corpus, dictionary, T=50, K=10) print hdp.show_topics(topics=20, topn=5) topicDocs = hdp[corpus] for x in topicDocs: print x
def main(): newData = pd.read_csv('../xsense_data/global_dataset.txt', sep=';') ###############################LONG WORD TRY ############################### ############################### 15 SIGNALS ############################### ## Choose feature to represent in words ## All exclused altitude ## dataPartOne = newData.ix[:,'Acc_X':'Pitch'] ## dataPartTwo = newData.ix[:, 'Speed_X':'Speed_Z'] ## newDataToWord = pd.concat([dataPartOne,dataPartTwo], axis=1) ###############################REDUCED WORD TRY ############################### ############################### 5 SIGNALS ############################### newDataToWord = newData.ix[:, ['Acc_X', 'Acc_Y', 'Acc_Z', 'Speed_X', 'Roll']] worder = WordData(newDataToWord) words = worder.create_words(worder.dataset) colWords = pd.Series(words, name='Word') wordDataset = pd.concat([newData, colWords], axis=1) wordDataset.to_csv('../xsense_data/word_global_dataset.txt', sep=';') docs = worder.create_text_corpus(wordDataset) #docs = ['aaabacdb abababdb addbaedb daecabdb badbccdb', # 'aeaaacdb abebabdb acdbaedc dbecadda addbbccb', # 'aeaaacdb abebabdb acdbaedc dbecadda addbbccb'] texts = [[i for i in doc.lower().split()] for doc in docs] dictionary = corpora.Dictionary(texts) dictionary.save('data_topic_modeling/doc_dictionary.dict') # corpus = corpora.TextCorpus(docs) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('data_topic_modeling/documents.mm', corpus) hdp = models.HdpModel(corpus, dictionary, T=50, K=10) print hdp.show_topics(topics=20, topn=5) topicDocs = hdp[corpus] for x in topicDocs: print x alpha, beta = hdp.hdp_to_lda() print alpha lda_model = models.LdaModel(id2word=hdp.id2word, num_topics=len(alpha), alpha=alpha, eta=hdp.m_eta) lda_model.expElogbeta = np.array(beta, dtype=np.float32) print lda_model.show_topic(1)
def main(): newData = pd.read_csv('../xsense_data/global_dataset.txt', sep=';') ###############################LONG WORD TRY ############################### ############################### 15 SIGNALS ############################### ## Choose feature to represent in words ## All exclused altitude ## dataPartOne = newData.ix[:,'Acc_X':'Pitch'] ## dataPartTwo = newData.ix[:, 'Speed_X':'Speed_Z'] ## newDataToWord = pd.concat([dataPartOne,dataPartTwo], axis=1) ###############################REDUCED WORD TRY ############################### ############################### 5 SIGNALS ############################### newDataToWord = newData.ix[:,['Acc_X','Acc_Y','Acc_Z','Speed_X','Roll']] worder = WordData(newDataToWord) words = worder.create_words(worder.dataset) colWords = pd.Series(words, name='Word') wordDataset = pd.concat([newData,colWords], axis=1) wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';') docs = worder.create_text_corpus(wordDataset) #docs = ['aaabacdb abababdb addbaedb daecabdb badbccdb', # 'aeaaacdb abebabdb acdbaedc dbecadda addbbccb', # 'aeaaacdb abebabdb acdbaedc dbecadda addbbccb'] texts = [[i for i in doc.lower().split()] for doc in docs] dictionary = corpora.Dictionary(texts) dictionary.save('data_topic_modeling/doc_dictionary.dict'); # corpus = corpora.TextCorpus(docs) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('data_topic_modeling/documents.mm', corpus) hdp = models.HdpModel(corpus, dictionary, T=50, K=10) print hdp.show_topics(topics=20, topn=5) topicDocs= hdp[corpus] for x in topicDocs: print x alpha, beta = hdp.hdp_to_lda() print alpha lda_model = models.LdaModel(id2word=hdp.id2word, num_topics=len(alpha), alpha=alpha, eta=hdp.m_eta) lda_model.expElogbeta = np.array(beta, dtype=np.float32) print lda_model.show_topic(1)
def setUp(self): self.actors = Crowling.get_actors( Crowling, 'https://movie.naver.com/movie/bi/mi/point.nhn?code=145162#tab') self.scores = Crowling.get_score( Crowling, 'https://movie.naver.com/movie/bi/mi/point.nhn?code=145162#tab') self.data1 = WordData.getWord(WordData, 'master.info') self.biggestWords, self.reivew = WordData.getReview( WordData, 'master.p') self.maxWords, self.reivews = buttons.setReviewData('master') self.maxWords = self.maxWords.split('\n')[0].split(':')[0].strip() self.reivews = self.reivews.split('\n')[0]
def setData(self): text = self.choiceMovie.currentText() self.tx1.setText(text) infoData = WordData.getWord(WordData, text+'.info') self.tx2.setText(infoData[0]) self.tx3.setText(infoData[1]) reviewTexts, reviews = setReviewData(text) self.tx4.setText(reviewTexts) self.tx5.setText(reviews)
def setReviewData(text): words = '' reviewData = WordData.getReview(WordData, text + '.p') reviewWords = sorted(reviewData[0].items(), key=lambda x: x[1], reverse=True) for word, nums in reviewWords: if word != 'actors' and word != 'score': words += word + ' : ' + str(nums) + '\n' sentences = '' for i in reviewData[1]: sentences += str(i) + '.' + '\n' return words, sentences
def get_synset_tokens(self, tagged): lemmatzr = WordNetLemmatizer() for token in tagged: wordnet_tag = self.penn_to_wordnet(token[1]) if not wordnet_tag: continue lemma = lemmatzr.lemmatize(token[0], pos=wordnet_tag) # If it can't append on try it's probably a spelling error try: self.ConversionData["synsets"].append(WordData(token[0], wordnet.synsets(lemma, pos=wordnet_tag)[0])) except: self.ConversionData["spellingErrors"].append(token[0]) return self.ConversionData