except Exception, e: logging.warn('Error: ' + str(e)) #logging.info(topicsNMF.) ###########################Entity Extraction ################################################################################### print('Starting Entity Extraction') tagToken = Miner.ExtractTags(data[:50],'lp_text') Entities = Miner.ExtractEntity(tagToken) logging.info('compute time: {0}'.format(time.time() - preTime)) Entities[1].draw() RegexEntities = Miner.grammarEntity(tagToken) #takes long RegexEntities[1].draw() logging.info('Entities') for entity in Entities: logging.info(entity) logging.info(' ') #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\entity_test.csv' logging.info('write to path: ' + writePath) pd.DataFrame(Entities).to_csv(writePath, index=False) # w2vec = Miner.getTopicWord2VecNeuralNet(data, 'lp_text') # logging.info('word2vec features') # logging.info(w2vec.accuracy())
def main(): with warnings.catch_warnings(): warnings.simplefilter("ignore") warnings.filterwarnings("ignore", category=DeprecationWarning) preTime = time.time() LogFile = 'logging_Miner.txt' logging.basicConfig(filename = LogFile, level = logging.DEBUG, filemode= 'w', format = ('%(filename)s: ''%(levelname)s: ''%(funcName)s(): ''%(lineno)d:\t''%(message)s')) logging.info('Start Miner') readPath = '' writePath = '' #remember two spaces for yaml file and no tabs with open("MinerConfig.yml",'r') as ymlFile: config = yaml.load(ymlFile) readPath = config['folder']['readpath'] writePath = config['folder']['writepath'] logging.info('reading from path: ' + readPath) #nltk.download() #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_info.csv' data = pd.read_csv(readPath) colnames = list(data.columns.values) data.columns = Miner.MyFlattten([['hotel_id'],colnames[1:]]) reviewData = pd.read_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_review.csv') ##############################Freq Dist################################################################## reviewText = getDocuments(reviewData,'body',True) totalText = '' for k in xrange(int(len(reviewText)/20)): totalText += str(reviewText[k]) reviewText0 = nltk.word_tokenize(totalText) #reviewText0 = Miner.tokenStemmer(reviewText0) #stemmer is not working very well Miner.stopWords.update(['hotel','hotels','near']) reviewText = [i.lower() for i in reviewText0 if i.lower() not in Miner.stopWords] Miner.stopWords.remove(['hotel','hotels','near']) freqDistText = Miner.getFreqDist(reviewText, True) #pickle freqDist #print('frequency plot') #freqDistText[0].plot(30,cumulative=True) logging.info('Top words: ' ) logging.info(freqDistText[1][3:10]) logging.info('cfDist predictions: ') #############################################word predictions###################################################### print('top words') print([i for i,j in freqDistText[1][3:10]]) topWords = [i for i,j in freqDistText[1][3:100]] wordsPred = [i for i,j in freqDistText[1][3:10]] print('topWords') print(topWords) print('wordsPred') print(wordsPred) wordsPredictions = Miner.getConditionalDist(reviewText, topWords, wordsPred) logging.info(wordsPredictions) # Ngrams = Miner.getNgram(reviewText, zip(wordsPred,wordsPred[::-1]), True) # # logging.info('Ngrams') # logging.info(' ') # logging.info(Ngrams[1]) # combineData = pd.merge(data.ix[:,['hotel_id','tags']], reviewData.ix[:,['hotel_id','title','body']], on=['hotel_id'],how='inner') combineData.to_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\combineData_text.csv') ###############################Topic Modeling###################################################################### # topicData = (combineData , 'body') # lda = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lda') # logging.info('lda topics') # logging.info(lda[0].print_topics(10)) # logging.info('LDA perplexity: ' + str(lda[1])) # # lsa = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lsa') # logging.info('lsa topics') # logging.info(lsa.print_topics(10)) # # dataText = Miner.getDocuments(topicData[0],topicData[1],True) # tfidf_searchTerms, modelTfIdf = Miner.tf_idf(dataText) # # print('tfidf_searchTerms.T.toarray()') # print(tfidf_searchTerms.T.toarray()) #word by doc, before transpose doc by word (row by col format) # # logging.info('tfidf_searchTerms transposed') # logging.info(tfidf_searchTerms.T.toarray()) # # tfidf_review, reviewModelTfIdf = Miner.tf_idf(Miner.getDocuments(combineData,'body',True)) # # topicsNMF = Miner.getTopicNMF(tfidf_searchTerms, modelTfIdf) # logging.info('NMF topics') # #################Similiarity testing################################################################################### colNames = ['title','body'] logging.info('Pairwise similiarity') #logging.info(Miner.similiarity(np.array([1,2,0,1]), np.array([0,2,2,1]), None, None)) logging.info(' ') # # combinePhraseDoc = Miner.CombineDocumentPhrase(combineData, colNames, True) #takes a long time to compute # # tfidf_review, combineModelTfIdf = Miner.tf_idf(combinePhraseDoc) # # dimReview = tfidf_review.toarray().shape # # tfidf_review_matrix = tfidf_review.toarray() # # for i in xrange(0,int(dimReview[0]/10),2): #loop through by twos # try: # logging.info('phrase: ' + str(combinePhraseDoc[i])) # logging.info('document: ' + str(combinePhraseDoc[i+1])) # #logging.info('phrase vector: ' ) # #logging.info(tfidf_review_matrix[i,:]) # #logging.info('doc vector: ' ) # #logging.info(tfidf_review_matrix[i+1,:]) # logging.info('similiarity: ' + str(Miner.similiarity(tfidf_review_matrix[i,:], tfidf_review_matrix[i+1,:], None, None))) # except Exception, e: # logging.warn('Error: ' + str(e)) # # logging.info(' ') # logging.info('Pairwise similiarity') # logging.info(' ') # dimCombineData = combineData.shape # # phrasesText = Miner.getDocuments(combineData,colNames[0],True) # documentText = Miner.getDocuments(combineData,colNames[1], True) # # for j in xrange(int(dimCombineData[0]/10)): # try: # logging.info('phrase: ' + str(phrasesText[j])) # logging.info('document: ' + str(documentText[j])) # tfidf_pair, pairTfIdf = Miner.tf_idf([phrasesText[j],documentText[j]]) # tfidf_pair_matrix = tfidf_pair.toarray() # logging.info('similiarity: ' + str(Miner.similiarity(tfidf_pair_matrix[0,:], tfidf_pair_matrix[1,:], None, None))) # except Exception, e: # logging.warn('Error: ' + str(e)) # #logging.info(topicsNMF.) ###########################Entity Extraction ################################################################################### tagToken = Miner.ExtractTags(combineData[:10],'body') Entities = Miner.ExtractEntity(tagToken) logging.info('compute time: {0}'.format(time.time() - preTime)) Entities[0].draw() RegexEntities = Miner.grammarEntity(tagToken) #takes long RegexEntities[0].draw() logging.info('Entities') for entity in Entities: logging.info(entity) logging.info(' ') #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\entity_test.csv' logging.info('write to path: ' + writePath) pd.DataFrame(Entities).to_csv(writePath, index=False) w2vec = Miner.getTopicWord2VecNeuralNet(data, 'tags') logging.info('word2vec features') logging.info(w2vec.accuracy())