def main(): with warnings.catch_warnings(): warnings.simplefilter("ignore") warnings.filterwarnings("ignore", category=DeprecationWarning) preTime = time.time() LogFile = 'logging_Miner.txt' logging.basicConfig(filename = LogFile, level = logging.DEBUG, filemode= 'w', format = ('%(filename)s: ''%(levelname)s: ''%(funcName)s(): ''%(lineno)d:\t''%(message)s')) logging.info('Start Miner') readPath = '' writePath = '' #remember two spaces for yaml file and no tabs with open("ET_MinerConfig.yml",'r') as ymlFile: config = yaml.load(ymlFile) readPath = config['folder']['readpath'] writePath = config['folder']['writepath'] logging.info('reading from path: ' + readPath) #nltk.download() #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_info.csv' # data = pd.read_csv(Miner.find_most_recent_file(readPath),sep='\t',usecols=['Ad','Description line 1','Description line 2', # 'Impressions','Cost','Conversions', # 'top_sq_tokens', 'lp_text']) data = pd.read_csv(readPath) colnames = list(data.columns.values) #data['ad_text'] = data['Ad'] + data['Description line 1'] + data['Description line 2'] #data = data.sort(['lp_text'],ascending=False) data.columns = Miner.MyFlattten([['hotel_id'],colnames[1:]]) #data[25:11400].to_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\ET\\data_text.csv',index=False) #sys.exit() data = Miner.ReIndex(data) #reviewData = pd.read_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_review.csv') ##############################Freq Dist################################################################## # reviewText = getDocuments(data,'lp_text',True, False) # # totalText = '' # for k in xrange(int(len(reviewText)/20)): # totalText += str(reviewText[k]) # # reviewText0 = nltk.word_tokenize(totalText) # #reviewText0 = Miner.tokenStemmer(reviewText0) #stemmer is not working very well # # # tempTokens = [word for word in reviewText0 if not Miner.punctuationNumbers.search(word)] # reviewText = [i.lower() for i in tempTokens if i.lower() not in Miner.stopWords] # # #Miner.stopWords.difference(set(['hotel','hotels','near'])) # this will effect the Miner file # # freqDistText = Miner.getFreqDist(reviewText, True) #pickle freqDist # # #print('frequency plot') # #freqDistText[0].plot(30,cumulative=True) # # logging.info('Top words: ' ) # logging.info(freqDistText[1][3:10]) # # #logging.info('cfDist predictions: ') # # #############################################word predictions###################################################### # # print('top words') # print([i for i,j in freqDistText[1][3:10]]) # # topWords = [i for i,j in freqDistText[1][3:100]] # wordsPred = [i for i,j in freqDistText[1][3:10]] # # print('topWords') # print(topWords) # print('wordsPred') # print(wordsPred) # # wordsPredictions = Miner.getConditionalDist(reviewText, topWords, wordsPred) # # logging.info(wordsPredictions) # # Ngrams = Miner.getNgram(reviewText, zip(wordsPred,wordsPred[::-1]), True) # # logging.info('Ngrams') # logging.info(' ') # logging.info(Ngrams[1]) # #combineData = pd.merge(data.ix[:,['hotel_id','tags']], reviewData.ix[:,['hotel_id','title','body']], on=['hotel_id'],how='inner') #combineData.to_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\combineData_text.csv') ###############################Topic Modeling###################################################################### topicData = (data[:5000].ix[133,:] , 'lp_text') print('topicData') print(topicData[0]['lp_text']) lda = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lda',True) logging.info('lda topics') logging.info(lda[0].print_topics(10)) logging.info('LDA perplexity: ' + str(lda[1])) lsa = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lsa',True) logging.info('lsa topics') logging.info(lsa.print_topics(10)) textBlockFlag = True dataText = Miner.getDocuments(topicData[0],topicData[1],True, textBlockFlag) print('dataText') print(dataText) tfidf_searchTerms, modelTfIdf = Miner.tf_idf(dataText) print('tfidf_searchTerms.T.toarray()') print(tfidf_searchTerms.T.toarray()) #word by doc, before transpose doc by word (row by col format) logging.info('tfidf_searchTerms transposed') logging.info(tfidf_searchTerms.T.toarray()) tfidf_review, reviewModelTfIdf = Miner.tf_idf(Miner.getDocuments(data[:5000].ix[133,:],'lp_text',True, textBlockFlag)) topicsNMF = Miner.getTopicNMF(tfidf_searchTerms, modelTfIdf) logging.info('NMF topics') #################Similiarity testing################################################################################### colNames = ['ad_text','lp_text'] topicData = (data[:5000], 'lp_text') # # logging.info('Pairwise similiarity') # #logging.info(Miner.similiarity(np.array([1,2,0,1]), np.array([0,2,2,1]), None, None)) # logging.info(' ') # combinePhraseDoc = Miner.CombineDocumentPhrase(data, colNames, True) #takes a long time to compute tfidf_review, combineModelTfIdf = Miner.tf_idf(combinePhraseDoc) dimReview = tfidf_review.toarray().shape tfidf_review_matrix = tfidf_review.toarray() for i in xrange(0,int(dimReview[0]/10),2): #loop through by twos try: #logging.info('phrase: ' + str(combinePhraseDoc[i])) #logging.info('document: ' + str(combinePhraseDoc[i+1])) #logging.info('phrase vector: ' ) #logging.info(tfidf_review_matrix[i,:]) #logging.info('doc vector: ' ) #logging.info(tfidf_review_matrix[i+1,:]) logging.info('similiarity: ' + str(Miner.similiarity(tfidf_review_matrix[i,:], tfidf_review_matrix[i+1,:], None, None))) except Exception, e: logging.warn('Error: ' + str(e))
except Exception, e: logging.warn('Error: ' + str(e)) logging.info(' ') logging.info('Pairwise similiarity') logging.info(' ') dimCombineData = data.shape phrasesText = Miner.getDocuments(data,colNames[0],True) documentText = Miner.getDocuments(data,colNames[1], True) for j in xrange(int(dimCombineData[0]/10)): try: #logging.info('phrase: ' + str(phrasesText[j])) #logging.info('document: ' + str(documentText[j])) tfidf_pair, pairTfIdf = Miner.tf_idf([phrasesText[j],documentText[j]]) tfidf_pair_matrix = tfidf_pair.toarray() logging.info('similiarity: ' + str(Miner.similiarity(tfidf_pair_matrix[0,:], tfidf_pair_matrix[1,:], None, None))) except Exception, e: logging.warn('Error: ' + str(e)) #logging.info(topicsNMF.) ###########################Entity Extraction ################################################################################### print('Starting Entity Extraction') tagToken = Miner.ExtractTags(data[:50],'lp_text') Entities = Miner.ExtractEntity(tagToken) logging.info('compute time: {0}'.format(time.time() - preTime))