Example #1
0
def main():
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        warnings.filterwarnings("ignore", category=DeprecationWarning)        
    
    preTime = time.time()
    
    LogFile = 'logging_Miner.txt'
    logging.basicConfig(filename = LogFile, level = logging.DEBUG, 
                        filemode= 'w', format = ('%(filename)s: ''%(levelname)s: ''%(funcName)s(): ''%(lineno)d:\t''%(message)s'))
    
    logging.info('Start Miner')
    
    readPath = ''
    writePath = ''
    #remember two spaces for yaml file and no tabs
    with open("ET_MinerConfig.yml",'r') as ymlFile:
        config = yaml.load(ymlFile)
        readPath = config['folder']['readpath']
        writePath = config['folder']['writepath']
        logging.info('reading from path: ' + readPath)          
    
    #nltk.download()
    #'C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_info.csv'
    
#     data = pd.read_csv(Miner.find_most_recent_file(readPath),sep='\t',usecols=['Ad','Description line 1','Description line 2',
#                                                                                'Impressions','Cost','Conversions',
#                                                                                'top_sq_tokens', 'lp_text'])
    data = pd.read_csv(readPath)
    colnames = list(data.columns.values)
    
    #data['ad_text'] = data['Ad'] + data['Description line 1'] + data['Description line 2'] 
    
    #data = data.sort(['lp_text'],ascending=False)
    
    data.columns = Miner.MyFlattten([['hotel_id'],colnames[1:]])
    #data[25:11400].to_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\ET\\data_text.csv',index=False)
    #sys.exit()
    data = Miner.ReIndex(data)   
   
    #reviewData = pd.read_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\hotel_review.csv')    
    
    ##############################Freq Dist##################################################################
#     reviewText = getDocuments(data,'lp_text',True, False)
#     
#     totalText = ''
#     for k in xrange(int(len(reviewText)/20)):
#         totalText += str(reviewText[k])
#         
#     reviewText0 = nltk.word_tokenize(totalText)
#     #reviewText0 = Miner.tokenStemmer(reviewText0) #stemmer is not working very well
#     
#     
#     tempTokens = [word for word in reviewText0 if not Miner.punctuationNumbers.search(word)]
#     reviewText = [i.lower() for i in tempTokens if i.lower() not in Miner.stopWords]
#     
#     #Miner.stopWords.difference(set(['hotel','hotels','near'])) # this will effect the Miner file
#     
#     freqDistText = Miner.getFreqDist(reviewText, True) #pickle freqDist
#     
#     #print('frequency plot')
#     #freqDistText[0].plot(30,cumulative=True)    
#     
#     logging.info('Top words: ' )
#     logging.info(freqDistText[1][3:10])    
#     
#     #logging.info('cfDist predictions: ')
#     
#     #############################################word predictions######################################################
#     
#     print('top words')
#     print([i for i,j in freqDistText[1][3:10]])
#     
#     topWords = [i for i,j in freqDistText[1][3:100]]
#     wordsPred = [i for i,j in freqDistText[1][3:10]]
#     
#     print('topWords')
#     print(topWords)
#     print('wordsPred')
#     print(wordsPred)
#     
#     wordsPredictions = Miner.getConditionalDist(reviewText, topWords, wordsPred)
#     
#     logging.info(wordsPredictions)
#     
#     Ngrams = Miner.getNgram(reviewText, zip(wordsPred,wordsPred[::-1]), True)
#     
#     logging.info('Ngrams')
#     logging.info(' ')
#     logging.info(Ngrams[1])
#     
    #combineData = pd.merge(data.ix[:,['hotel_id','tags']], reviewData.ix[:,['hotel_id','title','body']], on=['hotel_id'],how='inner')
    
    #combineData.to_csv('C:\\Users\\fhokhold\\Documents\\Projects\\Vulcan\\vulcan-data\\combineData_text.csv')
    
    ###############################Topic Modeling######################################################################    
    topicData = (data[:5000].ix[133,:] , 'lp_text')
    print('topicData')
    print(topicData[0]['lp_text'])
    
    lda = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lda',True)
    logging.info('lda topics')
    logging.info(lda[0].print_topics(10))
    logging.info('LDA perplexity: ' + str(lda[1]))
     
    lsa = Miner.getTopicsLDAandLSA(topicData[0],topicData[1],'lsa',True)
    logging.info('lsa topics')
    logging.info(lsa.print_topics(10))
    
    textBlockFlag = True
     
    dataText = Miner.getDocuments(topicData[0],topicData[1],True, textBlockFlag)
    print('dataText')
    print(dataText)
    tfidf_searchTerms, modelTfIdf = Miner.tf_idf(dataText)
     
    print('tfidf_searchTerms.T.toarray()')
    print(tfidf_searchTerms.T.toarray()) #word by doc, before transpose doc by word (row by col format)
     
    logging.info('tfidf_searchTerms transposed')
    logging.info(tfidf_searchTerms.T.toarray())
     
    tfidf_review, reviewModelTfIdf = Miner.tf_idf(Miner.getDocuments(data[:5000].ix[133,:],'lp_text',True, textBlockFlag))  
     
    topicsNMF = Miner.getTopicNMF(tfidf_searchTerms, modelTfIdf)
    logging.info('NMF topics')    
     
    #################Similiarity testing###################################################################################
    colNames = ['ad_text','lp_text']    
    
    topicData = (data[:5000], 'lp_text')
#     
#     logging.info('Pairwise similiarity')
#     #logging.info(Miner.similiarity(np.array([1,2,0,1]), np.array([0,2,2,1]), None, None))
#     logging.info(' ')       
#     
    combinePhraseDoc = Miner.CombineDocumentPhrase(data, colNames, True) #takes a long time to compute
     
    tfidf_review, combineModelTfIdf  = Miner.tf_idf(combinePhraseDoc)   
     
    dimReview = tfidf_review.toarray().shape
     
    tfidf_review_matrix = tfidf_review.toarray()    
     
    for i in xrange(0,int(dimReview[0]/10),2): #loop through by twos     
        try:                         
            #logging.info('phrase: ' + str(combinePhraseDoc[i]))
            #logging.info('document: ' + str(combinePhraseDoc[i+1]))
            #logging.info('phrase vector: ' )
            #logging.info(tfidf_review_matrix[i,:])
            #logging.info('doc vector: ' )
            #logging.info(tfidf_review_matrix[i+1,:])
            logging.info('similiarity: ' + str(Miner.similiarity(tfidf_review_matrix[i,:], tfidf_review_matrix[i+1,:], None, None)))
        except Exception, e:
            logging.warn('Error: ' + str(e))
Example #2
0
      except Exception, e:
          logging.warn('Error: ' + str(e))
   
  logging.info(' ')            
  logging.info('Pairwise similiarity')
  logging.info(' ')
  dimCombineData = data.shape
   
  phrasesText = Miner.getDocuments(data,colNames[0],True)
  documentText = Miner.getDocuments(data,colNames[1], True)
   
  for j in xrange(int(dimCombineData[0]/10)):
      try:                
          #logging.info('phrase: ' + str(phrasesText[j]))
          #logging.info('document: ' + str(documentText[j]))
          tfidf_pair, pairTfIdf = Miner.tf_idf([phrasesText[j],documentText[j]]) 
          tfidf_pair_matrix = tfidf_pair.toarray()
          logging.info('similiarity: ' + str(Miner.similiarity(tfidf_pair_matrix[0,:], tfidf_pair_matrix[1,:], None, None)))
      except Exception, e:
          logging.warn('Error: ' + str(e))    
   
  #logging.info(topicsNMF.)
  
  ###########################Entity Extraction ###################################################################################
  print('Starting Entity Extraction')
 
  tagToken = Miner.ExtractTags(data[:50],'lp_text')
  Entities = Miner.ExtractEntity(tagToken)
  
  logging.info('compute time: {0}'.format(time.time() - preTime))