def UpdateSimilarityDatabase(target=None, title=None, outPut=None): global embedding_model if title == None: title = 'Append Similar word relation' else: title += '\nAppend Similar word relation' if outPut == None: outPut = '' else: outPut = 'Process data of ' + outPut + '\n' main.ShowTitle(title, outPut + 'Getting exist data') if target == None: relationDict = {} relationList = DataBaseManager.DoSQL(""" SELECT Normal_Word, Target_Word, Similar_Relation_ID FROM similar_word_relation """) for relation in relationList: if relation[0] == relation[1]: newWordDict = {relation[0]: {}} relationDict.update(newWordDict) else: newRelation = {relation[1]: relation[2]} relationDict.get(relation[0]).update(newRelation) else: sqlResult = DataBaseManager.DoSQL(""" SELECT Category_ID, Relation_Table_Name FROM product_dic WHERE Product_ID = """ + str(target) + """ """) productInfo = sqlResult[0] featureList = [productInfo[1]] sqlResult = DataBaseManager.DoSQL(""" SELECT Feature_Name FROM feature_dic WHERE Category_ID = """ + str(productInfo[0]) + """ """) for result in sqlResult: featureList.append(result[0]) if target == None: main.ShowTitle(title, outPut + 'Getting latest calculated similar data') wordList = [] for word in embedding_model.wv.index2word: wordList.append(word) wordDict = {} removeList = [] index = 0 updateTime = 0 while True: currentTime = int( str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Removing not verb and adjective (' + str(index) + '/' + str(len(wordList)) + ' removed: ' + str(len(removeList)) + ')') word = wordList[index] targetTag = ['VA', 'VV'] if len(NLP.DoNLP(word, targetTag)) <= 0: removeList.append(index) index += 1 if index >= len(wordList): break removeList.sort(reverse=True) for index in removeList: wordList.pop(index) wordDict = dict.fromkeys(wordList) insertQuery = [] updateQuery = [] index = 0 if target == None: updateTime = 0 for word in wordDict.keys(): result = embedding_model.most_similar( positive=[word], topn=len(embedding_model.wv.index2word) - 1) currentTime = int( str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Append query (' + str(index) + '/' + str(len(wordList)) + ')') for similar in result: try: wordDict[similar[0]] except: continue else: existData = relationDict.get(word) try: relationID = existData[similar[0]] except: newQuery = """ INSERT INTO similar_word_relation (Normal_Word, Target_Word, Similar_Value) VALUES ('""" + word + """', '""" + similar[ 0] + """', """ + str(similar[1]) + """)""" insertQuery.append(newQuery) else: newQuery = """ UPDATE similar_word_relation SET Similar_Value = """ + str(similar[1]) + """ WHERE Similar_Relation_ID = """ + str(relationID) updateQuery.append(newQuery) index += 1 else: SentiWordBinder = BindSentiWords.BindSentiWords() updateTime = 0 for feature in featureList: result = embedding_model.most_similar( positive=[feature], topn=len(embedding_model.wv.index2word) - 1) currentTime = int( str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Append query (' + str(index) + '/' + str(len(featureList)) + ')') for similar in result: if feature != similar[0] and productInfo[1] != similar[0]: # if feature != productInfo[1]: # try: # wordDict[similar[0]] # except: # updateQuery.append(""" # UPDATE `""" + productInfo[1] + """` # SET `""" + feature + """` = null # WHERE Word = '""" + similar[0] + """' # """) # else: # updateQuery.append(""" # UPDATE `""" + productInfo[1] + """` # SET `""" + feature + """` = """ + str(similar[1]) + """ # WHERE Word = '""" + similar[0] + """' # """) # else: updateQuery.append(""" UPDATE `""" + productInfo[1] + """` SET `""" + feature + """` = """ + str(similar[1]) + """ WHERE Word = '""" + similar[0] + """' """) sentiValueDict = SentiWordBinder.BindSentiWords( [similar[0]]) if sentiValueDict[similar[0]] != 'None': updateQuery.append(""" UPDATE `""" + productInfo[1] + """` SET Sentiment_Value = """ + sentiValueDict[similar[0]] + """ WHERE Word = '""" + similar[0] + """' """) updateQuery.append(""" UPDATE `""" + productInfo[1] + """` SET `""" + feature + """` = null WHERE Word_Count <= """ + str(5) + """ """) index += 1 if target == None: db = 'db_capstone' else: db = 'db_capstone_similarity' DataBaseManager.DoManyQuery(insertQuery, db=db, title=title, outPut=outPut, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, db=db, title=title, outPut=outPut, queryType='UPDATE')
def UpdateSimilarWordDictionary(title=None, outPut=None): if title == None: title = 'Update Similar word Dictionary' else: title += '\nUpdate Similar word Dictionary' if outPut == None: outPut = '' else: outPut = 'Process data of ' + outPut + '\n' main.ShowTitle(title, 'Getting similarity data') sqlResult = DataBaseManager.DoSQL(""" SELECT Normal_Word, Target_Word, Word_Count FROM similar_word_relation WHERE Similar_Value > 0.95 ORDER BY Similar_Value DESC """) index = 0 wordDict = {} updateTime = 0 for result in sqlResult: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Building relation dictionary (' + str(index) + '/' + str(len(sqlResult)) + ')') newRelation = {result[1]: result[2]} try: existRelation = wordDict[result[0]] except: wordRelation = {result[0]: newRelation} else: existRelation.update(newRelation) wordRelation = {result[0]: existRelation} wordDict.update(wordRelation) index += 1 index = 0 removeIndex = 0 initialLength = len(wordDict) removeList = [] updateTime = 0 for key, relation in wordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, 'Removing unnecessary word (' + str(index) + '/' + str(initialLength) + ' removed: ' + str(removeIndex) + ')') if len(relation) <= 1: removeList.append(key) removeIndex += 1 index += 1 index = 0 updateTime = 0 for key in removeList: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Removing unnecessary word (' + str(index) + '/' + str(initialLength) + ' removed: ' + str(removeIndex) + ')') wordDict.pop(key) relatedWordDict = {} SentiWordBinder = BindSentiWords.BindSentiWords() index = 0 updateTime = 0 for key, value in wordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Calculating sentimental value (' + str(index) + '/' + str(len(wordDict)) + ')') compareList = [] compareList.extend(value.keys()) sentiValueDict = SentiWordBinder.BindSentiWords(compareList) keySentiValue = sentiValueDict[key] if keySentiValue != 'None': sentiValueDict.pop(key) for word, targetSentiValue in sentiValueDict.items(): if targetSentiValue != 'None': if int(keySentiValue) == int(targetSentiValue): if value[key] > wordDict[word][word]: newRelation = {word: key} elif value[key] < wordDict[word][word]: newRelation = {key: word} relatedWordDict.update(newRelation) index = 0 updateTime = 0 for subWord, superWord in relatedWordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Building similar word dictionary (' + str(index) + '/' + str(len(relatedWordDict)) + ')') try: upperWord = relatedWordDict[superWord] except: continue else: relatedWordDict[subWord] = upperWord for key, value in relatedWordDict.items(): if value == superWord: relatedWordDict[key] = upperWord main.ShowTitle(title, outPut + 'Getting exist similar word dictionary') sqlResult = DataBaseManager.DoSQL(""" SELECT Sub_Word, Similar_ID FROM similar_word_dic """) existRelatedWordDict = dict(sqlResult) insertQuery = [] updateQuery = [] index = 0 updateTime = 0 for subWord, superWord in relatedWordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Appending Query (' + str(index) + '/' + str(len(relatedWordDict)) + ')') try: dictionaryID = existRelatedWordDict[subWord] except: insertQuery.append(""" INSERT INTO similar_word_dic (Sub_Word, Super_Word) VALUES ('""" + subWord + """', '""" + superWord + """') """) else: updateQuery.append(""" UPDATE similar_word_dic SET Super_Word WHERE Similar_ID = """ + str(dictionaryID) + """ """) DataBaseManager.DoManyQuery(insertQuery, title=title, outPut=outPut, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, title=title, outPut=outPut, queryType='UPDATE')
def AppendWordDicQuery(title=None, outPut=None): global productDic global productSimilarDic global similarInsertQuery global similarUpdateQuery global insertQuery global updateQuery if title == None: title = '' if outPut == None: outPut = '' featureDic = {} sqlResult = DataBaseManager.DoSQL(""" SELECT Feature_Name, Category_ID FROM feature_dic """) for result in sqlResult: try: existFeatureList = featureDic[result[1]] except: existFeatureList = [] existFeatureList.append(result[0]) newDict = {result[1]: existFeatureList} featureDic.update(newDict) createQuery = [] updateRelateTableQuery = [] for name, wordDict in productSimilarDic.items(): if productDic[name]['relationTable'] == None: try: featureList = [name] featureList[1:] = featureDic[productDic[name]['categoryID']] except: featureList = [name] createQuery.append(""" CREATE TABLE `db_capstone_similarity`.`""" + name + """` ( `Word_ID` INT(11) NOT NULL AUTO_INCREMENT, `Word` TEXT NOT NULL, `Word_Count` INT(11) NOT NULL DEFAULT '0', `Sentiment_Value` INT(11) NULL DEFAULT NULL,`""" + '`FLOAT NULL DEFAULT NULL,`'.join(featureList) + """` FLOAT NULL DEFAULT NULL, PRIMARY KEY (`Word_ID`) ) ENGINE = InnoDB """) updateRelateTableQuery.append(""" UPDATE product_dic SET Relation_Table_Name = '""" + name + """' WHERE Product_ID = """ + str(productDic[name]['productID']) + """ """) productDic[name]['relationTable'] = name existWord = {} else: WordList = [] WordList.extend( DataBaseManager.DoSQL( """ SELECT `Word`, `Word_ID` FROM `""" + name + """`""", 'db_capstone_similarity')) existWord = dict(WordList) for word, count in wordDict.items(): try: wordID = existWord[word] except: similarInsertQuery.append(""" INSERT INTO `""" + name + """` (`Word` ,`Word_Count`) VALUES ('""" + word + """', """ + str(count) + """) """) else: similarUpdateQuery.append(""" UPDATE `""" + name + """` SET `Word_Count` = `Word_Count` + """ + str(count) + """ WHERE `Word_ID` = """ + str(wordID) + """ """) DataBaseManager.DoManyQuery(createQuery, title=title, outPut=outPut, queryType='CREATE') DataBaseManager.DoManyQuery(updateRelateTableQuery, title=title, outPut=outPut, queryType='UPDATE')
def Dividing(reviewData, fileName, title=None, outPut=None): global similarInsertQuery global similarUpdateQuery global insertQuery global updateQuery if title == None: title = '' if outPut == None: outPut = '' similarInsertQuery = [] similarUpdateQuery = [] insertQuery = [] updateQuery = [] completeIndex = 0 noProductIndex = 0 skippedIndex = 0 sqlResult = DataBaseManager.DoSQL(""" SELECT Review_ID, Review_Number FROM review_dic """) completedReview = dict(sqlResult) updateTime = 0 for data in reviewData: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime additionString = '' if noProductIndex > 0 or skippedIndex > 0: additionString += ' (' if skippedIndex > 0: additionString += 'skipped: ' + str(skippedIndex) if noProductIndex > 0 and skippedIndex > 0: additionString += ' / ' if noProductIndex > 0: additionString += 'not product: ' + str(noProductIndex) additionString += ')' main.ShowTitle( title, outPut + 'Building dictionary for ' + fileName + ' (' + str(completeIndex + skippedIndex + noProductIndex) + '/' + str(len(reviewData)) + ')' + additionString) splitData = data.split(',') if len(splitData) < 2: return 'No data in ' + fileName reviewNumber = fileName + '-' + splitData[0] splitData.remove(splitData[0]) reviewTitleString = splitData[0] reviewTitleString = reviewTitleString.replace('\n', '') reviewTitleString = reviewTitleString.replace(';', ',') splitData.remove(splitData[0]) reviewString = ''.join(splitData) reviewString = reviewString.replace('\n', '') reviewString = reviewString.replace(';', ',') if reviewNumber in completedReview.values(): skippedIndex += 1 continue if reviewTitleString == '!e': continue reviewTitleStringList = NLP.DoNLP(reviewTitleString, None, 'Review') reviewStringList = NLP.DoNLP(reviewString, None, 'Review') resultList = GetProductName( ' '.join(reviewTitleStringList), ' '.join(reviewStringList)).get('product_Name') if len(resultList) > 0: resultStringList = DictionaryBuilder.ConvertNormalWord( mainStringList=reviewTitleStringList, subStringList=reviewStringList, mode='Review') resultString = '#'.join(resultStringList) for name in resultList: updateQuery.append(""" UPDATE product_dic SET Count = Count + 1 WHERE Product_Name = '""" + name + """' """) insertQuery.append(""" INSERT INTO review_dic (Review_Number, Review, Product_ID) VALUES ('""" + reviewNumber + """', '""" + resultString + """', """ + str(productDic[name].get('productID')) + """) """) for word in resultStringList: try: wordDict = productSimilarDic[name] except: wordDict = {} try: currentCount = wordDict[word] except: currentCount = 0 wordInfo = {word: currentCount + 1} wordDict.update(wordInfo) newItem = {name: wordDict} productSimilarDic.update(newItem) completeIndex += 1 else: noProductIndex += 1 returnString = "Complete building dictionary for " + fileName if skippedIndex > 0 or noProductIndex > 0: returnString += ' (' if skippedIndex > 0: returnString += 'skipped: ' + str(skippedIndex) if skippedIndex > 0 and noProductIndex > 0: returnString += ' / ' if noProductIndex > 0: returnString += 'not product: ' + str(noProductIndex) returnString += ')' returnString += '\n' AppendWordDicQuery(title=title, outPut=outPut + returnString) DataBaseManager.DoManyQuery(insertQuery, title=title, outPut=outPut + returnString, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, title=title, outPut=outPut + returnString, queryType='UPDATE') DataBaseManager.DoManyQuery(similarInsertQuery, 'db_capstone_similarity', title=title, outPut=outPut + returnString, queryType='INSERT') DataBaseManager.DoManyQuery(similarUpdateQuery, 'db_capstone_similarity', title=title, outPut=outPut + returnString, queryType='UPDATE') return returnString
def AppendArticleDic(reviewData, fileName, title=None, outPut=None): global insertQuery global updateQuery if title == None: title = '' if outPut == None: outPut = '' insertQuery = [] updateQuery = [] stackUnit = DataBaseManager.maximumQueryStactUnit completeIndex = 0 skippedIndex = 0 articleLastID = DataBaseManager.DoSQL(""" SELECT `AUTO_INCREMENT` FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'db_capstone' AND TABLE_NAME = 'article_dic'; """)[0][0] articleNumberList = [] index = 0 while True: articleNumberList.extend( DataBaseManager.DoSQL(""" SELECT Article_ID, Article_Number FROM article_dic WHERE Article_ID > """ + str(index) + """ AND Article_ID <= """ + str(index + stackUnit))) index += stackUnit if index > articleLastID: break completedReview = dict(articleNumberList) wordDic = {} main.ShowTitle( title, outPut + 'Building dictionary for ' + fileName + ' (' + str(completeIndex) + '/' + str(len(reviewData)) + ')') updateTime = int(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) for data in reviewData: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Building dictionary for ' + fileName + ' (' + str(completeIndex + skippedIndex) + '/' + str(len(reviewData)) + ')') splitData = data.split(',') if len(splitData) < 2: return 'No data in ' + fileName reviewNumber = fileName + '-' + splitData.pop(0) reviewString = splitData.pop(0) + ',' reviewString += ''.join(splitData) reviewString = reviewString.replace('\n', '') reviewString = reviewString.replace(';', ',') if reviewNumber in completedReview.values(): skippedIndex += 1 continue if reviewString == '!e': continue resultStringList = ConvertNormalWord(reviewString, reviewString) resultString = '#'.join(resultStringList) insertQuery.append(""" INSERT INTO article_dic (Article_Number, Article) VALUES ('""" + reviewNumber + """', '""" + resultString + """')""") for word in resultStringList: try: currentCount = wordDic[word] except: currentCount = 0 newItem = {word: currentCount + 1} wordDic.update(newItem) completeIndex += 1 AppendWordDicQuery(wordDic) returnString = "Complete building dictionary for " + fileName if skippedIndex > 0: returnString += ' (skipped ' + str(skippedIndex) + ' of ' + str( len(reviewData)) + ' review)' returnString += '\n' DataBaseManager.DoManyQuery(insertQuery, title=title, outPut=outPut + returnString, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, title=title, outPut=outPut + returnString, queryType='UPDATE') return returnString