def GetReviewData(workDir): global sourceName sourceName = workDir.split('\\')[len(workDir.split('\\')) - 1] fileList = os.listdir(workDir) targetFileList = [] for file in fileList: nameSplit = file.split('.') if len(nameSplit) >= 2: if nameSplit[len(nameSplit) - 1] == 'txt': targetFileList.append(file) outPut = '' for file in targetFileList: title = 'Now dividing reviews (' + str(targetFileList.index(file) + 1) + '/' + str( len(targetFileList)) + ')' try: fileDes = open(workDir + "\\" + file, 'r', encoding="utf-8") except: outPut += 'fail to open ' + file + '\n' + '\n' main.ShowTitle(title, outPut) else: outPut += Dividing(FileManager.FileReader(fileDes), file, title, outPut) # DictionaryBuilder.AppendArticleDic(FileManager.FileReader(fileDes), file, title, outPut) WordSimilarity.baseDir = baseDir return WordSimilarity.ProcessAllProduct(title=None, outPut=outPut)
def BuildWordDic(workDir): fileList = os.listdir(workDir) targetFileList = [] for file in fileList: nameSplit = file.split('.') if len(nameSplit) >= 2: if nameSplit[len(nameSplit) - 1] == 'txt': targetFileList.append(file) outPut = '' for file in targetFileList: try: fileDes = open(workDir + "\\" + file, 'r', encoding="utf-8") except: outPut += 'fail to open ' + file + '\n' + '\n' continue title = 'Now processing articles (' + str( targetFileList.index(file) + 1) + '/' + str( len(targetFileList)) + ')' outPut += AppendArticleDic(FileManager.FileReader(fileDes), file.split('.')[0], title, outPut) main.ShowTitle(title, outPut) WordSimilarity.baseDir = baseDir return WordSimilarity.ProcessArticle(outPut=outPut, title=None)
def DoManyQuery(queryList, db=None, title=None, outPut=None, queryType=None): if title == None: title = '' if outPut == None: outPut = '' if queryType == None: queryType = '' if queryList != []: updateTime = 0 for index in range(0, math.ceil(len(queryList) / maximumQueryStactUnit)): currentTime = int(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle(title, outPut + 'Sending ' + queryType + ' query (' + str(index) + '/' + str(math.ceil(len(queryList) / maximumQueryStactUnit)) + ')') Query = ';'.join(queryList[index*maximumQueryStactUnit:min(maximumQueryStactUnit*(index+1), len(queryList))]) DoSQL(Query, db)
def GetSimilarity(target=None, title=None, outPut=None): global contents global baseDir global embedding_model if title == None: title = 'Building similarity data' else: title += '\nBuilding similarity data' if outPut == None: outPut = '' else: outPut = 'Process data of ' + outPut + '\n' currentTime = str(datetime.datetime.now().strftime('%Y#%m#%d&%H#%M#%S')) if target == None: saveDir = baseDir + '\\WordVectorData\\Normal\\' + currentTime + '.fasttext' else: saveDir = baseDir + '\\WordVectorData\\' + str( target) + '\\' + currentTime + '.fasttext' main.ShowTitle(title, outPut) if embedding_model == None: embedding_model = FastText(size=30, window=3, min_count=5, workers=4, sg=1) embedding_model.build_vocab(contents) embedding_model.train(contents, total_examples=embedding_model.corpus_count, epochs=embedding_model.epochs) embedding_model.save(saveDir) else: if len(contents) > 0: embedding_model.build_vocab(contents, update=True) embedding_model.train(contents, total_examples=embedding_model.corpus_count, epochs=embedding_model.epochs) embedding_model.save(saveDir)
def SelectProduct(Mode, outPut=None, title=None): if title == None: title = '' if outPut == None: outPut = '' if Mode == 'Product': while True: main.ShowTitle(title, outPut) inputValue = input( 'Enter product Name (%q to back %a to process all): ') if inputValue == '%q': return '' if inputValue == '%a': return ProcessAllProduct() productList = ReviewDivider.GetProductName(inputValue).get( 'product_Name') main.ShowTitle(title, 'Data for ' + inputValue) number = 1 if len(productList) > 0: for product in productList: print(str(number) + '. ' + product) number += 1 else: print('No result') print('') print('r. Re-enter name') print('b. Back') inputValue = input("=> ") if inputValue == 'r': outPut = '' continue elif inputValue == 'b': return '' try: sqlResult = DataBaseManager.DoSQL(""" SELECT Product_ID FROM product_dic WHERE Product_Name = '""" + productList[int(inputValue) - 1] + """' """)[0] if sqlResult != []: targetID = sqlResult[0] else: outPut = 'Please enter correct number or charactor' continue except: outPut = 'Please enter correct number or charactor' continue tableName = DataBaseManager.DoSQL(""" SELECT Relation_Table_Name FROM product_dic WHERE Product_ID = """ + str(targetID) + """ LIMIT 1 """)[0][0] if tableName == None: outPut = 'No review for ' + tableName continue GetContent(targetID, title=title, outPut=productList[int(inputValue) - 1]) GetSimilarity(targetID, title=title, outPut=productList[int(inputValue) - 1]) if len(contents) > 0: UpdateSimilarityDatabase(targetID, title=title, outPut=productList[int(inputValue) - 1]) outPut = '' while True: main.ShowTitle('', outPut) inputValue = input('Enter target word (%q to back): ') if inputValue == '%q': outPut = '' break outPut = GetRelatedWord(tableName, inputValue) else: articleCount = DataBaseManager.DoSQL(""" SELECT COUNT(*) FROM article_dic """)[0][0] if articleCount <= 0: return 'There is no data' ProcessArticle() outPut = '' while True: main.ShowTitle('', outPut) inputValue = input('Enter target word (%q to back): ') if inputValue == '%q': outPut = '' break outPut = GetRelatedWord('Normal', inputValue)
def UpdateSimilarityDatabase(target=None, title=None, outPut=None): global embedding_model if title == None: title = 'Append Similar word relation' else: title += '\nAppend Similar word relation' if outPut == None: outPut = '' else: outPut = 'Process data of ' + outPut + '\n' main.ShowTitle(title, outPut + 'Getting exist data') if target == None: relationDict = {} relationList = DataBaseManager.DoSQL(""" SELECT Normal_Word, Target_Word, Similar_Relation_ID FROM similar_word_relation """) for relation in relationList: if relation[0] == relation[1]: newWordDict = {relation[0]: {}} relationDict.update(newWordDict) else: newRelation = {relation[1]: relation[2]} relationDict.get(relation[0]).update(newRelation) else: sqlResult = DataBaseManager.DoSQL(""" SELECT Category_ID, Relation_Table_Name FROM product_dic WHERE Product_ID = """ + str(target) + """ """) productInfo = sqlResult[0] featureList = [productInfo[1]] sqlResult = DataBaseManager.DoSQL(""" SELECT Feature_Name FROM feature_dic WHERE Category_ID = """ + str(productInfo[0]) + """ """) for result in sqlResult: featureList.append(result[0]) if target == None: main.ShowTitle(title, outPut + 'Getting latest calculated similar data') wordList = [] for word in embedding_model.wv.index2word: wordList.append(word) wordDict = {} removeList = [] index = 0 updateTime = 0 while True: currentTime = int( str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Removing not verb and adjective (' + str(index) + '/' + str(len(wordList)) + ' removed: ' + str(len(removeList)) + ')') word = wordList[index] targetTag = ['VA', 'VV'] if len(NLP.DoNLP(word, targetTag)) <= 0: removeList.append(index) index += 1 if index >= len(wordList): break removeList.sort(reverse=True) for index in removeList: wordList.pop(index) wordDict = dict.fromkeys(wordList) insertQuery = [] updateQuery = [] index = 0 if target == None: updateTime = 0 for word in wordDict.keys(): result = embedding_model.most_similar( positive=[word], topn=len(embedding_model.wv.index2word) - 1) currentTime = int( str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Append query (' + str(index) + '/' + str(len(wordList)) + ')') for similar in result: try: wordDict[similar[0]] except: continue else: existData = relationDict.get(word) try: relationID = existData[similar[0]] except: newQuery = """ INSERT INTO similar_word_relation (Normal_Word, Target_Word, Similar_Value) VALUES ('""" + word + """', '""" + similar[ 0] + """', """ + str(similar[1]) + """)""" insertQuery.append(newQuery) else: newQuery = """ UPDATE similar_word_relation SET Similar_Value = """ + str(similar[1]) + """ WHERE Similar_Relation_ID = """ + str(relationID) updateQuery.append(newQuery) index += 1 else: SentiWordBinder = BindSentiWords.BindSentiWords() updateTime = 0 for feature in featureList: result = embedding_model.most_similar( positive=[feature], topn=len(embedding_model.wv.index2word) - 1) currentTime = int( str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Append query (' + str(index) + '/' + str(len(featureList)) + ')') for similar in result: if feature != similar[0] and productInfo[1] != similar[0]: # if feature != productInfo[1]: # try: # wordDict[similar[0]] # except: # updateQuery.append(""" # UPDATE `""" + productInfo[1] + """` # SET `""" + feature + """` = null # WHERE Word = '""" + similar[0] + """' # """) # else: # updateQuery.append(""" # UPDATE `""" + productInfo[1] + """` # SET `""" + feature + """` = """ + str(similar[1]) + """ # WHERE Word = '""" + similar[0] + """' # """) # else: updateQuery.append(""" UPDATE `""" + productInfo[1] + """` SET `""" + feature + """` = """ + str(similar[1]) + """ WHERE Word = '""" + similar[0] + """' """) sentiValueDict = SentiWordBinder.BindSentiWords( [similar[0]]) if sentiValueDict[similar[0]] != 'None': updateQuery.append(""" UPDATE `""" + productInfo[1] + """` SET Sentiment_Value = """ + sentiValueDict[similar[0]] + """ WHERE Word = '""" + similar[0] + """' """) updateQuery.append(""" UPDATE `""" + productInfo[1] + """` SET `""" + feature + """` = null WHERE Word_Count <= """ + str(5) + """ """) index += 1 if target == None: db = 'db_capstone' else: db = 'db_capstone_similarity' DataBaseManager.DoManyQuery(insertQuery, db=db, title=title, outPut=outPut, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, db=db, title=title, outPut=outPut, queryType='UPDATE')
def UpdateSimilarWordDictionary(title=None, outPut=None): if title == None: title = 'Update Similar word Dictionary' else: title += '\nUpdate Similar word Dictionary' if outPut == None: outPut = '' else: outPut = 'Process data of ' + outPut + '\n' main.ShowTitle(title, 'Getting similarity data') sqlResult = DataBaseManager.DoSQL(""" SELECT Normal_Word, Target_Word, Word_Count FROM similar_word_relation WHERE Similar_Value > 0.95 ORDER BY Similar_Value DESC """) index = 0 wordDict = {} updateTime = 0 for result in sqlResult: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Building relation dictionary (' + str(index) + '/' + str(len(sqlResult)) + ')') newRelation = {result[1]: result[2]} try: existRelation = wordDict[result[0]] except: wordRelation = {result[0]: newRelation} else: existRelation.update(newRelation) wordRelation = {result[0]: existRelation} wordDict.update(wordRelation) index += 1 index = 0 removeIndex = 0 initialLength = len(wordDict) removeList = [] updateTime = 0 for key, relation in wordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, 'Removing unnecessary word (' + str(index) + '/' + str(initialLength) + ' removed: ' + str(removeIndex) + ')') if len(relation) <= 1: removeList.append(key) removeIndex += 1 index += 1 index = 0 updateTime = 0 for key in removeList: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Removing unnecessary word (' + str(index) + '/' + str(initialLength) + ' removed: ' + str(removeIndex) + ')') wordDict.pop(key) relatedWordDict = {} SentiWordBinder = BindSentiWords.BindSentiWords() index = 0 updateTime = 0 for key, value in wordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Calculating sentimental value (' + str(index) + '/' + str(len(wordDict)) + ')') compareList = [] compareList.extend(value.keys()) sentiValueDict = SentiWordBinder.BindSentiWords(compareList) keySentiValue = sentiValueDict[key] if keySentiValue != 'None': sentiValueDict.pop(key) for word, targetSentiValue in sentiValueDict.items(): if targetSentiValue != 'None': if int(keySentiValue) == int(targetSentiValue): if value[key] > wordDict[word][word]: newRelation = {word: key} elif value[key] < wordDict[word][word]: newRelation = {key: word} relatedWordDict.update(newRelation) index = 0 updateTime = 0 for subWord, superWord in relatedWordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Building similar word dictionary (' + str(index) + '/' + str(len(relatedWordDict)) + ')') try: upperWord = relatedWordDict[superWord] except: continue else: relatedWordDict[subWord] = upperWord for key, value in relatedWordDict.items(): if value == superWord: relatedWordDict[key] = upperWord main.ShowTitle(title, outPut + 'Getting exist similar word dictionary') sqlResult = DataBaseManager.DoSQL(""" SELECT Sub_Word, Similar_ID FROM similar_word_dic """) existRelatedWordDict = dict(sqlResult) insertQuery = [] updateQuery = [] index = 0 updateTime = 0 for subWord, superWord in relatedWordDict.items(): currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Appending Query (' + str(index) + '/' + str(len(relatedWordDict)) + ')') try: dictionaryID = existRelatedWordDict[subWord] except: insertQuery.append(""" INSERT INTO similar_word_dic (Sub_Word, Super_Word) VALUES ('""" + subWord + """', '""" + superWord + """') """) else: updateQuery.append(""" UPDATE similar_word_dic SET Super_Word WHERE Similar_ID = """ + str(dictionaryID) + """ """) DataBaseManager.DoManyQuery(insertQuery, title=title, outPut=outPut, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, title=title, outPut=outPut, queryType='UPDATE')
def GetContent(target=None, title=None, outPut=None): global contents global embedding_model global baseDir embedding_model = None if title == None: title = 'Get data for Analyze similarity' else: title += '\nGet data for Analyze similarity' if outPut == None: outPut = '' else: outPut = 'Process data of ' + outPut + '\n' targetDataList = [] main.ShowTitle(title, outPut + "Find exist data") if target == None: sqlResult = DataBaseManager.DoSQL(""" SELECT Date FROM article_dic ORDER BY Article_ID DESC LIMIT 1 """) else: sqlResult = DataBaseManager.DoSQL(""" SELECT Date FROM review_dic WHERE Product_ID = """ + str(target) + """ ORDER BY Review_ID DESC LIMIT 1 """) lastUpdateDate = sqlResult[0][0] targetFasttextList = [] if target == None: targetDir = baseDir + '\\WordVectorData\\Normal' else: targetDir = baseDir + '\\WordVectorData\\' + str(target) if os.path.isdir(targetDir): fileList = os.listdir(targetDir) for file in fileList: extension = file.split('.')[-1] fileName = '.'.join(file.split('.')[:-1]) if extension == 'fasttext': targetFasttextList.append(fileName) else: os.makedirs(targetDir) lastProcessDate = '0000-00-00 00:00:00' if len(targetFasttextList) > 0: targetFileName = max(targetFasttextList) lastProcessDate = '-'.join( targetFileName.split('&')[0].split('#')) + ' ' + ':'.join( targetFileName.split('&')[1].split('#')) if targetFileName > lastUpdateDate.strftime('%Y#%m#%d&%H#%M#%S'): embedding_model = FastText.load(targetDir + '\\' + targetFileName + '.fasttext') outPut = 'There is existing similarity data' else: embedding_model = None targetDataList = [] main.ShowTitle(title, outPut + "Reading data") if target == None: sqlResult = DataBaseManager.DoSQL(""" SELECT Article FROM article_dic WHERE Date > '""" + lastProcessDate + """' AND Article != '' """) else: sqlResult = DataBaseManager.DoSQL(""" SELECT Review FROM review_dic WHERE Date > '""" + lastProcessDate + """' AND Review != '' """) for result in sqlResult: if result != []: targetDataList.extend(result) for data in targetDataList: contents.append(data.split('#')) return ''
outPut += "{:10}\t".format(dataList[i][0]) outPut += "{:1.4}\t\t".format(dataList[i][1]) if i < len(adjectiveList): outPut += "{:10}\t".format(adjectiveList[i][0]) outPut += "{:1.4}".format(adjectiveList[i][1]) outPut += '\n' return outPut def Proceed(Mode, title='', outPut=''): return SelectProduct(Mode, title, outPut) if __name__ == '__main__': baseDir = os.getcwd() outPut = '' while True: main.ShowTitle(outPut) print('1. Product\n2. Normal word') inputValue = input('=> ') if inputValue == '1': outPut = Proceed('Product') elif inputValue == '2': outPut = Proceed('Normal') elif inputValue == 'q': exit() else: outPut = 'Please enter correct value or charactor\n'
def ManageProduct(title=None, outPut=None): if title == None: title = '' if outPut == None: outPut = '' GetProductDic() while True: main.ShowTitle(title, outPut) productName = input('Enter product name (%q to back): ') if productName == '%q': return '' while True: productList = GetProductName(productName).get('product_Name') main.ShowTitle('Data for ' + productName + '\n' + outPut, title) number = 1 for product in productList: print(str(number) + '. Modify ' + product) number += 1 print('') print('a. Add new product (' + productName + ')') print('r. Re-enter name') print('b. Back') inputValue = input('=> ') targetIndex = -1 try: targetIndex = int(inputValue) except: if inputValue == 'r': outPut = '' break elif inputValue == 'b': return '' elif inputValue == 'a': outPut = '' outPutWork = 'Add new product (' + productName + ')' else: outPut = 'Please enter correct number or charactor' continue if targetIndex != -1: productPureName = productList[targetIndex - 1] while True: main.ShowTitle(title, outPut) print('1. Modify\n2. Delete') modeInput = input('=> ') if modeInput == '1': outPut = '' outPutWork = 'Modify ' + productPureName break elif modeInput == '2': ProductDictionaryRemove(productPureName) break outPutState = '' proceed = True name = '' propertyList = ['', []] DiscriptionList = [] if inputValue != 'a': outPutState = '/ Name' main.ShowTitle(outPutWork + outPutState + '\n' + outPut, title) nameInputValue = input( 'Enter product name (%q to cancel add / %s to skip):') if nameInputValue == '%q': proceed == False outPut = '' elif nameInputValue == '%s': name = '' outPut = '' else: name = nameInputValue outPut = '' else: name = productName outPut = '' if proceed == False: continue for i in range(0, 2): stateString = [' / Carrier', ' / Category'] targetString = ['carrier', 'category'] if i == 0: dataList = GetProductCarrierList(0) else: dataList = GetProductCategoryList() while True: main.ShowTitle(outPutWork + stateString[i] + '\n' + outPut, title) for ID, propertyName in dataList.items(): print(str(ID) + '. ', end='') print(propertyName) print('') if inputValue != 'a': print('s. Skip') if i == 1: print('a. Add new ' + targetString[i]) print('b. Cancel add') propertyInputValue = input('=> ') if propertyInputValue == 'b': proceed = False outPut = '' break elif propertyInputValue == 's': if inputValue != 'a': propertyList[i] = '' outPut = '' break else: outPut = 'Please input without %' continue elif propertyInputValue == 'a': if i == 0: outPut = 'Please enter correct number or charactor' continue main.ShowTitle( outPutWork + outPutState + '\n' + outPut, title) print('Enter' + targetString[i] + '!NO SPACE ENTER! (%q to cancel add', end='') if inputValue != 'a': print(' / %s to skip', end='') propertyInputValue = input('): ') if propertyInputValue == '%q': outPut = '' proceed = False break elif propertyInputValue == '%s': if inputValue != 'a': outPut = '' propertyList[i] = '' break else: outPut = 'Please input without %' continue propertyList[i] = propertyInputValue.replace(' ', '') else: try: if propertyInputValue in dataList.keys(): propertyIndex = int(propertyInputValue) else: outPut = 'Please enter correct number or charactor' continue except: outPut = 'Please enter correct number or charactor' continue propertyList[i] = dataList[propertyIndex - 1] outPut = '' break if proceed == False: continue if proceed == False: continue for i in range(0, 2): stateString = [' / Main discription', ' / Sub discription'] targetString = ['main discription', 'sub discription'] newList = [] while True: listString = ' data: ' + ', '.join(newList) if inputValue != 'a': listString += '(prev data: ' + ', '.join( productDic[targetIndex][i + 3]) + ')' main.ShowTitle( outPutWork + stateString[i] + listString + '\n' + outPut, title) print( 'Enter ' + targetString[i] + ' !NO SPACE ENTER! (%q to cancel add / %f to finish add', end='') if inputValue != 'a': print(' / %s to skip', end='') discriptionInputValue = input('): ') if discriptionInputValue == '%q': proceed == False outPut = '' break elif discriptionInputValue == '%f': outPut = '' break elif discriptionInputValue == '%s': if inputValue != 'a': newList = None outPut = '' break else: outPut = 'Please input without %' continue newList.append(discriptionInputValue) outPut = '' if proceed == False: break DiscriptionList.append(newList) if proceed: if inputValue == 'a': ProductDictionaryAppend(name, propertyList[0], propertyList[1], DiscriptionList[0], DiscriptionList[1]) else: ProductDictionaryModify(targetIndex, name, propertyList[0], propertyList[1], DiscriptionList[0], DiscriptionList[1])
def Dividing(reviewData, fileName, title=None, outPut=None): global similarInsertQuery global similarUpdateQuery global insertQuery global updateQuery if title == None: title = '' if outPut == None: outPut = '' similarInsertQuery = [] similarUpdateQuery = [] insertQuery = [] updateQuery = [] completeIndex = 0 noProductIndex = 0 skippedIndex = 0 sqlResult = DataBaseManager.DoSQL(""" SELECT Review_ID, Review_Number FROM review_dic """) completedReview = dict(sqlResult) updateTime = 0 for data in reviewData: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime additionString = '' if noProductIndex > 0 or skippedIndex > 0: additionString += ' (' if skippedIndex > 0: additionString += 'skipped: ' + str(skippedIndex) if noProductIndex > 0 and skippedIndex > 0: additionString += ' / ' if noProductIndex > 0: additionString += 'not product: ' + str(noProductIndex) additionString += ')' main.ShowTitle( title, outPut + 'Building dictionary for ' + fileName + ' (' + str(completeIndex + skippedIndex + noProductIndex) + '/' + str(len(reviewData)) + ')' + additionString) splitData = data.split(',') if len(splitData) < 2: return 'No data in ' + fileName reviewNumber = fileName + '-' + splitData[0] splitData.remove(splitData[0]) reviewTitleString = splitData[0] reviewTitleString = reviewTitleString.replace('\n', '') reviewTitleString = reviewTitleString.replace(';', ',') splitData.remove(splitData[0]) reviewString = ''.join(splitData) reviewString = reviewString.replace('\n', '') reviewString = reviewString.replace(';', ',') if reviewNumber in completedReview.values(): skippedIndex += 1 continue if reviewTitleString == '!e': continue reviewTitleStringList = NLP.DoNLP(reviewTitleString, None, 'Review') reviewStringList = NLP.DoNLP(reviewString, None, 'Review') resultList = GetProductName( ' '.join(reviewTitleStringList), ' '.join(reviewStringList)).get('product_Name') if len(resultList) > 0: resultStringList = DictionaryBuilder.ConvertNormalWord( mainStringList=reviewTitleStringList, subStringList=reviewStringList, mode='Review') resultString = '#'.join(resultStringList) for name in resultList: updateQuery.append(""" UPDATE product_dic SET Count = Count + 1 WHERE Product_Name = '""" + name + """' """) insertQuery.append(""" INSERT INTO review_dic (Review_Number, Review, Product_ID) VALUES ('""" + reviewNumber + """', '""" + resultString + """', """ + str(productDic[name].get('productID')) + """) """) for word in resultStringList: try: wordDict = productSimilarDic[name] except: wordDict = {} try: currentCount = wordDict[word] except: currentCount = 0 wordInfo = {word: currentCount + 1} wordDict.update(wordInfo) newItem = {name: wordDict} productSimilarDic.update(newItem) completeIndex += 1 else: noProductIndex += 1 returnString = "Complete building dictionary for " + fileName if skippedIndex > 0 or noProductIndex > 0: returnString += ' (' if skippedIndex > 0: returnString += 'skipped: ' + str(skippedIndex) if skippedIndex > 0 and noProductIndex > 0: returnString += ' / ' if noProductIndex > 0: returnString += 'not product: ' + str(noProductIndex) returnString += ')' returnString += '\n' AppendWordDicQuery(title=title, outPut=outPut + returnString) DataBaseManager.DoManyQuery(insertQuery, title=title, outPut=outPut + returnString, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, title=title, outPut=outPut + returnString, queryType='UPDATE') DataBaseManager.DoManyQuery(similarInsertQuery, 'db_capstone_similarity', title=title, outPut=outPut + returnString, queryType='INSERT') DataBaseManager.DoManyQuery(similarUpdateQuery, 'db_capstone_similarity', title=title, outPut=outPut + returnString, queryType='UPDATE') return returnString
def AppendArticleDic(reviewData, fileName, title=None, outPut=None): global insertQuery global updateQuery if title == None: title = '' if outPut == None: outPut = '' insertQuery = [] updateQuery = [] stackUnit = DataBaseManager.maximumQueryStactUnit completeIndex = 0 skippedIndex = 0 articleLastID = DataBaseManager.DoSQL(""" SELECT `AUTO_INCREMENT` FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = 'db_capstone' AND TABLE_NAME = 'article_dic'; """)[0][0] articleNumberList = [] index = 0 while True: articleNumberList.extend( DataBaseManager.DoSQL(""" SELECT Article_ID, Article_Number FROM article_dic WHERE Article_ID > """ + str(index) + """ AND Article_ID <= """ + str(index + stackUnit))) index += stackUnit if index > articleLastID: break completedReview = dict(articleNumberList) wordDic = {} main.ShowTitle( title, outPut + 'Building dictionary for ' + fileName + ' (' + str(completeIndex) + '/' + str(len(reviewData)) + ')') updateTime = int(str(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) for data in reviewData: currentTime = int(str( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) if updateTime < currentTime: updateTime = currentTime main.ShowTitle( title, outPut + 'Building dictionary for ' + fileName + ' (' + str(completeIndex + skippedIndex) + '/' + str(len(reviewData)) + ')') splitData = data.split(',') if len(splitData) < 2: return 'No data in ' + fileName reviewNumber = fileName + '-' + splitData.pop(0) reviewString = splitData.pop(0) + ',' reviewString += ''.join(splitData) reviewString = reviewString.replace('\n', '') reviewString = reviewString.replace(';', ',') if reviewNumber in completedReview.values(): skippedIndex += 1 continue if reviewString == '!e': continue resultStringList = ConvertNormalWord(reviewString, reviewString) resultString = '#'.join(resultStringList) insertQuery.append(""" INSERT INTO article_dic (Article_Number, Article) VALUES ('""" + reviewNumber + """', '""" + resultString + """')""") for word in resultStringList: try: currentCount = wordDic[word] except: currentCount = 0 newItem = {word: currentCount + 1} wordDic.update(newItem) completeIndex += 1 AppendWordDicQuery(wordDic) returnString = "Complete building dictionary for " + fileName if skippedIndex > 0: returnString += ' (skipped ' + str(skippedIndex) + ' of ' + str( len(reviewData)) + ' review)' returnString += '\n' DataBaseManager.DoManyQuery(insertQuery, title=title, outPut=outPut + returnString, queryType='INSERT') DataBaseManager.DoManyQuery(updateQuery, title=title, outPut=outPut + returnString, queryType='UPDATE') return returnString