def getIDFVector(self, documentList): vocabularyString = " ".join(documentList) wordList = self.parser.tokenise(vocabularyString) wordList = self.parser.removeStopWords(wordList) uniqWordList = util.removeDuplicates(wordList) IDFvector = [tfidf.idf(word,documentList) for word in uniqWordList] return IDFvector
def wordProcess( lst ) : parser = Parser() termString = parser.clean( lst ) termLst = parser.tokenize( termString ) termLst = parser.removeStopWords( termLst ) termLst = util.removeDuplicates( termLst ) return termLst
def getVectorKeywordIndexSeprated(self, documentList): """ create the keyword associated to the position of the elements within the document vectors """ vectorIndex = [] for document in documentList: vocabularyList = self.parser.tokenise(document) #Remove common words which have no search value vocabularyList = self.parser.removeStopWords(vocabularyList) uniqueVocabularyList = util.removeDuplicates(vocabularyList) vectorIndex.append(uniqueVocabularyList) return vectorIndex #set[keywords]
def getVectorKeywordIndex(self, documentList): """ create the keyword associated to the position of the elements within the document vectors """ #Mapped documents into a single word string vocabularyString = " ".join(documentList) vocabularyList = self.parser.tokenise(vocabularyString) #Remove common words which have no search value vocabularyList = self.parser.removeStopWords(vocabularyList) uniqueVocabularyList = util.removeDuplicates(vocabularyList) vectorIndex={} offset=0 #Associate a position with the keywords which maps to the dimension on the vector used to represent this word for word in uniqueVocabularyList: vectorIndex[word]=offset offset+=1 return vectorIndex #(keyword:position)
def getVectorKeywordIndex(self, documentList): """ create the keyword associated to the position of the elements within the document vectors """ #Mapped documents into a single word string vocabularyString = " ".join(documentList) vocabularyString = re.sub(r"\s+", " ", vocabularyString) vocabularyList = [word for word in vocabularyString.split(" ")] uniqueVocabularyList = util.removeDuplicates(vocabularyList) vectorIndex = {} offset = 0 #Associate a position with the keywords which maps to the dimension on the vector used to represent this word for word in uniqueVocabularyList: vectorIndex[word] = offset offset += 1 return vectorIndex #(keyword:position)
def makeIDFVector(self, documentList): outputVector = [0] * len(self.vectorKeywordIndex) docNumber = len(documentList) for doc in documentList: docTemp = self.parser.tokenise(doc) docTemp = self.parser.removeStopWords(docTemp) uniqueDocTemp = util.removeDuplicates(docTemp) for key in uniqueDocTemp: outputVector[self.vectorKeywordIndex[key]] += 1 # DF for i in range(len(outputVector)): outputVector[i] = math.log(docNumber / outputVector[i]) # IDF return outputVector
t1 = datetime.now() for file in os.listdir("../lyrics_process"): top5 = [0] * 5 top5word = [""] * 5 f = open("../lyrics_process/" + file, 'r') content = f.read() content = content.split(' ') f.close() if content[0] == "": content = content[1::] DiffTerm = util.removeDuplicates(content) print("=== Counting TF-IDF ===") for word in DiffTerm: print(word) tfidf = content.count(word) * math.log( 81975 / (1 + WordFreInDoc[WordIndex[word]])) print("----- TOP5 Word -----") for index in range(0, 5): if tfidf > top5[index]:
from datetime import datetime word = json.load(open("./allword.json")) words = json.loads(word) WordIDF = [0] * len(words) t1 = datetime.now() for file in os.listdir("../lyrics_tfidf"): f = open("../lyrics_tfidf/" + file, 'r') content = f.read() data = json.loads(content) d = json.loads(data) diffword = util.removeDuplicates(d) print("Counting all DOCs") for w in diffword: WordIDF[words[w]] += 1 f.close() t2 = datetime.now() t3 = t2 - t1 print("Process Time: " + str(t3.seconds / 60)) print("Dump json file") f = open("./IDF_final.json", "w") json_array = json.dumps(WordIDF)
data = json.loads(d) #print(len(data)) WordInDoc = [0] * len(data) print("Start counting one word in how many docs...") t1 = datetime.now() for file in os.listdir("../lyrics"): f = open("../lyrics/" + file, 'r') content = f.read() content = parser.tokenise(content) content = parser.removeStopWords(content) content = util.removeDuplicates(content) f.close() print("===== Add in Vector ======") for word in content: WordInDoc[data[word]] += 1 print("...Next round...") t2 = datetime.now() t3 = t2 - t1 print("Process Time: " + str(t3.seconds/60)) print("Dump json file") f = open("./IDF.json", "w")