def getIDFVector(self, documentList):
     vocabularyString = " ".join(documentList)
     wordList = self.parser.tokenise(vocabularyString)
     wordList = self.parser.removeStopWords(wordList)
     uniqWordList = util.removeDuplicates(wordList)
     IDFvector = [tfidf.idf(word,documentList) for word in uniqWordList]
     return IDFvector
Ejemplo n.º 2
0
def wordProcess( lst ) :
	parser = Parser()
	termString = parser.clean( lst )
	termLst = parser.tokenize( termString )
	termLst = parser.removeStopWords( termLst ) 
	termLst = util.removeDuplicates( termLst ) 
	return termLst
    def getVectorKeywordIndexSeprated(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """
        vectorIndex = []
        for document in documentList:

            vocabularyList = self.parser.tokenise(document)
            #Remove common words which have no search value
            vocabularyList = self.parser.removeStopWords(vocabularyList)
            uniqueVocabularyList = util.removeDuplicates(vocabularyList)
            vectorIndex.append(uniqueVocabularyList)

        return vectorIndex  #set[keywords]
Ejemplo n.º 4
0
    def getVectorKeywordIndex(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """
        #Mapped documents into a single word string	
        vocabularyString = " ".join(documentList)

        vocabularyList = self.parser.tokenise(vocabularyString)
        #Remove common words which have no search value
        vocabularyList = self.parser.removeStopWords(vocabularyList)
        uniqueVocabularyList = util.removeDuplicates(vocabularyList)

        vectorIndex={}
        offset=0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in uniqueVocabularyList:
            vectorIndex[word]=offset
            offset+=1
        return vectorIndex  #(keyword:position)
Ejemplo n.º 5
0
	def getVectorKeywordIndex(self, documentList):
		""" create the keyword associated to the position of the elements within the document vectors """

		#Mapped documents into a single word string	
		vocabularyString = " ".join(documentList)
			
		vocabularyList = self.parser.tokenise(vocabularyString)
		#Remove common words which have no search value
		vocabularyList = self.parser.removeStopWords(vocabularyList)
		uniqueVocabularyList = util.removeDuplicates(vocabularyList)
		
		vectorIndex={}
		offset=0
		#Associate a position with the keywords which maps to the dimension on the vector used to represent this word
		for word in uniqueVocabularyList:
			vectorIndex[word]=offset
			offset+=1
		return vectorIndex  #(keyword:position)
Ejemplo n.º 6
0
    def getVectorKeywordIndex(self, documentList):
        """ create the keyword associated to the position of the elements within the document vectors """

        #Mapped documents into a single word string
        vocabularyString = " ".join(documentList)

        vocabularyString = re.sub(r"\s+", " ", vocabularyString)
        vocabularyList = [word for word in vocabularyString.split(" ")]

        uniqueVocabularyList = util.removeDuplicates(vocabularyList)

        vectorIndex = {}
        offset = 0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in uniqueVocabularyList:
            vectorIndex[word] = offset
            offset += 1
        return vectorIndex  #(keyword:position)
    def makeIDFVector(self, documentList):

        outputVector = [0] * len(self.vectorKeywordIndex)

        docNumber = len(documentList)

        for doc in documentList:
            docTemp = self.parser.tokenise(doc)
            docTemp = self.parser.removeStopWords(docTemp)
            uniqueDocTemp = util.removeDuplicates(docTemp)

            for key in uniqueDocTemp:
                outputVector[self.vectorKeywordIndex[key]] += 1  # DF

        for i in range(len(outputVector)):
            outputVector[i] = math.log(docNumber / outputVector[i])  # IDF

        return outputVector
Ejemplo n.º 8
0
t1 = datetime.now()

for file in os.listdir("../lyrics_process"):

    top5 = [0] * 5
    top5word = [""] * 5

    f = open("../lyrics_process/" + file, 'r')
    content = f.read()
    content = content.split(' ')
    f.close()

    if content[0] == "":
        content = content[1::]

    DiffTerm = util.removeDuplicates(content)

    print("=== Counting TF-IDF ===")

    for word in DiffTerm:

        print(word)

        tfidf = content.count(word) * math.log(
            81975 / (1 + WordFreInDoc[WordIndex[word]]))

        print("----- TOP5 Word -----")

        for index in range(0, 5):

            if tfidf > top5[index]:
Ejemplo n.º 9
0
from datetime import datetime

word = json.load(open("./allword.json"))
words = json.loads(word)

WordIDF = [0] * len(words)

t1 = datetime.now()

for file in os.listdir("../lyrics_tfidf"):

    f = open("../lyrics_tfidf/" + file, 'r')
    content = f.read()
    data = json.loads(content)
    d = json.loads(data)
    diffword = util.removeDuplicates(d)

    print("Counting all DOCs")

    for w in diffword:
        WordIDF[words[w]] += 1

    f.close()

t2 = datetime.now()
t3 = t2 - t1
print("Process Time: " + str(t3.seconds / 60))

print("Dump json file")
f = open("./IDF_final.json", "w")
json_array = json.dumps(WordIDF)
Ejemplo n.º 10
0
data = json.loads(d)

#print(len(data))

WordInDoc = [0] * len(data)

print("Start counting one word in how many docs...")

t1 = datetime.now()

for file in os.listdir("../lyrics"):
    f = open("../lyrics/" + file, 'r')
    content = f.read()
    content = parser.tokenise(content)
    content = parser.removeStopWords(content)
    content = util.removeDuplicates(content)
    f.close()

    print("===== Add in Vector ======")
    for word in content:
        WordInDoc[data[word]] += 1
    
    print("...Next round...")


t2 = datetime.now()
t3 = t2 - t1
print("Process Time: " + str(t3.seconds/60))

print("Dump json file")
f = open("./IDF.json", "w")