def processTrainData(filename):
	listOfTrainComments = []
	listOfUniqueTokens = [] # unique tokens of the entire corpus
	documentFrequencyOfTokens = {}
	xVal, yVal = rd.loadDataSet(filename)
	for i in range(xVal.shape[0]):
		tempVal = Comment(i)
		tempVal.setContent(xVal[i])
		tempVal.setStatus(yVal[i])
		listOfTrainComments.append(tempVal)

	for i in range(len(listOfTrainComments)):
		content = listOfTrainComments[i].getContent()
		status = listOfTrainComments[i].getStatus()
		content = fext.commentNormalizer(content)
		tokenList = fext.commentTokenizer(content)
		tokenList = fext.removeStopWords(tokenList)
		tokenList = fext.commentStemmer(tokenList)
		# bigramList = fext.convertToBigrams(tokenList)
		# tokenList = tokenList + bigramList
		listOfUniqueTokens = listOfUniqueTokens + tokenList  # list of unique tokens

		dicTokens = calculateTermFrequency(tokenList)
		listOfTrainComments[i].setTokenList(dicTokens)
		for key, value in dicTokens.items():
			if key in documentFrequencyOfTokens:
				documentFrequencyOfTokens[key] += 1
			else:
				documentFrequencyOfTokens[key] = 1

	#documentFrequencyOfTokens = sorted(documentFrequencyOfTokens.items(), key=lambda x: x[1], reverse=True)
	for key, val in documentFrequencyOfTokens.items():
		if val >= 5:
			listOfUniqueTokens.append(key)

	invertedDocumentFrequencyOfTokens = {}
	totalNumberOfDoc = len(listOfTrainComments)
	for key, val in documentFrequencyOfTokens.items():
		invertedDocumentFrequencyOfTokens[key] = 1 + np.log2(totalNumberOfDoc / val)

	'''
	for i in range(len(listOfTrainComments)):
		cmnt = listOfTrainComments[i]
		tokenList = cmnt.getTokensList()
		for key, val in tokenList.items():
			tokenList[key] = val * invertedDocumentFrequencyOfTokens[key]
	'''
	#print(len(listOfUniqueTokens))
	return (listOfTrainComments, listOfUniqueTokens, invertedDocumentFrequencyOfTokens, documentFrequencyOfTokens)
def processTestData(filename):
	listOfTestComments = []
	xVal, yVal = rd.loadDataSet(filename)
	for i in range(xVal.shape[0]):
		tempVal = Comment(i)
		tempVal.setContent(xVal[i])
		tempVal.setStatus(yVal[i])
		listOfTestComments.append(tempVal)

	for i in range(len(listOfTestComments)):
		content = listOfTestComments[i].getContent()
		status = listOfTestComments[i].getStatus()
		content = fext.commentNormalizer(content)
		tokenList = fext.commentTokenizer(content)
		tokenList = fext.removeStopWords(tokenList)
		tokenList = fext.commentStemmer(tokenList)
		# bigramList = fext.convertToBigrams(tokenList)
		# tokenList = tokenList + bigramList
		dicTokens = calculateTermFrequency(tokenList)
		listOfTestComments[i].setTokenList(dicTokens)

	return listOfTestComments
Example #3
0
def processTrainData(filename):
    listOfTrainComments = []
    listOfUniqueTokens = []  # unique tokens of the entire corpus
    documentFrequencyOfTokens = {}
    xVal, yVal = rd.loadDataSet(filename)
    for i in range(xVal.shape[0]):
        tempVal = Comment(i)
        tempVal.setContent(xVal[i])
        tempVal.setStatus(yVal[i])
        listOfTrainComments.append(tempVal)

    for i in range(len(listOfTrainComments)):
        content = listOfTrainComments[i].getContent()
        status = listOfTrainComments[i].getStatus()
        content = fext.commentNormalizer(content)
        tokenList = fext.commentTokenizer(content)
        tokenList = fext.removeStopWords(tokenList)
        tokenList = fext.commentStemmer(tokenList)
        # bigramList = fext.convertToBigrams(tokenList)
        # tokenList = tokenList + bigramList
        listOfUniqueTokens = listOfUniqueTokens + tokenList  # list of unique tokens

        dicTokens = calculateTermFrequency(tokenList)
        listOfTrainComments[i].setTokenList(dicTokens)
        for key, value in dicTokens.items():
            if key in documentFrequencyOfTokens:
                documentFrequencyOfTokens[key] += 1
            else:
                documentFrequencyOfTokens[key] = 1

    #documentFrequencyOfTokens = sorted(documentFrequencyOfTokens.items(), key=lambda x: x[1], reverse=True)
    for key, val in documentFrequencyOfTokens.items():
        if val >= 5:
            listOfUniqueTokens.append(key)

    invertedDocumentFrequencyOfTokens = {}
    totalNumberOfDoc = len(listOfTrainComments)
    for key, val in documentFrequencyOfTokens.items():
        invertedDocumentFrequencyOfTokens[key] = 1 + np.log2(
            totalNumberOfDoc / val)
    '''
	for i in range(len(listOfTrainComments)):
		cmnt = listOfTrainComments[i]
		tokenList = cmnt.getTokensList()
		for key, val in tokenList.items():
			tokenList[key] = val * invertedDocumentFrequencyOfTokens[key]
	'''
    #print(len(listOfUniqueTokens))
    return (listOfTrainComments, listOfUniqueTokens,
            invertedDocumentFrequencyOfTokens, documentFrequencyOfTokens)
Example #4
0
def processTestData(filename):
    listOfTestComments = []
    xVal, yVal = rd.loadDataSet(filename)
    for i in range(xVal.shape[0]):
        tempVal = Comment(i)
        tempVal.setContent(xVal[i])
        tempVal.setStatus(yVal[i])
        listOfTestComments.append(tempVal)

    for i in range(len(listOfTestComments)):
        content = listOfTestComments[i].getContent()
        status = listOfTestComments[i].getStatus()
        content = fext.commentNormalizer(content)
        tokenList = fext.commentTokenizer(content)
        tokenList = fext.removeStopWords(tokenList)
        tokenList = fext.commentStemmer(tokenList)
        # bigramList = fext.convertToBigrams(tokenList)
        # tokenList = tokenList + bigramList
        dicTokens = calculateTermFrequency(tokenList)
        listOfTestComments[i].setTokenList(dicTokens)

    return listOfTestComments