def tfidf_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) knn = neighbors.KNeighborsClassifier(n_neighbors=10) knn.fit(trainMat[0:(trainMat.shape[0] * 0.7), :], trainVals[0:(trainMat.shape[0] * 0.7)]) return knn.score(trainMat[(trainMat.shape[0] * 0.7):, :], trainVals[(trainMat.shape[0] * 0.7):])
def tfidf_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) knn = neighbors.KNeighborsClassifier(n_neighbors=10) knn.fit(trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7)]) return knn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :])
def get_tfidf_logreg(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] if numWords > 0: trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, train, labels, cutoff
def get_tfidf_logreg(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] if (numWords > 0): trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, train, labels, cutoff
def tfidf_shallownn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = np.zeros((len(subjects), 2)) for s in enumerate(subjects): if s[1] == 'Sports': trainVals[s[0], 0] = 1 elif s[1] == 'Politics': trainVals[s[0], 1] = 1 snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2) snn.train(trainMat[0:(trainMat.shape[0] * 0.7), :], trainVals[0:(trainMat.shape[0] * 0.7), :], display_progress=True, maxiter=10) return snn.score(trainMat[(trainMat.shape[0] * 0.7):, :], trainVals[(trainMat.shape[0] * 0.7):, :])
def tfidf_shallownn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = np.zeros((len(subjects), 2)) for s in enumerate(subjects): if s[1] == "Sports": trainVals[s[0], 0] = 1 elif s[1] == "Politics": trainVals[s[0], 1] = 1 snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2) snn.train( trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7), :], display_progress=True, maxiter=10, ) return snn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :, :])