def tfidf_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) knn = neighbors.KNeighborsClassifier(n_neighbors=10) knn.fit(trainMat[0:(trainMat.shape[0] * 0.7), :], trainVals[0:(trainMat.shape[0] * 0.7)]) return knn.score(trainMat[(trainMat.shape[0] * 0.7):, :], trainVals[(trainMat.shape[0] * 0.7):])
def tfidf_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) knn = neighbors.KNeighborsClassifier(n_neighbors=10) knn.fit(trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7)]) return knn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :])
def get_glove_logreg(train_file, trainMat=None): if trainMat == None: trainMat = buildGloveTrainMat(train_file) wd = buildwd.buildWD(train_file, randomize=True) labels = wd[3] trainVals = buildwd.trainValsFromSubjects(labels) logreg = linear_model.LogisticRegression() logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return logreg, trainMat, trainVals
def glove_knn(train_file, trainMat=None): if trainMat == None: trainMat = buildGloveTrainMat(train_file) wd = buildwd.buildWD(train_file, randomize=True) labels = wd[3] trainVals = buildwd.trainValsFromSubjects(labels) knn = neighbors.KNeighborsClassifier(n_neighbors=5) knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return knn.score(trainMat[(trainMat.shape[0]*0.7):,:], trainVals[(trainMat.shape[0]*0.7):])
def get_bag_logreg(train_file): wd = buildwd.buildWD(train_file, randomize=True, sentiment=True) colnames = wd[1] subjects = wd[3] trainMat = np.transpose(wd[0]) row_sums = trainMat.sum(axis=1) trainMat = trainMat / row_sums[:, np.newaxis] trainVals = buildwd.trainValsFromSubjects(subjects) print 'Training bag_logreg...' logreg = linear_model.LogisticRegression() logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return logreg, trainMat, trainVals
def get_bag_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] trainMat = np.transpose(wd[0]) row_sums = trainMat.sum(axis=1) trainMat = trainMat / row_sums[:, np.newaxis] trainVals = buildwd.trainValsFromSubjects(subjects) print 'Training bag_knn...' knn = neighbors.KNeighborsClassifier(n_neighbors=5) knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return knn, trainMat, trainVals
def get_tfidf_logreg(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] if numWords > 0: trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, train, labels, cutoff
def get_tfidf_logreg(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] if (numWords > 0): trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, train, labels, cutoff