def tfidf_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) knn = neighbors.KNeighborsClassifier(n_neighbors=10) knn.fit(trainMat[0:(trainMat.shape[0] * 0.7), :], trainVals[0:(trainMat.shape[0] * 0.7)]) return knn.score(trainMat[(trainMat.shape[0] * 0.7):, :], trainVals[(trainMat.shape[0] * 0.7):])
def tfidf_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) knn = neighbors.KNeighborsClassifier(n_neighbors=10) knn.fit(trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7)]) return knn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :])
def get_glove_logreg(train_file, trainMat=None): if trainMat == None: trainMat = buildGloveTrainMat(train_file) wd = buildwd.buildWD(train_file, randomize=True) labels = wd[3] trainVals = buildwd.trainValsFromSubjects(labels) logreg = linear_model.LogisticRegression() logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return logreg, trainMat, trainVals
def glove_knn(train_file, trainMat=None): if trainMat == None: trainMat = buildGloveTrainMat(train_file) wd = buildwd.buildWD(train_file, randomize=True) labels = wd[3] trainVals = buildwd.trainValsFromSubjects(labels) knn = neighbors.KNeighborsClassifier(n_neighbors=5) knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return knn.score(trainMat[(trainMat.shape[0]*0.7):,:], trainVals[(trainMat.shape[0]*0.7):])
def buildGloveTrainMat(train_file): wd = buildwd.buildWD(train_file, randomize=True) mat = wd[0] tweetIDs = wd[1] words = wd[2] labels = wd[3] buildGloveCache(words) mat = np.transpose(mat) print 'Building GLOVE train matrix...' trainMat = np.array([glove_features(mat[i,:], words) for i in range(len(tweetIDs))]) return trainMat
def get_bag_logreg(train_file): wd = buildwd.buildWD(train_file, randomize=True, sentiment=True) colnames = wd[1] subjects = wd[3] trainMat = np.transpose(wd[0]) row_sums = trainMat.sum(axis=1) trainMat = trainMat / row_sums[:, np.newaxis] trainVals = buildwd.trainValsFromSubjects(subjects) print 'Training bag_logreg...' logreg = linear_model.LogisticRegression() logreg.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return logreg, trainMat, trainVals
def get_bag_knn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] trainMat = np.transpose(wd[0]) row_sums = trainMat.sum(axis=1) trainMat = trainMat / row_sums[:, np.newaxis] trainVals = buildwd.trainValsFromSubjects(subjects) print 'Training bag_knn...' knn = neighbors.KNeighborsClassifier(n_neighbors=5) knn.fit(trainMat[0:(trainMat.shape[0]*0.7),:], trainVals[0:(trainMat.shape[0]*0.7)]) return knn, trainMat, trainVals
def get_tfidf_logreg(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] if numWords > 0: trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, train, labels, cutoff
def get_tfidf_logreg(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] if (numWords > 0): trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, train, labels, cutoff
def tfidf_shallownn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = np.zeros((len(subjects), 2)) for s in enumerate(subjects): if s[1] == 'Sports': trainVals[s[0], 0] = 1 elif s[1] == 'Politics': trainVals[s[0], 1] = 1 snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2) snn.train(trainMat[0:(trainMat.shape[0] * 0.7), :], trainVals[0:(trainMat.shape[0] * 0.7), :], display_progress=True, maxiter=10) return snn.score(trainMat[(trainMat.shape[0] * 0.7):, :], trainVals[(trainMat.shape[0] * 0.7):, :])
def tfidf_shallownn(train_file): wd = buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.split() if words[0] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 for word in words[2:]: pword = buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = np.zeros((len(subjects), 2)) for s in enumerate(subjects): if s[1] == "Sports": trainVals[s[0], 0] = 1 elif s[1] == "Politics": trainVals[s[0], 1] = 1 snn = shallownn.ShallowNeuralNetwork(input_dim=trainMat.shape[1], hidden_dim=5, output_dim=2) snn.train( trainMat[0 : (trainMat.shape[0] * 0.7), :], trainVals[0 : (trainMat.shape[0] * 0.7), :], display_progress=True, maxiter=10, ) return snn.score(trainMat[(trainMat.shape[0] * 0.7) :, :], trainVals[(trainMat.shape[0] * 0.7) :, :])
""" Format is word then glove vector values with spaces used as delimiters """ def writeToFile(mat, rownames): f = open(WRITE_FILE, "w") for i in range(len(rownames)): toWrite = rownames[i] + " " for j in range(mat.shape[1]): toWrite += str(mat[i, j]) if j != mat.shape[1] - 1: toWrite += " " toWrite += "\n" f.write(toWrite) f.close() def buildGloveFile(mat, rownames): glv = glove(mat=mat, rownames=rownames) writeToFile(glv[0], glv[1]) if __name__ == "__main__": wd = buildwd.buildWD(TRAIN_FILE) mat = wd[0] rownames = wd[2] buildGloveFile(mat, rownames)
# Return the sum of the word and context matrices, per the advice # in section 4.2: return (W + C, rownames) """ Format is word then glove vector values with spaces used as delimiters """ def writeToFile(mat, rownames): f = open(WRITE_FILE, 'w') for i in range(len(rownames)): toWrite = rownames[i] + " " for j in range(mat.shape[1]): toWrite += str(mat[i,j]) if j != mat.shape[1]-1: toWrite += " " toWrite += "\n" f.write(toWrite) f.close() def buildGloveFile(mat, rownames): glv = glove(mat=mat, rownames=rownames) writeToFile(glv[0], glv[1]) if __name__ == '__main__': wd = buildwd.buildWD(TRAIN_FILE) mat = wd[0] rownames = wd[2] buildGloveFile(mat, rownames)