def tfidf_logreg(train_file): wd = sentiment_buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] trainMat = np.transpose(wd[0]) row_sums = trainMat.sum(axis=1) trainMat = trainMat / row_sums[:, np.newaxis] trainVals = sentiment_buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, rownames
def tfidf_logreg(train_file): wd = sentiment_buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] trainMat = np.transpose(wd[0]) row_sums = trainMat.sum(axis=1) trainMat = trainMat / row_sums[:, np.newaxis] trainVals = sentiment_buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index*0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg, rownames
def tfidf_logreg(train_file): wd = sentiment_buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.strip('\"').split(',') if words[1] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 tweet = sentiment_buildwd.buildTweet(words[5:]) for word in tweetprocess.tokenize(tweet): pword = sentiment_buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow*1.0) / numWords trainMat[matCol,:] = trainRow matCol += 1 f.close() trainVals = sentiment_buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index*0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg.score(train[cutoff:], labels[cutoff:])
def tfidf_logreg(train_file): wd = sentiment_buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.strip('\"').split(',') if words[1] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 tweet = sentiment_buildwd.buildTweet(words[5:]) for word in tweetprocess.tokenize(tweet): pword = sentiment_buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = sentiment_buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg.score(train[cutoff:], labels[cutoff:])