def buildWD(file_name, writeCSV=False): print "Building word dictionary" wordRowDict, numTweets, rownames = buildWords(file_name) print "Word dictionary finished" mat = np.zeros((len(wordRowDict), numTweets)) colnames = [] subjects = [] print "Building word document matrix" f = open(file_name) matCol = 0 for line in f: words = line.strip('\"').split(',') tweet = buildTweet(words[5:]) tweetColumn = np.zeros(len(wordRowDict)) for word in tweetprocess.tokenize(tweet): if word in wordRowDict: tweetColumn[wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1 if np.sum(tweetColumn) > 0.5*(len(words)-2): colnames.append(words[1]) subjects.append(words[0]) mat[:,matCol] = tweetColumn matCol += 1 f.close() mat = mat[:,0:matCol] print "Word document matrix finished" if writeCSV: print "Writing to CSV" writeToCSV(mat, colnames, wordRowDict, "trainWords.csv") print "Finished writing to CSV" return (mat, colnames, rownames, subjects)
def buildWords(file_name): wordCountDict = defaultdict(int) wordRowDict = {} rownames = [] numTweets = 0 f = open(file_name) if "_tiny" in file_name or "_micro" in file_name: thresh = 10 else: thresh = 50 row = 0 for line in f: numTweets += 1 words = line.split() tweet = buildTweet(words[2:]) for word in tweetprocess.tokenize(tweet): wordCountDict[word] = wordCountDict[word] + 1 if wordCountDict[word] > thresh and word not in wordRowDict: rownames.append(word) wordRowDict[word] = row row += 1 f.close() i = 0 for word in wordRowDict: if i == 15: break i += 1 print len(wordRowDict) return wordRowDict, numTweets, rownames
def buildWords(file_name): wordCountDict = defaultdict(int) wordRowDict = {} rownames = [] numTweets = 0 f = open(file_name) thresh = 5 row = 0 for line in f: numTweets += 1 words = line.strip('\"').split(',') tweet = buildTweet(words[5:]) for word in tweetprocess.tokenize(tweet): wordCountDict[word] = wordCountDict[word] + 1 if wordCountDict[word] > thresh and word not in wordRowDict: rownames.append(word) wordRowDict[word] = row row += 1 f.close() i = 0 for word in wordRowDict: if i == 15: break i += 1 print "Word dict length: ", len(wordRowDict) return wordRowDict, numTweets, rownames
def buildWD(file_name, writeCSV=False): print "Building word dictionary" wordRowDict, numTweets, rownames = buildWords(file_name) print "Word dictionary finished" mat = np.zeros((len(wordRowDict), numTweets)) colnames = [] subjects = [] print "Building word document matrix" f = open(file_name) matCol = 0 for line in f: words = line.strip('\"').split(',') tweet = buildTweet(words[5:]) tweetColumn = np.zeros(len(wordRowDict)) for word in tweetprocess.tokenize(tweet): if word in wordRowDict: tweetColumn[ wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1 if np.sum(tweetColumn) > 0.5 * (len(words) - 2): colnames.append(words[1]) subjects.append(words[0]) mat[:, matCol] = tweetColumn matCol += 1 f.close() mat = mat[:, 0:matCol] print "Word document matrix finished" if writeCSV: print "Writing to CSV" writeToCSV(mat, colnames, wordRowDict, "trainWords.csv") print "Finished writing to CSV" return (mat, colnames, rownames, subjects)
def tfidf_logreg(train_file): wd = sentiment_buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.strip('\"').split(',') if words[1] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 tweet = sentiment_buildwd.buildTweet(words[5:]) for word in tweetprocess.tokenize(tweet): pword = sentiment_buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow*1.0) / numWords trainMat[matCol,:] = trainRow matCol += 1 f.close() trainVals = sentiment_buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index*0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg.score(train[cutoff:], labels[cutoff:])
def tfidf_logreg(train_file): wd = sentiment_buildwd.buildWD(train_file) colnames = wd[1] rownames = wd[2] subjects = wd[3] idf = tfidf(wd[0], rownames) trainMat = np.zeros((len(colnames), wd[0].shape[1])) f = open(train_file) matCol = 0 for line in f: words = line.strip('\"').split(',') if words[1] in colnames: trainRow = np.zeros(wd[0].shape[1]) numWords = 0 tweet = sentiment_buildwd.buildTweet(words[5:]) for word in tweetprocess.tokenize(tweet): pword = sentiment_buildwd.processWord(word) if pword in rownames: numWords += 1 trainRow = trainRow + idf[0][rownames.index(pword)] trainRow = (trainRow * 1.0) / numWords trainMat[matCol, :] = trainRow matCol += 1 f.close() trainVals = sentiment_buildwd.trainValsFromSubjects(subjects) # RANDOMIZE random.seed(17) shuffle = range(len(subjects)) random.shuffle(shuffle) train = [] labels = [] index = 0 for i in shuffle: train.append(trainMat[i]) labels.append(trainVals[i]) index += 1 cutoff = int(index * 0.7) logreg = linear_model.LogisticRegression() logreg.fit(train[0:cutoff], labels[0:cutoff]) return logreg.score(train[cutoff:], labels[cutoff:])
def buildWD(file_name, writeCSV=False, randomize=False, sentiment=False): print "Building word dictionary" wordRowDict, numTweets, rownames = buildWords(file_name) print "Word dictionary finished" extra_feats = 0 if sentiment: extra_feats = 1 mat = np.zeros((len(wordRowDict) + extra_feats, numTweets)) colnames = [] subjects = [] print "Building word document matrix" f = open(file_name) matCol = 0 if (sentiment): sentiment_model, sentiment_words = sent.tfidf_logreg( SENTIMENT_FILENAME) sentimentTweetColumn = np.zeros(len(sentiment_words)) for line in f: words = line.split() tweet = buildTweet(words[2:]) tweetColumn = np.zeros(len(wordRowDict) + extra_feats) num_words = 0 for word in tweetprocess.tokenize(tweet): if word in wordRowDict: tweetColumn[ wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1 if sentiment and word in sentiment_words: sentimentTweetColumn[sentiment_words.index(word)] += 1 num_words += 1 if (sentiment and num_words > 0): sentimentTweetColumn *= 1.0 / num_words tweetColumn[-1] = sentiment_model.predict(sentimentTweetColumn) elif (sentiment): tweetColumn[-1] = 2 if np.sum(tweetColumn) > 0.5 * (len(words) - 2): colnames.append(words[0]) subjects.append(words[1]) mat[:, matCol] = tweetColumn matCol += 1 f.close() mat = mat[:, 0:matCol] print "Word document matrix finished" if writeCSV: print "Writing to CSV" writeToCSV(mat, colnames, wordRowDict, "trainWords.csv") print "Finished writing to CSV" if randomize: random.seed(17) # RANDOMIZE shuffle = range(len(subjects)) random.shuffle(shuffle) m = np.zeros(mat.shape) c = [] s = [] index = 0 for i in shuffle: m[:, index] = mat[:, i] c.append(colnames[i]) s.append(subjects[i]) index += 1 return (m, c, rownames, s) #return (mat, colnames, rownames, subjects) return (mat, colnames, rownames, subjects)
def buildWD(file_name, writeCSV=False, randomize=False, sentiment=False): print "Building word dictionary" wordRowDict, numTweets, rownames = buildWords(file_name) print "Word dictionary finished" extra_feats = 0 if sentiment: extra_feats = 1 mat = np.zeros((len(wordRowDict) + extra_feats, numTweets)) colnames = [] subjects = [] print "Building word document matrix" f = open(file_name) matCol = 0 if (sentiment): sentiment_model, sentiment_words = sent.tfidf_logreg(SENTIMENT_FILENAME) sentimentTweetColumn = np.zeros(len(sentiment_words)) for line in f: words = line.split() tweet = buildTweet(words[2:]) tweetColumn = np.zeros(len(wordRowDict) + extra_feats) num_words = 0 for word in tweetprocess.tokenize(tweet): if word in wordRowDict: tweetColumn[wordRowDict[word]] = tweetColumn[wordRowDict[word]] + 1 if sentiment and word in sentiment_words: sentimentTweetColumn[sentiment_words.index(word)] += 1 num_words += 1 if (sentiment and num_words > 0): sentimentTweetColumn *= 1.0 / num_words tweetColumn[-1] = sentiment_model.predict(sentimentTweetColumn) elif (sentiment): tweetColumn[-1] = 2 if np.sum(tweetColumn) > 0.5*(len(words)-2): colnames.append(words[0]) subjects.append(words[1]) mat[:,matCol] = tweetColumn matCol += 1 f.close() mat = mat[:,0:matCol] print "Word document matrix finished" if writeCSV: print "Writing to CSV" writeToCSV(mat, colnames, wordRowDict, "trainWords.csv") print "Finished writing to CSV" if randomize: random.seed(17) # RANDOMIZE shuffle = range(len(subjects)) random.shuffle(shuffle) m = np.zeros(mat.shape) c = [] s = [] index = 0 for i in shuffle: m[:, index] = mat[:, i] c.append(colnames[i]) s.append(subjects[i]) index += 1 return (m, c, rownames, s) #return (mat, colnames, rownames, subjects) return (mat, colnames, rownames, subjects)