favoriteLabels = [] reachLabels = [] for tweet in tweetData: content = tweet['content'] words = simpleTokenize(content) if tweet['retweetCount'] < 1: retweetLabels.append(0) else: retweetLabels.append(1) if tweet['favoriteCount'] < 1: favoriteLabels.append(0) else: favoriteLabels.append(1) # sentiment feature tweet_happy_prob, tweet_sad_prob = utilities.classifySentiment(words, happy_log_probs, sad_log_probs) sentiFeatures.append(tweet_happy_prob) # readability feature readabilityFeatures.append(readcalc.ReadCalc(content).get_smog_index()) # length feature lengthFeatures.append(len(words)) # LDA input file LDAfile.write('"'+content.encode('utf-8').replace('"', "'")+'"'+'\n') LDAfile.close() # POS Features posFeatures = genPOSFeatures()
def run(groupSize, groupTitle, vectorMode, featureMode, outputFile='result.output'): resultFile = open(outputFile, 'a') mentionMapper = mapMention('adData/analysis/ranked/mention.json') print groupTitle resultFile.write(groupTitle + '\n') for group in range(groupSize): print 'group: ' + str(group) resultFile.write('group: ' + str(group) + '\n') happy_log_probs, sad_log_probs = utilities.readSentimentList('twitter_sentiment_list.csv') posFile = open('adData/analysis/groups/' + groupTitle + '/group' + str(group) + '.pos', 'r') negFile = open('adData/analysis/groups/' + groupTitle + '/group' + str(group) + '.neg', 'r') posParseLengthFile = open('adData/analysis/groups/' + groupTitle + '/parserLength' + str(group) + '.pos', 'r') negParseLengthFile = open('adData/analysis/groups/' + groupTitle + '/parserLength' + str(group) + '.neg', 'r') posHeadCountFile = open('adData/analysis/groups/' + groupTitle + '/parserHeadCount' + str(group) + '.pos', 'r') negHeadCountFile = open('adData/analysis/groups/' + groupTitle + '/parserHeadCount' + str(group) + '.neg', 'r') contents = [] scores = [] days = [] time = [] labels = [] parseLength = [] headCount = [] usernames = [] semanticFeatures = [] classes = [] print 'loading...' for line in posFile: seg = line.strip().split(' :: ') text = seg[3] username = seg[7].split(';') time.append(hourMapper(seg[2])) day = seg[1] score = float(seg[0]) usernames.append(username) days.append(dayMapper[day]) contents.append(text) scores.append(score) labels.append(1) for line in negFile: seg = line.strip().split(' :: ') text = seg[3] username = seg[7].split(';') time.append(hourMapper(seg[2])) day = seg[1] score = float(seg[0]) usernames.append(username) days.append(dayMapper[day]) contents.append(text) scores.append(score) labels.append(0) if vectorMode == 1: resultFile.write('tfidf \n') vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=2, stop_words='english') vectorMatrix = vectorizer.fit_transform(contents) elif vectorMode == 2: resultFile.write('binary count \n') vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=2, stop_words='english', binary='True') vectorMatrix = vectorizer.fit_transform(contents) print vectorMatrix.shape else: resultFile.write('no vector features \n') for line in posParseLengthFile: parseLength.append(int(line.strip(' :: ')[0])) for line in negParseLengthFile: parseLength.append(int(line.strip(' :: ')[0])) for line in posHeadCountFile: headCount.append(int(line.strip(' :: ')[0])) for line in negHeadCountFile: headCount.append(int(line.strip(' :: ')[0])) posHeadCountFile.close() negHeadCountFile.close() posParseLengthFile.close() negParseLengthFile.close() posFile.close() negFile.close() for index, content in enumerate(contents): temp = [] twLen = len(simpleTokenize(content)) posProb, negProb = utilities.classifySentiment(simpleTokenize(content), happy_log_probs, sad_log_probs) readScore = textstat.coleman_liau_index(content) temp.append(content.count('urrl')) temp.append(content.count('hhttg')) # numpy.append(temp, content.count('emmoj')) temp.append(content.count('ussernm')) temp.append(twLen) temp.append(posProb) temp.append(readScore) temp.append(parseLength[index]) temp.append(headCount[index]) temp.append(days[index]) temp.append(time[index]) mentionFlag = 0 mentionFollowers = 0 for user in usernames[index]: if user in mentionMapper: if mentionMapper[user][0] == 1: mentionFlag = 1 mentionFollowers += mentionMapper[user][1] temp.append(mentionFlag) temp.append(mentionFollowers) semanticFeatures.append(numpy.array(temp)) classes.append(labels[index]) if featureMode == 0: resultFile.write('semantic features only \n') features = csr_matrix(numpy.array(semanticFeatures)) elif featureMode == 1: resultFile.write('vector features only \n') features = vectorMatrix else: resultFile.write('both features \n') features = hstack((vectorMatrix, csr_matrix(numpy.array(semanticFeatures))), format='csr') # initialize the MLP model = Classifier(layers=[Layer("Sigmoid", units=100), Layer("Softmax")], learning_rate=0.02, n_iter=25) precisionSum = 0.0 recallSum = 0.0 accuracySum = 0.0 resultFile.flush() print 'running 5-fold CV...' for i in range(5): print 'case ' + str(i) feature_train, feature_test, label_train, label_test = cross_validation.train_test_split(features.todense(), classes, test_size=0.2, random_state=0) X_train = numpy.array(feature_train) Y_train = numpy.array(label_train) X_test = numpy.array(feature_test) Y_test = numpy.array(label_test) model.fit(X_train, Y_train) predictions = model.predict(X_test) correctCount = 0.0 totalCount = 0.0 if len(predictions) != len(label_test): print 'inference error!' resultFile.write('inferece error!\n') for index, label in enumerate(predictions): if label == 1: if label_test[index] == 1: correctCount += 1 totalCount += 1 if totalCount == 0: precision = 0 else: precision = correctCount / totalCount recall = correctCount / label_test.count(1) accuracy = model.score(X_test, Y_test) precisionSum += precision recallSum += recall accuracySum += accuracy resultFile.flush() outputPrecision = precisionSum / 5 outputRecall = recallSum / 5 outputAccuracy = accuracySum / 5 if (outputRecall + outputPrecision) == 0: outputF1 = 0.0 else: outputF1 = 2 * outputRecall * outputPrecision / (outputRecall + outputPrecision) print outputPrecision print outputRecall print outputAccuracy print outputF1 print '' resultFile.write(str(outputPrecision) + '\n') resultFile.write(str(outputRecall) + '\n') resultFile.write(str(outputAccuracy) + '\n') resultFile.write(str(outputF1) + '\n') resultFile.write('\n') resultFile.flush() resultFile.close()