Esempio n. 1
0
leafWords = ['s','es','ed','er','ly','ing']
cw = features.readCommonWords('common_words.txt')

# Import Data
reviews = Data(inputFile, numLines = 50000, testLines = 5000)
reviews.getInfo()
reviews.shuffle()


# Different feature extractors
f1 = features.posNegClusterFeatures("embeddings.p", "dictionary.p", 'NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt', 200)
f2 = features.wordFeatures
f3 = features.positiveNegativeCountsWithClause('NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt')
f4 = features.emotionCounts('NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt')
f5 = features.stemmedWordFeatures(leafWords)
f6 = features.clusterFeatures("embeddings.p", "dictionary.p", 200)
f7 = features.wordFeaturesWithNegation(cw, leafWords)

# SVM
SVMModel = SVM(reviews, [f1, f5])
SVMModel.getInfo()

# Naive Bayes
naiveBayesModel = NaiveBayes(reviews, [f3, f4, f5])
naiveBayesModel.getInfo()

# Linear Regression
linearModel = LinearRegression(reviews, [f7, f3, f1])
linearModel.getInfo()

# Analyzing command line arguments
if len(sys.argv) < 2:
  print 'Usage:'
  print '  python %s <JSON file>' % sys.argv[0]
  exit()

inputFile = sys.argv[1]

# Import Data
reviews = Data(inputFile, numLines = 10000, testLines = 1000)
reviews.getInfo()
reviews.shuffle()

#lexicon = features.readFullLexicon('NRC-Emotion-Lexicon-v0.92/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt')
featureExtractor = features.clusterFeatures("embeddings.p", "dictionary.p", 200)

# Create sparse numpy arrays
reviews.convertDataToArray(featureExtractor)
reviews.convertLabelsToOneHot() # Need this for tensorflow

#print reviews.trainArray.todense()
#print reviews.trainLabelOneHot.transpose().todense()

x = tf.placeholder("float", [None, reviews.numFeatures])

# We have 5 stars and reviews.numFeatures number of reviews
W = tf.Variable(tf.zeros([reviews.numFeatures, 5]))
b = tf.Variable(tf.zeros([5]))

# Our prediction