def getSarcasmScore(sentence): sentence = sentence.encode('ascii', 'ignore') features = feature_extraction.getallfeatureset(sentence) features_vec = vec.transform(features) score = classifier.predict(features_vec)[0] percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0)) return score
def getSarcasmScore(sentence): sentence = sentence.encode('ascii', 'ignore') features = feature_extraction.getallfeatureset(sentence) features_vec = feat_dict.transform(features) score = model.decision_function(features_vec)[0] percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0)) return percentage
def getIronicScore(tweet): features = feature_extraction.getallfeatureset(tweet) # classifier can only get data in numerical form so we convert it in vector form. featuresVector = vector.transform(features) # Distance of the samples featureVector to the separating hyperplane score = classifier.decision_function(featuresVector)[0] #sigmoid percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0)) return percentage
ironicData.append(b['text']) elif (b["label"] == -1): nonIronicData.append(b['text']) print('Number of ironic tweets :', len(ironicData)) print('Number of non-ironic tweets :', len(nonIronicData)) print('Feature engineering') classificationSet = ['Ironic', 'Non-Ironic'] #label set featureSets = [] index = 0 for tweet in ironicData: if (np.mod(index, 10000) == 0): print("Processed Ironic Tweets: ", index) featureSets.append( (feature_extraction.getallfeatureset(tweet), classificationSet[0])) index += 1 index = 0 for tweet in nonIronicData: if (np.mod(index, 10000) == 0): print("Processed Non-Ironic Tweets: ", index) featureSets.append( (feature_extraction.getallfeatureset(tweet), classificationSet[1])) index += 1 featureSets = np.array(featureSets) targets = (featureSets[0::, 1] == 'Ironic').astype(int) #Transforms lists of feature-value mappings to vectors vector = DictVectorizer() featureVector = vector.fit_transform(featureSets[0::, 0])
from sklearn.neural_network import MLPClassifier import pickle import feature_extraction pos_data=np.load('sarcpreproc.npy') neg_data=np.load('nonsarcpreproc.npy') print 'Number of sarcastic tweets :', len(pos_data) print 'Number of non-sarcastic tweets :', len(neg_data) cls_set = ['Non-Sarcastic','Sarcastic'] featuresets = [] for tweet in pos_data: featuresets.append((feature_extraction.getallfeatureset(tweet),cls_set[1])) for tweet in neg_data: featuresets.append((feature_extraction.getallfeatureset(tweet),cls_set[0])) featuresets=np.array(featuresets) targets=(featuresets[0::,1]=='Sarcastic').astype(int) vec = DictVectorizer() featurevec = vec.fit_transform(featuresets[0::,0]) #Saving the dictionary vectorizer file_Name = "vecdict_all.p" fileObject = open(file_Name,'wb') pickle.dump(vec, fileObject) fileObject.close()
if(b["label"]==1): ironicData.append(b['text']) elif(b["label"]==-1): nonIronicData.append(b['text']) print('Number of ironic tweets :', len(ironicData)) print('Number of non-ironic tweets :', len(nonIronicData)) print('Feature engineering') classificationSet = ['Ironic', 'Non-Ironic'] #label set featureSets = [] index=0 for tweet in ironicData: if (np.mod(index, 10000) == 0): print("Processed Ironic Tweets: ", index) featureSets.append((feature_extraction.getallfeatureset(tweet), classificationSet[0])) index+=1 index = 0 for tweet in nonIronicData: if (np.mod(index, 10000) == 0): print("Processed Non-Ironic Tweets: ", index) featureSets.append((feature_extraction.getallfeatureset(tweet), classificationSet[1])) index+=1 featureSets=np.array(featureSets) targets=(featureSets[0::,1]=='Ironic').astype(int) #Transforms lists of feature-value mappings to vectors vector = DictVectorizer() featureVector = vector.fit_transform(featureSets[0::,0]) a=['svm','DT']
from sklearn.feature_extraction import DictVectorizer import pickle import feature_extraction pos_data = np.load('sarcpreproc.npy') neg_data = np.load('nonsarcpreproc.npy') print 'Number of sarcastic tweets :', len(pos_data) print 'Number of non-sarcastic tweets :', len(neg_data) cls_set = ['Non-Sarcastic', 'Sarcastic'] featuresets = [] for tweet in pos_data: featuresets.append( (feature_extraction.getallfeatureset(tweet), cls_set[1])) for tweet in neg_data: featuresets.append( (feature_extraction.getallfeatureset(tweet), cls_set[0])) featuresets = np.array(featuresets) targets = (featuresets[0::, 1] == 'Sarcastic').astype(int) vec = DictVectorizer() featurevec = vec.fit_transform(featuresets[0::, 0]) #Saving the dictionnary vectorizer file_Name = "vecdict_all.p" fileObject = open(file_Name, 'wb') pickle.dump(vec, fileObject)
#creating the classifier print(model.fit(x_traincv, df_y)) #extract tfid features to test data too x_testcv = cv.transform(df_x1.values.astype(str)) #for checking predictions predictions = model.predict(x_testcv) #single ouput basic_test=["This is just a long sentence, to make sure that it's not how long the sentence is that matters the most",\ 'I just love when you make me feel like shit','Life is odd','Just got back to the US !', \ "Isn'it great when your girlfriend dumps you ?", "I love my job !", 'I love my son !'] feature_basictest = [] for tweet in basic_test: feature_basictest.append(feature_extraction.getallfeatureset(tweet)) feature_basictest = np.array(feature_basictest) feature_basictestvec = vector.transform(feature_basictest) print(basic_test) print(classifier.predict(feature_basictestvec)) #to look at how many predictions were right actual_results = np.array(df_y1) print(actual_results) count = 0 for i in range(len(predictions)): if predictions[i] == actual_results[i]: count = count + 1