Beispiel #1
0
def getSarcasmScore(sentence):
    sentence = sentence.encode('ascii', 'ignore')
    features = feature_extraction.getallfeatureset(sentence)

    features_vec = vec.transform(features)
    score = classifier.predict(features_vec)[0]
    percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0))

    return score
def getSarcasmScore(sentence):
    sentence = sentence.encode('ascii', 'ignore')
    features = feature_extraction.getallfeatureset(sentence)

    features_vec = feat_dict.transform(features)
    score = model.decision_function(features_vec)[0]
    percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0))

    return percentage
Beispiel #3
0
def getIronicScore(tweet):
    features = feature_extraction.getallfeatureset(tweet)
    # classifier can only get data in numerical form so we convert it in vector form.
    featuresVector = vector.transform(features)
    #	Distance of the samples featureVector to the separating hyperplane
    score = classifier.decision_function(featuresVector)[0]
    #sigmoid
    percentage = int(round(2.0 * (1.0 / (1.0 + np.exp(-score)) - 0.5) * 100.0))

    return percentage
        ironicData.append(b['text'])
    elif (b["label"] == -1):
        nonIronicData.append(b['text'])
print('Number of ironic tweets :', len(ironicData))
print('Number of non-ironic tweets :', len(nonIronicData))

print('Feature engineering')
classificationSet = ['Ironic', 'Non-Ironic']  #label set
featureSets = []

index = 0
for tweet in ironicData:
    if (np.mod(index, 10000) == 0):
        print("Processed Ironic Tweets: ", index)
    featureSets.append(
        (feature_extraction.getallfeatureset(tweet), classificationSet[0]))
    index += 1

index = 0
for tweet in nonIronicData:
    if (np.mod(index, 10000) == 0):
        print("Processed Non-Ironic Tweets: ", index)
    featureSets.append(
        (feature_extraction.getallfeatureset(tweet), classificationSet[1]))
    index += 1
featureSets = np.array(featureSets)
targets = (featureSets[0::, 1] == 'Ironic').astype(int)

#Transforms lists of feature-value mappings to vectors
vector = DictVectorizer()
featureVector = vector.fit_transform(featureSets[0::, 0])
Beispiel #5
0
from sklearn.neural_network import MLPClassifier
import pickle
import feature_extraction


pos_data=np.load('sarcpreproc.npy')
neg_data=np.load('nonsarcpreproc.npy')

print 'Number of  sarcastic tweets :', len(pos_data)
print 'Number of  non-sarcastic tweets :', len(neg_data)

cls_set = ['Non-Sarcastic','Sarcastic']
featuresets = [] 

for tweet in pos_data:
    featuresets.append((feature_extraction.getallfeatureset(tweet),cls_set[1]))
    
for tweet in neg_data:
    featuresets.append((feature_extraction.getallfeatureset(tweet),cls_set[0]))

featuresets=np.array(featuresets)
targets=(featuresets[0::,1]=='Sarcastic').astype(int)

vec = DictVectorizer()
featurevec = vec.fit_transform(featuresets[0::,0])

#Saving the dictionary vectorizer
file_Name = "vecdict_all.p"
fileObject = open(file_Name,'wb') 
pickle.dump(vec, fileObject)
fileObject.close()
Beispiel #6
0
    if(b["label"]==1):
       ironicData.append(b['text'])
    elif(b["label"]==-1):
        nonIronicData.append(b['text'])
print('Number of ironic tweets :', len(ironicData))
print('Number of non-ironic tweets :', len(nonIronicData))

print('Feature engineering')
classificationSet = ['Ironic', 'Non-Ironic'] #label set
featureSets = []

index=0
for tweet in ironicData:
    if (np.mod(index, 10000) == 0):
        print("Processed Ironic Tweets: ", index)
    featureSets.append((feature_extraction.getallfeatureset(tweet), classificationSet[0]))
    index+=1

index = 0
for tweet in nonIronicData:
    if (np.mod(index, 10000) == 0):
        print("Processed Non-Ironic Tweets: ", index)
    featureSets.append((feature_extraction.getallfeatureset(tweet), classificationSet[1]))
    index+=1
featureSets=np.array(featureSets)
targets=(featureSets[0::,1]=='Ironic').astype(int)

#Transforms lists of feature-value mappings to vectors
vector = DictVectorizer()
featureVector = vector.fit_transform(featureSets[0::,0])
a=['svm','DT']
from sklearn.feature_extraction import DictVectorizer
import pickle
import feature_extraction

pos_data = np.load('sarcpreproc.npy')
neg_data = np.load('nonsarcpreproc.npy')

print 'Number of  sarcastic tweets :', len(pos_data)
print 'Number of  non-sarcastic tweets :', len(neg_data)

cls_set = ['Non-Sarcastic', 'Sarcastic']
featuresets = []

for tweet in pos_data:
    featuresets.append(
        (feature_extraction.getallfeatureset(tweet), cls_set[1]))

for tweet in neg_data:
    featuresets.append(
        (feature_extraction.getallfeatureset(tweet), cls_set[0]))

featuresets = np.array(featuresets)
targets = (featuresets[0::, 1] == 'Sarcastic').astype(int)

vec = DictVectorizer()
featurevec = vec.fit_transform(featuresets[0::, 0])

#Saving the dictionnary vectorizer
file_Name = "vecdict_all.p"
fileObject = open(file_Name, 'wb')
pickle.dump(vec, fileObject)
#creating the classifier
print(model.fit(x_traincv, df_y))

#extract tfid features to test data too

x_testcv = cv.transform(df_x1.values.astype(str))
#for checking predictions
predictions = model.predict(x_testcv)

#single ouput
basic_test=["This is just a long sentence, to make sure that it's not how long the sentence is that matters the most",\
            'I just love when you make me feel like shit','Life is odd','Just got back to the US !', \
            "Isn'it great when your girlfriend dumps you ?", "I love my job !", 'I love my son !']
feature_basictest = []
for tweet in basic_test:
    feature_basictest.append(feature_extraction.getallfeatureset(tweet))
feature_basictest = np.array(feature_basictest)
feature_basictestvec = vector.transform(feature_basictest)

print(basic_test)
print(classifier.predict(feature_basictestvec))

#to look at how many predictions were right
actual_results = np.array(df_y1)
print(actual_results)

count = 0
for i in range(len(predictions)):
    if predictions[i] == actual_results[i]:
        count = count + 1