def main():
    #FIELDS: ID, SENTIMENT, TEXT
    base_dir =  os.path.dirname(os.path.realpath(__file__))

    _file = "dataset"+os.sep+"SemevalTrainB.tsv"
    train_file = os.path.join(base_dir,_file)

    _file = "dataset"+os.sep+"SemevalTestB2013.tsv"
    test_file = os.path.join(base_dir,_file)

    tweetsTrain_ALL = pandas.read_csv(train_file, header=0,  delimiter="\t", index_col=False)
    tweetsTest_ALL  = pandas.read_csv(test_file, header=0,  delimiter="\t", index_col=False)
    
    #only use first 500 of training entries
    #tweetsTrain = tweetsTrain_ALL[0:500] #0403
    tweetsTrain = tweetsTrain_ALL[0:1000] #0459
    tweetsTest = tweetsTest_ALL

    #bow bag of words
    vectorizer = createVectorizer() #used to create the feature vectors

    featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist())
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    print "BOW:Final score on test set: " + str(calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
    """ 
Exemple #2
0
def main():
    #FIELDS: ID, SENTIMENT, TEXT
    tweetsTrain_ALL = pandas.read_csv('dataset/SemevalTrainB.tsv',
                                      header=0,
                                      delimiter="\t",
                                      index_col=False)
    tweetsTest_ALL = pandas.read_csv('dataset/SemevalTestB2013.tsv',
                                     header=0,
                                     delimiter="\t",
                                     index_col=False)

    #only use first 500 of training entries
    tweetsTrain = tweetsTrain_ALL[0:500]
    tweetsTest = tweetsTest_ALL

    vectorizer = createVectorizer()  #used to create the feature vectors

    featuresOfTrainData = vectorizer.fit_transform(
        tweetsTrain['TEXT'].tolist())
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    print "Final score on test set: " + str(
        calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
Exemple #3
0
def evaluate_base(train_file, test_file, desc):

    tweetsTrain = pd.read_csv(train_file,
                              header=0,
                              delimiter="\t",
                              index_col=False)
    tweetsTest = pd.read_csv(test_file,
                             header=0,
                             delimiter="\t",
                             index_col=False)

    vectorizer = createVectorizer("")

    featuresOfTrainData = vectorizer.fit_transform(
        tweetsTrain['TEXT'].tolist(), "bow")
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    cross_validation = str('%.5f' % getCrossValidationScore(classifier))
    final_score = str(
        '%.5f' %
        calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))

    print desc + ":Final score: " + final_score, " CrossValidationScore:", cross_validation
    ret = [final_score, cross_validation]
    return ret
def evaluate(vectorizer,tweetsTrain,tweetsTest,typ):
    #all
    featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist(),"bow")
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()
    
    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    cross_validation= str('%.5f' % getCrossValidationScore(classifier))
    final_score = str('%.5f' % calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
    print typ+":Final score: " + final_score," CrossValidationScore:",cross_validation
    ret =[final_score,cross_validation] 
    return  ret
Exemple #5
0
def evaluate(vectorizer, tweetsTrain, tweetsTest, typ):
    #all
    featuresOfTrainData = vectorizer.fit_transform(
        tweetsTrain['TEXT'].tolist(), "bow")
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    cross_validation = str('%.5f' % getCrossValidationScore(classifier))
    final_score = str(
        '%.5f' %
        calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
    print typ + ":Final score: " + final_score, " CrossValidationScore:", cross_validation
    ret = [final_score, cross_validation]
    return ret
def main():
    #FIELDS: ID, SENTIMENT, TEXT
    tweetsTrain_ALL = pandas.read_csv('dataset/SemevalTrainB.tsv', header=0,  delimiter="\t", index_col=False)
    tweetsTest_ALL = pandas.read_csv('dataset/SemevalTestB2013.tsv', header=0,  delimiter="\t", index_col=False)
    
    #only use first 500 of training entries
    tweetsTrain = tweetsTrain_ALL[0:500]
    tweetsTest = tweetsTest_ALL

    vectorizer = createVectorizer() #used to create the feature vectors

    featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist())
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    print "Final score on test set: " + str(calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
def evaluate_base(train_file,test_file,desc):

    tweetsTrain = pd.read_csv(train_file, header=0,  delimiter="\t", index_col=False)
    tweetsTest  = pd.read_csv(test_file, header=0,  delimiter="\t", index_col=False)

    vectorizer = createVectorizer("") 


    featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist(),"bow")
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    cross_validation= str('%.5f' % getCrossValidationScore(classifier))
    final_score = str('%.5f' % calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))

    print desc+":Final score: " + final_score," CrossValidationScore:",cross_validation
    ret =[final_score,cross_validation] 
    return  ret
Exemple #8
0
def main():
    #FIELDS: ID, SENTIMENT, TEXT
    base_dir = os.path.dirname(os.path.realpath(__file__))

    _file = "dataset" + os.sep + "SemevalTrainB.tsv"
    train_file = os.path.join(base_dir, _file)

    _file = "dataset" + os.sep + "SemevalTestB2013.tsv"
    test_file = os.path.join(base_dir, _file)

    tweetsTrain_ALL = pandas.read_csv(train_file,
                                      header=0,
                                      delimiter="\t",
                                      index_col=False)
    tweetsTest_ALL = pandas.read_csv(test_file,
                                     header=0,
                                     delimiter="\t",
                                     index_col=False)

    #only use first 500 of training entries
    #tweetsTrain = tweetsTrain_ALL[0:500] #0403
    tweetsTrain = tweetsTrain_ALL[0:1000]  #0459
    tweetsTest = tweetsTest_ALL

    #bow bag of words
    vectorizer = createVectorizer()  #used to create the feature vectors

    featuresOfTrainData = vectorizer.fit_transform(
        tweetsTrain['TEXT'].tolist())
    labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

    featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
    labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

    classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
    print "BOW:Final score on test set: " + str(
        calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
    """ 
def main():
   

#  lange finalzeichen punctuation mark capschar/len feat1countpositivewordss
    #FIELDS: ID, SENTIMENT, TEXT
    base_dir =  os.path.dirname(os.path.realpath(__file__))

    _file = "dataset"+os.sep+"SemevalTrainB.tsv"
    train_file = os.path.join(base_dir,_file)

    _file = "dataset"+os.sep+"SemevalTestB2013.tsv"
    test_file = os.path.join(base_dir,_file)

    tweetsTrain_ALL = pandas.read_csv(train_file, header=0,  delimiter="\t", index_col=False)
    tweetsTest_ALL  = pandas.read_csv(test_file, header=0,  delimiter="\t", index_col=False)
    
    #only use first 500 of training entries
    #tweetsTrain = tweetsTrain_ALL[0:500] #0403
    tweetsTrain = tweetsTrain_ALL#[0:500] #0459
    tweetsTest = tweetsTest_ALL
    """for tweet in tweetsTrain['TEXT'].tolist():
        print tweet
        raw_input(">")
    """
  
    # all combinations 
    stuff = ["bow","hashtag","smile","feat1","feat2","feat3","capslock","Punctuationmark"]
    max_fs = 0
    max_cv = 0
    max_r=[]
    for L in range(0, len(stuff)+1):
        for subset in itertools.combinations(stuff, L):
            if len(subset) > 2 :
                vectorizer = createVectorizer_dynamic(subset) 
                w=""
                for s in subset:
                    w=w+"-"+s
                    #print w,"X",s
                    
                #print subset
                #raw_input(">")
                r=evaluate(vectorizer,tweetsTrain,tweetsTest,w)
                if r[0] > max_fs or r[1] > max_cv:
                    print "*"*33
                    print subset
                    print type(subset)
                    print len(subset)
                    print r
                    max_fs = r[0]
                    max_cv = r[1] 
                    max_r  = r
                #raw_input(">")
    print "CHAMPS"    ,max_r,max_fs,max_cv
            
    print "Single features evaluation......."    
    #bow
    vectorizer = createVectorizer("bow") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"BAGWOR")
    #hashtag    
    vectorizer = createVectorizer("hashtag") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"HASHTA")  
    #smile
    vectorizer = createVectorizer("smile") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"SMILE ")
    
    #feat1
    vectorizer = createVectorizer("feat1") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"SCORE ")
    #feat2
    vectorizer = createVectorizer("feat2") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"COUNT+")
    #feat3
    vectorizer = createVectorizer("feat3") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"LENGTH")
    #capslock
    vectorizer = createVectorizer("capslock") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"CAPSLK")
    #Punctuationmark
    vectorizer = createVectorizer("Punctuationmark") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"PUMARK")
    
    print "Grouped features evaluation......."    
    #all
    vectorizer = createVectorizer("") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"BASE_3")
    
    #all (feat1,feat2,feat3)
    vectorizer = createVectorizer("all") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"ALL   ")
    """ all trainings data
Single features evaluation.......
BAGWOR:Final score: 0.57997  CrossValidationScore: 0.56994
HASHTA:Final score: 0.04242  CrossValidationScore: 0.03448
SMILE :Final score: 0.08459  CrossValidationScore: 0.10531
SCORE :Final score: 0.44349  CrossValidationScore: 0.42490
COUNT+:Final score: 0.42304  CrossValidationScore: 0.41968
LENGTH:Final score: 0.23442  CrossValidationScore: 0.23512
CAPSLK:Final score: 0.27601  CrossValidationScore: 0.27802
PUMARK:Final score: 0.11608  CrossValidationScore: 0.13662


Grouped features evaluation.......
BASE_3:Final score: 0.58561  CrossValidationScore: 0.58662
ALL   :Final score: 0.60424  CrossValidationScore: 0.60120

    """
   #schlaumeier train all
    all_tweets=pandas.concat([tweetsTrain_ALL,tweetsTest])
    tweetsTrain = all_tweets#0.934980197195
    tweetsTest = tweetsTest_ALL

    vectorizer = createVectorizer("") #used to create the feature vectors
    evaluate(vectorizer,tweetsTrain,tweetsTest,"OVERFITTING:schlaumeier train all")
    
 
    #graph
    x = [100, 200, 500, 1000, 2000, 4000 ,8000]
    y=[]
    for dim_train in x:
        tweetsTrain = tweetsTrain_ALL[0:dim_train] #0459
        tweetsTest = tweetsTest_ALL    
        
        vectorizer = createVectorizer("") #used to create the feature vectors

        featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist())
        labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

        featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
        labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

        classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
        fs= calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)
        print "Final score on test set (dim_train=" +str(dim_train)+"):"+ str(fs)  
        y.append(fs)
   
    ###########THE FIGURE
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)                    
    ax.set_title('training data size  VS  final score')
    ax.set_ylabel('final score')
    ax.set_xlabel('training data size')

    ###########SCATTER
    plt.scatter(x, y)
    plt.plot(x,y)
    #plt.plot(x,y,'o',y,x,'r')
    #plt.grid(True)
    ###########SHOW
    plt.show()
def main():

    #  lange finalzeichen punctuation mark capschar/len feat1countpositivewordss
    #FIELDS: ID, SENTIMENT, TEXT
    base_dir = os.path.dirname(os.path.realpath(__file__))

    _file = "dataset" + os.sep + "SemevalTrainB.tsv"
    train_file = os.path.join(base_dir, _file)

    _file = "dataset" + os.sep + "SemevalTestB2013.tsv"
    test_file = os.path.join(base_dir, _file)

    tweetsTrain_ALL = pandas.read_csv(train_file,
                                      header=0,
                                      delimiter="\t",
                                      index_col=False)
    tweetsTest_ALL = pandas.read_csv(test_file,
                                     header=0,
                                     delimiter="\t",
                                     index_col=False)

    #only use first 500 of training entries
    #tweetsTrain = tweetsTrain_ALL[0:500] #0403
    tweetsTrain = tweetsTrain_ALL  #[0:500] #0459
    tweetsTest = tweetsTest_ALL
    """for tweet in tweetsTrain['TEXT'].tolist():
        print tweet
        raw_input(">")
    """

    # all combinations
    stuff = [
        "bow", "hashtag", "smile", "feat1", "feat2", "feat3", "capslock",
        "Punctuationmark"
    ]
    max_fs = 0
    max_cv = 0
    max_r = []
    for L in range(0, len(stuff) + 1):
        for subset in itertools.combinations(stuff, L):
            if len(subset) > 2:
                vectorizer = createVectorizer_dynamic(subset)
                w = ""
                for s in subset:
                    w = w + "-" + s
                    #print w,"X",s

                #print subset
                #raw_input(">")
                r = evaluate(vectorizer, tweetsTrain, tweetsTest, w)
                if r[0] > max_fs or r[1] > max_cv:
                    print "*" * 33
                    print subset
                    print type(subset)
                    print len(subset)
                    print r
                    max_fs = r[0]
                    max_cv = r[1]
                    max_r = r
                #raw_input(">")
    print "CHAMPS", max_r, max_fs, max_cv

    print "Single features evaluation......."
    #bow
    vectorizer = createVectorizer("bow")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "BAGWOR")
    #hashtag
    vectorizer = createVectorizer(
        "hashtag")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "HASHTA")
    #smile
    vectorizer = createVectorizer("smile")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "SMILE ")

    #feat1
    vectorizer = createVectorizer("feat1")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "SCORE ")
    #feat2
    vectorizer = createVectorizer("feat2")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "COUNT+")
    #feat3
    vectorizer = createVectorizer("feat3")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "LENGTH")
    #capslock
    vectorizer = createVectorizer(
        "capslock")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "CAPSLK")
    #Punctuationmark
    vectorizer = createVectorizer(
        "Punctuationmark")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "PUMARK")

    print "Grouped features evaluation......."
    #all
    vectorizer = createVectorizer("")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "BASE_3")

    #all (feat1,feat2,feat3)
    vectorizer = createVectorizer("all")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest, "ALL   ")
    """ all trainings data
Single features evaluation.......
BAGWOR:Final score: 0.57997  CrossValidationScore: 0.56994
HASHTA:Final score: 0.04242  CrossValidationScore: 0.03448
SMILE :Final score: 0.08459  CrossValidationScore: 0.10531
SCORE :Final score: 0.44349  CrossValidationScore: 0.42490
COUNT+:Final score: 0.42304  CrossValidationScore: 0.41968
LENGTH:Final score: 0.23442  CrossValidationScore: 0.23512
CAPSLK:Final score: 0.27601  CrossValidationScore: 0.27802
PUMARK:Final score: 0.11608  CrossValidationScore: 0.13662


Grouped features evaluation.......
BASE_3:Final score: 0.58561  CrossValidationScore: 0.58662
ALL   :Final score: 0.60424  CrossValidationScore: 0.60120

    """
    #schlaumeier train all
    all_tweets = pandas.concat([tweetsTrain_ALL, tweetsTest])
    tweetsTrain = all_tweets  #0.934980197195
    tweetsTest = tweetsTest_ALL

    vectorizer = createVectorizer("")  #used to create the feature vectors
    evaluate(vectorizer, tweetsTrain, tweetsTest,
             "OVERFITTING:schlaumeier train all")

    #graph
    x = [100, 200, 500, 1000, 2000, 4000, 8000]
    y = []
    for dim_train in x:
        tweetsTrain = tweetsTrain_ALL[0:dim_train]  #0459
        tweetsTest = tweetsTest_ALL

        vectorizer = createVectorizer("")  #used to create the feature vectors

        featuresOfTrainData = vectorizer.fit_transform(
            tweetsTrain['TEXT'].tolist())
        labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist()

        featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist())
        labelsOfTestData = tweetsTest['SENTIMENT'].tolist()

        classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData)
        fs = calculateFinalScore(classifier, featuresOfTestData,
                                 labelsOfTestData)
        print "Final score on test set (dim_train=" + str(
            dim_train) + "):" + str(fs)
        y.append(fs)

    ###########THE FIGURE
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title('training data size  VS  final score')
    ax.set_ylabel('final score')
    ax.set_xlabel('training data size')

    ###########SCATTER
    plt.scatter(x, y)
    plt.plot(x, y)
    #plt.plot(x,y,'o',y,x,'r')
    #plt.grid(True)
    ###########SHOW
    plt.show()