Esempio n. 1
0
import pandas as pd
import cPickle as pickle

# ***** SETTINGS   *****
use_upsample = 0
use_downsample = 0

downsample_rate_favor = 0.3
downsample_rate_none  = 0.3

strength = 'soft'

# ***** LOAD DATA   *****
if use_downsample:
    data = ptd.getTrainingData()
    sub_none = ptd.getDownsample2_0(data, "NONE", strength, downsample_rate_none)
    sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength, downsample_rate_favor)
    against = data[data.Stance == "AGAINST"]

    data = pd.concat([sub_favor, sub_none, against])

else:
    train_data = ptd.getTrainingDataWithMeta()
    validate_data = ptd.getValidationDataWithMeta()
    test_data = ptd.getTestDataWithMeta()



if use_upsample:
    data = pd.concat([data, data[data.Stance == "AGAINST"]])
Esempio n. 2
0
import pandas as pd
import cPickle as pickle

# ***** SETTINGS   *****
use_upsample = 0
use_downsample = 0

downsample_rate_favor = 0.3
downsample_rate_none = 0.3

strength = 'soft'

# ***** LOAD DATA   *****
if use_downsample:
    data = ptd.getTrainingData()
    sub_none = ptd.getDownsample2_0(data, "NONE", strength,
                                    downsample_rate_none)
    sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength,
                                     downsample_rate_favor)
    against = data[data.Stance == "AGAINST"]

    data = pd.concat([sub_favor, sub_none, against])

else:
    train_data = ptd.getTrainingDataWithMeta()
    validate_data = ptd.getValidationDataWithMeta()
    test_data = ptd.getTestDataWithMeta()

if use_upsample:
    data = pd.concat([data, data[data.Stance == "AGAINST"]])

#print "None: ", len(data[data.Stance == "NONE"])
Esempio n. 3
0
import pandas as pd

# ***** SETTINGS   *****
use_upsample = 1
use_downsample = 1

#perform_test_on_unused_data = 1

downsample_rate_favor = 0.3
#downsample_rate_none  = 10

strength = 'soft'

if use_downsample:
    data = pd.read_csv('../TextFiles/data/tcp_train.csv', sep='\t')
    sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength,
                                     downsample_rate_favor)
    against = data[data.Stance == "AGAINST"]

    data = pd.concat([sub_favor, against])

else:
    data = pd.read_csv('../TextFiles/data/tcp_train.csv', sep='\t')
    data = pd.concat(
        [data[data.Stance == "FAVOR"], data[data.Stance == "AGAINST"]])

if use_upsample:
    data = pd.concat([data, data[data.Stance == "AGAINST"]])

cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1)

# Select classifiers to use
Esempio n. 4
0
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.ensemble import VotingClassifier

validate = 1
testing = 0

data = pd.read_csv(open('../TextFiles/data/tcp_train.csv'), sep='\t', index_col=0)
val = pd.read_csv(open('../TextFiles/data/tcp_validate.csv'), sep='\t', index_col=0)
test = pd.read_csv(open('../TextFiles/data/tcp_test.csv'), sep='\t', index_col=0)

print("using down sampling")
print 'Downsample favor: ' + str(0.2)
print 'Downsample none: ' + str(0.4)
#test_data = ptd.getTestData()
sub_favor = ptd.getDownsample2_0(data, "FAVOR", "soft", 0.2)
sub_none = ptd.getDownsample2_0(data, "NONE", "soft", 0.4)
against = data[data.Stance == "AGAINST"]

data = pd.concat([sub_favor, sub_none, against])

#glove_fnames1 = glob('../DataProcessing/GloveVectorizer/vectors/glove.6B.300d_tcp_abstracts.pkl')
glove_fnames = glob('../DataProcessing/GloveVectorizer/vectors/glove.840B.300d_tcp_abstracts.pkl')
#glove_fnames = glove_fnames1 + glove_fnames2
print glove_fnames

glove_ids = [fname.split('/')[-1].split('_')[0] for fname in glove_fnames]

# *****     FINDING BEST VECTOR SPACE     *****
for fname, glove_id in zip(glove_fnames, glove_ids):
    print 80 * '='
Esempio n. 5
0
# ***** SETTINGS   *****
use_upsample = 1
use_downsample = 1

# perform_test_on_unused_data = 1

downsample_rate_favor = 0.3
# downsample_rate_none  = 10

strength = "soft"


if use_downsample:
    data = pd.read_csv("../TextFiles/data/tcp_train.csv", sep="\t")
    sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength, downsample_rate_favor)
    against = data[data.Stance == "AGAINST"]

    data = pd.concat([sub_favor, against])

else:
    data = pd.read_csv("../TextFiles/data/tcp_train.csv", sep="\t")
    data = pd.concat([data[data.Stance == "FAVOR"], data[data.Stance == "AGAINST"]])

if use_upsample:
    data = pd.concat([data, data[data.Stance == "AGAINST"]])

cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1)

# Select classifiers to use
classifiers = [
Esempio n. 6
0
data = pd.read_csv(open('../TextFiles/data/tcp_train.csv'),
                   sep='\t',
                   index_col=0)
val = pd.read_csv(open('../TextFiles/data/tcp_validate.csv'),
                  sep='\t',
                  index_col=0)
test = pd.read_csv(open('../TextFiles/data/tcp_test.csv'),
                   sep='\t',
                   index_col=0)

print("using down sampling")
print 'Downsample favor: ' + str(0.2)
print 'Downsample none: ' + str(0.4)
#test_data = ptd.getTestData()
sub_favor = ptd.getDownsample2_0(data, "FAVOR", "soft", 0.2)
sub_none = ptd.getDownsample2_0(data, "NONE", "soft", 0.4)
against = data[data.Stance == "AGAINST"]

data = pd.concat([sub_favor, sub_none, against])

#glove_fnames1 = glob('../DataProcessing/GloveVectorizer/vectors/glove.6B.300d_tcp_abstracts.pkl')
glove_fnames = glob(
    '../DataProcessing/GloveVectorizer/vectors/glove.840B.300d_tcp_abstracts.pkl'
)
#glove_fnames = glove_fnames1 + glove_fnames2
print glove_fnames

glove_ids = [fname.split('/')[-1].split('_')[0] for fname in glove_fnames]

# *****     FINDING BEST VECTOR SPACE     *****