Esempio n. 1
0
def loaddata(data_dir, mode='train', max_len=None):
    """The function for loading data.

    This function will load the data, and then turns it into
    Lang Object.

    Args:
        data_dir: A string indicates the location of data set
        mode: A string indicates to load train, valid, or test.

    Returns:
        A list of reading dataset and a dictionary of Langs
    """
    data_set = readfile(data_dir + mode + '.json')
    if max_len is not None:
        data_set = data_set[:max_len]
    rt, re, rm, summary = readLang(data_set)

    print("Read %s data" % mode)
    print("Read %s box score summary" % len(data_set))
    print("Embedding size of (r.t, r.e, r.m) and summary:")
    print("({}, {}, {}), {}".format(rt.n_words, re.n_words, rm.n_words,
                                    summary.n_words))

    langs = {'rt': rt, 're': re, 'rm': rm, 'summary': summary}
    return data_set, langs
def runTest(file1, version, model, mode='words'):
    """Gets training or test file for stance detection SemiVal 2016 competition and prints prediction results.

        Parameters
        ----------
        file1 : list
            a list with text tokens on index (0)  and hashtags list on index (1)

        istest : Boolean
            specifies if the dataset is for test or training

        version : int
            0: Training dataset, 1: Test dataset, 2:Other domain dataset
        mode : str
            choose either (words) or (hashtags)

        """
    indata = readfile(file1, version)
    data = preprocesstweets(indata,
                            ignoreNONE=False,
                            version=version,
                            lowerCase=True)
    tfidfAdded = getTfidfRepresentation(data, version, mode)
    labels = [d[7] for d in data]
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels)
    print(encoder.classes_)
    if version == 0:
        x_train, x_test, y_train, y_test = train_test_split(tfidfAdded,
                                                            y,
                                                            test_size=0.2)
        y_test = np_utils.to_categorical(y_test, num_classes=3)
        y_train = np_utils.to_categorical(y_train, num_classes=3)
        print(x_train.shape[1])
        print(model.summary())
        model.fit(x_train,
                  y_train,
                  epochs=10,
                  verbose=2,
                  validation_data=(x_test, y_test))
        loss, acc = model.evaluate(x_test, y_test, verbose=0)
        ypred = model.predict(x_test)
        print('Training Accuracy: %f' % (acc * 100))
        print('Training F-Score: ', f1(y_test, ypred) * 100)
    if version == 1 or version == 2:
        y = np_utils.to_categorical(y, num_classes=3)
        loss, acc = model.evaluate(tfidfAdded, y)
        ypred = model.predict(tfidfAdded)
        otherdomain = ''
        if version == 2:
            otherdomain = '(other domain)'
        print('TEST Accuracy ' + otherdomain + ': %f' % ((acc * 100)))
        print('TEST F-Score ' + otherdomain + ': ', (f1(y, ypred) * 100))
def loaddata(data_dir, mode='train', max_len=None, copy_player=COPY_PLAYER):
    data_set = readfile(data_dir + mode + '.json', copy_player=copy_player)
    if max_len is not None:
        data_set = data_set[:max_len]
    rt, re, rm, summary = readLang(data_set)

    print("Read %s data" % mode)
    print("Read %s box score summary" % len(data_set))
    print("Embedding size of (r.t, r.e, r.m) and summary:")
    print("({}, {}, {}), {}".format(rt.n_words, re.n_words, rm.n_words,
                                    summary.n_words))

    langs = {'rt': rt, 're': re, 'rm': rm, 'summary': summary}
    return data_set, langs
#External libraries
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
#files
from preprocessing import readfile, calc_length, flatten
from preprocessing import padding_data, one_hot_encoding
from preprocessing import CHANNELS

folder = "data"
train_data = readfile(f"{folder}/ae.train", 0)
test_data = readfile(f"{folder}/ae.test", 1)
maxlength = calc_length(test_data, train_data)

train_input, train_output = padding_data(train_data, maxlength, 0, True)
test_input, test_output = padding_data(test_data, maxlength, 1, True)

pca = PCA(n_components=CHANNELS)
train_transformed = pca.fit_transform(train_input)
test_transformed = pca.fit_transform(test_input)
"""
Logistic Regression with PCA
folds: 5, 10, 15
accuracy: 0.3162162162162162
"""
# cv = KFold(n_splits=10, random_state=42, shuffle=True)
# clf = LogisticRegression()
# scores = []
        print(x_train.shape[1])
        print(model.summary())
        model.fit(x_train,
                  y_train,
                  epochs=10,
                  verbose=2,
                  validation_data=(x_test, y_test))
        loss, acc = model.evaluate(x_test, y_test, verbose=0)
        ypred = model.predict(x_test)
        print('Training Accuracy: %f' % (acc * 100))
        print('Training F-Score: ', f1(y_test, ypred) * 100)
    if version == 1 or version == 2:
        y = np_utils.to_categorical(y, num_classes=3)
        loss, acc = model.evaluate(tfidfAdded, y)
        ypred = model.predict(tfidfAdded)
        otherdomain = ''
        if version == 2:
            otherdomain = '(other domain)'
        print('TEST Accuracy ' + otherdomain + ': %f' % ((acc * 100)))
        print('TEST F-Score ' + otherdomain + ': ', (f1(y, ypred) * 100))


indata = readfile('SemEval2016-Task6-subtaskA-traindata-gold.csv', 0)
data = preprocesstweets(indata, ignoreNONE=False, version=0, lowerCase=True)
tfidfAdded = getTfidfRepresentation(data, 0, 'words')
model = createModel(tfidfAdded.shape[1])

runTest('SemEval2016-Task6-subtaskA-traindata-gold.csv', 0, model, 'words')
runTest('SemEval2016-Task6-subtaskA-testdata-gold.txt', 1, model, 'words')
runTest('stance.csv', 2, model, 'words')
                        validation_data=(Xtest, y_test), verbose=2)
    print('History', history.history)

    # evaluate
    print('Predicting (training)..')
    ypred = model.predict(Xtest)
    print('Accuracy (TRAIN): %f' % (model.evaluate(Xtest,y_test)[0]*100))
    print('FScore (TRAIN): %f' % (f1(y_test, ypred)*100))

    print('Predicting (testing)..')
    #ypred = model.predict(XtestGroup)
    #print('Accuracy (TEST): %f' % (model.evaluate(XtestGroup,y_testGroup)[0]*100))
    #print('FScore (TEST): %f' % (f1(y_testGroup,ypred)*100))


indata = readfile('SemEval2016-Task6-subtaskA-traindata-gold.csv', False)
data = preprocesstweets(indata,ignoreNONE=False, version =0)
tweets = [' '.join(d[0]) for d in data]
stances = [d[7] for d in data]
encoder = LabelEncoder()
stances = encoder.fit_transform(stances)

indata = readfile('SemEval2016-Task6-subtaskA-testdata-gold.txt', True)
data = preprocesstweets(indata,ignoreNONE=False, version =1)
tweets2 = [' '.join(d[0]) for d in data]
stances2 = [d[7] for d in data]
stances2 = encoder.fit_transform(stances2)


convModel(tweets, stances, tweets2, stances2)
#The model is not working currently, last edits caused a problem. Reported results were from previous stage