Exemple #1
0
def pipeline(x_train, y_train, x_val, y_val, x_test, y_test, num_classes,
             img_rows, img_cols, batch_size, epochs, augment):
    x_train, y_train, x_val, y_val, x_test, y_test, input_shape = pre_process(
        x_train, y_train, x_val, y_val, x_test, y_test, num_classes, img_rows,
        img_cols)
    model, history = apply_training(x_train, y_train, x_val, y_val, batch_size,
                                    epochs, input_shape, num_classes, augment)
    return model, history, x_train, y_train, x_val, y_val, x_test, y_test, input_shape
def data2image(data_vect, labels, num_classes, one_hot):
    # input image dimensions
    fv = StructDataTransformer()
    fv.fit(data_vect.to_numpy(), labels.to_numpy())
    img_rows, img_cols = fv.image_dim, fv.image_dim

    # pre-processing
    fv.imgs, labels, input_shape, data_mn, data_std = pre_process(
        fv.imgs, img_rows, img_cols, labels, num_classes, one_hot)

    return fv, labels, input_shape, data_mn, data_std
Exemple #3
0
def cross_validation(hyper_parameters):
    # extract hyper parameters of the neural net
    features = hyper_parameters[0]
    num_neurons = hyper_parameters[1]
    p_value = hyper_parameters[2]
    lr_1 = hyper_parameters[3]
    lr_2 = hyper_parameters[4]
    lr_3 = hyper_parameters[5]

    # start training and testing
    # pre-process the data, using the function defined in preprocessing.py
    data = pre_process()

    # keep only chosen features
    columns = np.append(features, [1] * output_size)
    features_idx = [i for i, x in enumerate(columns) if x == 1]
    data = data.iloc[:, features_idx]

    # split data for later use (k cross validation)
    splitted_data = np.split(data, k_cross_validation)

    # train using cross validation
    all_train_losses = []
    all_test_losses = []
    all_train_correctness = []
    all_test_correctness = []
    for i in range(k_cross_validation):
        # extract train and test data, split input and target
        X_train, Y_train = train_data(splitted_data, i)
        X_test, Y_test = test_data(splitted_data, i)

        # train the model and print loss, confusion matrix and correctness
        reg_model, loss, correctness = train(X_train, Y_train, num_neurons,
                                             p_value, lr_1, lr_2, lr_3)

        # test the model on test data
        test_loss, test_correctness = test(X_test, Y_test, reg_model)

        # append losses and correctness
        all_train_losses.append(loss)
        all_test_losses.append(test_loss)
        all_train_correctness.append(correctness)
        all_test_correctness.append(test_correctness)

    # print average loss and correctness on training and testing data
    train_loss_avg = (sum(all_train_losses) / len(all_train_losses)).item()
    test_loss_avg = (sum(all_test_losses) / len(all_test_losses)).item()
    print('average loss on training data', train_loss_avg)
    print('average loss on testing data', test_loss_avg)
    train_correctness_avg = sum(all_train_correctness) / len(
        all_train_correctness)
    test_correctness_avg = sum(all_test_correctness) / len(
        all_test_correctness)
    print('average correctness on training data', train_correctness_avg)
    print('average correctness on testing data', test_correctness_avg)
    print('')

    # display performance of each model
    if plot_each_run:
        # losses
        plt.figure()
        plt.plot(all_train_losses, label='training data', color='blue')
        plt.plot(all_test_losses, label='testing data', color='red')
        plt.axhline(y=train_loss_avg,
                    linestyle=':',
                    label='training data average loss',
                    color='blue')
        plt.axhline(y=test_loss_avg,
                    linestyle=':',
                    label='testing data average loss',
                    color='red')
        plt.legend()
        plt.title('losses of model on training and testing data')
        plt.show()
        # correctness
        plt.figure()
        plt.plot(all_train_correctness, label='training data', color='blue')
        plt.plot(all_test_correctness, label='testing data', color='red')
        plt.axhline(y=train_correctness_avg,
                    linestyle=':',
                    label='training data average correctness',
                    color='blue')
        plt.axhline(y=test_correctness_avg,
                    linestyle=':',
                    label='testing data average correctness',
                    color='red')
        plt.legend()
        plt.title('correctness of model on training and testing data')
        plt.show()

    print("settings: ", features_idx, num_neurons, p_value, lr_1, lr_2, lr_3)
    print("---------------------------------------\n")

    return test_correctness_avg, train_correctness_avg, test_loss_avg, train_loss_avg
Exemple #4
0
def run():

    train_x, train_y = get_data('Dataset/haspeede2_dev_taskAB.tsv')
    lang = ['italian']
    models = ['log_reg']
    embeddings = ['tfidf']
    corpus = []
    for i in range(0, len(train_x)):
        sentence = preprocessing.pre_process(train_x[i].lower(), False)

        if sentence != ' ':
            # sentence = get_stem(lang[1], sentence)
            corpus.append(sentence)
        else:
            train_y = np.delete(train_y, i, 0)
    train_x = corpus

    best_score = 0.0
    best_model = None
    best_embedding = None

    for e in embeddings:
        embedded_train_x = get_embeddings(lang[0], train_x, "train_", e, False)
        for m in models:

            print("Lang: " + lang[0] + "\tEmbeddings: " + e + "\tModel: " + m)
            score, model = classifier(embedded_train_x, train_y, m)
            print("SCORE: " + str(score))
            if score > best_score:
                best_score = score
                best_model = model
                best_embedding = e

    print("BEST MODEL: ")
    print(best_model)
    print(m + " " + best_embedding + ": " + str(score))

    embedded_train_x = get_embeddings(lang[0], train_x, "train_",
                                      best_embedding, False)
    best_model.fit(embedded_train_x, train_y)

    with open("taskA/model_" + m + "_" + e + ".pk", "wb") as fout:
        pickle.dump(best_model, fout)
    print("Model saved.")

    #####TESTING     on  TWEETS    #########

    test = []
    test_x = get_test_data('Dataset/haspeede2_test_taskAB-tweets.tsv')

    for i in range(0, len(test_x)):
        sentence = preprocessing.pre_process(test_x[i].lower(), False)
        test.append(sentence)
    test_x = test

    test_x = get_embeddings(lang[0], test_x, "test_tweets_", best_embedding,
                            True)

    test_y = best_model.predict(test_x)

    with open("taskA/test_y_tweets_" + m + e + ".pk", "wb") as testyout:
        pickle.dump(test_y, testyout)

    #####TESTING     on  NEWS    #########

    test = []
    test_x = get_test_data('Dataset/haspeede2-test_taskAB-news.tsv')

    for i in range(0, len(test_x)):
        sentence = preprocessing.pre_process(test_x[i].lower(), False)
        test.append(sentence)
    test_x = test

    test_x = get_embeddings(lang[0], test_x, "test_news_", best_embedding,
                            True)

    test_y = best_model.predict(test_x)

    with open("taskA/test_y_news_" + m + "_" + e + ".pk", "wb") as testyout:
        pickle.dump(test_y, testyout)

    return
                imp_numerical = Imputer(missing_values='NaN',
                                        strategy='mean',
                                        axis=0,
                                        copy=False)
                val[:, j] = imp_numerical.fit_transform(val[:,
                                                            j].reshape(-1,
                                                                       1)).T

        #labelencoder
        for j in range(val.shape[1]):
            if j == val.shape[1] - 1 or categorical[j]:
                val[:, j] = le.fit_transform(val[:, j])

        #one-hot-encoding and standardization
        data_numeric = val[:, :-1]
        data_labels = val[:, -1]
        data_numeric, categorical = preprocessing.pre_process(
            raw_data=data_numeric,
            categorical=categorical,
            impute=False,
            standardize=True,
            one_hot_encode=True)
        val = np.append(data_numeric, np.array(data_labels, ndmin=2).T, axis=1)
        pd.DataFrame(val, index=None, columns=None).to_csv('{}/{}.csv'.format(
            dirname, dataset_name),
                                                           index=None,
                                                           header=None)
        print("dataset {} finished \n \n".format(dataset_name))
    except:
        print("!!! error encountered on {} \n \n".format(dataset_name))
Exemple #6
0
import preprocessing
from preprocessing import pre_process

import math

features = ['Budget', 'Runtime', 'vote_average', 'Popularity', 'vote_count']
complex_features = ['Genres']


class DenseTransformer(TransformerMixin):
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('movies')
rows = table.scan(FilterExpression=Attr('Budget').gt(1000))['Items']
data, target = pre_process(rows, features, complex_features)

reg = make_pipeline(DictVectorizer(), DenseTransformer(), LinearRegression())
scores = cross_val_score(reg, data, target, cv=10)

print(scores)
print(np.mean(np.array(scores)))
Exemple #7
0
def confusion_matrix(Y, Y_predicted):
    confusion = torch.zeros(5, 5)
    correct_num = 0
    for i in range(Y.shape[0]):
        actual_class = interpret_output(Y[i])
        predicted_class = interpret_output(Y_predicted[i])
        confusion[actual_class[1]][predicted_class[1]] += 1
        if actual_class == predicted_class:
            correct_num += 1
    return confusion, correct_num


################################ main ###################################
if __name__ == "__main__":
    # pre-process the data, using the function defined in preprocessing.py
    data = pre_process()

    # split data for later use (k cross validation)
    splitted_data = np.split(data, k_cross_validation)

    # train using cross validation
    all_train_losses = []
    all_test_losses = []
    all_train_correctness = []
    all_test_correctness = []
    for i in range(k_cross_validation):
        # extract train and test data, split input and target
        X_train, Y_train = train_data(splitted_data, i)
        X_test, Y_test = test_data(splitted_data, i)

        # train the model and print loss, confusion matrix and correctness
Exemple #8
0
### WAND Run ###

import wandb
import logConfig, load_data, accuracy_loss, train, preprocessing, plot

train_X, train_Y, test_X, test_Y, labels = load_data.load_data()

(N, w, h), n_labels = train_X.shape, len(labels)

# Dimension of datapoints
d = w * h

# Data Preprocessing
(train_x, train_y), (val_x, val_y), (test_x, test_y) = preprocessing.pre_process(d, n_labels, train_X, train_Y, test_X, test_Y)


def main(config = None):
  run = wandb.init(config=config, resume=True)
  config = wandb.config

  hl = [config.hidden_layer_size] * config.hidden_layers               # Hidden layers
  ol = [len(train_y[0])]                                               # Output layers
  n_hl = len(hl)

  name = "hl_" + str(config.hidden_layers) + "_bs_" + str(config.batch_size) + "_ac_" + config.ac
  run.name = name

  logConfig.logConfig(config)

  # Set Loss function here
  loss_functions = [ "cross_entropy", "sq_loss" ]
def sentiment(movie_name):
    #print(datetime.now())
    model = joblib.load('./Sentimov/sentiment/model.sav')

    feature_words = fwords.create_feature_words('./Sentimov/sentiment/feature_words.txt')
    emo_list = fwords.create_feature_words('./Sentimov/sentiment/emo_list.txt')
    word_features = feature_words + emo_list

    name = movie_name.replace(" ", "").replace(":", "").replace("'","").replace("-","")
    hashtag = str("#" + name)

    inputf = open("./Sentimov/sentiment/" + name + ".txt")
    lines=inputf.readlines()
    inputf.close()
    output = open("./Sentimov/sentiment/" + name + '.tsv', 'w')
    tweetlist=[]
    pos_list=[]
    neg_list=[]
    neu_list=[]
    #count = 0;
    for line in lines:
        #count = count + 1
        if line!="\n":
            try:
                d = json.loads(line)
                text = d[u'text']
                tweet = preprocessing.pre_process(text, hashtag)
                text = text.replace("\n", "")
                text = html_parser.unescape(text).encode('utf-8')
                tweetlist.append(tweet)
                #remove stopword and specailcharacter
                stop_words = set(stopwords.words('english'))
                #word_tokens = TweetTokenizer.tokenize(tweet)
                word_tokens = word_tokenize(tweet)
                #word_tokens = cleanText.remove_repeated_characters(word_tokens)
                label = model.classify(fwords.find_features(word_tokens,word_features))
                arr = []

                if ('retweeted_status' in d):
                    arr.append(d[u'retweeted_status'][u'text'])
                    arr.append(d[u'created_at'])
                    arr.append(d[u'retweeted_status'][u'user'][u'screen_name'])
                    arr.append(d[u'retweeted_status'][u'id_str'])
                    arr.append(d[u'retweeted_status'][u'user'][u'profile_image_url_https'])
                    arr.append("retweeted")
                    arr.append(d[u'user'][u'screen_name'])
                else:
                    arr.append(text)
                    arr.append(d[u'created_at'])
                    arr.append(d[u'user'][u'screen_name'])
                    arr.append(d[u'id_str'])
                    arr.append(d[u'user'][u'profile_image_url_https'])
                    arr.append("not_retweeted")

                if (label == "positive"):
                    pos_list.append(arr)
                    output.write(str(text)+"\tpositive\n")
                elif (label == "negative"):
                    neg_list.append(arr)
                    output.write(str(text)+"\tnegative\n")
                else:
                    neu_list.append(arr)
                    output.write(str(text)+"\tneutral\n")
            except:
                pass

    output.close()
    result = {'pos':pos_list, 'neg':neg_list, 'neu':neu_list}
    #print(result)

    return result
        [tokenized_sentences[i] for i in [pair[0] for pair in top_sentences]])
    return summary


if __name__ == '__main__':

    article_no = int(parse_args().article_no)
    print '==== Summarising article No: {} ===='.format(article_no)

    news_data = preprocessing.get_news_data_from_csv()

    print news_data[article_no]['title']
    print 'Article'
    print news_data[article_no]['article']

    clean_data = preprocessing.pre_process(news_data)
    print '==== Data Pre Processing Complete ===='

    lemmatiser = WordNetLemmatizer()
    stopwords_list = get_stopwords()

    corpus_data = map(lambda record: record['article'], clean_data)
    corpus_data = set(corpus_data)

    # Uncomment the following lines, if retraining the Count Vectorizer
    # count_vect = CountVectorizer()
    # count_vect = count_vect.fit(corpus_data)
    # util.save_to_disk(count_vect, current_directory + '/pickle_objects/count_vect')
    count_vect = util.load_from_disk(current_directory +
                                     '/pickle_objects/count_vect')
    freq_term_matrix = count_vect.transform(corpus_data)
    test_path = os.path.join(output_path, "test")

    for i, author in enumerate(authors):
        # bag = Counter()
        author_path = os.path.join(test_path, author)
        files_of_author = list_files(author_path)

        for filename in files_of_author:
            file_path = os.path.join(author_path, filename)
            tokens = tokenize_file(file_path)
            author_candidates = calculate_probability_of_author(
                tokens=tokens,
                training_bags=training_bags,
                doc_counts=doc_counts)
            candidate_index = authors.index(author_candidates[0][0])
            confusion_matrix[i, candidate_index] += 1
    # print(confusion)
    return confusion_matrix


if __name__ == "__main__":
    print("main func")
    num_documents_by_author = pre_process()
    training_bag_of_author, doc_count_of_author = create_BOW()
    confusion = calculate_confusion_matrix(
        training_bags=training_bag_of_author, doc_counts=doc_count_of_author)
    # print(confusion)
    np.savetxt('confusion.txt', confusion, fmt='%d')
    print('tp : {:.2f}%'.
          format(100 * sum(np.diag(confusion)) / sum(sum(confusion))))