コード例 #1
0
ファイル: run_monolingual.py プロジェクト: tomdakin/cwi
def run_model(language, dataset_name, evaluation_split, detailed_report):
    """Trains and tests the CWI model for a particular dataset of a particular language. Reports results.

    Args:
        language: The language of the dataset.
        dataset_name: The name of the dataset (all files should have it).
        evaluation_split: The split of the data to use for evaluating the performance of the model (dev or test).
        detailed_report: Whether to display a detailed report or just overall score.

    """
    print("\nModel for {} - {}.".format(language, dataset_name))

    data = Dataset(language, dataset_name)

    #The code below is used for creating unigram probability csv files

    # if (language == 'spanish'):
    #     corpus_words = nltk.corpus.cess_esp.words()
    #     unigram_counts = Counter(corpus_words)
    #     total_words = len(corpus_words)

    # def calc_unigram_prob(unigram_counts, total_words):
    #     u_prob = {} #defaultdict
    #     for word in unigram_counts:
    #         u_prob[word] = unigram_counts[word]/total_words
    #     return u_prob

    # def save_to_file(u_prob,file_name):
    #     w = csv.writer(open(file_name, "w"))
    #     for word, prob in u_prob.items():
    #         w.writerow([word, prob])
    # print('calc unigram prob: ')

    # u_prob = calc_unigram_prob(unigram_counts, total_words)
    # print('saving file')
    # save_to_file(u_prob, 'data/external/spanish_u_prob.csv')
    # kdfjei

    baseline = MonolingualCWI(language)

    baseline.train(data.train_set())

    if evaluation_split in ["dev", "both"]:
        print("\nResults on Development Data")
        predictions_dev = baseline.predict(data.dev_set())
        gold_labels_dev = data.dev_set()['gold_label']
        print(
            report_binary_score(gold_labels_dev, predictions_dev,
                                detailed_report))

    if evaluation_split in ["test", "both"]:
        print("\nResults on Test Data")
        predictions_test = baseline.predict(data.test_set())
        gold_labels_test = data.test_set()['gold_label']
        print(
            report_binary_score(gold_labels_test, predictions_test,
                                detailed_report))

    print()
コード例 #2
0
def run_model(test_language, evaluation_split, detailed_report):
    """ Trains the CWI model in all languages but one. Tests on all datasets of
        a particular language. Reports results.

    Args:
        test_language:      The language of the dataset to use for testing.
                            evaluation_split: The split of the data to use for
                            evaluating the performance of the model (dev or
                            test).

        detailed_report:    Whether to display a detailed report or just overall
                            score.

    """

    # collect the training data for all the languages but one
    train_data = []
    for language, datasets_names in datasets_per_language.items():
        if language != test_language:
            for dataset_name in datasets_names:
                data = Dataset(language, dataset_name)
                lang_train_set = data.train_set()
                if lang_train_set is None:
                    print("No training data found for language {}.".format(
                        language))
                else:
                    train_data.append(lang_train_set)

    train_data = pd.concat(train_data)

    # train the CWI model
    cwi_model = CrosslingualCWI(list(datasets_per_language.keys()))
    cwi_model.train(train_data)

    # test the model
    test_datasets = datasets_per_language[test_language]

    for dataset_name in test_datasets:
        data = Dataset(test_language, dataset_name)

        print("\nTesting on  {} - {}.".format(test_language, dataset_name))

        if evaluation_split in ["dev", "both"]:
            print("\nResults on Development Data")
            predictions_dev = cwi_model.predict(data.dev_set())
            gold_labels_dev = data.dev_set()['gold_label']
            print(
                report_binary_score(gold_labels_dev, predictions_dev,
                                    detailed_report))

        if evaluation_split in ["test", "both"]:
            print("\nResults on Test Data")
            predictions_test = cwi_model.predict(data.test_set())
            gold_labels_test = data.test_set()['gold_label']
            print(
                report_binary_score(gold_labels_test, predictions_test,
                                    detailed_report))

    print()
def run_model(model_type,
              in_notebook=False,
              is_verbose=True,
              return_model=False):

    if model_type == 'LR':
        print(
            "\nPredicting Speaker Stance - Multi Label Logistic Regression Baseline Model "
        )
        model = baseline.Model()

    elif model_type == 'MultiClassLR':
        print(
            "\nPredicting Speaker Stance - Multi Class Logistic Regression Baseline Model "
        )
        model = multiclassbaseline.Model()

    elif model_type == 'MTNN':
        print("\nPredicting Speaker Stance - Multi Task Model ")
        model = MultiTaskNN.My_Model(is_verbose=is_verbose)

    elif model_type == 'MLP':
        print("\nPredicting Speaker Stance - Multi Layer Perceptron Model ")
        model = MultiLayerPercep.Model(is_verbose=is_verbose)

    elif model_type == 'FastText':
        print("\nPredicting Speaker Stance - FastText Model ")
        model = FastText.My_Model(is_verbose=is_verbose)

    elif model_type == 'MultiClassFastText':
        print("\nPredicting Speaker Stance - FastText Model ")
        model = FastTextMultiClass.My_Model(is_verbose=is_verbose)

    elif model_type == 'FastTextMT':
        print("\nPredicting Speaker Stance - FastText Multi Task Model ")
        model = FastTextMultiTask.My_Model(is_verbose=is_verbose)

    elif model_type == 'FastTextAux':
        print("\nPredicting Speaker Stance - FastText Aux Task Model ")
        model = FastTextAuxTask.My_Model(is_verbose=is_verbose)

    else:
        raise ValueError(
            'Unknown Model Type: {} not supported option'.format(model_type))

    print("Loading Data")

    if in_notebook is True:
        data = Dataset(in_notebook=True)

    else:
        data = Dataset()

    print("Training Model")
    model.train(data.train_set())

    y, y_pred = model.test(data.test_set())
    if return_model is True:
        return model, y, y_pred
    return y, y_pred
コード例 #4
0
def run_model(model_type,
              domain,
              is_verbose=True,
              probs=False,
              get_history=False):
    if model_type == 'logistic_regression':
        model = LR(domain, probs=probs)
    elif model_type == 'fast_text':
        model = FastText(domain, probs=probs, is_verbose=is_verbose)
    elif model_type == 'mlp':
        model = MLP(domain, is_verbose=is_verbose)
    else:
        raise ValueError('incorrect model type choice')

    print("Loading Data")
    data = Dataset()

    print("Training Model")
    hist = model.train(data.train_set())

    if get_history is True:
        return hist

    else:
        y, y_pred = model.test(data.test_set())
        return y, y_pred
コード例 #5
0
def run_model():
    print("\nPredicting Speaker Stance - Baseline Model ")
    print("Loading Data")
    data = Dataset()
    model = Model()

    print("Training Model")
    model.train(data.train_set())

    print("\nResults on Test Data")
    y, y_pred = model.test(data.test_set())
    print(report_scores(y, y_pred))
def run_model(in_notebook=False):
    print("\nPredicting Speaker Stance - Baseline Model ")
    print("Loading Data")
    if in_notebook is True:
        data = Dataset(in_notebook=True)

    else:
        data = Dataset()

    model = Model()
    print("Training Model")
    model.train(data.train_set())

    print("\nResults on Test Data")
    y, y_pred = model.test(data.test_set())
    print(report_scores(y, y_pred))
コード例 #7
0
def run_model(language, dataset_name, evaluation_split, detailed_report, ablate):
    """Trains and tests the CWI model for a particular dataset of a particular language. Reports results.

    Args:
        language: The language of the dataset.
        dataset_name: The name of the dataset (all files should have it).
        evaluation_split: The split of the data to use for evaluating the performance of the model (dev or test).
        detailed_report: Whether to display a detailed report or just overall score.

    """
    score_only = True if ablate else False

    data = Dataset(language, dataset_name)
    #The code below is used for creating unigram probability csv files

        #corp = nltk.corpus.ConllCorpusReader('.', 'tiger_release_aug07.corrected.16012013.conll09',
                                    # ['ignore', 'words', 'ignore', 'ignore', 'ignore'],
                                     #encoding='utf-8')
    # filename = 'europarl-v7.fr-en.fr'
    # file = open(filename, mode='rt', encoding='utf-8')
    # corpus_words = []
    # for line in file:
    #     #print(line)
    #     corpus_words += line.strip(',').strip('.').split()
    #     #print(corpus_words)

    # #corpus_words = corp.words()
    # unigram_counts = Counter(corpus_words)
    # total_words = len(corpus_words)

    # def calc_unigram_prob(unigram_counts, total_words):
    #     u_prob = {} #defaultdict
    #     for word in unigram_counts:
    #         u_prob[word] = unigram_counts[word]/total_words
    #     return u_prob

    # def save_to_file(u_prob,file_name):
    #     w = csv.writer(open(file_name, "w"))
    #     for word, prob in u_prob.items():
    #         w.writerow([word, prob])
    # print('calc unigram prob: ')

    # u_prob = calc_unigram_prob(unigram_counts, total_words)
    # print('saving file')
    # save_to_file(u_prob, 'data/external/french_u_prob.csv')
    # hgiuyo

    baseline = MonolingualCWI(language, ablate)

    baseline.train(data.train_set())


    if evaluation_split in ["dev", "both"]:
        if not score_only:
            print("\nResults on Development Data")
        predictions_dev = baseline.predict(data.dev_set())
        gold_labels_dev = data.dev_set()['gold_label']
        print(report_binary_score(gold_labels_dev, predictions_dev, detailed_report, score_only))


    if evaluation_split in ["test", "both"]:
        if not score_only:
            print("\nResults on Test Data")
        predictions_test = baseline.predict(data.test_set())
        gold_labels_test = data.test_set()['gold_label']
        print(report_binary_score(gold_labels_test, predictions_test, detailed_report, score_only))
    if not score_only:
        print()
        y = self.target_pipe.fit_transform(trainset)
        num_features = X.shape[1]
        self.model = self.build_net(num_features)
        self.model.fit(X, y, epochs=500, batch_size=32, verbose=self.is_verbose)

    def test(self, testset):
        X = self.feature_pipe.transform(testset)
        y = self.mlb.transform(testset)
        print(y.shape)
        y_pred_raw = self.model.predict(X)
        y_pred = np.column_stack(y_pred_raw)  # join the raw outputs into format for sklearn scoring
        print(y_pred.shape)
        threshold = 0.5
        for row in y_pred:
            for i, val in enumerate(row):
                if val > threshold:
                    row[i] = 1
                elif val < threshold:
                    row[i] = 0
        print(y_pred[0])
        return y, y_pred


if __name__ == '__main__':

    from src.data.dataset import Dataset
    data = Dataset()
    model = Model()
    model.train(data.train_set())
    model.test(data.test_set())
コード例 #9
0
ファイル: run_crosslingual.py プロジェクト: AlisonMS/cwi-1
def run_model(selective_testing, translate, test_language, evaluation_split,
              detailed_report):
    """ Trains the CWI model in all languages but one. Tests on all datasets of
        a particular language. Reports results.

    Args:
        test_language:      The language of the dataset to use for testing.
                            evaluation_split: The split of the data to use for
                            evaluating the performance of the model (dev or
                            test).

        detailed_report:    Whether to display a detailed report or just overall
                            score.

    """

    # collect the training data for all the languages but one
    train_data = []

    if selective_testing == 'ESG':
        for language, datasets_names in datasets_per_language.items():
            if language != test_language:
                for dataset_name in datasets_names:
                    data = Dataset(language, dataset_name)
                    lang_train_set = data.train_set()
                    if lang_train_set is None:
                        print("No training data found for language {}.".format(
                            language))
                    else:
                        train_data.append(lang_train_set)
        train_data = pd.concat(train_data)
    else:
        train_data = pd.DataFrame()
        if 'E' in selective_testing:
            train_data = pd.concat([
                train_data,
                Dataset('english', 'News').train_set(),
                Dataset('english', 'WikiNews').train_set(),
                Dataset('english', 'Wikipedia').train_set()
            ])

        if 'S' in selective_testing:
            train_data = pd.concat(
                [train_data,
                 Dataset('spanish', 'Spanish').train_set()])

        if 'G' in selective_testing:
            train_data = pd.concat(
                [train_data,
                 Dataset('german', 'German').train_set()])


#        if selective_testing == 'ES':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set(),
#                                    Dataset('english','Wikipedia').train_set(),
#                                    Dataset('spanish','Spanish').train_set()])
#        elif selective_testing == 'EG':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set()
#                                    ,Dataset('english','Wikipedia').train_set(),
#                                    Dataset('german','German').train_set()])
#        elif selective_testing == 'E':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set()
#                                    ,Dataset('english','Wikipedia').train_set()])
#        elif selective_testing == 'G':
#            train_data = pd.concat([Dataset('german','German').train_set()])
#
#        elif selective_testing == 'S':
#            train_data = pd.concat([Dataset('spanish','Spanish').train_set()])
#        else:
#            train_data = pd.concat([Dataset('spanish','Spanish').train_set(),
#                                    Dataset('german','German').train_set()])

# train the CWI model
    cwi_model = CrosslingualCWI(list(datasets_per_language.keys()))
    cwi_model.train(train_data)

    # test the model
    test_datasets = datasets_per_language[test_language]

    for dataset_name in test_datasets:
        data = Dataset(test_language, dataset_name)

        print("\nTesting on  {} - {}.".format(test_language, dataset_name))

        if evaluation_split in ["dev", "both"]:
            print("\nResults on Development Data")

            if test_language == 'french':
                print("\nNo Dev Data for French, skipping...")
            else:
                predictions_dev = cwi_model.predict(data.dev_set())
                gold_labels_dev = data.dev_set()['gold_label']
                print(
                    report_binary_score(gold_labels_dev, predictions_dev,
                                        detailed_report))

        if evaluation_split in ["test", "both"]:
            print("\nResults on Test Data")

            data.translate = translate
            predictions_test = cwi_model.predict(data.test_set())
            gold_labels_test = data.test_set()['gold_label']

            print(
                report_binary_score(gold_labels_test, predictions_test,
                                    detailed_report))

    print()
        if self.domain == 'multi-class':
            y_pred = [np.argmax(yi) for yi in y_pred]

        elif self.domain == 'multi-label' or self.domain == 'multi-task':
            if self.domain == 'multi-task':
                print('stacking')
                y = np.column_stack(y)
                y_pred = np.column_stack(y_pred)

            y_pred = np.where(y_pred > 0.5, 1, 0)  #  binarize result
        return y, y_pred


if __name__ == '__main__':
    data = Dataset()

    print("Training Model")
    for domain in ['multi-task', 'multi-label', 'multi-class']:
        model = MLP(domain)
        model.train(data.train_set())

        y, y_pred = model.test(data.test_set())

        print("\nResults on Test Data")

        if domain is 'multi-class':
            print(report_multiclass_scores(y, y_pred))

        else:
            print(report_scores(y, y_pred))