Beispiel #1
0
def run_model(language, dataset_name, evaluation_split, detailed_report):
    """Trains and tests the CWI model for a particular dataset of a particular language. Reports results.

    Args:
        language: The language of the dataset.
        dataset_name: The name of the dataset (all files should have it).
        evaluation_split: The split of the data to use for evaluating the performance of the model (dev or test).
        detailed_report: Whether to display a detailed report or just overall score.

    """
    print("\nModel for {} - {}.".format(language, dataset_name))

    data = Dataset(language, dataset_name)

    #The code below is used for creating unigram probability csv files

    # if (language == 'spanish'):
    #     corpus_words = nltk.corpus.cess_esp.words()
    #     unigram_counts = Counter(corpus_words)
    #     total_words = len(corpus_words)

    # def calc_unigram_prob(unigram_counts, total_words):
    #     u_prob = {} #defaultdict
    #     for word in unigram_counts:
    #         u_prob[word] = unigram_counts[word]/total_words
    #     return u_prob

    # def save_to_file(u_prob,file_name):
    #     w = csv.writer(open(file_name, "w"))
    #     for word, prob in u_prob.items():
    #         w.writerow([word, prob])
    # print('calc unigram prob: ')

    # u_prob = calc_unigram_prob(unigram_counts, total_words)
    # print('saving file')
    # save_to_file(u_prob, 'data/external/spanish_u_prob.csv')
    # kdfjei

    baseline = MonolingualCWI(language)

    baseline.train(data.train_set())

    if evaluation_split in ["dev", "both"]:
        print("\nResults on Development Data")
        predictions_dev = baseline.predict(data.dev_set())
        gold_labels_dev = data.dev_set()['gold_label']
        print(
            report_binary_score(gold_labels_dev, predictions_dev,
                                detailed_report))

    if evaluation_split in ["test", "both"]:
        print("\nResults on Test Data")
        predictions_test = baseline.predict(data.test_set())
        gold_labels_test = data.test_set()['gold_label']
        print(
            report_binary_score(gold_labels_test, predictions_test,
                                detailed_report))

    print()
def run_model(test_language, evaluation_split, detailed_report):
    """ Trains the CWI model in all languages but one. Tests on all datasets of
        a particular language. Reports results.

    Args:
        test_language:      The language of the dataset to use for testing.
                            evaluation_split: The split of the data to use for
                            evaluating the performance of the model (dev or
                            test).

        detailed_report:    Whether to display a detailed report or just overall
                            score.

    """

    # collect the training data for all the languages but one
    train_data = []
    for language, datasets_names in datasets_per_language.items():
        if language != test_language:
            for dataset_name in datasets_names:
                data = Dataset(language, dataset_name)
                lang_train_set = data.train_set()
                if lang_train_set is None:
                    print("No training data found for language {}.".format(
                        language))
                else:
                    train_data.append(lang_train_set)

    train_data = pd.concat(train_data)

    # train the CWI model
    cwi_model = CrosslingualCWI(list(datasets_per_language.keys()))
    cwi_model.train(train_data)

    # test the model
    test_datasets = datasets_per_language[test_language]

    for dataset_name in test_datasets:
        data = Dataset(test_language, dataset_name)

        print("\nTesting on  {} - {}.".format(test_language, dataset_name))

        if evaluation_split in ["dev", "both"]:
            print("\nResults on Development Data")
            predictions_dev = cwi_model.predict(data.dev_set())
            gold_labels_dev = data.dev_set()['gold_label']
            print(
                report_binary_score(gold_labels_dev, predictions_dev,
                                    detailed_report))

        if evaluation_split in ["test", "both"]:
            print("\nResults on Test Data")
            predictions_test = cwi_model.predict(data.test_set())
            gold_labels_test = data.test_set()['gold_label']
            print(
                report_binary_score(gold_labels_test, predictions_test,
                                    detailed_report))

    print()
def run_model(language, dataset_name, evaluation_split, detailed_report, ablate):
    """Trains and tests the CWI model for a particular dataset of a particular language. Reports results.

    Args:
        language: The language of the dataset.
        dataset_name: The name of the dataset (all files should have it).
        evaluation_split: The split of the data to use for evaluating the performance of the model (dev or test).
        detailed_report: Whether to display a detailed report or just overall score.

    """
    score_only = True if ablate else False

    data = Dataset(language, dataset_name)
    #The code below is used for creating unigram probability csv files

        #corp = nltk.corpus.ConllCorpusReader('.', 'tiger_release_aug07.corrected.16012013.conll09',
                                    # ['ignore', 'words', 'ignore', 'ignore', 'ignore'],
                                     #encoding='utf-8')
    # filename = 'europarl-v7.fr-en.fr'
    # file = open(filename, mode='rt', encoding='utf-8')
    # corpus_words = []
    # for line in file:
    #     #print(line)
    #     corpus_words += line.strip(',').strip('.').split()
    #     #print(corpus_words)

    # #corpus_words = corp.words()
    # unigram_counts = Counter(corpus_words)
    # total_words = len(corpus_words)

    # def calc_unigram_prob(unigram_counts, total_words):
    #     u_prob = {} #defaultdict
    #     for word in unigram_counts:
    #         u_prob[word] = unigram_counts[word]/total_words
    #     return u_prob

    # def save_to_file(u_prob,file_name):
    #     w = csv.writer(open(file_name, "w"))
    #     for word, prob in u_prob.items():
    #         w.writerow([word, prob])
    # print('calc unigram prob: ')

    # u_prob = calc_unigram_prob(unigram_counts, total_words)
    # print('saving file')
    # save_to_file(u_prob, 'data/external/french_u_prob.csv')
    # hgiuyo

    baseline = MonolingualCWI(language, ablate)

    baseline.train(data.train_set())


    if evaluation_split in ["dev", "both"]:
        if not score_only:
            print("\nResults on Development Data")
        predictions_dev = baseline.predict(data.dev_set())
        gold_labels_dev = data.dev_set()['gold_label']
        print(report_binary_score(gold_labels_dev, predictions_dev, detailed_report, score_only))


    if evaluation_split in ["test", "both"]:
        if not score_only:
            print("\nResults on Test Data")
        predictions_test = baseline.predict(data.test_set())
        gold_labels_test = data.test_set()['gold_label']
        print(report_binary_score(gold_labels_test, predictions_test, detailed_report, score_only))
    if not score_only:
        print()
Beispiel #4
0
def run_model(selective_testing, translate, test_language, evaluation_split,
              detailed_report):
    """ Trains the CWI model in all languages but one. Tests on all datasets of
        a particular language. Reports results.

    Args:
        test_language:      The language of the dataset to use for testing.
                            evaluation_split: The split of the data to use for
                            evaluating the performance of the model (dev or
                            test).

        detailed_report:    Whether to display a detailed report or just overall
                            score.

    """

    # collect the training data for all the languages but one
    train_data = []

    if selective_testing == 'ESG':
        for language, datasets_names in datasets_per_language.items():
            if language != test_language:
                for dataset_name in datasets_names:
                    data = Dataset(language, dataset_name)
                    lang_train_set = data.train_set()
                    if lang_train_set is None:
                        print("No training data found for language {}.".format(
                            language))
                    else:
                        train_data.append(lang_train_set)
        train_data = pd.concat(train_data)
    else:
        train_data = pd.DataFrame()
        if 'E' in selective_testing:
            train_data = pd.concat([
                train_data,
                Dataset('english', 'News').train_set(),
                Dataset('english', 'WikiNews').train_set(),
                Dataset('english', 'Wikipedia').train_set()
            ])

        if 'S' in selective_testing:
            train_data = pd.concat(
                [train_data,
                 Dataset('spanish', 'Spanish').train_set()])

        if 'G' in selective_testing:
            train_data = pd.concat(
                [train_data,
                 Dataset('german', 'German').train_set()])


#        if selective_testing == 'ES':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set(),
#                                    Dataset('english','Wikipedia').train_set(),
#                                    Dataset('spanish','Spanish').train_set()])
#        elif selective_testing == 'EG':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set()
#                                    ,Dataset('english','Wikipedia').train_set(),
#                                    Dataset('german','German').train_set()])
#        elif selective_testing == 'E':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set()
#                                    ,Dataset('english','Wikipedia').train_set()])
#        elif selective_testing == 'G':
#            train_data = pd.concat([Dataset('german','German').train_set()])
#
#        elif selective_testing == 'S':
#            train_data = pd.concat([Dataset('spanish','Spanish').train_set()])
#        else:
#            train_data = pd.concat([Dataset('spanish','Spanish').train_set(),
#                                    Dataset('german','German').train_set()])

# train the CWI model
    cwi_model = CrosslingualCWI(list(datasets_per_language.keys()))
    cwi_model.train(train_data)

    # test the model
    test_datasets = datasets_per_language[test_language]

    for dataset_name in test_datasets:
        data = Dataset(test_language, dataset_name)

        print("\nTesting on  {} - {}.".format(test_language, dataset_name))

        if evaluation_split in ["dev", "both"]:
            print("\nResults on Development Data")

            if test_language == 'french':
                print("\nNo Dev Data for French, skipping...")
            else:
                predictions_dev = cwi_model.predict(data.dev_set())
                gold_labels_dev = data.dev_set()['gold_label']
                print(
                    report_binary_score(gold_labels_dev, predictions_dev,
                                        detailed_report))

        if evaluation_split in ["test", "both"]:
            print("\nResults on Test Data")

            data.translate = translate
            predictions_test = cwi_model.predict(data.test_set())
            gold_labels_test = data.test_set()['gold_label']

            print(
                report_binary_score(gold_labels_test, predictions_test,
                                    detailed_report))

    print()