Ejemplo n.º 1
0
def save_comparison_table(table, extractors, classifiers, filepath):
    '''
    Exports the data contained in the table to csv format:
    by def: each row represent a different set of feature
            each column represent a different kind of classifier
    '''
    file = open(filepath, 'w')

    header = ','.join(['Features\\Classifier'] +
                      [get_classifier_name(clf) for clf in classifiers]) + '\n'
    file.write(header)

    for idx, row in enumerate(table):
        str_row = get_features_extr_name(extractors[idx]) + ","
        str_row += ','.join(['{0:.3f}'.format(float(x)) for x in row]) + '\n'
        file.write(str_row)
    file.close()
Ejemplo n.º 2
0
def train(inputPath, splitsPath, outputPath, verbosity_level=1):
    '''

    For each language, proceeds as follow:
        - takes in input the corresponding .pkl file
        - train a text-based classifier on the 80% split
        - save the resulting model in outputPath

    :param inputPath:  Path to PAN18 dataset
    :param splitsPath: Path to dir containing the .pkl files produced by 'splitting.py'
    :param outputPath: Path to dir in which the outputs models will be saved
        NB. Create outputPath directory before using this function
    '''

    for lang in ['ar', 'en', 'es']:

        input_dir = join(inputPath, lang)
        output_dir = join(outputPath, lang)

        #print("input_dir ", input_dir)
        #print("output_dir ", output_dir)

        if exists(output_dir):
            rmtree(output_dir)
        makedirs(output_dir)

        # --------------------------------------------------------------------------
        # Load the .pkl file
        with open(splitsPath + "/" + lang + ".pkl", 'rb') as f:
            dic = load(f)
        # Load the tweets in one language
        Authors = parse_tweets_from_dir(input_dir=inputPath + "/" + lang + "/",
                                        label=True,
                                        aggregation=100,
                                        splitDic=dic,
                                        verbosity_level=verbosity_level)

        if not (Authors):
            abort_clean("Tweets loading failed")

        # --------------------------------------------------------------------------
        # Load the classifier

        t0 = time()
        classifier = get_classifier(classifier_str="svm",
                                    config=None,
                                    verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # Load the features extractors

        features_extr = None
        features_extr = get_features_extr(features_str_list="tfidf",
                                          language=lang,
                                          verbose=verbosity_level)
        # --------------------------------------------------------------------------
        # Build the execution pipeline

        pipeline = get_pipeline(features_extr=features_extr,
                                classifier=classifier,
                                verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # Train the execution pipeline

        # train and cross validate results
        if (verbosity_level):
            print("Model Training with cross validation\n")
        pipeline, scores, best_train_indices, best_test_indices = train_model_cross_validation(
            authors=Authors,
            label_type="gender",
            pipeline=pipeline,
            verbose=verbosity_level)

        if verbosity_level:
            print_scores(scores)

        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_scores(scores=scores,
                    output_dir=output_dir + "/",
                    filename=lang,
                    verbose=verbosity_level)

        #--------------------------------------------------------------------------
        # Save the resulting model
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        # build train corpus
        authors = array(Authors)
        train_authors = authors[best_train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type='gender',
                                    verbosity=verbosity_level)
        # build test corpus
        test_authors = authors[best_test_indices]

        # train model
        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=0)

        save_model(pipeline=pipeline,
                   output_dir=output_dir + "/",
                   filename=filename,
                   verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # End Execution
        if verbosity_level:
            print("Training task complete in " + str(round(time() - t0)) +
                  " s")