Esempio n. 1
0
def get_features_extr(features_str_list, language=None, verbose=1):
    '''
    Returns a feature union object containing all the features extractor 
    referenced to in the features_str_list.
    '''
    features_str_list = features_str_list.split("+")
    feat_extr_list = []
    # final feature extractor name
    feat_extr_union_name = ""

    if (verbose):
        print("Starting loading features extractor ... ")

    # load each features vectorizer and build the union
    # the name of each sub extractor is the final estimator
    for feat_extr_str in features_str_list:
        feat_extr = load_features_extr(feat_extr_str, language, verbose)
        feat_extr_pipe_name = feat_extr[-1][0]
        feat_extr_pipe = get_pipeline(features_extr=feat_extr,
                                      classifier=None,
                                      verbose=verbose > 2)
        feat_extr_list.append((feat_extr_pipe_name, feat_extr_pipe))
        feat_extr_union_name += "+" + feat_extr_pipe_name

    feat_extr_union_name = feat_extr_union_name[1:]
    feat_extr_union = FeatureUnion(feat_extr_list)
    res = (feat_extr_union_name, feat_extr_union)

    if (verbose):
        print("features extractor loaded : " + feat_extr_union_name + "\n")
    return res
Esempio n. 2
0
def train(inputPath, splitsPath, outputPath, verbosity_level=1):
    '''

    For each language, proceeds as follow:
        - takes in input the corresponding .pkl file
        - train a text-based classifier on the 80% split
        - save the resulting model in outputPath

    :param inputPath:  Path to PAN18 dataset
    :param splitsPath: Path to dir containing the .pkl files produced by 'splitting.py'
    :param outputPath: Path to dir in which the outputs models will be saved
        NB. Create outputPath directory before using this function
    '''

    for lang in ['ar', 'en', 'es']:

        input_dir = join(inputPath, lang)
        output_dir = join(outputPath, lang)

        #print("input_dir ", input_dir)
        #print("output_dir ", output_dir)

        if exists(output_dir):
            rmtree(output_dir)
        makedirs(output_dir)

        # --------------------------------------------------------------------------
        # Load the .pkl file
        with open(splitsPath + "/" + lang + ".pkl", 'rb') as f:
            dic = load(f)
        # Load the tweets in one language
        Authors = parse_tweets_from_dir(input_dir=inputPath + "/" + lang + "/",
                                        label=True,
                                        aggregation=100,
                                        splitDic=dic,
                                        verbosity_level=verbosity_level)

        if not (Authors):
            abort_clean("Tweets loading failed")

        # --------------------------------------------------------------------------
        # Load the classifier

        t0 = time()
        classifier = get_classifier(classifier_str="svm",
                                    config=None,
                                    verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # Load the features extractors

        features_extr = None
        features_extr = get_features_extr(features_str_list="tfidf",
                                          language=lang,
                                          verbose=verbosity_level)
        # --------------------------------------------------------------------------
        # Build the execution pipeline

        pipeline = get_pipeline(features_extr=features_extr,
                                classifier=classifier,
                                verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # Train the execution pipeline

        # train and cross validate results
        if (verbosity_level):
            print("Model Training with cross validation\n")
        pipeline, scores, best_train_indices, best_test_indices = train_model_cross_validation(
            authors=Authors,
            label_type="gender",
            pipeline=pipeline,
            verbose=verbosity_level)

        if verbosity_level:
            print_scores(scores)

        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_scores(scores=scores,
                    output_dir=output_dir + "/",
                    filename=lang,
                    verbose=verbosity_level)

        #--------------------------------------------------------------------------
        # Save the resulting model
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        # build train corpus
        authors = array(Authors)
        train_authors = authors[best_train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type='gender',
                                    verbosity=verbosity_level)
        # build test corpus
        test_authors = authors[best_test_indices]

        # train model
        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=0)

        save_model(pipeline=pipeline,
                   output_dir=output_dir + "/",
                   filename=filename,
                   verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # End Execution
        if verbosity_level:
            print("Training task complete in " + str(round(time() - t0)) +
                  " s")