def get_lsa(config=None): ''' Returns a latent semantic analysis vectorizer. If specified, follows the config to setup the vectorizer Else follows default lsa setup. ''' extractr_name = "" extractr = None if not (config): extractr_name = "lsa-default" extractr = TruncatedSVD( #------------------------- Default Values n_components=1000, #---------------------- 2 algorithm="randomized", n_iter=10, random_state=42, tol=0. ) else: extractr_name = config["extractr_name"] try: extractr = TruncatedSVD(**(config["configuration"])) except: abort_clean("Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def get_nbb(config=None): ''' Returns a Naive Bayes classifier (bernouilly implementation). If specified, follows the config to setup the NB classifier Else follows default NB classifier setup. ''' clf_name = "" clf = None if not (config): clf_name = "nbb-default" clf = BernoulliNB( alpha=1.0, binarize=.0, fit_prior=True, class_prior=None) else: clf_name = config["classifier_name"] try: clf = BernoulliNB(**(config["configuration"])) except: abort_clean("Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def get_tfidf(config=None): ''' Returns a tfidf vectorizer. If specified, follows the config to setup the vectorizer Else follows default tfidf setup. ''' extractr_name = "" extractr = None if not (config): extractr_name = "tfidf-default" extractr = TfidfTransformer( norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) else: extractr_name = config["extractr_name"] try: extractr = TfidfTransformer(**(config["configuration"])) except: abort_clean("Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def get_svm(config=None): ''' Returns a svm classifier. If specified, follows the config to setup the svm Else follows default svm setup. ''' clf_name = "" clf = None if not(config): clf_name = "svm-default" clf = LinearSVC( #---------------------------- Default Value C=1.0, loss='squared_hinge', penalty='l1', #------------------- l2 dual=False, #--------------------- True tol=1e-4, multi_class='crammer_singer', #--- ovr fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=500) #-------------------- 1000 else: clf_name = config["classifier_name"] try: clf = LinearSVC(**(config["configuration"])) except: abort_clean("Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def get_classifier(classifier_str, config=None, verbose=1): ''' Returns a classifier specified in parameter Available classifiers are : - nbb : NaiveBayes (bernouilly) - mlp : Multi-layered Perceptron - rfo : Random Forest - svm : Support Vector Machine A classifier can be specified : (TODO) - by its name --> a default ft_extr will be instanciated - by a path to a config file, --> a custom ft_extr will be instanciated ''' if verbose and not(config): print("Starting loading classifier ... ") if config: classifier_str = config["classifier_type"] #-------------------------------------------------------------------------- # Get required classifier clf_name = "" clf = None if classifier_str == "svm": clf_name, clf = get_svm(config) elif classifier_str == "mlp": clf_name, clf = get_mlp(config) elif classifier_str == "nbb": clf_name, clf = get_nbb(config) elif classifier_str == "rfo": clf_name, clf = get_rfo(config) else: try: config = load_config(classifier_str) except: abort_clean("Cannot load the classifier configuration", "Either the clf name is incorrect or the path is invalid : " + classifier_str) if verbose: print("Loading classifier config from file") # recursive call with config loaded return get_classifier("", config, verbose=verbose) #-------------------------------------------------------------------------- # Return classifier if(verbose): print("classifier loaded: '" + clf_name + "'\n") res = (clf_name, clf) return res
def load_model(filename): ''' Loads a classifier (pipeline) from a file. ''' # Load model try: pipe = joblib.load(filename) except: abort_clean("failed to load the classifier") return pipe
def get_wc2(config=None): ''' Returns a word count (bigram) vectorizer. If specified, follows the config to setup the vectorizer Else follows default wc2 setup. ''' extractr_name = "" extractr = None tokenizr = TweetTokenizer( preserve_case=True, strip_handles=True, reduce_len=False) if not (config): extractr_name = "wc2-default" extractr = CountVectorizer( #----------------- Default Values input='content', encoding='utf-8', decode_error='ignore', strip_accents=None, analyzer='word', preprocessor=None, tokenizer=tokenizr.tokenize, #------------ None ngram_range=(1, 2), #--------------------- (1, 1) stop_words=None, lowercase=True, token_pattern=r"(?u)\b\w\w+\b", max_df=1.0, min_df=2, #------------------------------- 1 max_features=None, vocabulary=None, binary=False, dtype=np.int64) else: extractr_name = config["extractr_name"] try: # Adjustements due to JSON incompatibility config["configuration"]["ngram_range"] = tuple( config["configuration"]["ngram_range"] ) config["configuration"]["dtype"] = np.int64 config["configuration"]["tokenizer"] = tokenizr.tokenize extractr = CountVectorizer(**(config["configuration"])) except: abort_clean("Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def get_mlp(config=None): ''' Returns a Multi-Layered Perceptron classifier. If specified, follows the config to setup the mlp classifier Else follows default mlp classifier setup. ''' clf_name = "" clf = None if not (config): clf_name = "mlp-default" clf = MLPClassifier( hidden_layer_sizes=(100,), activation="relu", solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=1e-4, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8) else: clf_name = config["classifier_name"] try: config["configuration"]["hidden_layer_sizes"] = tuple( config["configuration"]["hidden_layer_sizes"] ) clf = MLPClassifier(**(config["configuration"])) except: abort_clean("Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def load_features_extr(features_str, verbose=1): ''' Returns a list of vectorizers to match the specified features_str Available features extractors are : - wc2 : Word count - bigram - tfidf : TF-IDF - lsa : Latent Semantic Analysis A feature extractor can be specified : - by its name --> a default clf will be instanciated - by a path to a config file, --> a custom clf will be instanciated ''' feat_extractors = [] #-------------------------------------------------------------------------- # Get required features_extractor if features_str == "wc2": feat_extractors.append(get_wc2(None)) elif features_str == "tfidf": feat_extractors.append(get_wc2(None)) feat_extractors.append(get_tfidf(None)) elif features_str == "lsa": feat_extractors.append(get_wc2(None)) feat_extractors.append(get_tfidf(None)) feat_extractors.append(get_lsa(None)) else : try: config = load_config(features_str) except: abort_clean("Cannot load the extractors configuration", "Either extr name is incorrect or path is invalid : " + features_str) # Load the config from a file if verbose: print("Loading features extractor config from file ") feat_extractors = load_features_extr_from_file(config, verbose=verbose) #-------------------------------------------------------------------------- # Return features extractors return feat_extractors
def get_rfo(config=None): ''' Returns a Naive Bayes classifier (bernouilly implementation). If specified, follows the config to setup the NB classifier Else follows default NB classifier setup. ''' clf_name = "" clf = None if not (config): clf_name = "rfo-default" clf = RandomForestClassifier( n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=-1, #------------------------------ 1 random_state=None, verbose=0, warm_start=False, class_weight=None) else: clf_name = config["classifier_name"] try: clf = RandomForestClassifier(**(config["configuration"])) except: abort_clean("Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def optimize(options): ''' Optimize the given classifier or/and features extractor on a specified list of parameters Will proceed as follows : - loads the dataset - builds the corpus - load the parameters for tuning - loads the classifiers - loads the features extractors - builds the execution pipelines - trains and compares the different classifiers on the corpus - outputs the best set of parameters found ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Label type not specified", "expected 'v' or 'g'") if not (options["hyper-parameters"]): abort_clean("hyper parameters not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the optimize parameters try: params = load_config(options["hyper-parameters"]) except: abort_clean("Configuration couldn't be loaded", "given path: " + options["hyper-parameters"]) #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=params["classifier-call"], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = get_features_extr( features_str_list=params["features-extractr-call"], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) # Set the classifier and the parameters to be tuned tuning_parameters = get_opt_parameters(params) scores = params["scores"] if options["verbosity"]: print("Starting the optimization process ...") # Launch the tuning of hyper parameters for score in scores: print("Tuning hyper-parameters for %s" % score) optimize_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) clf_optimizer = GridSearchCV(estimator=pipeline, param_grid=tuning_parameters, scoring='%s_macro' % score, fit_params=None, n_jobs=-1, pre_dispatch='2*n_jobs', iid=True, cv=None, refit=True, verbose=options["verbosity"], error_score='raise', return_train_score=True) # Start optimisation clf_optimizer.fit(optimize_corpus["tweets"], optimize_corpus["labels"]) if options["verbosity"]: print("Best parameters set found on development set:") best_parameters = clf_optimizer.best_params_ for param_name in sorted(best_parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print() if options["verbosity"] > 1: print("Grid scores on development set:") means = clf_optimizer.cv_results_['mean_test_score'] stds = clf_optimizer.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf_optimizer.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) # saving results save_optimisation_results(grid=clf_optimizer, output_dir=options["output-dir"], score=score, verbose=options["verbosity"])
# - input-dir : input directory for tweet loading # - label-type : which labels to train on # - no-cross-validation : assess if the classifier should be cross-valid # - output-dir : output directory for resulting files # - processed-tweets-dir : (legacy) directory for the parsed tweets # - verbosity : noise level on the terminal trainer_opt = { "aggregation": args.aggregation, "classifier": args.classifier, "cross-validation": args.cross_validation, "features": args.features, "gensim": args.gensim, "hyper-parameters": args.hyper_parameters, "input-dir": args.input_dir, "label-type": args.label_type, "output-dir": args.output_dir, "processed-tweets-dir": args.processed_tweets_dir, "token-level": args.token_level, "verbosity": args.verbosity } from act_trainer import train train(trainer_opt) #------------------------------------------------------------------------------ # [Contextual] Unknown Request else: abort_clean("ERROR : Unknown user request.", "Request found : " + usr_request)
def evaluate(options): ''' Evaluates the results of a classification in the context of PAN17 The input directory must be structured according to PAN17 specifications Will proceed as follows : - loads the author files - loads the truth files (one per language) - compares the predicted labels with the truth - outputs the results ''' # PAN 17 specifics language_dirs = get_language_dir_names() #-------------------------------------------------------------------------- # Check basic requirements if not (options["truth-dir"]): abort_clean("truth directory not specified") #-------------------------------------------------------------------------- # Load the author files if options["verbosity"]: print("Loading authors files ...") t0 = time() Authors = [] for l_dir in language_dirs: l_path = format_dir_name(options["input-dir"] + l_dir) file_name_list = [ f for f in listdir(l_path) if isfile(join(l_path, f)) ] for file_name in file_name_list: auth = load_author_file(file_path=l_path + file_name, verbose=options["verbosity"] > 1) Authors.append(auth) if options["verbosity"]: print("Files loaded : " + str(len(Authors))) print("Loading author files --- success in %.3f seconds\n" % (time() - t0)) #-------------------------------------------------------------------------- # Load the truth files if options["verbosity"]: print("Loading truth files ...") t0 = time() truth = dict() for lang in language_dirs: lang_dir = format_dir_name(options["truth-dir"] + lang) try: truth_file = open(lang_dir + "truth.txt") except: abort_clean("Can't open truth file", "Couldn't open " + lang_dir + "truth.txt") truth_lines = [x.strip().split(':::') for x in truth_file.readlines()] attrs = dict() for l in truth_lines: attrs[l[0]] = {"gender": l[1], "variety": l[2]} truth[lang] = attrs if options["verbosity"]: print("Files loaded : " + str(len(truth))) print("Loading truth files --- success in %.3f seconds\n" % (time() - t0)) #-------------------------------------------------------------------------- # Compute results if options["verbosity"]: print("Computing results ...") t0 = time() # preparing result data-structure results = dict() for lang in language_dirs: var_labels = get_variety_labels(lang) var_confusion_matrix = [[0 for x in var_labels] for y in var_labels] gdr_labels = get_gender_labels() gdr_confusion_matrix = [[0 for x in gdr_labels] for y in gdr_labels] results[lang] = { "n_files": 0, "gdr-labels": gdr_labels, "gdr-confusion-matrix": gdr_confusion_matrix, "gdr-positive-eval": 0, "var-labels": var_labels, "var-confusion-matrix": var_confusion_matrix, "var-positive-eval": 0 } # Starting computation for auth in Authors: lang_res = results[auth["lang"]] auth_truth = truth[auth["lang"]][auth["id"]] results[auth["lang"]]["n_files"] += 1 auth_gdr_eval = auth_truth["gender"] == auth["gender"] auth_var_eval = auth_truth["variety"] == auth["variety"] var_labels = lang_res["var-labels"] lang_res["var-confusion-matrix"][var_labels.index( auth_truth["variety"])][var_labels.index(auth["variety"])] += 1 gdr_labels = lang_res["gdr-labels"] lang_res["gdr-confusion-matrix"][gdr_labels.index( auth_truth["gender"])][gdr_labels.index(auth["gender"])] += 1 results[auth["lang"]]["gdr-positive-eval"] += 1 if auth_gdr_eval else 0 results[auth["lang"]]["var-positive-eval"] += 1 if auth_var_eval else 0 if options["verbosity"]: print("Computing results --- success in %.3f seconds\n" % (time() - t0)) #-------------------------------------------------------------------------- # Save results save_evaluation_results(results=results, input_dir=options["input-dir"], output_dir=options["output-dir"], verbose=options["verbosity"])
def train_model_cross_validation(authors, label_type, pipeline, verbose=1): ''' Takes a pipeline and train it on the specified corpus. Processes a cross-validation algorithm (K-fold) in order to evaluate the quality of the model. Returns the best trained pipeline (in terms of macro f-score). ''' labels = get_labels(lang=authors[0]["lang"], label_type=label_type) if not (labels): abort_clean("Could not extract labels") if verbose: print("Labels extraction succeded.") print("Available labels : " + " / ".join(labels) + "\n") if verbose: t0 = time() print("Starting model Cross Validation ... (this may take some time)") confusion = array([[0 for x in range(len(labels))] for y in range(len(labels))]) scores = [] best_f_score = 0 best_pipeline = None scores_micro = [] scores_macro = [] # start Kfold cross validation. n_run = 1 k_fold = KFold(n_splits=10, shuffle=True) authors = array(authors) for train_indices, test_indices in k_fold.split(authors): # build train corpus train_authors = authors[train_indices] train_corpus = build_corpus(authors=train_authors, label_type=label_type, verbosity=verbose) # build test corpus test_authors = authors[test_indices] # train model pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=0) # test model truthes = [] predictions = [] for author in test_authors: var_classes, var_predictions = predict_author_proba(author=author, model=pipeline) var_max_idx = var_predictions.index(max(var_predictions)) label_predicted = var_classes[var_max_idx] predictions.append(label_predicted) truthes.append(author[label_type]) # compute metrics confusion += confusion_matrix(truthes, predictions, labels=labels) score_micro = f1_score(truthes, predictions, labels=labels, average="micro") score_macro = f1_score(truthes, predictions, labels=labels, average="macro") if verbose: print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) + " macrof1=" + str(score_macro)) # store for avg scores_micro.append(score_micro) scores_macro.append(score_macro) n_run += 1 # save the pipeline if better than the current one if score_macro > best_f_score: best_pipeline = clone(pipeline, True) best_f_score = score_macro if verbose: print("Model Cross Validation complete in %.3f seconds.\n" % (time() - t0)) scores = { "mean_score_micro": sum(scores_micro) / len(scores_micro), "mean_score_macro": sum(scores_macro) / len(scores_macro), "confusion_matrix": confusion, "best_macro_score": best_f_score, "labels": labels } return best_pipeline, scores
def train(inputPath, splitsPath, outputPath, verbosity_level=1): ''' For each language, proceeds as follow: - takes in input the corresponding .pkl file - train a text-based classifier on the 80% split - save the resulting model in outputPath :param inputPath: Path to PAN18 dataset :param splitsPath: Path to dir containing the .pkl files produced by 'splitting.py' :param outputPath: Path to dir in which the outputs models will be saved NB. Create outputPath directory before using this function ''' for lang in ['ar', 'en', 'es']: input_dir = join(inputPath, lang) output_dir = join(outputPath, lang) #print("input_dir ", input_dir) #print("output_dir ", output_dir) if exists(output_dir): rmtree(output_dir) makedirs(output_dir) # -------------------------------------------------------------------------- # Load the .pkl file with open(splitsPath + "/" + lang + ".pkl", 'rb') as f: dic = load(f) # Load the tweets in one language Authors = parse_tweets_from_dir(input_dir=inputPath + "/" + lang + "/", label=True, aggregation=100, splitDic=dic, verbosity_level=verbosity_level) if not (Authors): abort_clean("Tweets loading failed") # -------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str="svm", config=None, verbose=verbosity_level) # -------------------------------------------------------------------------- # Load the features extractors features_extr = None features_extr = get_features_extr(features_str_list="tfidf", language=lang, verbose=verbosity_level) # -------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=verbosity_level) # -------------------------------------------------------------------------- # Train the execution pipeline # train and cross validate results if (verbosity_level): print("Model Training with cross validation\n") pipeline, scores, best_train_indices, best_test_indices = train_model_cross_validation( authors=Authors, label_type="gender", pipeline=pipeline, verbose=verbosity_level) if verbosity_level: print_scores(scores) filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_scores(scores=scores, output_dir=output_dir + "/", filename=lang, verbose=verbosity_level) #-------------------------------------------------------------------------- # Save the resulting model filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) # build train corpus authors = array(Authors) train_authors = authors[best_train_indices] train_corpus = build_corpus(authors=train_authors, label_type='gender', verbosity=verbosity_level) # build test corpus test_authors = authors[best_test_indices] # train model pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=0) save_model(pipeline=pipeline, output_dir=output_dir + "/", filename=filename, verbose=verbosity_level) # -------------------------------------------------------------------------- # End Execution if verbosity_level: print("Training task complete in " + str(round(time() - t0)) + " s")
def train(options): ''' Trains a specified classifier on a specified dataset using specified feature extractors. Will proceed as follows : - loads the dataset - builds the corpus - loads the classifier - loads the features extractor - builds the execution pipeline - trains the classifier on the corpus - cross-validates the resulting model [optional] - saves the resulting model [optional] ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Labels not specified", "expected 'l', 'g' or 'v'") if not (options["features"]) and not (options["gensim"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=options["classifier"][0], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = None if not (options["gensim"]): features_extr = get_features_extr( features_str_list=options["features"][0], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Train the execution pipeline # train and cross validate results if (options["cross-validation"]): if (options["verbosity"]): print("Model Training with cross validation\n") if options["gensim"]: model, pipeline, scores = train_model_gensim_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, config=options["hyper-parameters"], token_level=options["token-level"], verbose=options["verbosity"]) else: pipeline, scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=options["verbosity"]) if options["verbosity"]: print_scores(scores) if options["output-dir"]: if options["gensim"]: filename = str("doc2vec" + "-siz_" + str(model[0].vector_size) + "-win_" + str(model[0].window) + "-cnt_" + str(model[0].min_count) + get_classifier_name(classifier)) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_scores(scores=scores, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) # train without validation --> output-dir required else: if options["verbosity"]: print("Model Training without cross validation\n") if not (options["output-dir"]): abort_clean("No output directory specified.", "Training without persisting is not allowed") train_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Save the resulting model if options["gensim"]: filename = "doc2vec+" + get_classifier_name(classifier) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_model(pipeline=pipeline, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # End Execution if options["verbosity"]: print("Training task complete in " + str(round(time() - t0)) + " s")
def classify(options): ''' Classifies a dataset respecting the PAN'17 specification. Predicts both language variety and Will proceed as follows : - loads the dataset - [contextual] for each subdirectory of the dataset, loads the related classifiers - predicts the different labels for each author within the loaded corpus - outputs the result files - [contextual] checks it's results ''' #---------------------------------------------------------------------- # Checking basic requirements if not (options["classification-type"] and options["classification-type"] in ["loose", "successive"]): abort_clean("Classification type incorrectly specified") if options["verbosity"]: print('Classificationtype is ' + options["classification-type"]) # PAN 17 specifics languages = get_language_dir_names() for lang in languages: if options["verbosity"]: print('---------------------------------------') print("Language up for classification: '" + lang + "'\n") processed_tweets_dir = ( "" if not (options["processed-tweets-dir"]) else format_dir_name(options["processed-tweets-dir"] + lang)) classifier_dir_path = format_dir_name(options["classifiers-dir"] + lang) output_subdir_path = format_dir_name(options["output-dir"] + lang) #---------------------------------------------------------------------- # Load the tweets Authors = parse_tweets_from_dir( input_dir=format_dir_name(options["input-dir"] + lang), output_dir=processed_tweets_dir, label=False, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #---------------------------------------------------------------------- # Load the classifiers classifiers = load_classifiers( classifier_dir_path=classifier_dir_path, classification_type=options["classification-type"], verbose=options["verbosity"]) #---------------------------------------------------------------------- # Start classification if options["verbosity"]: print("Starting authors classification ...") t0 = time() classify_authors(Authors, classifiers, options["classification-type"], options["verbosity"]) if options["verbosity"] > 1: for auth in Authors: print(auth["id"] + ":::" + auth["gender"] + ":::" + auth["variety"]) if options["verbosity"]: print("Classification of '" + lang + "' complete in %.3f seconds" % (time() - t0)) print('---------------------------------------\n') create_dir(output_subdir_path) for auth in Authors: save_author_file(author=auth, output_dir=output_subdir_path, verbose=options["verbosity"] > 1) # for memory issues, free the classifiers objects gc.collect()
def train_model_gensim_cross_validation(authors, label_type, pipeline, config="", token_level="word", verbose=1): ''' Takes a doc2vec model and trains it on the specified corpus. Takes a classifier and trains it on the doc2vec model vectors. Processes a cross-validation algorithm (K-fold) in order to evaluate the quality of the overall model. Returns the best trained pipeline (in terms of macro f-score). ''' labels = get_labels(lang=authors[0]["lang"], label_type=label_type) if not (labels): abort_clean("Could not extract labels") if verbose: print("Labels extraction succeded.") print("Available labels : " + " / ".join(labels) + "\n") if verbose: t0 = time() print("Starting model Cross Validation ... (this may take some time)") # load doc2vec conf conf = [] if config: conf = load_config(config)["extractors"][0] # legacy conf files if verbose: print("loading doc2vec config file from disk :") print(" - vector_size = " + str(conf["configuration"]["vector_size"])) print(" - window = " + str(conf["configuration"]["window"])) print(" - min_count = " + str(conf["configuration"]["min_count"])) # load the tokenizer tknzr = Tokenizer(token_level) if verbose: print("Selected token level : " + token_level + "\n") # Kfold parameters. confusion = array([[0 for x in range(len(labels))] for y in range(len(labels))]) scores = [] best_f_score = 0 best_pipeline = None best_model = None scores_micro = [] scores_macro = [] n_run = 1 k_fold = KFold(n_splits=10, shuffle=True) authors = array(authors) # start Kfold cross validation. for train_indices, test_indices in k_fold.split(authors): # import gensim lib (heavy load) from gensim import models as gensim_models # get doc2vec model model_dm = get_doc2vec(conf, 1, verbose) model_pv = get_doc2vec(conf, 0, verbose) # build train corpus train_authors = authors[train_indices] train_corpus = build_corpus(authors=train_authors, label_type=label_type, verbosity=verbose) # build test corpus test_authors = authors[test_indices] # learn the vocabulary (tokenisation of each tweet) tweets = list(zip(train_corpus["labels"], train_corpus["tweets"])) processed_tweets = [] idxs = [0 for l in labels] for t in tweets: prefix = t[0] + "_" + str(idxs[labels.index(t[0])]) idxs[labels.index(t[0])] += 1 processed_tweets.append( gensim_models.doc2vec.LabeledSentence(words=tknzr.tokenize( t[1]), tags=[prefix])) tweets = processed_tweets model_dm.build_vocab(tweets) model_pv.build_vocab(tweets) # train doc2vec model shuffle(tweets) model_dm.train(sentences=tweets, total_examples=model_dm.corpus_count, epochs=100, start_alpha=0.025, end_alpha=0.0025) model_dm.delete_temporary_training_data() model_pv.train(sentences=tweets, total_examples=model_pv.corpus_count, epochs=100, start_alpha=0.025, end_alpha=0.0025) model_pv.delete_temporary_training_data() # train dataset conversion (doc->vectors) train_vectors = zeros((sum(idxs), model_dm.vector_size * 2)) train_labels = [] for i, tag in enumerate(model_dm.docvecs.doctags): train_vectors[i] = concatenate( (model_dm.docvecs[tag], model_pv.docvecs[tag]), axis=0) train_labels.append(tag.split('_')[0]) train_labels = array(train_labels) # train classifier pipeline.fit(train_vectors, train_labels) # test models truthes = [] predictions = [] for author in test_authors: # test dataset conversion (doc->vectors) tweet_vectors = [ concatenate((model_dm.infer_vector(tknzr.tokenize(tweet)), model_pv.infer_vector(tknzr.tokenize(tweet))), axis=0) for tweet in author["tweets"] ] author_tmp = {"tweets": tweet_vectors} var_classes, var_predictions = predict_author_proba( author=author_tmp, model=pipeline) var_max_idx = var_predictions.index(max(var_predictions)) label_predicted = var_classes[var_max_idx] predictions.append(label_predicted) truthes.append(author[label_type]) # compute metrics confusion += confusion_matrix(truthes, predictions, labels=labels) score_micro = f1_score(truthes, predictions, labels=labels, average="micro") score_macro = f1_score(truthes, predictions, labels=labels, average="macro") if verbose: print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) + " macrof1=" + str(score_macro)) # store for avg scores_micro.append(score_micro) scores_macro.append(score_macro) n_run += 1 # save the pipeline if better than the current one if score_macro > best_f_score: best_model = [model_dm, model_pv] best_pipeline = clone(pipeline, True) best_f_score = score_macro if verbose: print("Model Cross Validation complete in %.3f seconds.\n" % (time() - t0)) scores = { "mean_score_micro": sum(scores_micro) / len(scores_micro), "mean_score_macro": sum(scores_macro) / len(scores_macro), "confusion_matrix": confusion, "best_macro_score": best_f_score, "labels": labels } return best_model, best_pipeline, scores
def parse_tweets_from_dir_2(input_dir, list_authors, output_dir=None, label=True, aggregation=100, verbosity_level=1): ''' Parses all the xml files directly in the input_dir (no recursion). Retrieves the attributes of the author stored in the truth file. If specified, the parsed files will be written into the output_dir Verbosity level specifies the amount of content displayed: 0- nothing 1- Main steps 2- Files parsed and stats about filtering / tweets available per class 3- All the parsed content. Returns a list containing all the author objects contained within the input_dir ''' # vars Authors = [] t0 = time() n_files = 0 n_files_parsed = 0 n_files_filtered = 0 n_files_infos_retrieved = 0 ret = '\n' # ---------------------------- FILES LISTING if verbosity_level: t0 = time() print("Starting files Listing ...") try: xml_files = [ f for f in listdir(input_dir + "/text") if (isfile(join(input_dir + "/text", f)) and f[-4:] == ".xml") and f.strip(".xml") in list_authors ] except: abort_clean("Files listing --- failure", "Maybe the directory specified is incorrect ?") if verbosity_level: print("Files found : " + str(len(xml_files))) print("Files listing --- success in %.3f seconds\n" % (time() - t0)) # ---------------------------- FILES PROCESSING if verbosity_level: t0 = time() print("Starting files processing ...") n_files = len(xml_files) if output_dir: create_dir(output_dir) for f in xml_files: author = None tweets = [] save_file = output_dir + f if output_dir else None try: author = parse_file(file_to_parse=input_dir + "/text/" + f, aggregation=aggregation, file_to_save=save_file, verbose=verbosity_level > 2) except: if verbosity_level > 1: print(" Parsing file : " + f + " --- failure") continue if verbosity_level > 1: print(" Parsing file : " + f + " --- success") n_files_parsed += 1 author["id"] = f[:-4] Authors.append(author) if verbosity_level: print("Parsed files : " + str(n_files_parsed) + " out of " + str(n_files)) print("Files Parsing --- success in %.3f seconds\n" % (time() - t0)) # ---------------------------- AUTHOR ATTRIBUTES RETRIEVING if label: if verbosity_level: t0 = time() print("Starting Author Attributes Retrieval ...") try: truth_file = open(input_dir + "/" + author['lang'] + ".txt") except: abort_clean("Author Attributes Retrieval --- failure", "Couldn't open truth file") truth_lines = [x.strip().split(':::') for x in truth_file.readlines()] attrs = dict() for l in truth_lines: attrs[l[0]] = l[1:] for idx, author in enumerate(Authors): author["gender"] = attrs[author["id"]][0] # author["variety"] = attrs[author["id"]][1] if author["gender"]: n_files_infos_retrieved += 1 if verbosity_level > 1: print(" author " + Authors[idx]["id"] + " information retrieved : Gender=" + Authors[idx]["gender"] + " Language=" + Authors[idx]["lang"]) if verbosity_level > 1: print("Retreived Information : " + str(n_files_infos_retrieved) + " out of " + str(n_files)) print("Author Attributes Retrieval --- success in %.3f seconds\n" % (time() - t0)) # ---------------------------- TWEET FILTERING -- D if verbosity_level: t0 = time() print("Starting Tweets Filtering ...") for author in Authors: if verbosity_level > 1: print(" author " + author["id"] + " filtering") try: author["tweets"] = filter_tweets(author, verbosity_level > 1) except: continue n_files_filtered += 1 if verbosity_level: print("Filtered files : " + str(n_files_filtered) + " out of " + str(n_files)) print("Tweets Filtering --- success in %.3f seconds\n" % (time() - t0)) # ---------------------------- RETURNING PROCESSED DATA if verbosity_level: print("Tweets available : " + str(sum([len(a["tweets"]) for a in Authors])) + "\n") return Authors
def compare(options): ''' Compare a set of specified classifiers on a specified dataset using specified features Will proceed as follows : - loads the dataset - builds the corpus - loads the classifiers - loads the features extractors - builds the execution pipelines - trains the different classifiers on the corpus - saves the scores obtained by each classifier on each set of features ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("label type not specified", "expected 'l', 'g' or 'v'") if not (options["features"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") #-------------------------------------------------------------------------- # Load the tweets Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifiers classifier_str_list = [] if isinstance(options["classifier"], list): classifier_str_list = options["classifier"] else: classifier_str_list = [options["classifier"]] classifiers = [ get_classifier(classifier_str=clf, config=None, verbose=False) for clf in classifier_str_list ] if options["verbosity"]: print("Classifiers Loaded: ") for clf in classifiers: print(" - '" + clf[0] + "'") print() #-------------------------------------------------------------------------- # Load the features extractors extractors_str_list = options["features"] extractors = [ get_features_extr(features_str_list=extr, verbose=False) for extr in extractors_str_list ] if options["verbosity"]: print("Features extractors Loaded: ") for extrs in extractors: print(" - '" + extrs[0] + "'") print() #-------------------------------------------------------------------------- # Prepare results informations supports F1_micro = [[0 for x in classifiers] for y in extractors] F1_macro = [[0 for x in classifiers] for y in extractors] Time_train = [[0 for x in classifiers] for y in extractors] output_dir = options["output-dir"] individual_scores_dir = output_dir + "indiv_scores/" create_dir(individual_scores_dir) #-------------------------------------------------------------------------- # Start the model comparison t0 = time() total_iteration = len(classifiers) * len(extractors) if options["verbosity"]: print("Starting model comparisons") # Loop for each pair features-extractor/classifier for idx_extr, extr in enumerate(extractors): extr_name = get_features_extr_name(extr) for idx_clf, clf in enumerate(classifiers): clf_name = get_classifier_name(clf) if options["verbosity"]: iteration_number = (idx_extr) * len(classifiers) + idx_clf + 1 print("Iteration : " + str(iteration_number) + "/" + str(total_iteration)) print("Testing : Features: " + extr_name + " | Classifier: " + clf_name) t0_step = time() # Build pipeline pipeline = get_pipeline(features_extr=extr, classifier=clf, verbose=False) # Start training + cross validation try: model, step_scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=False) except: print("some error occured - the features extracted and the \ classifier are problably incompatible\n") continue if options["verbosity"]: print("Training complete in " + str(round(time() - t0_step)) + " seconds") print_scores(step_scores) print() # Save scores save_scores(scores=step_scores, output_dir=individual_scores_dir, filename=extr_name + "+" + clf_name, verbose=False) F1_micro[idx_extr][idx_clf] = step_scores["mean_score_micro"] F1_macro[idx_extr][idx_clf] = step_scores["mean_score_macro"] Time_train[idx_extr][idx_clf] = round(time() - t0_step) # Save final micro and macro measuresand execution time save_comparison_table(F1_micro, extractors, classifiers, output_dir + "micro.csv") save_comparison_table(F1_macro, extractors, classifiers, output_dir + "macro.csv") save_comparison_table(Time_train, extractors, classifiers, output_dir + "time.csv") if options["verbosity"]: print("Comparison task complete in " + str(round(time() - t0)) + " s")