def load_classification_data(path, delimiter_text_labels = '\t', delimiter_labels = '\t', line_start_skip = None): f = codecs.open(path,'r',encoding='utf8') lines = [[t.strip() for t in l.split(delimiter_text_labels)] for l in f.readlines()] instances = [] for i in range(len(lines)): if line_start_skip is not None and lines[i][0].startswith(line_start_skip): continue text = data_helper.clean_str(lines[i][0].strip()).split() if delimiter_text_labels == delimiter_labels: labels = lines[i][1:] else: labels = lines[i][1].strip().split(delimiter_labels) instances.append((text, labels)) return instances
def train_cnn(texts, languages, labels, embeddings, parameters, model_serialization_path, emb_lang='default'): # preparing texts print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing texts...', flush=True) texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts] # encoding languages (full name to abbreviation) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Encoding languages (full name to abbreviation)...', flush=True) langs = [map_lang(x) for x in languages] # preparing training examples print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing training examples...', flush=True) x_train, y_train, dist_labels = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False) # defining the CNN model print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Defining the CNN model...', flush=True) cnn_classifier = cnn.CNN(embeddings=(embeddings.emb_sizes[emb_lang], embeddings.lang_embeddings[emb_lang]), num_conv_layers=parameters["num_convolutions"], filters=parameters["filters"], k_max_pools=parameters["k_max_pools"], manual_features_size=0) cnn_classifier.define_model( len(x_train[0]), len(dist_labels), loss_functions.softmax_cross_entropy, len(embeddings.lang_vocabularies[emb_lang]), l2_reg_factor=parameters["reg_factor"], update_embeddings=parameters["update_embeddings"]) cnn_classifier.define_optimization( learning_rate=parameters["learning_rate"]) cnn_classifier.set_distinct_labels(dist_labels) # initializing a Tensorflow session print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Initializing a Tensorflow session...', flush=True) session = tf.InteractiveSession() session.run(tf.global_variables_initializer()) # training the model print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Training the model...', flush=True) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, eval_func, configuration_func=None) simp_trainer.train( list(zip(x_train, y_train)), parameters["batch_size"], parameters["num_epochs"], num_epochs_not_better_end=5, epoch_diff_smaller_end=parameters["epoch_diff_smaller_end"], print_batch_losses=True, eval_params={"dist_labels": dist_labels}) # storing the model print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Storing the model...', flush=True) cnn_classifier.serialize(session, model_serialization_path) session.close() print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Training model is done!', flush=True)
def topically_scale(filenames, texts, languages, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang='default', stopwords=[]): print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Loading classifier...", flush=True) cnn_classifier, session = cnn.load_model( model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy, just_predict=True) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None, configuration_func=None) classified_texts = {} items = list(zip(filenames, texts, [map_lang(x) for x in languages])) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Topically classifying texts...", flush=True) for item in items: fn, text, lang = item print(fn, flush=True) # split text in sentences sentences = nltk.sent_tokenize(text) sents_clean = [ data_helper.clean_str(s.strip()).split() for s in sentences ] langs = [lang] * len(sentences) # preparing training examples x_test = data_shaper.prep_classification( sents_clean, None, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) results = simp_trainer.test(list(zip(x_test, [None] * len(x_test))), parameters["batch_size"], batch_size_irrelevant=True, print_batches=True) pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels) print("Predictions: ", flush=True) print(pred_labs, flush=True) classified_texts[fn] = list(zip(sentences, pred_labs, langs)) print("Languages: " + str(langs), flush=True) print("Done with classifying: " + fn, flush=True) lines_to_write = [] print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Topical scaling...", flush=True) for l in cnn_classifier.dist_labels: label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([ sent_label[0] for sent_label in classified_texts[fn] if sent_label[1] == l ])) for fn in classified_texts] label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50] if len(label_filtered) > 3: print("Topic: " + l, flush=True) fns = [x[0] for x in label_filtered] langs = [x[1] for x in label_filtered] filt_texts = [x[2] for x in label_filtered] for i in range(len(fns)): io_helper.write_list( os.path.dirname(predictions_file_path) + "/" + fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt", [filt_texts[i]]) label_scale = scale_efficient(fns, filt_texts, [inverse_map_lang(x) for x in langs], embeddings, None, parameters, emb_lang=emb_lang, stopwords=stopwords) lines_to_write.append("Scaling for class: " + l) lines_to_write.extend( [k + " " + str(label_scale[k]) for k in label_scale]) lines_to_write.append("\n") else: lines_to_write.append( "Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic." ) print( "Topic: " + l + ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.", flush=True) io_helper.write_list(predictions_file_path, lines_to_write) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Topical Scaling is done!', flush=True)
def test_cnn(texts, languages, labels, embeddings, model_serialization_path, predictions_file_path, parameters, emb_lang='default'): # loading the serialized print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Loading the serialized model...', flush=True) cnn_classifier, session = cnn.load_model( model_serialization_path, embeddings.lang_embeddings[emb_lang], loss_functions.softmax_cross_entropy, just_predict=(labels is None)) # preparing/cleaning the texts print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing/cleaning the texts...', flush=True) texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts] # encoding languages (full name to abbreviation) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Encoding languages (full name to abbreviation)...', flush=True) langs = [map_lang(x) for x in languages] # preparing testing examples print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Preparing training examples...', flush=True) if labels: x_test, y_test, dist_labels = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) else: x_test = data_shaper.prep_classification( texts_clean, labels, embeddings, embeddings_language=emb_lang, multilingual_langs=langs, numbers_token='<NUM/>', punct_token='<PUNC/>', add_out_of_vocabulary_terms=False, dist_labels=cnn_classifier.dist_labels, max_seq_len=cnn_classifier.max_text_length) simp_trainer = trainer.SimpleTrainer(cnn_classifier, session, build_feed_dict_func, None if not labels else eval_func, configuration_func=None) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Starting test...', flush=True) results = simp_trainer.test( list(zip(x_test, y_test if labels else [None] * len(x_test))), parameters["batch_size"], eval_params={"dist_labels": cnn_classifier.dist_labels}, batch_size_irrelevant=True) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Getting prediction labels...', flush=True) pred_labs = get_prediction_labels(results[0] if labels else results, cnn_classifier.dist_labels) if labels is None: io_helper.write_list(predictions_file_path, pred_labs) else: list_pairs = list(zip(pred_labs, labels)) list_pairs.insert(0, ("Prediction", "Real label")) list_pairs.append(("Performance: ", str(results[1]))) io_helper.write_list_tuples_separated(predictions_file_path, list_pairs) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' Prediction is done!', flush=True)
sent_index_de_test = { i: sent_lines_de_test[i] for i in range(len(sent_lines_de_test)) } sent_index_en_test = { i: sent_lines_en_test[i] for i in range(len(sent_lines_en_test)) } print("Length sent_index_de_test: " + str(len(sent_index_de_test))) print("Length sent_index_en_test: " + str(len(sent_index_en_test))) print("Cleaning dictionaries DE test...") for k in sent_index_de_test: sent_index_de_test[k] = data_helper.clean_str( sent_index_de_test[k]).split() print("Cleaning dictionaries EN test...") for k in sent_index_en_test: sent_index_en_test[k] = data_helper.clean_str( sent_index_en_test[k]).split() print("Embedding indices lookup DE test...") sent_index_embedded_de_test = data_shaper.prep_embeddings_lookup( sent_index_de_test, t_embeddings, stopwords=stopwords_de, punctuation=punctuation, lang='default', text_lang_prefix='de__', min_tokens=2, num_token="<NUM/>")