Example #1
0
def load_classification_data(path, delimiter_text_labels = '\t', delimiter_labels = '\t', line_start_skip = None):
	f = codecs.open(path,'r',encoding='utf8')
	lines = [[t.strip() for t in l.split(delimiter_text_labels)] for l in f.readlines()]
	instances = []
	for i in range(len(lines)):
		if line_start_skip is not None and lines[i][0].startswith(line_start_skip):
			continue
		text = data_helper.clean_str(lines[i][0].strip()).split()
		if delimiter_text_labels == delimiter_labels:
			labels = lines[i][1:]
		else:
			labels = lines[i][1].strip().split(delimiter_labels)
		instances.append((text, labels))
	return instances		
Example #2
0
def train_cnn(texts,
              languages,
              labels,
              embeddings,
              parameters,
              model_serialization_path,
              emb_lang='default'):
    # preparing texts
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing texts...',
          flush=True)
    texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
    # encoding languages (full name to abbreviation)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Encoding languages (full name to abbreviation)...',
          flush=True)
    langs = [map_lang(x) for x in languages]
    # preparing training examples
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing training examples...',
          flush=True)
    x_train, y_train, dist_labels = data_shaper.prep_classification(
        texts_clean,
        labels,
        embeddings,
        embeddings_language=emb_lang,
        multilingual_langs=langs,
        numbers_token='<NUM/>',
        punct_token='<PUNC/>',
        add_out_of_vocabulary_terms=False)

    # defining the CNN model
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Defining the CNN model...',
          flush=True)
    cnn_classifier = cnn.CNN(embeddings=(embeddings.emb_sizes[emb_lang],
                                         embeddings.lang_embeddings[emb_lang]),
                             num_conv_layers=parameters["num_convolutions"],
                             filters=parameters["filters"],
                             k_max_pools=parameters["k_max_pools"],
                             manual_features_size=0)
    cnn_classifier.define_model(
        len(x_train[0]),
        len(dist_labels),
        loss_functions.softmax_cross_entropy,
        len(embeddings.lang_vocabularies[emb_lang]),
        l2_reg_factor=parameters["reg_factor"],
        update_embeddings=parameters["update_embeddings"])
    cnn_classifier.define_optimization(
        learning_rate=parameters["learning_rate"])
    cnn_classifier.set_distinct_labels(dist_labels)

    # initializing a Tensorflow session
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Initializing a Tensorflow session...',
          flush=True)
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())

    # training the model
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Training the model...',
          flush=True)
    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         eval_func,
                                         configuration_func=None)
    simp_trainer.train(
        list(zip(x_train, y_train)),
        parameters["batch_size"],
        parameters["num_epochs"],
        num_epochs_not_better_end=5,
        epoch_diff_smaller_end=parameters["epoch_diff_smaller_end"],
        print_batch_losses=True,
        eval_params={"dist_labels": dist_labels})

    # storing the model
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Storing the model...',
          flush=True)
    cnn_classifier.serialize(session, model_serialization_path)
    session.close()
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Training model is done!',
          flush=True)
Example #3
0
def topically_scale(filenames,
                    texts,
                    languages,
                    embeddings,
                    model_serialization_path,
                    predictions_file_path,
                    parameters,
                    emb_lang='default',
                    stopwords=[]):
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Loading classifier...",
          flush=True)
    cnn_classifier, session = cnn.load_model(
        model_serialization_path,
        embeddings.lang_embeddings[emb_lang],
        loss_functions.softmax_cross_entropy,
        just_predict=True)
    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         None,
                                         configuration_func=None)

    classified_texts = {}
    items = list(zip(filenames, texts, [map_lang(x) for x in languages]))
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Topically classifying texts...",
          flush=True)
    for item in items:
        fn, text, lang = item
        print(fn, flush=True)
        # split text in sentences
        sentences = nltk.sent_tokenize(text)
        sents_clean = [
            data_helper.clean_str(s.strip()).split() for s in sentences
        ]
        langs = [lang] * len(sentences)

        # preparing training examples
        x_test = data_shaper.prep_classification(
            sents_clean,
            None,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)

        results = simp_trainer.test(list(zip(x_test, [None] * len(x_test))),
                                    parameters["batch_size"],
                                    batch_size_irrelevant=True,
                                    print_batches=True)

        pred_labs = get_prediction_labels(results, cnn_classifier.dist_labels)
        print("Predictions: ", flush=True)
        print(pred_labs, flush=True)

        classified_texts[fn] = list(zip(sentences, pred_labs, langs))

        print("Languages: " + str(langs), flush=True)
        print("Done with classifying: " + fn, flush=True)

    lines_to_write = []
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          "  Topical scaling...",
          flush=True)
    for l in cnn_classifier.dist_labels:
        label_filtered = [(fn, classified_texts[fn][0][2], ' '.join([
            sent_label[0] for sent_label in classified_texts[fn]
            if sent_label[1] == l
        ])) for fn in classified_texts]
        label_filtered = [x for x in label_filtered if len(x[2].strip()) > 50]
        if len(label_filtered) > 3:
            print("Topic: " + l, flush=True)
            fns = [x[0] for x in label_filtered]
            langs = [x[1] for x in label_filtered]
            filt_texts = [x[2] for x in label_filtered]

            for i in range(len(fns)):
                io_helper.write_list(
                    os.path.dirname(predictions_file_path) + "/" +
                    fns[i].split(".")[0] + "_" + l.replace(" ", "-") + ".txt",
                    [filt_texts[i]])

            label_scale = scale_efficient(fns,
                                          filt_texts,
                                          [inverse_map_lang(x) for x in langs],
                                          embeddings,
                                          None,
                                          parameters,
                                          emb_lang=emb_lang,
                                          stopwords=stopwords)
            lines_to_write.append("Scaling for class: " + l)
            lines_to_write.extend(
                [k + " " + str(label_scale[k]) for k in label_scale])
            lines_to_write.append("\n")
        else:
            lines_to_write.append(
                "Topic: " + l +
                ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic."
            )
            print(
                "Topic: " + l +
                ": Insufficient number of files contains text of this topic (i.e., class) in order to allow for scaling for the topic.",
                flush=True)

    io_helper.write_list(predictions_file_path, lines_to_write)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Topical Scaling is done!',
          flush=True)
Example #4
0
def test_cnn(texts,
             languages,
             labels,
             embeddings,
             model_serialization_path,
             predictions_file_path,
             parameters,
             emb_lang='default'):
    # loading the serialized
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Loading the serialized model...',
          flush=True)
    cnn_classifier, session = cnn.load_model(
        model_serialization_path,
        embeddings.lang_embeddings[emb_lang],
        loss_functions.softmax_cross_entropy,
        just_predict=(labels is None))

    # preparing/cleaning the texts
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing/cleaning the texts...',
          flush=True)
    texts_clean = [data_helper.clean_str(t.strip()).split() for t in texts]
    # encoding languages (full name to abbreviation)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Encoding languages (full name to abbreviation)...',
          flush=True)
    langs = [map_lang(x) for x in languages]
    # preparing testing examples
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Preparing training examples...',
          flush=True)
    if labels:
        x_test, y_test, dist_labels = data_shaper.prep_classification(
            texts_clean,
            labels,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)
    else:
        x_test = data_shaper.prep_classification(
            texts_clean,
            labels,
            embeddings,
            embeddings_language=emb_lang,
            multilingual_langs=langs,
            numbers_token='<NUM/>',
            punct_token='<PUNC/>',
            add_out_of_vocabulary_terms=False,
            dist_labels=cnn_classifier.dist_labels,
            max_seq_len=cnn_classifier.max_text_length)

    simp_trainer = trainer.SimpleTrainer(cnn_classifier,
                                         session,
                                         build_feed_dict_func,
                                         None if not labels else eval_func,
                                         configuration_func=None)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '  Starting test...',
          flush=True)
    results = simp_trainer.test(
        list(zip(x_test, y_test if labels else [None] * len(x_test))),
        parameters["batch_size"],
        eval_params={"dist_labels": cnn_classifier.dist_labels},
        batch_size_irrelevant=True)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Getting prediction labels...',
          flush=True)
    pred_labs = get_prediction_labels(results[0] if labels else results,
                                      cnn_classifier.dist_labels)

    if labels is None:
        io_helper.write_list(predictions_file_path, pred_labs)
    else:
        list_pairs = list(zip(pred_labs, labels))
        list_pairs.insert(0, ("Prediction", "Real label"))
        list_pairs.append(("Performance: ", str(results[1])))
        io_helper.write_list_tuples_separated(predictions_file_path,
                                              list_pairs)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
          '  Prediction is done!',
          flush=True)
Example #5
0
sent_index_de_test = {
    i: sent_lines_de_test[i]
    for i in range(len(sent_lines_de_test))
}
sent_index_en_test = {
    i: sent_lines_en_test[i]
    for i in range(len(sent_lines_en_test))
}

print("Length sent_index_de_test: " + str(len(sent_index_de_test)))
print("Length sent_index_en_test: " + str(len(sent_index_en_test)))

print("Cleaning dictionaries DE test...")
for k in sent_index_de_test:
    sent_index_de_test[k] = data_helper.clean_str(
        sent_index_de_test[k]).split()
print("Cleaning dictionaries EN test...")
for k in sent_index_en_test:
    sent_index_en_test[k] = data_helper.clean_str(
        sent_index_en_test[k]).split()

print("Embedding indices lookup DE test...")
sent_index_embedded_de_test = data_shaper.prep_embeddings_lookup(
    sent_index_de_test,
    t_embeddings,
    stopwords=stopwords_de,
    punctuation=punctuation,
    lang='default',
    text_lang_prefix='de__',
    min_tokens=2,
    num_token="<NUM/>")