Example #1
0
def cosine_similarity(books_data, DF, tf_idf, total_vocab, total_vocab_size, k,
                      query):
    final_dict = dict()

    D = zero_vector(tf_idf, total_vocab, books_data.shape[0], total_vocab_size)

    # print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    # print("\nQuery:", query)
    print("")
    # print(tokens)

    d_cosines = []

    query_vector = gen_vector(DF, tokens, books_data, total_vocab)

    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))

    out = np.array(d_cosines).argsort()[-k:][::-1]

    # print("")

    # print(out)
    for each in out:
        case = {
            each: {
                'bookname': books_data['bookname'][each],
                'author': books_data['author'][each],
                'chapter': books_data['chapter'][each]
            }
        }
        final_dict.update(case)
Example #2
0
def load():
    print("Reading training data...")
    df = pd.read_csv(DATASET_PATH + TRAIN_FILE_NAME)

    print("Preprocessing training data...")
    question1, question2, labels, tokenizer = pp.preprocess(df, mode='train')

    return question1, question2, labels, tokenizer
def main():
    if len(sys.argv) == 3:
        start = time.time()
        print('Preprocessing the training data..')
        train = load_data(sys.argv[1])
        print(train.head(3))
        train = preprocess_data.preprocess(train, train=True)
        train = balance_training_data.balance_data(train)

        print('Preprocessing the test data..')
        test = load_data(sys.argv[2])
        print(test.head(3))
        test = preprocess_data.preprocess(test, train=False)

        print('Building the model..')
        classification.two_step_classification(train, test)
        end = time.time()

        print("The whole thing took {0:.2f} minutes".format(
            (end - start) / 60))
    else:
        print(
            "You need to provide two files: one set of training data and one set of test data."
        )
Example #4
0
def cosine_similarity(books_data, DF, tf_idf, total_vocab, total_vocab_size, k,
                      query):
    final_dict = dict()

    D = zero_vector(tf_idf, total_vocab, books_data.shape[0], total_vocab_size)

    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("")

    d_cosines = []

    query_vector = gen_vector(DF, tokens, books_data, total_vocab)

    for d in D:
        cosine_rate = cosine_sim(query_vector, d)

        if math.isnan(cosine_rate):
            d_cosines.append(0.0)
        else:
            d_cosines.append(cosine_rate)

    out = np.array(d_cosines).argsort()[-k:][::-1]

    cosine_val = sorted(d_cosines, reverse=True)[:k]

    # for each in out:
    #     result= np.where(out == each)
    #     case = {each: {'bookname':books_data['bookname'][each], 'author':books_data['author'][each], 'chapter':books_data['chapter'][each], 'similarity':cosine_val[result[0][0]]}}
    #     final_dict.update(case)
    for each in out:
        result = np.where(out == each)
        if cosine_val[result[0][0]] == 0.0:
            pass
        else:
            case = {
                each: {
                    'bookname': books_data['bookname'][each],
                    'author': books_data['author'][each],
                    'chapter': books_data['chapter'][each],
                    'similarity': cosine_val[result[0][0]]
                }
            }
            final_dict.update(case)
Example #5
0
parser = argparse.ArgumentParser()
parser.add_argument('-embedding', action='store', dest='embedding')
fasttext_name = parser.parse_args().embedding

embedding_dim = 100
learning_rate = 0.001145
bs = 256
drop = 0.2584
max_length = 1431
max_num_words = 23140
filters = [6]
num_filters = 2426
nclasses = 451

x_train, y_train, x_val, y_val, embedding_matrix = preprocess(
    fasttext_name, embedding_dim, max_length, max_num_words)

print("Starting Training ...")

filter_sizes = []
for i in filters:
    filter_sizes.append(i)

embedding_layer = Embedding(max_num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=True)

sequence_input = Input(shape=(max_length, ), dtype='uint16')
embedded_sequences = embedding_layer(sequence_input)
    batchSize = 64

    nEpochs = 20

    print('Training!')

    roc_auc = train_neural_network(model,
                                   batchSize,
                                   nEpochs,
                                   x_train,
                                   y_train,
                                   x_test=x_test,
                                   y_test=y_test)
    plot_gini_results(roc_auc.gini, roc_auc.gini_val)
    nn_output = model.predict(x_validation).flatten()
    export_csv_file("output.csv", validation_ids, nn_output)


x_train_file_name, x_validate_file_name = 'Data/train.csv', 'Data/test.csv'

x_train, y_train, x_validation, validation_ids = generate_features(
    x_train_file_name, x_validate_file_name)

x_train = np.array(x_train).astype(np.float32)
y_train = np.array(y_train).astype(np.int32)
x_validation = np.array(x_validation).astype(np.float32)
validation_ids = np.array(validation_ids).astype(np.int32)

x_train, y_train, x_validation = preprocess(x_train, y_train, x_validation)

puertoPredictions(x_train, y_train, x_validation, validation_ids)
Example #7
0
threshold = 0.1  # Minimum Manhattan LSTM distance between two outputs
# for them to be classified as semantically similar

# Load trained model
print("Loading model...")
model = tf.keras.models.load_model(
    CHECKPOINT_PATH + MODEL_FILE_NAME,
    custom_objects={"manh_lstm_distance": manh_lstm_distance})

# Read test file
print("Reading test data...")
df = pd.read_csv(DATASET_PATH + TEST_FILE_NAME, skiprows=skiprows, nrows=nrows)

# Preprocess test data
print("Preprocessing test data...")
question1, question2 = pp.preprocess(df, mode='predict')

# Predict Manhattan LSTM distances
print("Predicting Manhattan LSTM distances...")
manh_lstm_distance = model.predict([question1, question2], verbose=1)

# Make binary predictions
print("Making binary predictions...")
prediction = manh_lstm_distance > threshold
prediction = prediction.astype(int)

# Print predictions
data = {
    'Manhattan LSTM distances': list(manh_lstm_distance),
    'Prediction': list(prediction)
}
Example #8
0
                        type=bool,
                        default=False,
                        help='Set to True if trying to reproduce results')
    parser.add_argument('--plots',
                        type=bool,
                        default=False,
                        help='Set to True if wanting to generate the plots')
    parser.add_argument('--manual_eval',
                        type=bool,
                        default=True,
                        help='Set to True to perform manual evaluation')
    args = parser.parse_args()

    df = pd.read_csv(args.input)
    data = df[(df.week == args.week)]
    data = preprocess(data)
    data.stems = [' '.join(text) for text in data.stems]

    print('This week has {0} articles.'.format(len(data)))

    vec_matrix_pca = tfidf_creation(data)

    k = elbow_plot(vec_matrix_pca)

    centroids, labels = clustering_kmeans(k, vec_matrix_pca, args.week,
                                          args.reproduce)
    data['labels'] = labels

    if args.plots:
        clusters_plot(centroids, labels, vec_matrix_pca, args.week)
Example #9
0
    args = parser.parse_args()
    config = config_file.config_preprocess[args.configurationID]

    # 1. Make new directory for the mel spectrograms
    config["melspectrogram_path"] = config['identifier'] + \
        "/%s_mels/" % (config['identifier'])
    # set audio representations folder
    if not os.path.exists(config_file.DATA_PATH +
                          config['melspectrogram_path']):
        os.makedirs(config_file.DATA_PATH + config['melspectrogram_path'])

    # 2. Find audio files to preprocess
    files_to_preprocess = []
    f = open(config_file.DATA_PATH + config["index_file"])
    for line in f.readlines():
        file_id, audio = line.strip().split("\t")
        melspectrogram = audio[:audio.rfind(".")] + ".pk"  # .npy or .pk
        # (id, path to audio file, path to mel spectrogram)
        files_to_preprocess.append(
            (file_id, config["audio_path"] + audio, config_file.DATA_PATH +
             config["melspectrogram_path"] + melspectrogram))

    # 3. Compute mel spectrograms
    preprocess(files_to_preprocess, config)
    # 4. Save the parameters in a json
    json.dump(
        config,
        open(
            config_file.DATA_PATH + config['melspectrogram_path'] +
            "config.json", "w"))