Exemple #1
0
    def compute_vectors(self):
        """
        Collect pas vector representations into a list.

        :return: list of sentence vector representations.
        """
        if not self.quiet:
            print("Computing vectorial representation...")
        vectors = np.zeros((1, 200, 134))

        embeddings = sentence_embeddings(self.sentences)
        centr_scores = centrality_scores(embeddings)
        tf_idfs = tf_idf(self.sentences, os.getcwd() + "/data/idfs.dat")
        # Position score, reference sentence length score, tf_idf, numerical data, centrality, title.
        for j in range(min(len(self.sentences), 200)):
            sent = self.sentences[j]

            position_score = (len(self.sentences) - j) / len(self.sentences)
            length_score = len(sent) / max(len(snt) for snt in self.sentences)
            tf_idf_score = 0
            numerical_score = 0
            centrality_score = centr_scores[j]
            title_sim_score = np.inner(np.array(embeddings[j]),
                                       np.array(embeddings[-1]))

            # Computing centrality and tf_idf score.
            terms = list(set(stem_and_stopword(sent)))
            for term in terms:
                # Due to errors terms may be not present in the tf_idf dictionary.
                if term in tf_idfs.keys():
                    tf_idf_score += tf_idfs[term]
                else:
                    tf_idf_score += 0

                if term.isdigit():
                    numerical_score += 1

            # Some errors in the preprocessing may lead to zero terms, so it is necessary to avoid division by zero.
            if len(terms):
                tf_idf_score /= len(terms)
            else:
                tf_idf_score = 0

            vectors[0, j, :] = np.append(
                np.array([
                    position_score, length_score, tf_idf_score,
                    numerical_score, centrality_score, title_sim_score
                ]), np.array(embeddings[j]))

        return vectors
Exemple #2
0
def store_full_sentence_matrices(index, ref):
    """
    Storing matrices for the extractive summarization task.
    """
    if index < 0:
        docs, references, _ = get_duc()
        doc_path = "/dataset/duc/duc_doc_sent_matrix.dat"
        ref_path = "/dataset/duc/duc_ref_sent_matrix.dat"
    else:
        docs_pas_lists, refs_pas_lists = get_pas_lists(index)
        docs = get_sources_from_pas_lists(docs_pas_lists)
        references = get_sources_from_pas_lists(refs_pas_lists)
        dataset_path = "/dataset/nyt/" + str(index) + "/nyt" + str(index)
        doc_path = dataset_path + "_doc_sent_matrix.dat"
        ref_path = dataset_path + "_ref_sent_matrix.dat"

    docs_no = len(docs)  # First dimension, documents number.
    # Second dimension, max document length (sparse), fixed in case of nyt.
    max_sent_no = 200
    # Third dimension, vector representation dimension.
    sent_vec_len = 134

    # The matrix are initialized as zeros, then they'll filled in with vectors for each docs' sentence.
    refs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len))
    docs_3d_matrix = np.zeros((docs_no, max_sent_no, sent_vec_len))

    # For each document the pas_list is extracted after cleaning the text and tokenizing it.
    if ref:
        doc_list = references
    else:
        doc_list = docs

    for i in range(len(doc_list)):
        doc = doc_list[i]
        print("Processing doc " + str(i) + "/" + str(len(docs)))
        doc = text_cleanup(doc)
        # Splitting sentences (by dot).
        sentences = tokens(doc)
        embeddings = sentence_embeddings(sentences)
        centr_scores = centrality_scores(embeddings)
        tf_idfs = tf_idf(sentences, os.getcwd() + "/dataset/duc/duc_idfs.dat")
        # Position score, reference sentence length score, tf_idf, numerical data, centrality, title.
        for j in range(len(sentences)):
            sent = sentences[j]

            position_score = (len(sentences) - j) / len(sentences)
            length_score = len(sent) / max(len(snt) for snt in sentences)
            tf_idf_score = 0
            numerical_score = 0
            centrality_score = centr_scores[j]
            title_sim_score = np.inner(np.array(embeddings[j]),
                                       np.array(embeddings[-1]))

            # Computing centrality and tf_idf score.
            terms = list(set(stem_and_stopword(sent)))
            for term in terms:
                # Due to errors terms may be not present in the tf_idf dictionary.
                if term in tf_idfs.keys():
                    tf_idf_score += tf_idfs[term]
                else:
                    tf_idf_score += 0

                if term.isdigit():
                    numerical_score += 1

            # Some errors in the preprocessing may lead to zero terms, so it is necessary to avoid division by zero.
            if len(terms):
                tf_idf_score /= len(terms)
            else:
                tf_idf_score = 0

            if ref:
                refs_3d_matrix[i, j, :] = np.append([
                    position_score, length_score, tf_idf_score,
                    numerical_score, centrality_score, title_sim_score
                ], embeddings[j])
            else:
                docs_3d_matrix[i, j, :] = np.append([
                    position_score, length_score, tf_idf_score,
                    numerical_score, centrality_score, title_sim_score
                ], embeddings[j])

    # Storing the matrices in the appropriate file, depending on the scoring system.
    if ref:
        with open(os.getcwd() + ref_path, "wb") as dest_f:
            pickle.dump(refs_3d_matrix, dest_f)
    else:
        with open(os.getcwd() + doc_path, "wb") as dest_f:
            pickle.dump(docs_3d_matrix, dest_f)
Exemple #3
0
def get_important_words(corpus, text):
    d = tf_idf(corpus, text)[0]['stats']
    return sorted(d, key=d.get, reverse=True)[:max_words]
def main():
    fileTotalManager = base.FileTotalManager('./file_lengths.json')
    # load data into memory
    print('loading csv data into memory...')

    genome_tags = base.load_data(
        '../data/genome-tags.csv', base.tags_adapter,
        fileTotalManager.getFileTotal('genome-tags.csv'))

    movies_info = base.load_data('../data/mlmovies.csv',
                                 base.movie_info_adapter,
                                 fileTotalManager.getFileTotal('mlmovies.csv'))

    ratings_info = base.load_data(
        '../data/mlratings.csv', base.RatingInfo,
        fileTotalManager.getFileTotal('mlratings.csv'))

    tags_info = base.load_data('../data/mltags.csv', base.TagInfo,
                               fileTotalManager.getFileTotal('mltags.csv'))

    print('loading completed!')

    # print(movie_actor[0].keys(), mltags[0].keys(), tags[0].keys(), mlmovies[0].keys(), mlusers[0].keys())
    print('preprocessing data...')

    # conversion
    min_ts, max_ts = base.convert_timestamp(tags_info, 'timestamp')
    # base.convert_timestamp(ratings_info, 'timestamp')
    genome_tags = {k['tagId']: k['tag'] for k in genome_tags}
    # movie_actor_list = base.get_moive_actor_list(movie_actor)
    # genres_movie_list, min_yr, max_yr = base.get_genre_movies_list(movies_info)
    movie_names = {k['movieid']: k['moviename'] for k in movies_info}

    # actor_names = {k['id']: k['name'] for k in actor_info}

    def tfidf_tag_weight(mr, ts):
        return (1.0 / mr) * (ts - min_ts + 1) / (max_ts - min_ts + 1)

    def no_weight(mr, ts):
        return 1

    print('building vectors')
    # actor_tags_vector
    # actors_tags_vector = base.actor_tag_vector(movie_actor, tags_info, no_weight)[1]
    # actors_idf, actors_tfidf_tags_vector = base.actor_tag_vector(movie_actor, tags_info, tfidf_tag_weight)
    # actors_idf = base.idf(actors_tfidf_tags_vector, actors_idf)
    # for actor in actors_tfidf_tags_vector.keys():
    #     actors_tfidf_tags_vector[actor] = base.tf_idf(actors_tfidf_tags_vector[actor], actors_idf, 'tf-idf')

    # movie_tags_vector
    print('Building standard movie-tag vector')
    movies_tags_vector = base.movie_tag_vector(movies_info, tags_info,
                                               no_weight)[1]

    print('\nBuilding tf-idf movie-tag vector')
    movies_idf, movies_tfidf_tags_vector = base.movie_tag_vector(
        movies_info, tags_info, tfidf_tag_weight)
    movies_idf = base.idf(movies_tfidf_tags_vector, movies_idf)
    for i, movie in enumerate(movies_tfidf_tags_vector.keys()):
        movies_tfidf_tags_vector[movie] = base.tf_idf(
            movies_tfidf_tags_vector[movie], movies_idf, 'tf-idf')

    # movie_actors_vector
    # movies_actors_vector = base.movie_actor_vector(movies_info, movie_actor, no_weight)[1]
    # movies_actor_idf, movies_tfidf_actors_vector = base.movie_actor_vector(movies_info, movie_actor, tfidf_actor_weight)
    # movies_actor_idf = base.idf(movies_tfidf_actors_vector, movies_actor_idf)
    # for movie in movies_tfidf_actors_vector.keys():
    #     movies_tfidf_actors_vector[movie] = base.tf_idf(movies_tfidf_actors_vector[movie], movies_actor_idf, 'tf-idf')

    # create actor-actor matrix
    # actor_actor_similarity, actors_list, actors_index = build_actor_actor_matrix(actors_tfidf_tags_vector)

    # create coactor-coactor matrix
    # coactor_coactor_matrix, coactors_list, coactors_index = build_coactor_coactor_matrix(movie_actor)

    # print('building AMY tensor')
    # create Actor-Movie-Year tensor (AMY tensor)
    # actor_movie_year_tensor, amy_tensor_info = build_actor_movie_year_tensor(movie_actor, movies_info)

    print('\nbuilding TMR tensor')
    # create Tag-Movie-Rating tensor (TMR tensor)
    tag_movie_rating, tmr_tensor_info = build_tag_movie_rating_tensor(
        genome_tags.keys(), ratings_info)

    print('creating list')
    # create watched list
    users_watched_movies = base.get_users_watched_movies(
        tags_info, ratings_info)

    # create watched movies info
    # watched_movies_info = base.get_moives_related_info(movies_info, ratings_info, movie_actor)

    print('preprocessing completed!')

    while True:
        command_line = input('query>')
        commands = command_line.split(' ')
        relevance_feedback = None

        if len(commands) > 0 and 'p3_task1' in commands[0]:
            if len(commands) == 3:
                if commands[2] == 'pf':
                    relevance_feedback = gen_prob_feedback_function(
                        movies_tags_vector)
                else:
                    if not (commands[2] == 'PCA' or commands[2] == 'SVD'):
                        help()
                        continue
            elif len(commands) == 4:
                if commands[3] == 'pf':
                    relevance_feedback = gen_prob_feedback_function(
                        movies_tags_vector)
                else:
                    help()
                    continue

            WeightConstants.initialize(movie_names, tags_info, ratings_info)

        if commands[0] == 'p3_task1a' and len(commands) > 2:
            user_id = int(commands[1])

            similarities = recommender_system_using_svd_pca(
                user_id, users_watched_movies, movies_tfidf_tags_vector,
                genome_tags, commands[2])

            print_output_using(user_id, similarities, relevance_feedback)

        elif commands[0] == 'p3_task1b' and len(commands) > 1:
            user_id = int(commands[1])

            similarities = recommender_system_using_lda(
                user_id, users_watched_movies, movies_tags_vector, genome_tags)

            print_output_using(user_id, similarities, relevance_feedback)

        elif commands[0] == 'p3_task1c' and len(commands) > 1:
            user_id = int(commands[1])

            similarities = recommender_system_using_cp(
                user_id, users_watched_movies, movies_tags_vector,
                tag_movie_rating, tmr_tensor_info, genome_tags)

            print_output_using(user_id, similarities, relevance_feedback)

        elif commands[0] == 'p3_task1d' and len(commands) > 1:
            user_id = int(commands[1])

            similarities = recommender_system_using_ppr(
                user_id, users_watched_movies, movies_tfidf_tags_vector)

            print_output_using(user_id, similarities, relevance_feedback)

        elif commands[0] == 'p3_task1e' and len(commands) > 1:
            user_id = int(commands[1])

            similarities = recommender_system_combining_all(
                user_id, users_watched_movies, movies_tfidf_tags_vector,
                movies_tags_vector, tag_movie_rating, tmr_tensor_info,
                genome_tags)

            print_output_using(user_id, similarities, relevance_feedback)

        elif commands[0] == 'p3_task3' and len(commands) == 3:
            lsh_indexing(genome_tags, movie_names, movies_tags_vector,
                         int(commands[1]), int(commands[2]))
        elif commands[0] == 'p3_task5' and len(commands) > 1:
            labelled_movies = {}
            n = int(input("Enter number of labels: "))
            while (n > 0):
                label = input("Enter label: ")
                movie_data = input("Enter space separated movies for label "
                                   "" + label + ": ")
                movies = movie_data.split(" ")
                for i, m in enumerate(movies):
                    movies[i] = int(m)
                labelled_movies[label] = movies
                n -= 1

            if commands[1] == 'NN' and len(commands) > 2:
                recommender_system_for_labeling_movies(
                    movies_info, labelled_movies, genome_tags,
                    movies_tfidf_tags_vector, commands[1], int(commands[2]))
            elif commands[1] == 'SVM' or commands[1] == 'DT':
                recommender_system_for_labeling_movies(
                    movies_info, labelled_movies, genome_tags,
                    movies_tfidf_tags_vector, commands[1], 0)
        elif len(commands) > 1 and (commands[0] == 'reset'
                                    and commands[1] == 'wc'):
            WeightConstants.reset()
            print("WeightConstants data has been purged")
        else:
            help()
Exemple #5
0
def extract_pas(sentences):
    """
    Extracts the PASs from a list of sentences (

    :param sentences: sentences from which to extract PAS.
    """
    # Compute the TFIDF vector of all terms in the document.
    tf_idfs = tf_idf(sentences, os.getcwd() + "/data/idfs.dat")

    # Longest sentence length needed afterwards for the length score.
    longest_sent_len = max(len(sent) for sent in sentences)

    pas_list = []
    for sent in sentences:
        # Ignoring short sentences (errors).
        if 3 < len(remove_punct(sent)) and len(sent) < 1000:
            sent_index = sentences.index(sent)

            # Substituting single apices with double apices to avoid errors with SRL.
            sent = re.sub("\'([a-zA-Z0-9])([a-zA-Z0-9 ]+)([a-zA-Z0-9])\'",
                          r'" \1\2\3 "', sent)

            annotations = _annotator.get_annoations(remove_punct(sent).split())
            # Getting SRL annotations from SENNA.
            sent_srl = annotations['srl']
            # Getting POS tags from SENNA.
            parts_of_speech = annotations['pos']

            for raw_pas in sent_srl:
                accept_pas = 1
                out_of_order = 0
                chk_sent = remove_punct(sent)
                # Rejecting PASs with arguments that change the order (w.r.t. to the one of the original sentence);
                # These represents the 10% of the total PASs and the 80% of them are incorrect.
                for arg in raw_pas.values():
                    # Replacing double spaces with a single space to avoid some arguments to be ignored.
                    arg = remove_punct(arg.replace("  ", " "))

                    if chk_sent.find(arg) < 0:
                        accept_pas = False
                        out_of_order = 1
                        break

                if accept_pas:
                    pas = Pas(sent, parts_of_speech, sent_index,
                              sent_srl.index(raw_pas), raw_pas, out_of_order)
                    pas_list.append(pas)

    # Completing each PAS with its realization embeddings and vector representation.
    # This process is done after the initialization as all the other pas are needed.
    realized_pass = []
    for pas in pas_list:
        realized_pass.append(realize_pas(pas))

    # Here the title is put together with the pass to avoid starting another embedding process
    realized_pass.append(sentences[0])
    pas_embeddings = sentence_embeddings(realized_pass)

    # Get the centrality scores for the pas embeddings
    pas_centralities = centrality_scores(pas_embeddings)

    for pas in pas_list:
        pas_index = pas_list.index(pas)
        pas.complete_pas(
            realized_pass[pas_index], pas_embeddings[pas_index],
            len(sentences), longest_sent_len, tf_idfs,
            pas_centralities[pas_index],
            np.inner(np.array(pas_embeddings[pas_index]),
                     np.array(pas_embeddings[-1])))

    return pas_list
#!python3

import text as text_parser
import utils
import xlsxwriter
import re

workbook = xlsxwriter.Workbook('tf-idf.xlsx')
print('Reading texts...')
all_texts = text_parser.get_text_corpus(9999, 'texts/news')
print('Done! Computing TF-IDF ranks...')
all_ranks = utils.tf_idf(all_texts)
print('\nDone! Writing results...')
text_no = 0
for dictionary in all_ranks:
    text_no += 1
    print('Writing worksheet ' + str(text_no) + '/' + str(len(all_ranks)) +
          ' (' + dictionary['title'][:10] + '...)',
          end='\r')
    sheet_name = str(text_no) + ' ' + re.sub('[\[\]:*?/\\\]', '',
                                             dictionary['title'][:-1])
    worksheet = workbook.add_worksheet(sheet_name[:28] +
                                       ('...' if len(sheet_name) > 28 else ''))
    worksheet.write(0, 0, dictionary['title'])
    worksheet.write(1, 0, '#')
    worksheet.write(1, 1, 'Rank')
    worksheet.write(1, 2, 'Word')
    row = 2
    sorted_dict = sorted(dictionary['stats'].items(),
                         key=lambda x: (x[1], x[0]),
                         reverse=True)
Exemple #7
0

if __name__ == "__main__":
    path = 'data/Data.csv'
    tokenized_docs, ids = read_corpus(file_path=path,
                                      has_tag=False,
                                      has_id=True)
    vocab_docs = list(Counter(chain(*tokenized_docs)).keys())

    vocab, word2id, vectors = None, None, None
    if not os.path.exists('word_vectors.npy'):
        word2vec_model = KeyedVectors.load_word2vec_format(
            './GoogleNews-vectors-negative300.bin', binary=True)
        save_word2vec_vocab(vocab_docs=vocab_docs,
                            word2vec_model=word2vec_model)
    else:
        vocab, word2id, vectors = load_word2vec_vocab()

    tf_idf_matrix = tf_idf(tokenized_docs, vocab, word2id)
    doc_vecs = get_document_vectors_word2vec(tokenized_docs, tf_idf_matrix,
                                             word2id, vectors)
    #model = kmeans(X_train=doc_vecs[:4000], n_clusters=4)
    #doc_vecs = get_document_vectors_tfidf()
    #model = kmeans(X_train=doc_vecs[:4000], n_clusters=4)
    #predictions = model.predict(doc_vecs[4000:])

    #model = gaussian_mixture_model(doc_vecs[:4000], n_components=4)
    #print(model.predict(doc_vecs[4000:]))

    print(hierarchical_clustering(X=doc_vecs[:4000], n_clusters=10))
Exemple #8
0
rank_threshold = 0.09  # filter less relevant words, ranked in relative range 0..1
commons = set()
line = []
text = []

with open('tf_df_output.txt') as f:
    content = f.readlines()
with open('files_count.txt') as f:
    documents = int(f.read())
with open('file_name.txt') as f:
    with open(f.read().strip()) as f2:
        text = utils.get_normalized_words(f2.read())
tf_df_word = [x.strip().split() for x in content]

for [tf, df, word] in tf_df_word:
    ranks[word] = utils.tf_idf(int(tf), int(df), documents)
    if ranks[word] > max_rank:
        max_rank = ranks[word]

for word in text:
    rank = ranks[word] / max_rank
    line.append(rank)
    if rank < rank_threshold:
        commons.add(word)

all_words = set(text) - commons


def add_to_hvg(w1, w2):
    if w1 > w2:
        temp = w1
Exemple #9
0
 def vectorize(self):
     docs = [self.tokenizer(t.encode('utf-8')) for t in self.texts]
     return utils.tf_idf(docs, normalize=True)
Exemple #10
0
    exit(1)

print('Reading all texts from /texts/news...')
corpus = get_text_corpus(root_dir=os.path.normpath('texts/news'))

main_text = None

for text in corpus:
    if text['filename'] == file_path:
        main_text = text

if not main_text:
    print('A mystery duck found')
    exit(1)

word_ranking = tf_idf(corpus, main_text)[0]
workbook = xlsxwriter.Workbook('article-rank.xlsx')
worksheet = workbook.add_worksheet(
    re.sub('[\[\]:*?/\\\]', '', main_text['title'][0:28]))
worksheet.write(0, 0, main_text['title'])
worksheet.write(1, 0, '#')
worksheet.write(1, 2, 'Word')
worksheet.write(1, 1, 'Rank')
row = 2
for word in main_text['text']:
    worksheet.write(row, 0, row - 1)
    worksheet.write(row, 1, word)
    worksheet.write(row, 2,
                    word_ranking['stats'][word] / word_ranking['max_rank'])
    row += 1
workbook.close()
Exemple #11
0
textds = TextDataSet(data_path='data/')
X_train, X_valid, X_test, y_train, y_valid, y_test = textds.generate_text_dataset(
    train_size=500, valid_size=200, test_size=500)

print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

# print(textds.embedding_from_text(['I love coffee']).shape)

# %%
knn_ds = {
    'hamming': ((X_train > 0).astype('float'), (X_valid > 0).astype('float')),
    'euclidean': (X_train, X_valid),
    'cosine': (tf_idf(X_train, alpha=1e-6,
                      beta=1e-9), tf_idf(X_valid, alpha=1e-6, beta=1e-9)),
}

best_acc = 0
best_metric = None
best_k = 0
for metric, (X_train_, X_valid_) in knn_ds.items():
    for k in [1, 3, 5]:
        clf = TextClassifier(Knn(n_neighbors=k, metric=metric))
        clf.fit(X_train_, y_train)
        acc = clf.score(X_valid_, y_valid)
        print(metric, k, round(acc * 100, 2), sep=', ')

        if acc > best_acc:
            best_acc = acc
            best_metric = metric