Example #1
0
def create_full_corpus(n_topics=num_topics):

    lda_bow = LdaModel.load(os.path.join(models_path, f'lda_bow_multi'))
    print('Loaded model')
    docs = read_ap.get_processed_docs()
    docs = [d for i, d in docs.items()]

    with open('./objects/dictionary_lda', 'rb') as f:
        dictionary = pkl.load(f)

    # creating bow
    print('creating bow corpus')
    corpus_bow = [dictionary.doc2bow(d) for d in docs]
    # creating binary bow
    print('creating binary bow')
    corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow]

    corpus_full = [
        sparse2full(t_doc, n_topics) for t_doc in lda_bow[corpus_binary]
    ]

    with open('./objects/lda_bow_full', 'wb') as f:
        pkl.dump(corpus_full, f)

    return corpus_full
def train(n_topics=num_topics):

    docs = read_ap.get_processed_docs()
    docs = [d for i, d in docs.items()]

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=50
                               )
    # save the dictionary
    with open(os.path.join(folder_path_objects,
                           'dictionary_lsi_bow'), 'wb') as f:
        pickle.dump(dictionary, f)

    # create binary and regular bow corpus
    corpus_bow = [dictionary.doc2bow(d) for d in docs]
    corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow]

    # save corpuses
    with open(os.path.join(folder_path_objects,
                           'corpus_binary'), 'wb') as f:
        pickle.dump(corpus_binary, f)

    # create models
    print(f'{time.ctime()} Start training LSA (binary bow)')
    lsi_bin = LsiModel(
        corpus=corpus_binary,
        id2word=dictionary,
        chunksize=1000,
        num_topics=n_topics
    )

    # save models to disk
    os.makedirs(folder_path_models, exist_ok=True)

    lsi_bin.save('./models/lsi_bin_filtered')
Example #3
0
def data_loader():
    '''
    Loads the documents (by id in a dict and concatenated in a list) and the
    word2id/id2word dicts.
    '''
    # Load documents
    if not os.path.exists("./pickles/processed_docs.pkl"):
        docs_by_id = ra.get_processed_docs()
    else:
        with open("./pickles/processed_docs.pkl", "rb") as reader:
            docs_by_id = pkl.load(reader)

    # Load word2id and id2word documents
    if not os.path.exists("./pickles/word2id.pkl"):
        print("constructing word2id and id2word dicts")
        word2id, id2word = data_utils.counter_to_dicts(docs_by_id)
    else:
        with open("./pickles/word2id.pkl", "rb") as reader:
            word2id = pkl.load(reader)
        with open("./pickles/id2word.pkl", "rb") as reader:
            id2word = pkl.load(reader)

    # Load word2vec corpus
    if not os.path.exists("./pickles/word2vec_corpus.pkl"):
        print("creating train_corpus")
        word2vec_corpus = data_utils.create_word2vec_corpus(
            docs_by_id, word2id)
    else:
        with open("./pickles/word2vec_corpus.pkl", "rb") as reader:
            word2vec_corpus = pkl.load(reader)

    return (word2id, id2word, word2vec_corpus, docs_by_id)
Example #4
0
def get_ranking(n_topics=num_topics):
    ''' get ranking for all queries '''

    # load queries
    qrels, queries = read_ap.read_qrels()

    # load model
    lda_bow = LdaModel.load(os.path.join(models_path, 'lda_bow_multi'))

    # load corpus of full vectors
    with open('./objects/lda_bow_full', 'rb') as f:
        corpus_full = pkl.load(f)

    # load dictionary
    with open('./objects/dictionary_lda', 'rb') as f:
        dictionary = pkl.load(f)

    # process docs
    processed_docs = read_ap.get_processed_docs()
    doc_keys = processed_docs.keys()
    idx2key = {i: key for i, key in enumerate(doc_keys)}

    overall_ser = {}

    # loop over queries
    for qid in tqdm(qrels):
        query_text = queries[qid]
        sims = get_sims(lda_bow, query_text, corpus_full, dictionary, n_topics)

        overall_ser[qid] = dict([(idx2key[idx], np.float64(score))
                                 for idx, score in sims])

    with open('./objects/overal_ser_lda', 'wb') as f:
        pkl.dump(overall_ser, f)
Example #5
0
def get_model(idx):
    # TF-IDF MODEL
    if idx == 1:
        docs_by_id = read_ap.get_processed_docs()
        model = TfIdfRetrieval(docs_by_id)
        return model

    # LSI BINARY MODEL
    elif idx == 2:
        return lsi_lda.LSIRetrieval('binary')

    # LSI TF-IDF MODEL
    elif idx == 3:
        return lsi_lda.LSIRetrieval('tfidf')

    # LDA MODEL
    elif idx == 4:
        return lsi_lda.LDARetrieval()

    elif idx == 5:
        return analysis.Word2Vec()

    # LSI BINARY 5 TOPICS
    elif idx == 12:
        return lsi_lda.LSIRetrieval('binary', path="lsi/5topics", num_topics=5)
Example #6
0
    def get_corpus(self):
        docs_by_id = read_ap.get_processed_docs()
        docs = [doc for doc_id, doc in docs_by_id.items()]
        doc_bows = [self.dictionary.doc2bow(doc) for doc in docs]
        corpus = [[(idx, 1) for idx, _ in bow] for bow in doc_bows]

        return corpus
def get_data():
    print("Loading data ...") 

    # load preprocessed data 
    download_ap.download_dataset()
    docs_by_id = read_ap.get_processed_docs()
    
    return docs_by_id
Example #8
0
 def __init__(self, path='lda/', num_topics=500):
     self.path = path
     self.dictionary = self.get_dictionary()
     self.model = self.get_model(num_topics)
     self.index = self.get_index()
     self.doc_index_map = {
         i: doc_id
         for i, (doc_id,
                 _) in enumerate(read_ap.get_processed_docs().items())
     }
Example #9
0
 def __init__(self, model_type, path='lsi/', num_topics=500):
     assert model_type == "binary" or model_type == "tfidf", "acccepted model_type: 'binary' or 'tfidf'"
     self.path = path
     self.model_type = model_type
     self.dictionary = self.get_dictionary()
     self.model = self.get_model(num_topics)
     self.index = self.get_index()
     self.doc_index_map = {
         i: doc_id
         for i, (doc_id,
                 _) in enumerate(read_ap.get_processed_docs().items())
     }
Example #10
0
    def get_dictionary(self):
        tmp_fname = self.path + "lda.dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.save_as_text(tmp_fname)
            return dictionary
Example #11
0
    def get_corpus(self):
        docs_by_id = read_ap.get_processed_docs()
        docs = [doc for doc_id, doc in docs_by_id.items()]
        doc_bows = [self.dictionary.doc2bow(doc) for doc in docs]

        if self.model_type == "binary":
            corpus = [[(idx, 1) for idx, _ in bow] for bow in doc_bows]

        elif self.model_type == "tfidf":
            df = self.dictionary.dfs
            corpus = [[(idx, (np.log(1 + tf) / df[idx])) for idx, tf in bow]
                      for bow in doc_bows]

        return corpus
Example #12
0
    def get_dictionary(self):
        tmp_fname = self.path + self.model_type + "_dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.filter_extremes(no_below=20, no_above=0.5)
            dictionary.save_as_text(tmp_fname)
            return dictionary
Example #13
0
    def __init__(self, window_size, vocab_size):

        # ensure dataset is downloaded
        download_ap.download_dataset()
        # pre-process the text
        docs_by_id = read_ap.get_processed_docs()

        self.word2id = dict()
        self.id2word = dict()

        self.window_size = window_size
        self.vocab_size = vocab_size

        self.docs_by_id = docs_by_id
        self.read_words(vocab_size)
Example #14
0
def main():
    docs_by_id = ra.get_processed_docs()
    path = "./doc2vec_models/{}".format(config.model_name)
    # print(path)
    if not os.path.exists(path):
        print("Model not yet trained, starting training now.")
        train_corpus = create_corpus(docs_by_id)
        model = train_doc2vec(train_corpus)
    else:
        print("Model already trained, loading the file.")
        model = gensim.models.doc2vec.Doc2Vec.load(path)

    qrels, queries = ra.read_qrels()
    print(queries)

    overall_ser = {}
    trec_path = "./results/trec_doc2vec.csv"

    # Write TREC results column headers to file
    with open(trec_path, "w") as f:
        f.write("query-id, Q0, document-id, rank, score, STANDARD\n")

    print("Evaluating doc2vec model:", config.model_name)

    # Loop over all queries and predict most relevant docs
    for qid in tqdm(qrels):
        query_text = queries[qid]
        results, trec_results = rank_docs(model, query_text, qid, config.model_name)
        results = dict(results)
        overall_ser[qid] = results
        # Write all test queries to TREC format file
        if not int(qid) in range(76,100):
            with open(trec_path, "a+") as f:
                f.write("\n".join("{},{},{},{},{},{}".format(x[0], x[1],x[2],x[3],x[4],x[5]) for x in trec_results))
                f.write("\n")

    # run evaluation with `qrels` as the ground truth relevance judgements
    # here, we are measuring MAP and NDCG
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
    metrics = evaluator.evaluate(overall_ser)

    # dump this to JSON
    # *Not* Optional - This is submitted in the assignment!
    json_path = "./results/{}.json".format(config.model_name)
    with open(json_path, "w") as writer:
        json.dump(metrics, writer, indent=1)
Example #15
0
def individual_query(query_text):

    docs = read_ap.get_processed_docs()
    doc_keys = docs.keys()
    idx2key = {i: key for i, key in enumerate(doc_keys)}

    # load model
    lda_bow = LdaModel.load(os.path.join(models_path, 'lda_bow_multi'))

    # load corpus of full vectors
    with open('./objects/lda_bow_full', 'rb') as f:
        corpus_full = pkl.load(f)

    # load dictionary
    with open('./objects/dictionary_lda', 'rb') as f:
        dictionary = pkl.load(f)

    sims = get_sims(lda_bow, query_text, corpus_full, dictionary, num_topics)
    ranking = dict([(idx2key[idx], np.float64(score)) for idx, score in sims])
    return ranking
Example #16
0
def preprocess(path=PROCESSED_DOCS_PATH):
    # Load the preprocessed docs_by_id file if it exists.
    if os.path.exists(path):
        print("Loading the preprocessed files...")
        with open(path, "rb") as reader:
            return pickle.load(reader)

    # (Down)load the dataset from the ap files and get it in the right form.
    download_ap.download_dataset()
    docs_by_id = read_ap.get_processed_docs()
    print("Filtering infrequent words...")
    docs_by_id = filter_infrequent(docs_by_id)
    print("Converting words to indices...")
    tok2idx, id2corpus = all_words_to_indices(docs_by_id)

    # Store the preprocessing results for faster future retrieval.
    print("Storing the preprocessed files...")
    with open(path, "wb") as writer:
        pickle.dump((tok2idx, id2corpus), writer)
    return tok2idx, id2corpus
Example #17
0
def train(config):
    print(f"Training vec dim: {config.vector_dim},  window size: {config.window_size}, Vocab size: {config.vocab_size}")
    if not os.path.exists(config.model_file) or config.t:
        print("\n###    Reading in the documents    ###\n")
        docs_by_id = read_ap.get_processed_docs()

        print("\n### Converting to gensim standards ###\n")
        train_docs = list(AP2Gensim(docs_by_id))

        model = gensim.models.doc2vec.Doc2Vec(vector_size=config.vector_dim, window=config.window_size, min_count=config.min_count, dm=0,
                                                max_vocab_size=config.vocab_size, epochs=config.epochs)

        print("\n###         Building vocab         ###\n")
        model.build_vocab(train_docs)

        print("\n###         Training model         ###\n")
        model.train(train_docs, total_examples=model.corpus_count, epochs=model.epochs)
        
        print("\n###          Saving model          ###\n")
        model.save(config.model_file)
    else:
        print("A model already exists so skipping training")
Example #18
0
def search_doc2vec(model, query, docs_by_id=None,
                   result_len=MAX_NUMBER_OF_RESULTS):
    if docs_by_id is None:
        docs_by_id = read_ap.get_processed_docs()

    # Deleting training data is advice by the official gensim website.
    model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

    print("Comparing the query embedding with all document embeddings...")
    # Get cosine similarity for the query compared to the documents.
    q_vec = model.infer_vector([q_tok for q_tok in read_ap.process_text(query)])
    q_vec = torch.FloatTensor(q_vec).unsqueeze(dim=0)
    cos = torch.nn.CosineSimilarity()
    results = {}
    for doc_id, doc in docs_by_id.items():
        vec = torch.FloatTensor(model.infer_vector(doc)).unsqueeze(dim=0)
        results[doc_id] = float(cos(vec, q_vec))

    # Rank the top results in a list.
    results = list(results.items())
    results.sort(key=lambda _: _[1])
    return results[:result_len]
Example #19
0
def main():
    docs_by_id = get_processed_docs()
    doc_ids = list(docs_by_id)
    documents = []
    for key in docs_by_id.keys():
        doc = docs_by_id[key]
        documents.append(doc)

    # construct dictionary and corpus
    dictionary, bow_corpus, tfidf_corpus = create_corpus_and_dict(documents)

    if not os.path.exists(
        ("./LDA_MODELS/BOW_LDA_{}_TOPICS.model".format(config.topics))):
        print("Starting LDA with topics = {} with BOW training now.".format(
            config.topics))
        BOW_LDA = train_LDA(bow_corpus, dictionary, config.topics)
        BOW_LDA.save(
            ("./LDA_MODELS/BOW_LDA_{}_TOPICS.model".format(config.topics)))

    else:
        print("LDA with BOW already trained, loading the file.")
        BOW_LDA = LdaModel.load(
            ("./LDA_MODELS/BOW_LDA_{}_TOPICS.model".format(config.topics)))
Example #20
0
def train(n_topics=num_topics):

    docs = read_ap.get_processed_docs()
    docs = [d for i, d in docs.items()]

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=50)

    # save the dictionary
    with open(os.path.join(folder_path_objects, 'dictionary_lsi_bow'),
              'wb') as f:
        pickle.dump(dictionary, f)

    # create binary and regular bow corpus
    corpus_bow = [dictionary.doc2bow(d) for d in docs]

    # create tf-idf corpus
    tfidf = TfidfModel(corpus_bow)
    corpus_tfidf = tfidf[corpus_bow]

    with open(os.path.join(folder_path_objects, 'corpus_lsi_tfidf'),
              'wb') as f:
        pickle.dump(corpus_tfidf, f)

    # create models
    print(f'{time.ctime()} Start training LSI (tf-idf)')
    lsi_tfidf = LsiModel(corpus=corpus_tfidf,
                         id2word=dictionary,
                         num_topics=n_topics)

    # save models to disk
    os.makedirs(folder_path_models, exist_ok=True)

    def filepath_out(model):
        return os.path.join('models', f'{model}_{t}')

    lsi_tfidf.save(filepath_out('lsi_tfidf'))
Example #21
0
def train(n_topics=num_topics):
    '''Train LDA model'''

    docs = read_ap.get_processed_docs()

    docs = [d for i, d in docs.items()]

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=50)

    # save the dictionary
    with open('./objects/dictionary_lda', 'wb') as f:
        pkl.dump(dictionary, f)

    # creating bow
    print('creating bow corpus')
    corpus_bow = [dictionary.doc2bow(d) for d in docs]
    # creating binary bow
    print('creating binary bow')
    corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow]

    # with open(os.path.join(objects_path, 'corpus'), 'wb') as f:
    #     pickle.dump(corpus_tfidf, f)

    print(f'{time.ctime()} Start training LDA (BOW)')
    lda_bow = LdaMulticore(workers=5,
                           corpus=corpus_binary,
                           id2word=dictionary,
                           chunksize=1000,
                           num_topics=n_topics,
                           dtype=np.float64)

    # save models to disk
    os.makedirs(models_path, exist_ok=True)

    lda_bow.save(os.path.join(models_path, f'lda_bow_multi'))
Example #22
0
            if query_term not in self.ii:
                continue
            for (doc_id, tf) in self.ii[query_term]:
                results[doc_id] += np.log(1 + tf) / self.df[query_term]

        results = list(results.items())
        results.sort(key=lambda _: -_[1])
        return results


if __name__ == "__main__":

    # ensure dataset is downloaded
    download_ap.download_dataset()
    # pre-process the text
    docs_by_id = read_ap.get_processed_docs()

    # Create instance for retrieval
    tfidf_search = TfIdfRetrieval(docs_by_id)
    # read in the qrels
    qrels, queries = read_ap.read_qrels()

    overall_ser = {}

    print("Running TFIDF Benchmark")
    # collect results
    for qid in tqdm(qrels): 
        query_text = queries[qid]

        results = tfidf_search.search(query_text)
        overall_ser[qid] = dict(results)
Example #23
0
def main():
    # id2word, word2id, = load_data()
    docs_by_id = get_processed_docs()
    doc_ids = list(docs_by_id)
    documents = []
    for key in docs_by_id.keys():
        doc = docs_by_id[key]
        documents.append(doc)

    #construct dictionary and corpus
    dictionary, bow_corpus, tfidf_corpus = create_corpus_and_dict(documents)

    # train lsi, generate index
    #TRAINING LOOP
    topic_args = [10, 50, 100, 500, 1000, 2000, 5000, 10000]
    for topic_num in topic_args:
        print(('starting training tfidf lsi and index with topic_num {}'
               ).format(topic_num))
        lsi, index = train_lsi(tfidf_corpus,
                               dictionary,
                               num_topics=topic_num,
                               corpus_type='tfidf')
        print(('finished training tfidf lsi and index with topic_num {}'
               ).format(topic_num))
        print(('starting training bow lsi and index with topic_num {}'
               ).format(topic_num))
        lsi, index = train_lsi(bow_corpus,
                               dictionary,
                               num_topics=topic_num,
                               corpus_type='bow')
        print(('finished training bow lsi and index with topic_num {}'
               ).format(topic_num))

    #METRICS LOOP
    # COMPUTE METRICS
    topic_args = [10, 50, 100, 500, 1000, 2000]
    for topic_num in topic_args:
        # retrieve model and index for TFIDF, compute and store metrics
        lsi, index = train_lsi(tfidf_corpus,
                               dictionary,
                               num_topics=topic_num,
                               corpus_type='tfidf')
        compute_metrics(dictionary=dictionary,
                        model=lsi,
                        index=index,
                        corpus_type='tfidf',
                        num_topics=topic_num,
                        doc_ids=doc_ids)

        # retrieve model and index for BOW, compute and store metrics
        lsi, index = train_lsi(bow_corpus,
                               dictionary,
                               num_topics=topic_num,
                               corpus_type='bow')
        compute_metrics(dictionary=dictionary,
                        model=lsi,
                        index=index,
                        corpus_type='bow',
                        num_topics=topic_num,
                        doc_ids=doc_ids)

    #TOPICS
    tfidf_lsi, index = train_lsi(tfidf_corpus,
                                 dictionary,
                                 num_topics=500,
                                 corpus_type='tfidf')
    print("top 5 TFIDF topics")
    pprint(tfidf_lsi.print_topics(num_topics=50))
    bow_lsi, index = train_lsi(bow_corpus,
                               dictionary,
                               num_topics=500,
                               corpus_type='bow')
    print("top 5 BOW topics")
    pprint(bow_lsi.print_topics(num_topics=50))
Example #24
0
import read_ap
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

model = LsiModel(common_corpus, id2word=common_dictionary)
vectorized_corpus = model[common_corpus]

docs = read_ap.get_processed_docs()

print(len(docs))

pass
"""
tf-idf > w2v > lsa  >> d2v
"""
Example #25
0
def get_docs_by_id():
    return read_ap.get_processed_docs()
Example #26
0
    metrics = evaluator.evaluate(overall_ser)

    json_filename = f"./json_files/benchmark_{model_name}.json"

    # dump to JSON
    with open(json_filename, "w") as writer:
        json.dump(metrics, writer, indent=1)

    return json_filename


if __name__ == "__main__":
    np.random.seed(42)

    # retrieve docs as a list
    processed_docs = get_processed_docs()
    docs = processed_docs.values()
    doc_keys = processed_docs.keys()
    idx2key = {i: key for i, key in enumerate(doc_keys)}

    # convert to TaggedDocuments so that gensim can work with them
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
    print(f"Docs are loaded. {len(docs)} in total\n")

    # train the model
    model, model_name = training(documents,
                                 max_vocab_size=3000000,
                                 vector_dim=300,
                                 window_size=2,
                                 verbose=True)
Example #27
0
def get_doc_keys():
    docs = read_ap.get_processed_docs()
    return list(docs.keys())
Example #28
0
def get_doc_list():
    """
    Process documents and convert doc Dictionary to a list of lists of tokens
    """
    docs = read_ap.get_processed_docs()
    return list(map(list, docs.values()))
Example #29
0
    if docs_by_id is None:
        docs_by_id = read_ap.get_processed_docs()

    # Deleting training data is advice by the official gensim website.
    model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

    print("Comparing the query embedding with all document embeddings...")
    # Get cosine similarity for the query compared to the documents.
    q_vec = model.infer_vector([q_tok for q_tok in read_ap.process_text(query)])
    q_vec = torch.FloatTensor(q_vec).unsqueeze(dim=0)
    cos = torch.nn.CosineSimilarity()
    results = {}
    for doc_id, doc in docs_by_id.items():
        vec = torch.FloatTensor(model.infer_vector(doc)).unsqueeze(dim=0)
        results[doc_id] = float(cos(vec, q_vec))

    # Rank the top results in a list.
    results = list(results.items())
    results.sort(key=lambda _: _[1])
    return results[:result_len]



if __name__ == "__main__":
    skipgram = SkipGram()
    skipgram._train(list(ID2CORPUS.values()))
    print(search_SkipGram(skipgram, "How are you"))

    train_doc2vec(read_ap.get_processed_docs(), batched=True)