Beispiel #1
0
def CreateLatentDirichletAllocationModel(pathDataset, dataset_name, lang,
                                         normalization, pathToLDAFolder):
    print(
        "I am going to check if LDA model exists. If it doesn't I will create it."
    )

    # path to the collection of documents
    pathToCollectionOfDocs = pathDataset + '/docsutf8'
    print(f"\nPath to the collection of docs = {pathToCollectionOfDocs}")

    # model init
    pathToLDAFile = pathToLDAFolder + dataset_name + '_lda.gz'
    print(f"Path to LDA file = {pathToLDAFile}")

    if os.path.exists(pathToLDAFile):
        print(f"Model =  {pathToLDAFile} already exists")
    else:
        print(
            "Model doesn't exist. Let's create a new model based on the collection of documents. It may take a while"
        )
        # Test if lan exists in spacy models. If not considers model en
        if lang not in ['en', 'es', 'pt', 'fr', 'it', 'nl', 'de']:
            compute_lda_model(pathToCollectionOfDocs,
                              pathToLDAFile,
                              n_topics=500,
                              extension='txt',
                              language="en",
                              normalization=normalization)
        else:
            compute_lda_model(pathToCollectionOfDocs,
                              pathToLDAFile,
                              n_topics=500,
                              extension='txt',
                              language=lang,
                              normalization=normalization)
        print("Model just created")
Beispiel #2
0
# -*- coding: utf-8 -*-

import logging
import sys

from pke import compute_lda_model

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the df weights dictionary, saved as a gzipped csv file
output_file = sys.argv[2]

# number of topics for the LDA model
n_topics = int(sys.argv[3])

# compute idf weights
compute_lda_model(input_dir=input_dir,
                  output_file=output_file,
                  n_topics=n_topics,
                  extension="xml",
                  language="en",
                  normalization="stemming")
Beispiel #3
0
# -*- coding: utf-8 -*-

import os
import logging
import sys

from pke import compute_lda_model

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = sys.argv[1]

# path to the df weights dictionary, saved as a gzipped csv file
output_file = sys.argv[2]

# number of topics for the LDA model
n_topics = int(sys.argv[3])

# compute idf weights
compute_lda_model(
    input_dir=input_dir,
    output_file=output_file,
    n_topics=n_topics,
    format="corenlp",  # input files format
    extension='xml',  # input files extension
    use_lemmas=False,  # do not use Stanford lemmas
    stemmer="porter",  # use porter stemmer
    language="english")  # language for the stop_words
Beispiel #4
0
        starts = [
            int(u.text)
            for u in sentence.iterfind('tokens/token/CharacterOffsetBegin')
        ]
        ends = [
            int(u.text)
            for u in sentence.iterfind('tokens/token/CharacterOffsetEnd')
        ]
        doc = {
            'words': [u.text for u in sentence.iterfind('tokens/token/word')],
            'lemmas':
            [u.text for u in sentence.iterfind('tokens/token/lemma')],
            'POS': [u.text for u in sentence.iterfind('tokens/token/POS')],
            'char_offsets': [(starts[k], ends[k]) for k in range(len(starts))]
        }
        sentences.append([(doc['words'][i], doc['POS'][i])
                          for i in range(len(doc['words']))])
    return sentences


documents = []
for fn in glob(input_dir + '*.xml'):
    doc = read_corenlp_xml(fn)
    documents.append(doc)

compute_lda_model(documents,
                  output_file=output_file,
                  n_topics=n_topics,
                  language='en',
                  normalization='stemming')
Beispiel #5
0
    pke.compute_document_frequency(input_dir=path_to_train,
                                   output_file=path_to_df_file,
                                   extension=params["extension"],
                                   language=params["language"],
                                   normalization=params["normalization"],
                                   stoplist=punctuations,
                                   delimiter='\t',
                                   n=5)

# pre-compute LDA distributions if needed
need_lda = any(model in ['TopicalPageRank'] for model in params['models'])
if need_lda and not os.path.isfile(path_to_lda_file):
    logging.info("computing LDA distributions from {}".format(params["path"]))
    pke.compute_lda_model(input_dir=path_to_train,
                          output_file=path_to_lda_file,
                          n_topics=params["n_topics"],
                          extension=params["extension"],
                          language=params["language"],
                          normalization=params["normalization"])


# pre-compute pairwise similarities if needed
need_pairwise = any(model in ['ExpandRank'] for model in params['models'])
if need_pairwise and not os.path.isfile(path_to_pairwise_file):
    logging.info("computing pairwise similarities in {}".format(
        params["path"]))

    logging.info("loading DF counts from {}".format(path_to_df_file))
    df_counts = pke.load_document_frequency_file(input_file=path_to_df_file)

    pke.compute_pairwise_similarity_matrix(
        input_dir=path_to_test,