def CreateLatentDirichletAllocationModel(pathDataset, dataset_name, lang, normalization, pathToLDAFolder): print( "I am going to check if LDA model exists. If it doesn't I will create it." ) # path to the collection of documents pathToCollectionOfDocs = pathDataset + '/docsutf8' print(f"\nPath to the collection of docs = {pathToCollectionOfDocs}") # model init pathToLDAFile = pathToLDAFolder + dataset_name + '_lda.gz' print(f"Path to LDA file = {pathToLDAFile}") if os.path.exists(pathToLDAFile): print(f"Model = {pathToLDAFile} already exists") else: print( "Model doesn't exist. Let's create a new model based on the collection of documents. It may take a while" ) # Test if lan exists in spacy models. If not considers model en if lang not in ['en', 'es', 'pt', 'fr', 'it', 'nl', 'de']: compute_lda_model(pathToCollectionOfDocs, pathToLDAFile, n_topics=500, extension='txt', language="en", normalization=normalization) else: compute_lda_model(pathToCollectionOfDocs, pathToLDAFile, n_topics=500, extension='txt', language=lang, normalization=normalization) print("Model just created")
# -*- coding: utf-8 -*- import logging import sys from pke import compute_lda_model # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the df weights dictionary, saved as a gzipped csv file output_file = sys.argv[2] # number of topics for the LDA model n_topics = int(sys.argv[3]) # compute idf weights compute_lda_model(input_dir=input_dir, output_file=output_file, n_topics=n_topics, extension="xml", language="en", normalization="stemming")
# -*- coding: utf-8 -*- import os import logging import sys from pke import compute_lda_model # setting info in terminal logging.basicConfig(level=logging.INFO) # path to the collection of documents input_dir = sys.argv[1] # path to the df weights dictionary, saved as a gzipped csv file output_file = sys.argv[2] # number of topics for the LDA model n_topics = int(sys.argv[3]) # compute idf weights compute_lda_model( input_dir=input_dir, output_file=output_file, n_topics=n_topics, format="corenlp", # input files format extension='xml', # input files extension use_lemmas=False, # do not use Stanford lemmas stemmer="porter", # use porter stemmer language="english") # language for the stop_words
starts = [ int(u.text) for u in sentence.iterfind('tokens/token/CharacterOffsetBegin') ] ends = [ int(u.text) for u in sentence.iterfind('tokens/token/CharacterOffsetEnd') ] doc = { 'words': [u.text for u in sentence.iterfind('tokens/token/word')], 'lemmas': [u.text for u in sentence.iterfind('tokens/token/lemma')], 'POS': [u.text for u in sentence.iterfind('tokens/token/POS')], 'char_offsets': [(starts[k], ends[k]) for k in range(len(starts))] } sentences.append([(doc['words'][i], doc['POS'][i]) for i in range(len(doc['words']))]) return sentences documents = [] for fn in glob(input_dir + '*.xml'): doc = read_corenlp_xml(fn) documents.append(doc) compute_lda_model(documents, output_file=output_file, n_topics=n_topics, language='en', normalization='stemming')
pke.compute_document_frequency(input_dir=path_to_train, output_file=path_to_df_file, extension=params["extension"], language=params["language"], normalization=params["normalization"], stoplist=punctuations, delimiter='\t', n=5) # pre-compute LDA distributions if needed need_lda = any(model in ['TopicalPageRank'] for model in params['models']) if need_lda and not os.path.isfile(path_to_lda_file): logging.info("computing LDA distributions from {}".format(params["path"])) pke.compute_lda_model(input_dir=path_to_train, output_file=path_to_lda_file, n_topics=params["n_topics"], extension=params["extension"], language=params["language"], normalization=params["normalization"]) # pre-compute pairwise similarities if needed need_pairwise = any(model in ['ExpandRank'] for model in params['models']) if need_pairwise and not os.path.isfile(path_to_pairwise_file): logging.info("computing pairwise similarities in {}".format( params["path"])) logging.info("loading DF counts from {}".format(path_to_df_file)) df_counts = pke.load_document_frequency_file(input_file=path_to_df_file) pke.compute_pairwise_similarity_matrix( input_dir=path_to_test,