def UpdateModel(self): app_path = self.request.folder app_model_path = '{}/data/treetm'.format(app_path) iterCount = self.GetIterCount(app_model_path) iters = self.GetIters(iterCount) mustLinks, cannotLinks, keepTerms, removeTerms = self.GetConstraints() action = self.GetAction() if action != 'train' or iters is None: self.content.update({ 'IterCount': iterCount, 'MustLinks': mustLinks, 'CannotLinks': cannotLinks, 'KeepTerms': keepTerms, 'RemoveTerms': removeTerms }) else: RefineLDA(app_model_path, numIters=iters, mustLinks=mustLinks, cannotLinks=cannotLinks, keepTerms=keepTerms, removeTerms=removeTerms) with LDA_DB(isReset=True) as lda_db: reader = TreeTMReader(lda_db, app_model_path) reader.Execute() computer = LDA_ComputeStats(lda_db) computer.Execute() self.content.update({ 'IterCount': iterCount, 'MustLinks': mustLinks, 'CannotLinks': cannotLinks, 'KeepTerms': keepTerms, 'RemoveTerms': removeTerms })
def index(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) response.delimiters = ('[[', ']]') return handler.GenerateResponse()
def TermFrequencyModel(): with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = TermTopicMatrix1(request, response, bow_db, lda_db) data = handler.GetTermFrequencyModel() dataStr = json.dumps(data, encoding='utf-8', indent=2, sort_keys=True) response.headers['Content-Type'] = 'application/json' return dataStr
def GroupInBox(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) handler.LoadGIB() return handler.GenerateResponse()
def Inspect(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) handler.InspectModel() return handler.GenerateResponse()
def ImportMalletLDA( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ): logger = logging.getLogger( 'termite' ) logger.addHandler( logging.StreamHandler() ) logger.setLevel( logging.INFO if is_quiet else logging.DEBUG ) app_path = 'apps/{}'.format( app_name ) corpus_filename = '{}/corpus.txt'.format( corpus_path ) database_filename = '{}/corpus.db'.format( database_path ) logger.info( '--------------------------------------------------------------------------------' ) logger.info( 'Import an ITM topic model as a web2py application...' ) logger.info( ' app_name = %s', app_name ) logger.info( ' app_path = %s', app_path ) logger.info( ' model_path = %s', model_path ) logger.info( ' corpus_filename = %s', corpus_filename ) logger.info( ' database_filename = %s', database_filename ) logger.info( '--------------------------------------------------------------------------------' ) if force_overwrite or not os.path.exists( app_path ): with CreateApp(app_name) as app: # Create a copy of the original corpus app_database_filename = '{}/corpus.db'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename ) shutil.copy( database_filename, app_database_filename ) # Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt) app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename ) shutil.copy( corpus_filename, app_corpus_filename ) app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() ) logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename ) SplitSentences( corpus_filename, app_sentences_filename ) app_db_filename = '{}/corpus.db'.format( app.GetDatabasePath() ) logger.info( 'Copying [%s] --> [%s]', database_filename, app_db_filename ) shutil.copy( database_filename, app_db_filename ) # Compute derived-statistics about the corpus db_path = app.GetDatabasePath() with Corpus_DB(db_path, isInit=True) as corpus_db: computer = Corpus_ComputeStats( corpus_db, app_corpus_filename, app_sentences_filename ) computer.Execute() # Import model app_model_path = '{}/treetm'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path ) shutil.copytree( model_path, app_model_path ) # Compute derived-statistics about the model with LDA_DB(db_path, isInit=True) as lda_db: reader = TreeTMReader( lda_db, app_model_path ) reader.Execute() computer = LDA_ComputeStats( lda_db, corpus_db ) computer.Execute() with ITM_DB(db_path, isInit=True) as itm_db: computer = ITM_ComputeStats( itm_db, corpus_db ) computer.Execute() else: logger.info( ' Already available: %s', app_path )
def ImportSTM( app_name, model_path, corpus_path, database_path, is_quiet, force_overwrite ): logger = logging.getLogger( 'termite' ) logger.addHandler( logging.StreamHandler() ) logger.setLevel( logging.INFO if is_quiet else logging.DEBUG ) app_path = 'apps/{}'.format( app_name ) corpus_filename = '{}/corpus.txt'.format( corpus_path ) database_filename = '{}/corpus.db'.format( database_path ) logger.info( '--------------------------------------------------------------------------------' ) logger.info( 'Import an STM topic model as a web2py application...' ) logger.info( ' app_name = %s', app_name ) logger.info( ' app_path = %s', app_path ) logger.info( ' model_path = %s', model_path ) logger.info( ' corpus_filename = %s', corpus_filename ) logger.info( ' database_filename = %s', database_filename ) logger.info( '--------------------------------------------------------------------------------' ) if force_overwrite or not os.path.exists( app_path ): with CreateApp(app_name) as app: # Import corpus (models/corpus.db, data/corpus.txt, data/sentences.txt) app_database_filename = '{}/corpus.db'.format( app.GetDatabasePath() ) app_corpus_filename = '{}/corpus.txt'.format( app.GetDataPath() ) app_sentences_filename = '{}/sentences.txt'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', database_filename, app_database_filename ) shutil.copy( database_filename, app_database_filename ) logger.info( 'Copying [%s] --> [%s]', corpus_filename, app_corpus_filename ) shutil.copy( corpus_filename, app_corpus_filename ) logger.info( 'Extracting [%s] --> [%s]', corpus_filename, app_sentences_filename ) SplitSentences( corpus_filename, app_sentences_filename ) # Import model (data/*) app_model_path = '{}/stm'.format( app.GetDataPath() ) logger.info( 'Copying [%s] --> [%s]', model_path, app_model_path ) shutil.copytree( model_path, app_model_path ) for stm_filename in [ 'doc-index.json', 'term-index.json', 'topic-index.json', 'doc-topic-matrix.txt', 'term-topic-matrix.txt' ]: source_filename = '{}/{}'.format(corpus_path, stm_filename) target_filename = '{}/{}'.format(app_model_path, stm_filename) logger.info( 'Copying [%s] --> [%s]', source_filename, target_filename ) shutil.copy( source_filename, target_filename ) db_path = app.GetDatabasePath() with Corpus_DB(db_path) as corpus_db: # Create a bow-of-words language model with BOW_DB(db_path, isInit=True) as bow_db: bow_computer = BOW_ComputeStats(bow_db, corpus_db, app_corpus_filename, app_sentences_filename) bow_computer.Execute() # Compute derived-statistics about an LDA-like topic model with LDA_DB(db_path, isInit=True) as lda_db: stm_reader = STMReader(lda_db, app_model_path, corpus_db) stm_reader.Execute() lda_computer = LDA_ComputeStats(lda_db, corpus_db) lda_computer.Execute() else: logger.info( ' Already available: %s', app_path )
def gib(): with Corpus_DB() as corpus_db: with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = GroupInBoxHandler(request, response, corpus_db, bow_db, lda_db) handler.UpdateModel() handler.InspectModel() handler.LoadGIB() dataStr = json.dumps(handler.content, encoding='utf-8', indent=2, sort_keys=True) response.headers['Content-Type'] = 'application/json' return dataStr
def main(): parser = argparse.ArgumentParser( description='Import a STM topic model as a folder of files.') parser.add_argument('path', type=str, default='poliblog_1', help='A folder containing file "stm.RData"') args = parser.parse_args() path = args.path logger = logging.getLogger('termite') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) with Corpus_DB('.') as corpus_db: with LDA_DB(path, isInit=True) as lda_db: reader = STMReader(lda_db, path, corpus_db, r_variable="mod.out.replicate") reader.Execute() command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, term_text, value FROM term_topic_matrix INNER JOIN terms ON term_topic_matrix.term_index = terms.term_index ORDER BY topic_index ASC, value DESC" > {PATH}/topic-word-weights.txt'.format( PATH=path) logger.info(command) os.system(command) command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, SUM(value) FROM doc_topic_matrix GROUP BY topic_index ORDER BY topic_index" > {PATH}/topic-weights.txt'.format( PATH=path) logger.info(command) os.system(command) data = [] max_value = 0 filename = '{}/topic-weights.txt'.format(path) with open(filename, 'r') as f: for line in f.read().splitlines(): topic_index, topic_weight = line.split('\t') topic_index = int(topic_index) topic_weight = float(topic_weight) max_value = max(topic_weight, max_value) data.append({ "topic_index": topic_index, "topic_weight": topic_weight, "value": topic_weight }) for elem in data: elem['value'] = elem['value'] / max_value filename = '{}/meta.json'.format(path) with open(filename, 'w') as f: json.dump(data, f, encoding='utf-8', indent=2, sort_keys=True)
def main(): parser = argparse.ArgumentParser( description='Import a gensim topic model as a folder of files.') parser.add_argument( 'path', type=str, default='model_001', help='A folder containing files "gensim.dict" and "gensim.model"') args = parser.parse_args() path = args.path logger = logging.getLogger('termite') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) with LDA_DB(path, isInit=True) as lda_db: reader = GensimReader(lda_db, path, None, extraStateFile=True) reader.Execute() command = 'sqlite3 -separator "\t" {PATH}/lda.db "SELECT topic_index, term_text, value FROM term_topic_matrix INNER JOIN terms ON term_topic_matrix.term_index = terms.term_index ORDER BY topic_index ASC, value DESC" > {PATH}/topic-word-weights.txt'.format( PATH=path) logger.info(command) os.system(command)
def index(): with BOW_DB() as bow_db: with LDA_DB() as lda_db: handler = TermTopicMatrix1(request, response, bow_db, lda_db) return handler.GenerateResponse()
def index(): with LDA_DB() as lda_db: handler = ITM_Core(request, response, lda_db) return handler.GenerateResponse()
def gib(): with Corpus_DB() as corpus_db: with LDA_DB() as lda_db: gib = ITM_GroupInBox(request, response, corpus_db, lda_db) gib.Load() return gib.GenerateResponse()
def Update(): with LDA_DB() as lda_db: handler = ITM_Core(request, response, lda_db) handler.UpdateModel() return handler.GenerateResponse()
def TopDocs(): with LDA_DB() as lda_db: handler = LDA_Core(request, response, lda_db) handler.LoadTopDocs() return handler.GenerateResponse()
def DocTopicMatrix(): with LDA_DB() as lda_db: handler = LDA_Core(request, response, lda_db) handler.LoadDocTopicMatrix() return handler.GenerateResponse()