def get_minibatch_iterator(nBatch=10, nLap=1, dataorderseed=0, **kwargs): Data = WordsData.read_from_mat(matfilepath) DataIterator = AdmixMinibatchIterator(Data, nBatch=nBatch, nLap=nLap, dataorderseed=dataorderseed) DataIterator.summary = get_data_info(Data.nDocTotal, Data.vocab_size) return DataIterator
def get_data(seed=8675309, nObsTotal=25000, **kwargs): ''' Grab data from database to initialize (used only once really) ''' doc_id_select = range(1,500) # grab the first 500 documents to initialize nDoc = len(doc_id_select) query = 'select * from data where rowid in (' + ','.join(map(str, doc_id_select)) + ')' Data = WordsData.read_from_db( dbpath, query, nDoc=nDoc, nDocTotal = nDoc, vocab_size = V ) Data.summary = get_data_info(Data.nDocTotal, Data.vocab_size) return Data
def get_test_data(seed=6789, nDocTotal=100, **kwargs): ''' Create dataset of "heldout" docs, for testing purposes. Uses different random seed than get_data, but otherwise similar. ''' updateKwArgsWithDefaults(kwargs) kwargs['seed'] = seed kwargs['nDocTotal'] = nDocTotal Data = WordsData.CreateToyDataFromMixModel(**kwargs) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(**kwargs): ''' Create and return dataset. Keyword Args ------- nDocTotal nWordsPerDoc ''' updateKwArgsWithDefaults(kwargs) Data = WordsData.CreateToyDataFromMixModel(**kwargs) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(seed=8675309, nObsTotal=25000, **kwargs): ''' Grab data from database to initialize (used only once really) ''' doc_id_select = range(1, 500) # grab the first 500 documents to initialize nDoc = len(doc_id_select) query = 'select * from data where rowid in (' + ','.join( map(str, doc_id_select)) + ')' Data = WordsData.read_from_db(dbpath, query, nDoc=nDoc, nDocTotal=nDoc, vocab_size=V) Data.summary = get_data_info(Data.nDocTotal, Data.vocab_size) return Data
def get_minibatch_iterator(seed=8675309, nBatch=10, nObsBatch=None, nObsTotal=25000, nLap=1, allocModelName=None, dataorderseed=0, **kwargs): Data = WordsData.read_from_mat(matfilepath) DataIterator = AdmixMinibatchIterator(Data, nBatch=nBatch, nObsBatch=nObsBatch, nLap=nLap, dataorderseed=dataorderseed) DataIterator.summary = get_data_info(Data.nDocTotal, Data.vocab_size) return DataIterator
def get_minibatch_iterator(seed=8675309, nBatch=10, nObsBatch=None, nObsTotal=25000, nLap=1, allocModelName=None, dataorderseed=0, **kwargs): words_dict = get_BoW(seed) Data = WordsData(**words_dict) DataIterator = AdmixMinibatchIterator(Data, nBatch=nBatch, nObsBatch=nObsBatch, nLap=nLap, dataorderseed=dataorderseed) DataIterator.summary = get_data_info() return DataIterator
def get_data(**kwargs): ''' Grab data from matfile specified by matfilepath ''' Data = WordsData.read_from_mat(matfilepath) Data.summary = get_data_info(Data.nDocTotal, Data.vocab_size) return Data
def CreateToyDataFromLDAModel(**kwargs): for key in Defaults: if key not in kwargs: kwargs[key] = Defaults[key] return WordsData.CreateToyDataFromLDAModel(**kwargs)
def run_topic_modeling(username,folder_id,ex_id): d = Folder.query.get(folder_id) ex = Experiment.query.get(ex_id) set_tm_status(username,folder_id, ex,'inprogress') db.session.commit() exinfo = ex.getExInfo() # CREATE WORD DATA datafile = d.wordcount_path() vocabfile = d.vocab_path() vocab = {} idx = 0 vv = [x.strip() for x in open(vocabfile,'r')] for v in vv: vocab[idx] = v idx += 1 lines = [x.strip().split(",") for x in open(datafile,'r')] docrange = [] word_id = [] word_count = [] start = 0 cur = -1 curD = -1 for l in lines: cur += 1 word_id.append(int(l[1])) word_count.append(int(l[2])) dID = int(l[0]) if(curD == -1): curD = dID if(curD != dID): docrange.append([start,cur-1]) start = cur-1 curD = dID docrange.append([start,cur+1]) data = WordsData(word_id,word_count,docrange,len(vocab),vocab,len(docrange)) # RUN Topic Modeling in BNPY a = {"tm_id":str(d.id), "username":username} hmodel = bnpy.Run.run(data, 'HDPModel', 'Mult', 'VB', doSaveToDisk=False, K=exinfo.nTopics, nLap=100, initname="randomfromprior", customFuncPath="refinery/webapp/", customFuncArgs=json.dumps(a)) ''' moves='birth,merge', birthPerLap=10, \ mergePerLap=10, nFreshLap=25) ''' exinfo.viz_data = getModelState(hmodel[0],hmodel[1],100) set_tm_status(username,folder_id, ex,'finish') db.session.commit()
def genWordsData(**kwargs): for key in Defaults: if key not in kwargs: kwargs[key] = Defaults[key] return WordsData.genToyData(**kwargs)
def get_data_info(): s = 'Toy Bars Data with %d true topics. Each doc uses ONE topic.' % (K) return s def get_data(**kwargs): ''' Create and return dataset. Keyword Args ------- nDocTotal nWordsPerDoc ''' updateKwArgsWithDefaults(kwargs) Data = WordsData.CreateToyDataFromMixModel(**kwargs) Data.name = get_short_name() Data.summary = get_data_info() return Data def updateKwArgsWithDefaults(kwargs): for key in Defaults: if key not in kwargs: kwargs[key] = Defaults[key] if __name__ == '__main__': import bnpy.viz.BarsViz WData = WordsData.CreateToyDataFromMixModel(**Defaults) bnpy.viz.BarsViz.plotExampleBarsDocs(WData)
def get_data(seed=8675309, nObsTotal=25000, **kwargs): words_dict = get_BoW(seed) Data = WordsData(**words_dict) Data.summary = get_data_info() return Data