def get_minibatch_iterator(seed=8675309, nBatch=10, nObsBatch=None, nObsTotal=25000, nLap=1, allocModelName=None, dataorderseed=0, **kwargs): words_dict = get_BoW(seed) Data = WordsData(**words_dict) DataIterator = AdmixMinibatchIterator(Data, nBatch=nBatch, nObsBatch=nObsBatch, nLap=nLap, dataorderseed=dataorderseed) DataIterator.summary = get_data_info() return DataIterator
def run_topic_modeling(username,folder_id,ex_id): d = Folder.query.get(folder_id) ex = Experiment.query.get(ex_id) set_tm_status(username,folder_id, ex,'inprogress') db.session.commit() exinfo = ex.getExInfo() # CREATE WORD DATA datafile = d.wordcount_path() vocabfile = d.vocab_path() vocab = {} idx = 0 vv = [x.strip() for x in open(vocabfile,'r')] for v in vv: vocab[idx] = v idx += 1 lines = [x.strip().split(",") for x in open(datafile,'r')] docrange = [] word_id = [] word_count = [] start = 0 cur = -1 curD = -1 for l in lines: cur += 1 word_id.append(int(l[1])) word_count.append(int(l[2])) dID = int(l[0]) if(curD == -1): curD = dID if(curD != dID): docrange.append([start,cur-1]) start = cur-1 curD = dID docrange.append([start,cur+1]) data = WordsData(word_id,word_count,docrange,len(vocab),vocab,len(docrange)) # RUN Topic Modeling in BNPY a = {"tm_id":str(d.id), "username":username} hmodel = bnpy.Run.run(data, 'HDPModel', 'Mult', 'VB', doSaveToDisk=False, K=exinfo.nTopics, nLap=100, initname="randomfromprior", customFuncPath="refinery/webapp/", customFuncArgs=json.dumps(a)) ''' moves='birth,merge', birthPerLap=10, \ mergePerLap=10, nFreshLap=25) ''' exinfo.viz_data = getModelState(hmodel[0],hmodel[1],100) set_tm_status(username,folder_id, ex,'finish') db.session.commit()
def get_data(seed=8675309, nObsTotal=25000, **kwargs): words_dict = get_BoW(seed) Data = WordsData(**words_dict) Data.summary = get_data_info() return Data