def fetch_sotu(): baseurl = 'http://stateoftheunion.onetwothree.net/texts/' path = 'data/sotu/sotus.pkl' def download_text(datestr): pagetext = urlopen(baseurl + datestr + '.html').read().replace('\n', ' ') paragraphs = re.findall(r'<p>(.*?)</p>', pagetext, re.DOTALL) return ' '.join(paragraph.strip() for paragraph in paragraphs) if not os.path.isfile(path): response = urlopen(baseurl + 'index.html') dates = re.findall(r'<li><a href="([0-9]+)\.html">', response.read()) print('Downloading SOTU data...') sotus = {date:download_text(date) for date in progprint(dates)} print('...done!') mkdir(os.path.dirname(path)) with open(path, 'w') as outfile: pickle.dump(sotus, outfile, protocol=-1) else: with open(path, 'r') as infile: sotus = pickle.load(infile) return sotus
def fetch_sotu(): baseurl = 'http://stateoftheunion.onetwothree.net/texts/' path = 'data/sotu/sotus.pkl' def download_text(datestr): pagetext = urlopen(baseurl + datestr + '.html').read().replace( '\n', ' ') paragraphs = re.findall(r'<p>(.*?)</p>', pagetext, re.DOTALL) return ' '.join(paragraph.strip() for paragraph in paragraphs) if not os.path.isfile(path): response = urlopen(baseurl + 'index.html') dates = re.findall(r'<li><a href="([0-9]+)\.html">', response.read()) print('Downloading SOTU data...') sotus = {date: download_text(date) for date in progprint(dates)} print('...done!') mkdir(os.path.dirname(path)) with open(path, 'w') as outfile: pickle.dump(sotus, outfile, protocol=-1) else: with open(path, 'r') as infile: sotus = pickle.load(infile) return sotus
def fetch_sotu(): baseurl = "http://stateoftheunion.onetwothree.net/texts/" path = "data/sotu/sotus.pkl" def download_text(datestr): pagetext = urlopen(baseurl + datestr + ".html").read().replace("\n", " ") paragraphs = re.findall(r"<p>(.*?)</p>", pagetext, re.DOTALL) return " ".join(paragraph.strip() for paragraph in paragraphs) if not os.path.isfile(path): response = urlopen(baseurl + "index.html") dates = re.findall(r'<li><a href="([0-9]+)\.html">', response.read()) print "Downloading SOTU data..." sotus = {date: download_text(date) for date in progprint(dates)} print "...done!" mkdir(os.path.dirname(path)) with open(path, "w") as outfile: pickle.dump(sotus, outfile, protocol=-1) else: with open(path, "r") as infile: sotus = pickle.load(infile) return sotus
def fit_ldac_ctm(num_topics, datadir, resultsdir): if isdir(resultsdir): for f in glob(join(resultsdir, '*.dat')): os.remove(f) mkdir(resultsdir) with open(logfile, 'w') as log: subprocess.check_call( [ctm_binary_path, 'est', join(datadir, documentfile), str(num_topics), 'rand', resultsdir, settingsfile], stdout=log, stderr=subprocess.STDOUT)
def fit_ldac_ctm(num_topics, datadir, resultsdir): if isdir(resultsdir): for f in glob(join(resultsdir, '*.dat')): os.remove(f) mkdir(resultsdir) with open(logfile, 'w') as log: subprocess.check_call([ ctm_binary_path, 'est', join(datadir, documentfile), str(num_topics), 'rand', resultsdir, settingsfile ], stdout=log, stderr=subprocess.STDOUT)
def dump_ldac_dataset(train_data, datadir): mkdir(datadir) def pairs(row): return ['{}:{}'.format( wordid,int(row[0,wordid])) for wordid in row.nonzero()[1]] def line(row): return '{} {}\n'.format(row.nnz, ' '.join(pairs(row))) with open(join(datadir, documentfile), 'w') as outfile: outfile.writelines(line(row) for row in train_data if row.nnz > 0) return np.array([row.nnz > 0 for row in train_data])
def dump_ldac_dataset(train_data, datadir): mkdir(datadir) def pairs(row): return [ '{}:{}'.format(wordid, int(row[0, wordid])) for wordid in row.nonzero()[1] ] def line(row): return '{} {}\n'.format(row.nnz, ' '.join(pairs(row))) with open(join(datadir, documentfile), 'w') as outfile: outfile.writelines(line(row) for row in train_data if row.nnz > 0) return np.array([row.nnz > 0 for row in train_data])
def cached(func): mkdir(cachedir) cachebase = os.path.join(cachedir, func.__module__ + func.__name__) def replace_arrays(v): if isinstance(v, np.ndarray): return hashlib.sha1(v).hexdigest() if isinstance(v, scipy.sparse.csr.csr_matrix): out = hashlib.sha1(v.data) out.update(v.indices) out.update(v.indptr) return out.hexdigest() return v @wraps(func) def wrapped(*args, **kwargs): argdict = \ {k:replace_arrays(v) for k,v in inspect.getcallargs(func,*args,**kwargs).iteritems()} closurevals = \ [replace_arrays(cell.cell_contents) for cell in func.__closure__ or []] key = str(hash(frozenset(argdict.items() + closurevals))) cachefile = cachebase + '.' + key if os.path.isfile(cachefile): with gzip.open(cachefile, 'r') as infile: value = pickle.load(infile) return value else: value = func(*args,**kwargs) with gzip.open(cachefile, 'w') as outfile: pickle.dump(value, outfile, protocol=-1) return value return wrapped
"var max iter 20", "cg max iter -1", "em convergence 1e-3", "var convergence 1e-6", "cg convergence 1e-6", "lag 1", "covariance estimate mle", ] if not has_ctm_c: msg = 'Please download ctm-c from {url} to {ctmdir} and build it. ' \ '(i.e. the ctm binary should be at {ctm_binary_path})'.format( url=ctm_url, ctmdir=ctmdir, ctm_binary_path=ctm_binary_path) raise Exception('\n' + '\n'.join(wrap(msg, 82))) mkdir(os.path.dirname(settingsfile)) with open(settingsfile, 'w') as outfile: outfile.writelines('\n'.join(settings)) def dump_ldac_dataset(train_data, datadir): mkdir(datadir) def pairs(row): return ['{}:{}'.format( wordid,int(row[0,wordid])) for wordid in row.nonzero()[1]] def line(row): return '{} {}\n'.format(row.nnz, ' '.join(pairs(row))) with open(join(datadir, documentfile), 'w') as outfile:
"var max iter 20", "cg max iter -1", "em convergence 1e-3", "var convergence 1e-6", "cg convergence 1e-6", "lag 1", "covariance estimate mle", ] if not has_ctm_c: msg = 'Please download ctm-c from {url} to {ctmdir} and build it. ' \ '(i.e. the ctm binary should be at {ctm_binary_path})'.format( url=ctm_url, ctmdir=ctmdir, ctm_binary_path=ctm_binary_path) raise Exception('\n' + '\n'.join(wrap(msg, 82))) mkdir(os.path.dirname(settingsfile)) with open(settingsfile, 'w') as outfile: outfile.writelines('\n'.join(settings)) def dump_ldac_dataset(train_data, datadir): mkdir(datadir) def pairs(row): return [ '{}:{}'.format(wordid, int(row[0, wordid])) for wordid in row.nonzero()[1] ] def line(row): return '{} {}\n'.format(row.nnz, ' '.join(pairs(row)))
print("Singular vector ", d, " Singular value, ", S[d]) print("Right: ") print(top_k(5, pi_vd)) print("Left: ") print(top_k(5, pi_ud)) if __name__ == "__main__": run = 1 results_dir = os.path.join("results", "alice", "run%03d" % run) # Make sure the results directory exists from pgmult.utils import mkdir if not os.path.exists(results_dir): print("Making results directory: ", results_dir) mkdir(results_dir) # Load the data Xs, words = load() # N_docs = 1 docs = slice(0,1) T_end = 4000 T_split = 100 # Filter out documents shorter than 2 * T_split Xfilt = [X for X in Xs if X.shape[0] > T_end] Xtrain = [X[:T_end-T_split] for X in Xfilt[docs]] Xtest = [X[T_end-T_split:T_end] for X in Xfilt[docs]] # Perform inference for a range of latent state dimensions and models
legendargs={ "columnspacing": 0.75, "handletextpad": 0.1 }) fig.savefig(os.path.join(results_dir, "legend.pdf")) if __name__ == "__main__": run = 5 results_dir = os.path.join("results", "dna", "run%03d" % run) # Make sure the results directory exists from pgmult.utils import mkdir if not os.path.exists(results_dir): print("Making results directory: ", results_dir) mkdir(results_dir) # Load data Xs, key = load_data() # Split data into two T_end = Xs[0].shape[0] T_split = 10 Xtrain = [Xs[0][:T_end - T_split, :]] Xtest = Xs[0][T_end - T_split:T_end, :] K = len(key) # Perform inference for a range of latent state dimensions and models N_samples = 1000 all_results = [] # Ds = np.array([2, 3, 4, 5, 6])