Example #1
0
File: dtm.py Project: HIPS/pgmult
def fetch_sotu():
    baseurl = 'http://stateoftheunion.onetwothree.net/texts/'
    path = 'data/sotu/sotus.pkl'

    def download_text(datestr):
        pagetext = urlopen(baseurl + datestr + '.html').read().replace('\n', ' ')
        paragraphs = re.findall(r'<p>(.*?)</p>', pagetext, re.DOTALL)
        return ' '.join(paragraph.strip() for paragraph in paragraphs)

    if not os.path.isfile(path):
        response = urlopen(baseurl + 'index.html')
        dates = re.findall(r'<li><a href="([0-9]+)\.html">', response.read())

        print('Downloading SOTU data...')
        sotus = {date:download_text(date) for date in progprint(dates)}
        print('...done!')

        mkdir(os.path.dirname(path))
        with open(path, 'w') as outfile:
            pickle.dump(sotus, outfile, protocol=-1)
    else:
        with open(path, 'r') as infile:
            sotus = pickle.load(infile)

    return sotus
Example #2
0
def fetch_sotu():
    baseurl = 'http://stateoftheunion.onetwothree.net/texts/'
    path = 'data/sotu/sotus.pkl'

    def download_text(datestr):
        pagetext = urlopen(baseurl + datestr + '.html').read().replace(
            '\n', ' ')
        paragraphs = re.findall(r'<p>(.*?)</p>', pagetext, re.DOTALL)
        return ' '.join(paragraph.strip() for paragraph in paragraphs)

    if not os.path.isfile(path):
        response = urlopen(baseurl + 'index.html')
        dates = re.findall(r'<li><a href="([0-9]+)\.html">', response.read())

        print('Downloading SOTU data...')
        sotus = {date: download_text(date) for date in progprint(dates)}
        print('...done!')

        mkdir(os.path.dirname(path))
        with open(path, 'w') as outfile:
            pickle.dump(sotus, outfile, protocol=-1)
    else:
        with open(path, 'r') as infile:
            sotus = pickle.load(infile)

    return sotus
Example #3
0
def fetch_sotu():
    baseurl = "http://stateoftheunion.onetwothree.net/texts/"
    path = "data/sotu/sotus.pkl"

    def download_text(datestr):
        pagetext = urlopen(baseurl + datestr + ".html").read().replace("\n", " ")
        paragraphs = re.findall(r"<p>(.*?)</p>", pagetext, re.DOTALL)
        return " ".join(paragraph.strip() for paragraph in paragraphs)

    if not os.path.isfile(path):
        response = urlopen(baseurl + "index.html")
        dates = re.findall(r'<li><a href="([0-9]+)\.html">', response.read())

        print "Downloading SOTU data..."
        sotus = {date: download_text(date) for date in progprint(dates)}
        print "...done!"

        mkdir(os.path.dirname(path))
        with open(path, "w") as outfile:
            pickle.dump(sotus, outfile, protocol=-1)
    else:
        with open(path, "r") as infile:
            sotus = pickle.load(infile)

    return sotus
Example #4
0
def fit_ldac_ctm(num_topics, datadir, resultsdir):
    if isdir(resultsdir):
        for f in glob(join(resultsdir, '*.dat')):
            os.remove(f)
    mkdir(resultsdir)
    with open(logfile, 'w') as log:
        subprocess.check_call(
            [ctm_binary_path, 'est', join(datadir, documentfile),
                str(num_topics), 'rand', resultsdir, settingsfile],
            stdout=log, stderr=subprocess.STDOUT)
Example #5
0
def fit_ldac_ctm(num_topics, datadir, resultsdir):
    if isdir(resultsdir):
        for f in glob(join(resultsdir, '*.dat')):
            os.remove(f)
    mkdir(resultsdir)
    with open(logfile, 'w') as log:
        subprocess.check_call([
            ctm_binary_path, 'est',
            join(datadir, documentfile),
            str(num_topics), 'rand', resultsdir, settingsfile
        ],
                              stdout=log,
                              stderr=subprocess.STDOUT)
Example #6
0
def dump_ldac_dataset(train_data, datadir):
    mkdir(datadir)

    def pairs(row):
        return ['{}:{}'.format(
                wordid,int(row[0,wordid])) for wordid in row.nonzero()[1]]

    def line(row):
        return '{} {}\n'.format(row.nnz, ' '.join(pairs(row)))

    with open(join(datadir, documentfile), 'w') as outfile:
        outfile.writelines(line(row) for row in train_data if row.nnz > 0)

    return np.array([row.nnz > 0 for row in train_data])
Example #7
0
def dump_ldac_dataset(train_data, datadir):
    mkdir(datadir)

    def pairs(row):
        return [
            '{}:{}'.format(wordid, int(row[0, wordid]))
            for wordid in row.nonzero()[1]
        ]

    def line(row):
        return '{} {}\n'.format(row.nnz, ' '.join(pairs(row)))

    with open(join(datadir, documentfile), 'w') as outfile:
        outfile.writelines(line(row) for row in train_data if row.nnz > 0)

    return np.array([row.nnz > 0 for row in train_data])
Example #8
0
def cached(func):
    mkdir(cachedir)
    cachebase = os.path.join(cachedir, func.__module__ + func.__name__)

    def replace_arrays(v):
        if isinstance(v, np.ndarray):
            return hashlib.sha1(v).hexdigest()
        if isinstance(v, scipy.sparse.csr.csr_matrix):
            out = hashlib.sha1(v.data)
            out.update(v.indices)
            out.update(v.indptr)
            return out.hexdigest()
        return v

    @wraps(func)
    def wrapped(*args, **kwargs):
        argdict = \
            {k:replace_arrays(v) for k,v in
                inspect.getcallargs(func,*args,**kwargs).iteritems()}
        closurevals = \
            [replace_arrays(cell.cell_contents) for cell in func.__closure__ or []]

        key = str(hash(frozenset(argdict.items() + closurevals)))
        cachefile = cachebase + '.' + key

        if os.path.isfile(cachefile):
            with gzip.open(cachefile, 'r') as infile:
                value = pickle.load(infile)
            return value
        else:
            value = func(*args,**kwargs)
            with gzip.open(cachefile, 'w') as outfile:
                pickle.dump(value, outfile, protocol=-1)
            return value

    return wrapped
Example #9
0
def cached(func):
    mkdir(cachedir)
    cachebase = os.path.join(cachedir, func.__module__ + func.__name__)

    def replace_arrays(v):
        if isinstance(v, np.ndarray):
            return hashlib.sha1(v).hexdigest()
        if isinstance(v, scipy.sparse.csr.csr_matrix):
            out = hashlib.sha1(v.data)
            out.update(v.indices)
            out.update(v.indptr)
            return out.hexdigest()
        return v

    @wraps(func)
    def wrapped(*args, **kwargs):
        argdict = \
            {k:replace_arrays(v) for k,v in
                inspect.getcallargs(func,*args,**kwargs).iteritems()}
        closurevals = \
            [replace_arrays(cell.cell_contents) for cell in func.__closure__ or []]

        key = str(hash(frozenset(argdict.items() + closurevals)))
        cachefile = cachebase + '.' + key

        if os.path.isfile(cachefile):
            with gzip.open(cachefile, 'r') as infile:
                value = pickle.load(infile)
            return value
        else:
            value = func(*args,**kwargs)
            with gzip.open(cachefile, 'w') as outfile:
                pickle.dump(value, outfile, protocol=-1)
            return value

    return wrapped
Example #10
0
    "var max iter 20",
    "cg max iter -1",
    "em convergence 1e-3",
    "var convergence 1e-6",
    "cg convergence 1e-6",
    "lag 1",
    "covariance estimate mle",
]

if not has_ctm_c:
    msg = 'Please download ctm-c from {url} to {ctmdir} and build it. ' \
          '(i.e. the ctm binary should be at {ctm_binary_path})'.format(
        url=ctm_url, ctmdir=ctmdir, ctm_binary_path=ctm_binary_path)
    raise Exception('\n' + '\n'.join(wrap(msg, 82)))

mkdir(os.path.dirname(settingsfile))
with open(settingsfile, 'w') as outfile:
    outfile.writelines('\n'.join(settings))


def dump_ldac_dataset(train_data, datadir):
    mkdir(datadir)

    def pairs(row):
        return ['{}:{}'.format(
                wordid,int(row[0,wordid])) for wordid in row.nonzero()[1]]

    def line(row):
        return '{} {}\n'.format(row.nnz, ' '.join(pairs(row)))

    with open(join(datadir, documentfile), 'w') as outfile:
Example #11
0
    "var max iter 20",
    "cg max iter -1",
    "em convergence 1e-3",
    "var convergence 1e-6",
    "cg convergence 1e-6",
    "lag 1",
    "covariance estimate mle",
]

if not has_ctm_c:
    msg = 'Please download ctm-c from {url} to {ctmdir} and build it. ' \
          '(i.e. the ctm binary should be at {ctm_binary_path})'.format(
        url=ctm_url, ctmdir=ctmdir, ctm_binary_path=ctm_binary_path)
    raise Exception('\n' + '\n'.join(wrap(msg, 82)))

mkdir(os.path.dirname(settingsfile))
with open(settingsfile, 'w') as outfile:
    outfile.writelines('\n'.join(settings))


def dump_ldac_dataset(train_data, datadir):
    mkdir(datadir)

    def pairs(row):
        return [
            '{}:{}'.format(wordid, int(row[0, wordid]))
            for wordid in row.nonzero()[1]
        ]

    def line(row):
        return '{} {}\n'.format(row.nnz, ' '.join(pairs(row)))
Example #12
0
        print("Singular vector ", d, " Singular value, ", S[d])
        print("Right: ")
        print(top_k(5, pi_vd))
        print("Left: ")
        print(top_k(5, pi_ud))


if __name__ == "__main__":
    run = 1
    results_dir = os.path.join("results", "alice", "run%03d" % run)

    # Make sure the results directory exists
    from pgmult.utils import mkdir
    if not os.path.exists(results_dir):
        print("Making results directory: ", results_dir)
        mkdir(results_dir)

    # Load the data
    Xs, words = load()

    # N_docs = 1
    docs = slice(0,1)
    T_end = 4000
    T_split = 100

    # Filter out documents shorter than 2 * T_split
    Xfilt = [X for X in Xs if X.shape[0] > T_end]
    Xtrain = [X[:T_end-T_split] for X in Xfilt[docs]]
    Xtest = [X[T_end-T_split:T_end] for X in Xfilt[docs]]

    # Perform inference for a range of latent state dimensions and models
Example #13
0
                               legendargs={
                                   "columnspacing": 0.75,
                                   "handletextpad": 0.1
                               })
    fig.savefig(os.path.join(results_dir, "legend.pdf"))


if __name__ == "__main__":
    run = 5
    results_dir = os.path.join("results", "dna", "run%03d" % run)

    # Make sure the results directory exists
    from pgmult.utils import mkdir
    if not os.path.exists(results_dir):
        print("Making results directory: ", results_dir)
        mkdir(results_dir)

    # Load data
    Xs, key = load_data()

    # Split data into two
    T_end = Xs[0].shape[0]
    T_split = 10
    Xtrain = [Xs[0][:T_end - T_split, :]]
    Xtest = Xs[0][T_end - T_split:T_end, :]
    K = len(key)

    # Perform inference for a range of latent state dimensions and models
    N_samples = 1000
    all_results = []
    # Ds = np.array([2, 3, 4, 5, 6])