Esempio n. 1
0
#!/usr/bin/env python
import json

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc

import jsondata

if __name__=='__main__':
    rc('text', usetex=True)

    data = jsondata.read('table.json')

    pps = [0.1, 0.5, 0.9,]

    dd = [('generate_well_separable', 'Well Separated'),
          ('generate_mostly_separable', 'Mostly Separated'), 
          ('generate_some_overlap', 'Some Overlap'), 
          ('generate_complete_overlap', 'Complete Overlap'), 
    ]

    for distributions,name in dd:
        fig, axs = plt.subplots(3, sharex=True, sharey=True)

        # Three subplots sharing both x/y axes
        for i,pp in enumerate(pps):
            rows = [r[2:] for r in data if r[0] == pp and r[1] == distributions]
            d = np.array(rows).T
            d.sort(axis=1)
            assert d.shape == (6, 4) 
Esempio n. 2
0
#!/usr/bin/env python
import numpy as np

import jsondata
import tlc

from itertools import izip

if __name__=='__main__':
    num_labeled = 9994

    cut_short = 100000000000

    # use my tlc synthetically generated dataset
    documents = jsondata.read('data/documents.dc.nyt.json')[:cut_short]
    comments = jsondata.read('data/comments.dc.nyt.json')[:cut_short]
    labeled_documents = jsondata.read('data/yelp.nyt_med.json')[:num_labeled][:cut_short]
    background = jsondata.read('data/background.nyt_med.json')[:cut_short]

    y = jsondata.read('data/yelp.labels.json')[:num_labeled][:cut_short]
    y = [(i - 3.0) for i in y] # center around 0

    real_data = (documents, comments, labeled_documents, background, y)

    var = tlc.TLCVars(real_data, Ku=25, Ks=5, Kb=25)
    var.eta = np.array([3.0, 1.5, 0.5, -1.5, -3.0])

    try:
        output = tlc.run_tlc(var)
    except Exception,e:
        print e
Esempio n. 3
0
#!/usr/bin/env python
import jsondata

num_docs = 10000
labeled_documents = jsondata.read('data/yelp.nyt_med.json')[:num_docs]
for i,l in enumerate(labeled_documents):
    if len(l) == 0:
        print i
Esempio n. 4
0
                second row matches to column and contains count of term in doc.
    """

    def matrix(d):
        """Accepts dictionary as above. Returns 2-row matrix."""
        # todo: i think this uses way too much memory
        elements = list(itertools.chain(*d.iteritems()))
        return r.matrix(ro.IntVector(elements), nrow=2)

    matrices = [matrix(d) for d in features]
    return matrices
    # return r.list(matrices)


if __name__ == "__main__":
    lexicon = dict([(a, i) for i, a in enumerate(jsondata.read("data/nytimes_med_common_vocab.json"))])

    """
    db = None
    try:
        import pymongo
        db = pymongo.Connection('localhost', 27017).nytimes
    except:
        print 'did not connect to mongo; not running'

    docs_with_comments = list(db.article.find({'num_comments':{'$gt': 0}}).sort([('pubdate', -1)]))

    dwc = docs_with_comments

    titles = []
    docs = []
Esempio n. 5
0
def read_yelp_reviews():
    """Returns generator of dicts of reviews from yelp datset."""
    for d in jsondata.read("data/yelp_academic_dataset.json"):
        if d["type"] == "review":
            yield d
Esempio n. 6
0
    Copyright (C) 2011 Joseph Perla

    GNU Affero General Public License. See <http://www.gnu.org/licenses/>.
"""
import glob

import numpy as np

import jsondata
import ppc


if __name__=='__main__':
    s = 'midterm/mytlc-output-15-%s'

    eta = jsondata.read(glob.glob(s % 'eta*')[0])
    sigma_squared = jsondata.read(glob.glob(s % 'sigma_squared*')[0])[0]
    beta = jsondata.read(glob.glob(s % 'beta*')[0])
    phi = jsondata.read(glob.glob(s % 'phiC*')[0])
    # cut beta down to what we need
    Nd,Kc = phi[0].shape
    beta = beta[:Kc,:]
    

    print 'finished reading in params...'
    global_params = {'eta': eta, 'beta': beta, 'sigma_squared': sigma_squared}
    local_params = [{'phi': p} for p in phi]

    comments = jsondata.read('data/comments.dc.nyt.json')
    print 'finished reading in docs...'
    p = ppc.YelpSentimentTLCPPC()
Esempio n. 7
0
    Looks inside json data. Prints out first few lines of words.
    Useful for making sure I have the data I want.

    Copyright (C) 2011 Joseph Perla

    GNU Affero General Public License. See <http://www.gnu.org/licenses/>.
"""

import sys
import jsondata

if __name__=='__main__':
    data_filename = sys.argv[1]
    vocab_filename = sys.argv[2]
    num_docs = int(sys.argv[3])
    words_per_doc = int(sys.argv[4])
    associated_filename = sys.argv[5] if len(sys.argv) > 5 else None

    data = jsondata.read(data_filename)[:num_docs]
    lexicon = jsondata.read(vocab_filename)
    words = [[lexicon[w] for (w,c) in sorted(doc, key=lambda w:-w[1])][:words_per_doc] for doc in data]

    if associated_filename is not None:
        associated = jsondata.read(associated_filename)[:num_docs]

    for i in xrange(num_docs):
        if associated_filename is not None:
            print associated[i], words[i]
        else:
            print words[i]
Esempio n. 8
0
#!/usr/bin/env python

import jsondata
from inspect_slda_model import predict


phi_filename = '../balancedtlc/mytlc-output-20-phiC.dat.npy.list.npz'
vocab_filename = ''
eta_filename = '../balancedtlc/mytlc-output-20-eta.dat.npy.gz'

titles_filename = '../data/titles.dc.nyt.json'

phi = jsondata.read(phi_filename)
eta = jsondata.read(eta_filename)
titles = jsondata.read(titles_filename)

print 'read in data...'
predicted_ratings = list(sorted((i, predict(eta, p)) for i,p in enumerate(phi)))
print 'predicted ratings...'

import csv
reader = csv.reader(open('gold.csv', 'r'))

#v = [i[0] for i in predicted_ratings]
#import pdb; pdb.set_trace()

rall = []

for line in reader:
    index = int(line[1])
    mean = float(line[2])
Esempio n. 9
0
    code i used to generate histograms of some data
    visualizing them
    Copyright (C) 2011 Joseph Perla

    GNU Affero General Public License. See <http://www.gnu.org/licenses/>.
"""
import pylab

import vlex
import jsondata

f = 'data/yelp_4cat_naive_full_mytoken_74.json'
f = 'data/yelp_4cat_naive_full_standardtoken.json'
f = 'data/yelp_2cat_naive_full_mytoken_783.json'

data = list(jsondata.read(f))
words = vlex.parse_bayes_into_scores(data)

values = [w[1] for w in words]

#remove the modes, +/-.75
#values = [v for v in values if abs(v) != .75]
values = [v for v in values if 30 > abs(v) and abs(v) != 3]


pylab.hist(values, bins=50)

pylab.show()
'''
'''
Esempio n. 10
0
    GNU Affero General Public License. See <http://www.gnu.org/licenses/>.
"""
import glob

import numpy as np

import jsondata
import ppc


if __name__=='__main__':
    s = 'midterm/mytlc-output-20-%s'

    name = 'tlc-pslda'

    eta = jsondata.read(glob.glob(s % 'eta*')[0])
    sigma_squared = jsondata.read(glob.glob(s % 'sigma_squared*')[0])[0]
    beta = jsondata.read(glob.glob(s % 'beta*')[0])
    phi = jsondata.read(glob.glob(s % 'phiC*')[0])
    # cut beta down to what we need
    Nd,Kc = phi[0].shape
    beta = beta[:Kc,:]

    # in comments, last phi is the sentiment data
    Ks = len(eta)
    phi = [p[:,-Ks:] for p in phi]

    print 'finished reading in params...'
    global_params = {'eta': eta, 'beta': beta, 'sigma_squared': sigma_squared}
    local_params = [{'phi': p} for p in phi]
Esempio n. 11
0
    Ks = len(eta)
    N,K = phi.shape
    phi = phi[:,-Ks:]
    EZ = np.sum(phi, axis=0) / N
    return np.dot(eta, EZ)


if __name__=='__main__':
    phi_filename = sys.argv[1]
    vocab_filename = sys.argv[2]
    num_docs = int(sys.argv[3])
    eta_filename = sys.argv[4]

    associated_filename = sys.argv[5]

    phi = jsondata.read(phi_filename)
    lexicon = jsondata.read(vocab_filename)

    eta = None
    if eta_filename is not None:
        eta = jsondata.read(eta_filename)

    print 'eta: %s' % eta

    if associated_filename is not None:
        associated = jsondata.read(associated_filename)

    print 'read in data...'
    predicted_ratings = list(sorted((predict(eta, p),i) for i,p in enumerate(phi)))
    print 'predicted ratings...'
Esempio n. 12
0
        raise NotImplementedError

    def discrepancy(self, posterior, observed):
        """Accepts posterior, which is dictionary of phi, beta, eta, sigma squared.
            Observed is a sparse vector of word, list of (word int,count) 2-tuples.

            Returns a real number.

            Just uses observed and posterior norm divided by sigma squared.
        """
        #TODO: jperla: maybe can generalize, sigma is a def standardizer() ?
        s = np.sqrt(posterior['sigma_squared'])
        return abs(self.posterior_norm(posterior) - self.observed_norm(observed)) / s


vocab = dict((w,i) for i,w in enumerate(jsondata.read('../data/nytimes_med_common_vocab.json')))
pos = jsondata.read('../data/liu_pos_words.json')
neg = jsondata.read('../data/liu_neg_words.json')

posi = set([vocab[w] for w in pos if w in vocab])
negi = set([vocab[w] for w in neg if w in vocab])

class YelpSentimentPartialSLDAPPC(TLCPPC):
    def simulate(self, posterior, observed):
        """Accepts posterior vars which include phi and eta.
            As well as observed value which is just a real number.
            Returns a new observation.

            Observation is from a normal from expected mean, like regression.
        """
        s = np.sqrt(posterior['sigma_squared'])
Esempio n. 13
0
    
    Copyright (C) 2011 Joseph Perla

    GNU Affero General Public License. See <http://www.gnu.org/licenses/>.
"""
import ppc

import numpy as np

import jsondata


if __name__=='__main__':
    s = 'medsldamodel/med-slda.final-%s.dat'

    eta = np.array(jsondata.read(s % 'eta'))
    beta = np.array(jsondata.read(s % 'beta'))
    phi = [np.array(p) for p in jsondata.read(s % 'phi')]
    sigma_squared = jsondata.read(s % 'sigma_squared')[0]

    print 'finished reading in params...'
    global_params = {'eta': eta, 'beta': beta, 'sigma_squared': sigma_squared}
    local_params = [{'phi': p} for p in phi]

    # get the data
    num_docs = 1000
    #labeled_documents = jsondata.read('data/yelp.nyt_med.json')[:num_docs]
    y = jsondata.read('data/yelp.labels.json')[:num_docs]

    #// filter out documents with no words
    #all_data = [(l,y) for l,y in izip(labeled_documents,y) if len(l) > 0]
Esempio n. 14
0
#!/usr/bin/env python
import json

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc

import jsondata

if __name__ == '__main__':
    rc('text', usetex=True)

    data = jsondata.read('table.json')

    pps = [
        0.1,
        0.5,
        0.9,
    ]

    dd = [
        ('generate_well_separable', 'Well Separated'),
        ('generate_mostly_separable', 'Mostly Separated'),
        ('generate_some_overlap', 'Some Overlap'),
        ('generate_complete_overlap', 'Complete Overlap'),
    ]

    for distributions, name in dd:
        fig, axs = plt.subplots(3, sharex=True, sharey=True)

        # Three subplots sharing both x/y axes
Esempio n. 15
0
def describe_doc(data_filename, vocab_filename, docid):
    counts = [f for f in (open(data_filename).readlines()[docid]).split(' ')][1:]
    wordids = [int(c.split(':')[0]) for c in counts]
    vocab = list(jsondata.read(vocab_filename))
    words = [vocab[i] for i in wordids]
    return sorted(words)
Esempio n. 16
0
def grab_topic(beta_filename, vocab_filename, topicid):
    counts = [float(f) for f in (open(beta_filename).readlines()[topicid]).split(' ')]
    minimum = min(counts) # to ignore very irrelevant words
    vocab = list(jsondata.read(vocab_filename))
    words = [(vocab[i], p) for i,p in enumerate(counts) if p > minimum]
    return sorted(words, key=lambda v:-v[1])