def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--total-jobs', metavar='<total-jobs>', help='total number of jobs downloading documents', type=int)
    parser.add_argument('--job', metavar='<job>', help='job number between 1 and <total-jobs>', type=int)

    args = parser.parse_args()
    check_args(parser, args)

    br = Browser()
    br.set_handle_robots(False)
#    br.set_debug_responses(True)

    data = urlencode({'user': USERNAME, 'pass': getpass()})

    document_urls = [LOGIN_PREFIX + url.strip() + '&view=etext' for url in file(DOCUMENT_URLS_FILE)]

    start = args.job - 1
    step = args.total_jobs

    for url in iterview(document_urls[start::step]):
        try:
            get_document_pages(br, url, data)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)
Esempio n. 2
0
def check(func, dist):
    """
    Arguments:

    func -- function to check
    dist -- (unnormalized) distribution to pass to func
    """

    num_samples = 100000

    empirical = zeros(len(dist))

    for n in iterview(xrange(num_samples)):
        empirical[func(dist)] += 1

    empirical /= num_samples
    normalized_dist = dist / float(dist.sum())

    # could look at max relative error
    # could also look at JS or KL divergence
    # could do absolute difference
    # ...

    error = (abs(empirical - normalized_dist) / normalized_dist).mean()

    assert error < 0.01, 'Mean relative error >= 1%'
Esempio n. 3
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--total-jobs',
                        metavar='<total-jobs>',
                        help='total number of jobs downloading documents',
                        type=int)
    parser.add_argument('--job',
                        metavar='<job>',
                        help='job number between 1 and <total-jobs>',
                        type=int)

    args = parser.parse_args()
    check_args(parser, args)

    br = Browser()
    br.set_handle_robots(False)
    #    br.set_debug_responses(True)

    data = urlencode({'user': USERNAME, 'pass': getpass()})

    document_urls = [
        LOGIN_PREFIX + url.strip() + '&view=etext'
        for url in file(DOCUMENT_URLS_FILE)
    ]

    start = args.job - 1
    step = args.total_jobs

    for url in iterview(document_urls[start::step]):
        try:
            get_document_pages(br, url, data)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)
Esempio n. 4
0
    def log_predictive_prob(self, new_corpus, num_samples):

        D, V, T = self.D, self.V, self.T

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt_plus_alpha_m = self.Dt_plus_alpha_m
        D_plus_alpha = self.D_plus_alpha

        Nvt_new, Nt_new, Dt_new, z_new = [], [], [], []

        for r in xrange(num_samples):

            Nvt_new.append(zeros((T, V), dtype=int))
            Nt_new.append(zeros(T, dtype=int))

            Dt_new.append(zeros(T, dtype=int))

            z_new.append(zeros(len(new_corpus), dtype=int))

        log_p = 0

        for d, doc in enumerate(iterview(new_corpus)):

            tmp = zeros(num_samples, dtype=float)

            for r in xrange(num_samples):
                for prev_d in xrange(0, d):

                    prev_doc = corpus.documents[prev_d]
                    t = z_new[r][prev_d]

                    Nvt_new[r][t, :] -= prev_doc.Nv
                    Nt_new[r][t] -= len(prev_doc)
                    Dt_new[r][t] -= 1

                    t = log_sample(gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(prev_doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m))

                    Nvt_new[r][t, :] += prev_doc.Nv
                    Nt_new[r][t] += len(prev_doc)
                    Dt_new[r][t] += 1

                    z_new[r][prev_d] = t

                log_dist = gammaln(Nt_new[r] + Nt_plus_beta) - gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) + gammaln(tile(doc.Nv, (T, 1)) + Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) - gammaln(len(doc) * ones(T) + Nt_new[r] + Nt_plus_beta) + log(Dt_new[r] + Dt_plus_alpha_m) - log(d + D_plus_alpha)

                tmp[r] = log_sum_exp(log_dist)

                t = log_sample(log_dist)

                Nvt_new[r][t, :] += doc.Nv
                Nt_new[r][t] += len(doc)
                Dt_new[r][t] += 1

                z_new[r][d] = t

            log_p += log_sum_exp(tmp) - log(num_samples)

        return log_p
Esempio n. 5
0
def time_taken(func, corpus, alpha, m, beta, n, num_reps):

    avg = 0

    for rep in iterview(xrange(num_reps), inc=1):

        start = time.time()
        func(corpus, alpha, m, beta, n)
        avg += (time.time() - start)

    avg /= float(num_reps)

    return avg
Esempio n. 6
0
def time_taken(func, corpus, alpha, m, beta, n, num_reps):

    avg = 0

    for rep in iterview(xrange(num_reps), inc=1):

        start = time.time()
        func(corpus, alpha, m, beta, n)
        avg += (time.time() - start)

    avg /= float(num_reps)

    return avg
Esempio n. 7
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over document--component assignments (i.e.,
        document groups) given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all document--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize document--component assignments
        """

        corpus = self.corpus

        T = self.T

        alpha_m = self.alpha_m

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt = self.Dt

        z = self.z

        for d, (doc, t) in enumerate(iterview(zip(corpus, z))):

            if not init:
                Nvt_plus_beta_n[t, :] -= doc.Nv
                Nt_plus_beta[t] -= len(doc)
                Dt[t] -= 1

            t = log_sample(
                gammaln(Nt_plus_beta)
                - gammaln(Nvt_plus_beta_n).sum(axis=1)
                + gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1)
                - gammaln(len(doc) * ones(T) + Nt_plus_beta)
                + log(Dt + alpha_m)
            )

            Nvt_plus_beta_n[t, :] += doc.Nv
            Nt_plus_beta[t] += len(doc)
            Dt[t] += 1

            z[d] = t
def main():

    br = Browser()
    br.set_handle_robots(False)
#    br.set_debug_responses(True)

    data = urlencode({'user': USERNAME, 'pass': getpass()})

    classifications = get_metadata_options(br, 'ca', data)
    sources = get_metadata_options(br, 'is', data)

    makedir(METADATA_DIR)
    makedir(TEXT_DIR)

    for filename in iterview(glob(DOCUMENT_PAGES_DIR + '/*'), inc=1000):
        extract_data(filename, classifications, sources)
Esempio n. 9
0
def main():

    br = Browser()
    br.set_handle_robots(False)
    #    br.set_debug_responses(True)

    data = urlencode({'user': USERNAME, 'pass': getpass()})

    classifications = get_metadata_options(br, 'ca', data)
    sources = get_metadata_options(br, 'is', data)

    makedir(METADATA_DIR)
    makedir(TEXT_DIR)

    for filename in iterview(glob(DOCUMENT_PAGES_DIR + '/*'), inc=1000):
        extract_data(filename, classifications, sources)
Esempio n. 10
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over document--component assignments (i.e.,
        document groups) given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all document--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize document--component assignments
        """

        corpus = self.corpus

        T = self.T

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt_plus_alpha_m = self.Dt_plus_alpha_m

        z = self.z

        for d, (doc, t) in enumerate(iterview(zip(corpus, z))):

            if not init:
                Nvt_plus_beta_n[t, :] -= doc.Nv
                Nt_plus_beta[t] -= len(doc)
                Dt_plus_alpha_m[t] -= 1

            t = log_sample(
                gammaln(Nt_plus_beta) - gammaln(Nvt_plus_beta_n).sum(axis=1) +
                gammaln(tile(doc.Nv, (T, 1)) + Nvt_plus_beta_n).sum(axis=1) -
                gammaln(len(doc) * ones(T) + Nt_plus_beta) +
                log(Dt_plus_alpha_m))

            Nvt_plus_beta_n[t, :] += doc.Nv
            Nt_plus_beta[t] += len(doc)
            Dt_plus_alpha_m[t] += 1

            z[d] = t
Esempio n. 11
0
def list_all_duplicates(directory):
    target = Path(directory)

    print('[*] finding all files...')
    filenames = target.glob('**/*')

    files = [f for f in filenames]
    files = sorted(files, key=lambda x: x.stat().st_size)
    files = list(filter(lambda x: x.stat().st_size > MIN_SIZE, files))

    hashes = {}
    first = None
    cur_size = -1
    for file in iterview(files):

        if file.is_dir():
            continue

        size = file.stat().st_size
        if cur_size == size:

            # if two same size files are found, record first one,
            if first is not None:
                with first.open('rb') as f:
                    hashval = md5(f.read()).hexdigest()
                    hashes[hashval] = [first]
                    first = None

            with file.open('rb') as f:
                hashval = md5(f.read()).hexdigest()
                if hashes.get(hashval):
                    hashes[hashval].append(file)
                else:
                    hashes[hashval] = [file]
        else:
            first = file
            cur_size = size

    dup_files_list = []
    for paths in hashes.values():
        if len(paths) > 1:
            dup_files_list.append(paths)

    return dup_files_list
Esempio n. 12
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over token--component (i.e., token--topic)
        assignments given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all token--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize token--component assignments
        """

        corpus = self.corpus

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta
        Ntd_plus_alpha_m = self.Ntd_plus_alpha_m
        Nd_plus_alpha = self.Nd_plus_alpha

        z = self.z

        for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)):
            for n, (v, t) in enumerate(zip(doc.w, zd)):

                if not init:
                    Nvt_plus_beta_n[v, t] -= 1
                    Nt_plus_beta[t] -= 1
                    Ntd_plus_alpha_m[d, t] -= 1

                t = sample((Nvt_plus_beta_n[v, :] / Nt_plus_beta) *
                           Ntd_plus_alpha_m[d, :])

                Nvt_plus_beta_n[v, t] += 1
                Nt_plus_beta[t] += 1
                Ntd_plus_alpha_m[d, t] += 1

                if init:
                    Nd_plus_alpha[d] += 1

                zd[n] = t
Esempio n. 13
0
def get_listing_pages(br):
    """
    Caches the contents of each URL in the file whose name is stored
    in the variable LISTING_URLS_FILE to the directory whose name is
    stored on the variable LISTING_PAGES_DIR. The contents of each URL
    will be stored in a file whose name is that URL's md5 hash.

    Arguments:

    br -- Browser object
    """

    listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)]

    for url in iterview(listing_urls):
        try:
            download_url(br, url, LISTING_PAGES_DIR)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)
Esempio n. 14
0
def group_hashes(hashes):
    debug('Start grouping hashes...')

    g = groups.Groups(hashes.keys())
    for k1, v1 in iterview(hashes.items()):
        img_id = g.find(str(k1))
        for k2, v2 in hashes.items():
            if k1 == k2:
                continue
            if v1 - v2 <= HASH_THRESHOLD:
                g.unite(img_id, str(k2))

    # groups with only one element are filtered
    group_result = g.get()
    cnt = Counter(group_result.values())
    filtered = [item for item in group_result.items()]
    filtered = filter(lambda g: True if cnt[g[1]] > 1 else False, filtered)

    return filtered
Esempio n. 15
0
def get_listing_pages(br):
    """
    Caches the contents of each URL in the file whose name is stored
    in the variable LISTING_URLS_FILE to the directory whose name is
    stored on the variable LISTING_PAGES_DIR. The contents of each URL
    will be stored in a file whose name is that URL's md5 hash.

    Arguments:

    br -- Browser object
    """

    listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)]

    for url in iterview(listing_urls):
        try:
            download_url(br, url, LISTING_PAGES_DIR)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)
Esempio n. 16
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over token--component (i.e., token--topic)
        assignments given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all token--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize token--component assignments
        """

        corpus = self.corpus

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta
        Ntd_plus_alpha_m = self.Ntd_plus_alpha_m
        Nd_plus_alpha = self.Nd_plus_alpha

        z = self.z

        for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)):
            for n, (v, t) in enumerate(zip(doc.w, zd)):

                if not init:
                    Nvt_plus_beta_n[v, t] -= 1
                    Nt_plus_beta[t] -= 1
                    Ntd_plus_alpha_m[d, t] -= 1

                pass # YOUR CODE GOES HERE

                Nvt_plus_beta_n[v, t] += 1
                Nt_plus_beta[t] += 1
                Ntd_plus_alpha_m[d, t] +=1

                if init:
                    Nd_plus_alpha[d] += 1

                zd[n] = t
Esempio n. 17
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over document--component assignments (i.e.,
        document groups) given this instance's corpus (i.e., document
        tokens). By default (i.e., if keyword argument 'init' is set
        to the value 'False') all document--component assignments (and
        corresponding counts) are assumed to have been initialized
        previously; otherwise, they are initialized.

        Keyword arguments:

        init -- whether to initialize document--component assignments
        """

        corpus = self.corpus

        T = self.T

        alpha_m = self.alpha_m

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt = self.Dt

        z = self.z

        for d, (doc, t) in enumerate(iterview(zip(corpus, z))):

            if not init:
                Nvt_plus_beta_n[t, :] -= doc.Nv
                Nt_plus_beta[t] -= len(doc)
                Dt[t] -= 1

            pass  # YOUR CODE GOES HERE

            Nvt_plus_beta_n[t, :] += doc.Nv
            Nt_plus_beta[t] += len(doc)
            Dt[t] += 1

            z[d] = t
Esempio n. 18
0
def get_listing_data():

    with safe_write(CSV_FILE) as f:
        for filename in iterview(glob(LISTING_PAGES_DIR + '/*')):

            contents = file(filename).read()
#            print contents

            try:
                [obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents)
                obj = loads(obj)
            except ValueError:
                return

            if 'listPrice' in obj and 'listBed' in obj:
                text = '\t'.join((os.path.basename(filename),
                                  str(obj['listPrice']), str(obj['listBed'])))
                f.write(text)
                f.write('\n')
                f.flush()
Esempio n. 19
0
def get_listing_data():

    with safe_write(CSV_FILE) as f:
        for filename in iterview(glob(LISTING_PAGES_DIR + '/*')):

            contents = file(filename).read()
            #            print contents

            try:
                [obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents)
                obj = loads(obj)
            except ValueError:
                return

            if 'listPrice' in obj and 'listBed' in obj:
                text = '\t'.join((os.path.basename(filename),
                                  str(obj['listPrice']), str(obj['listBed'])))
                f.write(text)
                f.write('\n')
                f.flush()
Esempio n. 20
0
def create_csv(numbers, max_document_length, min_type_count, stopwords):

    vocab = defaultdict(int)

    data = {}

    for filename in iterview(glob(METADATA_DIR + '/*')):

        number = os.path.basename(filename)

        if numbers is not None and number not in numbers:
            continue

        with file(filename) as f:
            metadata = f.read().strip()

        fields = metadata.split('\t')

        assert len(fields) == 8
        assert fields[0] == number

        text = ''

        for page in xrange(1, int(metadata[-1]) + 1):
            with file(os.path.join(TEXT_DIR, number + '_' + str(page))) as f:
                text += f.read().strip()

        text = re.findall('[a-z]+', text)
        text = [x for x in text if x not in stopwords]

        for x in text:
            vocab[x] += 1

        data[number] = (metadata, ' '.join(text))

    for number, (metadata, text) in data.items():

        text = [x for x in text.split(' ') if vocab[x] >= min_type_count]
        text = text[:min(len(text), max_document_length)]

        print '\t'.join([metadata, ' '.join(text)])
def create_csv(numbers, max_document_length, min_type_count, stopwords):

    vocab = defaultdict(int)

    data = {}

    for filename in iterview(glob(METADATA_DIR + "/*")):

        number = os.path.basename(filename)

        if numbers is not None and number not in numbers:
            continue

        with file(filename) as f:
            metadata = f.read().strip()

        fields = metadata.split("\t")

        assert len(fields) == 8
        assert fields[0] == number

        text = ""

        for page in xrange(1, int(metadata[-1]) + 1):
            with file(os.path.join(TEXT_DIR, number + "_" + str(page))) as f:
                text += f.read().strip()

        text = re.findall("[a-z]+", text)
        text = [x for x in text if x not in stopwords]

        for x in text:
            vocab[x] += 1

        data[number] = (metadata, " ".join(text))

    for number, (metadata, text) in data.items():

        text = [x for x in text.split(" ") if vocab[x] >= min_type_count]
        text = text[: min(len(text), max_document_length)]

        print "\t".join([metadata, " ".join(text)])
Esempio n. 22
0
def time_taken(func, dists, num_reps, num_samples=1):

    seed(1000)

    mean = 0

    for rep in iterview(xrange(num_reps)):

        start = time.time()

        for dist in dists:
            if num_samples == 1:
                func(dist)
            else:
                func(dist, num_samples)

        mean += (time.time() - start) / float(len(dists))

    mean /= float(num_reps)

    return mean
Esempio n. 23
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over token--topic assignments.

        Keyword arguments:

        init -- whether to initialize token--topic assignments
        """

        corpus = self.corpus

        Ntd_plus_alpha_m = self.Ntd_plus_alpha_m
        Nd_plus_alpha = self.Nd_plus_alpha
        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        z = self.z

        for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)):
            for n, (v, t) in enumerate(zip(doc.tokens, zd)):

                if not init:
                    Ntd_plus_alpha_m[d, t] -= 1
                    Nvt_plus_beta_n[v, t] -= 1
                    Nt_plus_beta[t] -= 1
                else:
                    Nd_plus_alpha[d] += 1

                t = sample((Nvt_plus_beta_n[v, :] / Nt_plus_beta)
                           * Ntd_plus_alpha_m[d, :])

                Ntd_plus_alpha_m[d, t] +=1
                Nvt_plus_beta_n[v, t] += 1
                Nt_plus_beta[t] += 1

                zd[n] = t
Esempio n. 24
0
    def gibbs_iteration(self, init=False):
        """
        Uses Gibbs sampling to draw a single sample from the posterior
        distribution over token--topic assignments.

        Keyword arguments:

        init -- whether to initialize token--topic assignments
        """

        corpus = self.corpus

        Ntd_plus_alpha_m = self.Ntd_plus_alpha_m
        Nd_plus_alpha = self.Nd_plus_alpha
        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        z = self.z

        for d, (doc, zd) in enumerate(iterview(zip(corpus, z), inc=200)):
            for n, (v, t) in enumerate(zip(doc.tokens, zd)):

                if not init:
                    Ntd_plus_alpha_m[d, t] -= 1
                    Nvt_plus_beta_n[v, t] -= 1
                    Nt_plus_beta[t] -= 1
                else:
                    Nd_plus_alpha[d] += 1

                t = sample((Nvt_plus_beta_n[v, :] / Nt_plus_beta) *
                           Ntd_plus_alpha_m[d, :])

                Ntd_plus_alpha_m[d, t] += 1
                Nvt_plus_beta_n[v, t] += 1
                Nt_plus_beta[t] += 1

                zd[n] = t
Esempio n. 25
0
    def log_predictive_prob(self, new_corpus, num_samples):

        D, V, T = self.D, self.V, self.T

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Dt_plus_alpha_m = self.Dt_plus_alpha_m
        D_plus_alpha = self.D_plus_alpha

        Nvt_new, Nt_new, Dt_new, z_new = [], [], [], []

        for r in xrange(num_samples):

            Nvt_new.append(zeros((T, V), dtype=int))
            Nt_new.append(zeros(T, dtype=int))

            Dt_new.append(zeros(T, dtype=int))

            z_new.append(zeros(len(new_corpus), dtype=int))

        log_p = 0

        for d, doc in enumerate(iterview(new_corpus)):

            tmp = zeros(num_samples, dtype=float)

            for r in xrange(num_samples):
                for prev_d in xrange(0, d):

                    prev_doc = corpus.documents[prev_d]
                    t = z_new[r][prev_d]

                    Nvt_new[r][t, :] -= prev_doc.Nv
                    Nt_new[r][t] -= len(prev_doc)
                    Dt_new[r][t] -= 1

                    t = log_sample(
                        gammaln(Nt_new[r] + Nt_plus_beta) -
                        gammaln(Nvt_new[r] + Nvt_plus_beta_n).sum(axis=1) +
                        gammaln(
                            tile(prev_doc.Nv, (T, 1)) + Nvt_new[r] +
                            Nvt_plus_beta_n).sum(axis=1) - gammaln(
                                len(prev_doc) * ones(T) + Nt_new[r] +
                                Nt_plus_beta) +
                        log(Dt_new[r] + Dt_plus_alpha_m))

                    Nvt_new[r][t, :] += prev_doc.Nv
                    Nt_new[r][t] += len(prev_doc)
                    Dt_new[r][t] += 1

                    z_new[r][prev_d] = t

                pass  # YOUR CODE GOES HERE

                Nvt_new[r][t, :] += doc.Nv
                Nt_new[r][t] += len(doc)
                Dt_new[r][t] += 1

                z_new[r][d] = t

            log_p += log_sum_exp(tmp) - log(num_samples)

        return log_p
Esempio n. 26
0
from glob import glob
import os

from iterview import iterview

for filename in iterview(glob('data/cache/html/*/*'), inc=1000):
    try:
        with file(filename) as f:
            contents = f.read()
            assert contents
            assert '<title>Off-Campus' not in contents
    except AssertionError:
        print 'Removing ', filename
        os.remove(filename)
Esempio n. 27
0
    def log_predictive_prob(self, new_corpus, num_samples):
        """
        Returns an approximation of the log probability of the
        specified new corpus given this instance's corpus (i.e.,
        document tokens) AND current set of token--component (i.e.,
        token--topic) assignments according to LDA.

        Arguments:

        new_corpus -- new corpus of documents
        num_samples -- ...
        """

        V, T = self.V, self.T

        D_new = len(new_corpus)

        alpha, alpha_m = self.alpha, self.alpha_m

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Nvt_new, Nt_new, Ntd_new, z_new = [], [], [], []

        for r in xrange(num_samples):

            Nvt_new.append(zeros((V, T), dtype=int))
            Nt_new.append(zeros(T, dtype=int))
            Ntd_new.append(zeros((D_new, T), dtype=int))

            z_r = []

            for doc in new_corpus:
                z_r.append(zeros(len(doc), dtype=int))

            z_new.append(z_r)

        log_p = 0

        for d, doc in enumerate(iterview(new_corpus)):
            for n, v in enumerate(doc.w):

                tmp = zeros(num_samples, dtype=float)

                for r in xrange(num_samples):

                    # for efficiency, resample only those
                    # token--component assignments belonging to
                    # previous tokens in the current document

                    for prev_n in xrange(0, n):

                        prev_v = doc.w[prev_n]
                        t = z_new[r][d][prev_n]

                        Nvt_new[r][prev_v, t] -= 1
                        Nt_new[r][t] -= 1
                        Ntd_new[r][d, t] -= 1

                        t = sample((Nvt_new[r][prev_v, :] +
                                    Nvt_plus_beta_n[prev_v, :]) /
                                   (Nt_new[r] + Nt_plus_beta) *
                                   (Ntd_new[r][d, :] + alpha_m))

                        Nvt_new[r][prev_v, t] += 1
                        Nt_new[r][t] += 1
                        Ntd_new[r][d, t] += 1

                        z_new[r][d][prev_n] = t

                    dist = ((Nvt_new[r][v, :] + Nvt_plus_beta_n[v, :]) /
                            (Nt_new[r] + Nt_plus_beta)) * (
                                (Ntd_new[r][d, :] + alpha_m) / (n + alpha))

                    tmp[r] = log(dist.sum())

                    t = sample(dist)

                    Nvt_new[r][v, t] += 1
                    Nt_new[r][t] += 1
                    Ntd_new[r][d, t] += 1

                    z_new[r][d][n] = t

                log_p += log_sum_exp(tmp) - log(num_samples)

        return log_p
from glob import glob
import os

from iterview import iterview


for filename in iterview(glob('data/cache/html/*/*'), inc=1000):
    try:
        with file(filename) as f:
            contents = f.read()
            assert contents
            assert '<title>Off-Campus' not in contents
    except AssertionError:
        print 'Removing ', filename
        os.remove(filename)
Esempio n. 29
0
    def log_predictive_prob(self, new_corpus, num_samples):
        """
        Returns an approximation of the log probability of the
        specified new corpus given this instance's corpus (i.e.,
        document tokens) AND current set of token--component (i.e.,
        token--topic) assignments according to LDA.

        Arguments:

        new_corpus -- new corpus of documents
        num_samples -- ...
        """

        V, T = self.V, self.T

        D_new = len(new_corpus)

        alpha, alpha_m = self.alpha, self.alpha_m

        Nvt_plus_beta_n = self.Nvt_plus_beta_n
        Nt_plus_beta = self.Nt_plus_beta

        Nvt_new, Nt_new, Ntd_new, z_new = [], [], [], []

        for r in xrange(num_samples):

            Nvt_new.append(zeros((V, T), dtype=int))
            Nt_new.append(zeros(T, dtype=int))
            Ntd_new.append(zeros((D_new, T), dtype=int))

            z_r = []

            for doc in new_corpus:
                z_r.append(zeros(len(doc), dtype=int))

            z_new.append(z_r)

        log_p = 0

        for d, doc in enumerate(iterview(new_corpus)):
            for n, v in enumerate(doc.w):

                tmp = zeros(num_samples, dtype=float)

                for r in xrange(num_samples):

                    # for efficiency, resample only those
                    # token--component assignments belonging to
                    # previous tokens in the current document

                    for prev_n in xrange(0, n):

                        prev_v = doc.w[prev_n]
                        t = z_new[r][d][prev_n]

                        Nvt_new[r][prev_v, t] -= 1
                        Nt_new[r][t] -= 1
                        Ntd_new[r][d, t] -= 1

                        t = sample((Nvt_new[r][prev_v, :] + Nvt_plus_beta_n[prev_v, :]) / (Nt_new[r] + Nt_plus_beta) * (Ntd_new[r][d, :] + alpha_m))

                        Nvt_new[r][prev_v, t] += 1
                        Nt_new[r][t] += 1
                        Ntd_new[r][d, t] += 1

                        z_new[r][d][prev_n] = t

                    pass # YOUR CODE GOES HERE

                    Nvt_new[r][v, t] += 1
                    Nt_new[r][t] += 1
                    Ntd_new[r][d, t] += 1

                    z_new[r][d][n] = t

                log_p += log_sum_exp(tmp) - log(num_samples)

        return log_p