Beispiel #1
0
	def RLDeconvolution(self, RLiterations, PSF, threads=multiprocessing.cpu_count(), PSF_pad=0):
		'''Input: RLiterations=number of iterations to perform
			PSF=point spread function (an EELS spectrum object)
		Optional argument: 
			threads=number of computer's CPUs to use while deconvolving, default is all of them
			PSF_pad=value to pad PSF with (or None to not pad PSF)'''
		PSF_sym = PSF.SymmetrizeAroundZLP()
		if PSF_pad is not None:
			data_length = np.size(self.SpectrumRange)
			PSF_length = np.size(PSF_sym.intensity)
			pad_length = data_length/2 - (1 + data_length) % 2 - (PSF_length-(PSF_length % 2))/2
			if PSF_sym.ZLP < data_length/2:
				PSF_sym = PSF.PadSpectrum(pad_length, pad_value=PSF_pad, pad_side='left').SymmetrizeAroundZLP()
			elif PSF_sym.ZLP > data_length/2:
				PSF_sym = PSF_sym.PadSpectrum(pad_length, pad_value=PSF_pad, pad_side='right')
		print 'Beginning deconvolution...'
		loopyP = partial(loopy, iterations=RLiterations, PSF=PSF_sym.Normalize().intensity)
		x_deconv = np.array(handythread.parallel_map(loopyP, abs(self.Normalize()), 
			threads = threads))
#		x_deconv = np.array(handythread.parallel_map(loopyP, self.Normalize(), 
#			threads = threads))
		x_deconv = np.ma.array(x_deconv, mask = self.data.mask)
		print 'Done %s iterations!' %RLiterations

		return EELSSpectrumImage(x_deconv, self.dispersion)
    def RLDeconvolution_Adaptive(self,
                                 RLiterations,
                                 PSF,
                                 threads=multiprocessing.cpu_count(),
                                 PSF_pad=0):
        PSF_sym = PSF.SymmetrizeAroundZLP()
        data_sym = self.SymmetrizeAroundZLP()
        if PSF_pad is not None:
            data_length = np.size(
                self.SpectrumRange)  ##replace w/ self.size[2]
            PSF_length = np.shape(PSF_sym.data)[2]
            pad_length = int(data_length / 2 - (1 + data_length) % 2 -
                             PSF_length // 2)
            if PSF_sym.ZLP < data_length / 2:
                PSF_sym = PSF.PadSpectrum(
                    pad_length, pad_value=PSF_pad,
                    pad_side='left').SymmetrizeAroundZLP()
            elif PSF_sym.ZLP > data_length / 2:
                PSF_sym = PSF_sym.PadSpectrum(pad_length,
                                              pad_value=PSF_pad,
                                              pad_side='right')

            if data_sym.ZLP < data_length / 2:
                data_sym = self.PadSpectrum(
                    pad_length, pad_value=PSF_pad + 1,
                    pad_side='left').SymmetrizeAroundZLP()
            elif PSF_sym.ZLP > data_length / 2:
                data_sym = self.PadSpectrum(pad_length,
                                            pad_value=PSF_pad + 1,
                                            pad_side='right')
        print('Beginning deconvolution...')
        loopyP_adapt = partial(loopy_adapt, iterations=RLiterations)
        deconvolution_arrays = np.append(np.expand_dims(data_sym.Normalize(),
                                                        axis=-1),
                                         np.expand_dims(PSF_sym.Normalize(),
                                                        axis=-1),
                                         axis=-1)
        x_deconv = np.array(
            handythread.parallel_map(loopyP_adapt,
                                     deconvolution_arrays,
                                     threads=threads))
        x_deconv = np.ma.array(x_deconv, mask=self.data.mask)
        print('Done %s iterations!' % RLiterations)
        return EELSSpectrumImage(x_deconv, dispersion=self.dispersion)
def litekmeans(X, k, max_iter=50):
    X = X.T
    n = X.shape[1]
    ndim = X.shape[0]
    last = 0
    label = np.random.randint(k, size=(n, ))
    iteration = 0
    batchsize = 100000
    nbatches = int(np.ceil(n / batchsize))
    center = np.zeros((ndim, k), dtype=np.float32)
    while np.any(label != last):
        start = time.time()
        iteration += 1
        print 'iteration: {0}'.format(iteration)

        E = scipy.sparse.coo_matrix((np.ones(
            (n, ), dtype=np.int), (np.arange(n), label)),
                                    shape=(n, k),
                                    dtype=np.float64).tocsr()

        # E = one hot assignments
        # spdiags... = counts
        print 'max of E.sum(0): %s' % (E.sum(0).max(), )
        print 'max of (1.0/E.sum(0)): %s' % ((1.0 / E.sum(0)).max(), )
        print 'min of E.sum(0): %s' % (E.sum(0).min(), )
        print 'min of (1.0/E.sum(0)): %s' % ((1.0 / E.sum(0)).min(), )
        print 'np.all(1.0/E.sum(0) == np.inf): %r' % (np.all(
            1.0 / E.sum(0) == np.inf), )
        center = X * E * scipy.sparse.spdiags(1.0 / (E.sum(0) + 0.0000000001),
                                              0, k, k)
        c2 = 0.5 * np.sum(center**2, 0).T[:, None]
        last = label
        label = np.zeros((n, ), dtype=np.int)

        def get_labels(batchidx):
            return np.argmax(np.dot(
                center.T, X[:, j * batchsize + batchidx *
                            1000:min(n, j * batchsize +
                                     (batchidx + 1) * 1000)]) - c2,
                             axis=0)

        for j in range(nbatches):
            print 'processing batch {0:d} / {1:d}'.format(j + 1, nbatches)

            tmp = handythread.parallel_map(get_labels,
                                           range(int(np.ceil(batchsize /
                                                             1000))),
                                           threads=8)
            label[j * batchsize:min(n, int((j + 1) *
                                           batchsize))] = np.concatenate(tmp)
        if iteration >= max_iter:
            break
        print 'iteration took {0:d} seconds'.format(int(time.time() - start))
    obj = 0
    Xsq = 0.5 * np.sum(X**2, 0)
    batchsize = 10000
    nbatches = int(np.ceil(n / batchsize))
    csq = 0.5 * np.sum(center**2, 0)

    # TODO: do this stuff in parallel as well (takes longer than expected)
    def compute_sqd(batchidx):
        tempX = X[:, j * batchsize +
                  batchidx * 100:min(n, j * batchsize + (batchidx + 1) * 100)]
        temp = np.dot(-center.T, tempX) + csq[:, None]
        tmp = Xsq[j * batchsize +
                  batchidx * 100:min(n, j * batchsize +
                                     (batchidx + 1) * 100)] + temp
        temp_mindist = np.min(
            Xsq[j * batchsize +
                batchidx * 100:min(n, j * batchsize +
                                   (batchidx + 1) * 100)] + temp,
            axis=0)
        return np.sum(temp_mindist)

    for j in range(nbatches):
        tmp = handythread.parallel_map(compute_sqd,
                                       range(int(np.ceil(batchsize / 100))),
                                       threads=8)
        obj += np.sum(tmp)
        print 'obj: %r' % (obj, )
    #print obj
    #for j in range(nbatches):
    #    tempX = X[:, j * batchsize:min(n, (j + 1) * batchsize)]
    #    temp = np.dot(-center.T, tempX) + csq[:, None]
    #    print 'Xsq[j * batchsize:min(n, (j + 1) * batchsize)].mean(): %r' % (
    #        Xsq[j * batchsize:min(n, (j + 1) * batchsize)].mean(), )
    #    print 'mean of temp: %s' % (temp.mean(), )
    #    tmp = Xsq[j * batchsize:min(n, (j + 1) * batchsize)] + temp
    #
    #    temp_mindist = np.min(
    #        Xsq[j * batchsize:min(n, (j + 1) * batchsize)] + temp,
    #        axis=0
    #    )
    #    obj = obj + np.sum(temp_mindist)
    print 'obj: %r' % (obj, )
    center = center.T
    return (label, center, obj)
 def test_parallel_map(self):
     l = range(100)
     r = handythread.parallel_map(lambda x: x**2, l)
     for i in range(len(l)):
         self.assertEqual(l[i]**2,r[i])
Beispiel #5
0
def run(doc_topics_filename, topic_keys_filename, state_filename, max_dict, meta_filename,
        solr_url, output_dir, date_format):

    if not os.path.exists(doc_topics_filename):
        raise FileNotFoundError(doc_topics_filename)

    if not os.path.exists(topic_keys_filename):
        raise FileNotFoundError(topic_keys_filename)

    if not os.path.exists(state_filename):
        raise FileNotFoundError(state_filename)

    if meta_filename is not None and not os.path.exists(meta_filename):
        raise FileNotFoundError(meta_filename)

    start = time()

    # read the mallet output
    print("Reading {}...".format(doc_topics_filename), end='', flush=True)
    doc_topics, num_topics = read_doc_topics(doc_topics_filename, return_num_topics=True)
    print("done, {:,} topics, {:,} documents".format(num_topics, len(doc_topics)))

    print("Reading {}...".format(topic_keys_filename), end='', flush=True)
    topic_keys, num_topwords = read_topic_keys(topic_keys_filename, return_num_topwords=True)
    print("done, {:,} top words per topic".format(num_topwords))

    print("Reading {}...".format(state_filename), end='', flush=True)
    state = read_state(state_filename)
    print("done")

    if meta_filename is not None:
        print("Reading metadata from {}...".format(meta_filename), end='', flush=True)
        doc_meta = read_meta(meta_filename, date_format)
        print("done, {:,} entries found".format(len(doc_meta)))
    else:
        print("Retrieving metadata from {}...".format(solr_url), end='', flush=True)
        doc_meta = retrieve_meta(doc_topics, solr_url, date_format)
        print("done, {:,} entries retrieved".format(len(doc_meta)))

    topicids = range(num_topics)

    # check if pruning is desired
    if max_dict > 0:
        print("Pruning tokens, keeping only the top {:,} tokens by frequency...".format(max_dict), end='', flush=True)
        state = prune_state(state, max_dict)
        print("done")

        print("Re-indexing tokens...", end='', flush=True)
        state['typeindex'] = pd.factorize(state['typeindex'])[0]
        print("done")

    print("Processing state data...", end='', flush=True)
    # extract mapping of token id to token
    tokenid_map = state[['typeindex', 'type']].drop_duplicates()
    tokenid_map.columns = ['tokenid', 'token']

    # extract mapping of token -> document -> topic
    state = state[['doc', 'typeindex', 'topic']]
    state.columns = ['docid', 'tokenid', 'topic']

    # compute token, topic -> count mapping
    token_topic_count = state[['tokenid', 'topic']].groupby(['tokenid', 'topic'], sort=False).size().reset_index()
    token_topic_count.columns = ['tokenid', 'topic', 'count']

    num_tokens = len(tokenid_map)

    doc_topics_complete = pd.merge(doc_meta, doc_topics, on='source')
    doc_topics_complete = doc_topics_complete.rename(columns={'id': 'docid', 'id_x': 'volid', 'id_y': 'docid'},
                                                     copy=False)

    docid_publishdate = doc_topics_complete[['docid', 'year']]
    print("done, {:,} tokens".format(num_tokens))

    # Create aggregate state object
    print("Calculating aggregate stats...", end='', flush=True)
    agg = state.groupby(['docid', 'tokenid', 'topic']).size().reset_index()
    agg.columns = ['docid', 'tokenid', 'topic', 'count']
    agg_token = pd.merge(agg, tokenid_map, on='tokenid')
    full_state = pd.merge(agg_token, docid_publishdate, on='docid')
    full_state.drop('docid', axis=1, inplace=True)
    corpus_token_counts_by_year = full_state[['year', 'count']].groupby('year').sum().reset_index()
    topic_keywords = pd.melt(topic_keys,
                             id_vars='id',
                             value_vars=['key.' + str(i) for i in range(num_topwords)],
                             value_name='token')
    topic_keywords = topic_keywords.rename(columns={'id': 'topic'}, copy=False)
    topic_keywords.drop('variable', axis=1, inplace=True)
    topic_token_counts_by_year = \
        pd.merge(full_state[['topic', 'year', 'token', 'count']],
        topic_keywords, on=['topic', 'token']).groupby(['topic', 'year', 'token']).sum().reset_index()
    print("done")

    print("Creating topics...", end='', flush=True)

    def create_topic(topicid):
        token_count = token_topic_count[token_topic_count['topic'] == topicid]
        vector = np.repeat(np.float64(0), num_tokens)
        for tid, _, cnt in token_count.itertuples(index=False):
            vector[tid] = np.float64(cnt)

        return Topic(vector)

    topics = list(parallel_map(create_topic, topicids))
    print("done")

    # calculate trend
    print("Calculating topic trend...", end='', flush=True)
    state_trend = pd.merge(state[['docid', 'topic']],
                           docid_publishdate,
                           on='docid')[['topic', 'year']].groupby(['topic', 'year']).size().reset_index()

    state_trend.columns = ['topic', 'year', 'count']

    def slope(topic):
        state_small = state_trend[(state_trend['topic'] == topic) & (state_trend['year'] != -1)]

        dates = state_small['year'].values
        values = state_small['count'].values

        lm = stats.linregress(dates, values)
        return lm[0]

    topic_keys['trend'] = list(parallel_map(slope, topicids))
    print("done")

    # Insert dists
    print("Calculating topic distance from center...", end='', flush=True)
    topic_keys['dist'] = list(parallel_map(lambda x: x.length, topics))
    print("done")

    print("Calculating topic means...", end='', flush=True)
    topic_keys['mean'] = doc_topics.ix[:, 'topic.0':].mean().values
    print("done")

    # Create Distance Matrix
    dist = pd.DataFrame(data=0, dtype='float64', index=topicids, columns=topicids)
    print("Calculating inter-topic distances...", end='', flush=True)

    def calc_dist(tuple):
        x, y = tuple
        d = topics[x].distance(topics[y])
        dist.iloc[x, y] = d
        dist.iloc[y, x] = d

    parallel_for(calc_dist, itertools.combinations(topicids, 2))
    print("done")

    # write results
    print("Writing out results...", end='', flush=True)
    doc_topics.to_csv(os.path.join(output_dir, 'documents.csv'), index=False, encoding='utf-8')
    topic_keys.to_csv(os.path.join(output_dir, 'topics.csv'), index=False, encoding='utf-8')
    tokenid_map.to_csv(os.path.join(output_dir, 'tokens.csv'), index=False, encoding='utf-8')
    agg.to_csv(os.path.join(output_dir, 'state.csv'), index=False, encoding='utf-8')
    dist.to_csv(os.path.join(output_dir, 'distance.csv'), encoding='utf-8')
    corpus_token_counts_by_year.to_csv(os.path.join(output_dir, 'counts_by_year.csv'), index=False, encoding='utf-8')
    topic_token_counts_by_year.to_csv(os.path.join(output_dir, 'counts_by_topic_year.csv'),
                                      index=False, encoding='utf-8')

    if solr_url is not None:
        doc_meta.drop('year', axis=1, inplace=True)
        doc_meta.to_csv(os.path.join(output_dir, 'docmeta.csv'), index=False, encoding='utf-8')
    print("done")

    elapsed = int(time() - start)

    print("All done. Time elapsed: {}".format(timedelta(seconds=elapsed)))