コード例 #1
0
    def decompose(self, corpus):

        skip_counts = bounter(size_mb=1024)
        word_counts = bounter(size_mb=1024)
        for l in corpus:
            wds = l.split()
            skips = list(skipgrams(wds, 2, 5))
            skips = ["#".join(t) for t in skips]
            if len(wds) > 0 and len(skips) > 0:
                skip_counts.update(skips)
                word_counts.update(wds)

        vocabulary = list(word_counts)
        shift = 1  # shift 1 does nothing since log(1) == 0.0
        M = count_skipgrams(skip_counts, word_counts, vocabulary, shift)
        # TODO: eigen something trick
        # singular value decomposition
        # U, _, V = svds(M, k=256)  # U, S, V
        U, _, V = sparsesvd(M, 300)
        # add context to U
        word_vecs = U.T + V.T
        del U
        del V
        # normalize rows
        word_vecs_norm = word_vecs / np.sqrt(
            np.sum(word_vecs * word_vecs, axis=0, keepdims=True))
        del word_vecs
        return vocabulary, word_vecs_norm
コード例 #2
0
ファイル: segments.py プロジェクト: mmh-max/py_autophrase
    def init_parameters(self):
        """参数初始化"""

        # 1.theta参数初始化

        # 遍历所有可能的候选短语,统计词频
        counter = bounter.bounter(size_mb=self.memory_size_mb)
        for i in range(1, len(self.C) + 1):
            for j in range(i + 1,
                           min(i + self.length_threshold + 1,
                               len(self.C) - 1)):
                if not self.Q.startswith(self.C.word(i, j)):
                    break
                counter.update((self.C.word(i, j), ))

        # 词频分布
        distr = np.array([i for i in counter.values()])
        _mean, _std = distr.mean(), distr.std()
        norm = truncnorm((0. - _mean) / _std, (1. - _mean) / _std,
                         loc=_mean,
                         scale=_std)

        # θ <- {短语 <- 短语下标}
        theta = bounter.bounter(size_mb=self.memory_size_mb)

        idx_theta = 0  # 从零计数
        for candidate in counter.keys():
            theta[candidate] = idx_theta
            idx_theta += 1
        del counter

        # 按正态分布随机初始化 θ
        param_theta = norm.rvs(size=idx_theta)

        # 2.delta参数初始化

        # 获取所有可能的pos标签二元组,
        delta = self.Tx_Ty()
        # 按均匀分布随机初始化 δ
        param_delta = np.random.uniform(size=len(delta))

        self._delta = delta
        self._theta = theta
        self._param_delta = param_delta
        self._param_theta = param_theta

        # 检查参数收敛情况
        self._trace_delta.append(
            compare_arrays(param_delta, np.ones(param_delta.size)))
        self._trace_theta.append(
            compare_arrays(param_theta, np.ones(param_theta.size)))
コード例 #3
0
    def ngram_counter(self, remove_stopwords, memory_size_mb):
        # Bounter is only a probabilistic frequency counter and
        # cannot be relied on for exact counting
        # mentioned in https://github.com/RaRe-Technologies/bounter
        count = bounter.bounter(size_mb=memory_size_mb)

        if remove_stopwords:
            stopwords = one_token_a_line(fname='stopwords.txt')
        else:
            stopwords = None

        process_bar = tqdm.tqdm(self.corpus, ascii=True)
        for doc in process_bar:
            process_bar.set_description('Counting n-grams')
            if not isinstance(doc, Sentences):
                err = 'doc must be a Sentence object. but got: {}'.format(
                    type(doc))
                raise TypeError(err)

            for sent in doc:
                # filter stopwords after ngram token is formed
                # but that not getting rid of them in the raw corpus
                if remove_stopwords and stopwords is not None:
                    ngram_tokens = [
                        t for t in ngram(sent, self.ngram_size)
                        if t not in stopwords
                    ]
                else:
                    ngram_tokens = ngram(sent, self.ngram_size)
                count.update(ngram_tokens)
        return count
コード例 #4
0
    def __init__(self, dumpfile = None, 
                lowfreq_threshold = 0, highfreq_threshold = 65535, hash_size = 7+10**8,
                hash_dtype = np.dtype([('counter', 'u2'), ('n-gram', bytes, 5)])):
        self.hash_dtype = hash_dtype
        self.hash_size = hash_size
        self.hash_dumpfile = dumpfile
        self.lowfreq_threshold = lowfreq_threshold
        self.highfreq_threshold = highfreq_threshold
        self.hash_add_tries = 0 # sum of hasd-add calls
        self.hash_added_keys = 0 # num of buckets in use
        self.hash_relookups = 0 # sum of re-lookups of all hash-add calls
        self.hash_collisions = 0 # sum of hash-add calls which fail on all re-lookups
        self.hash_ceilings = 0 # sum of hash-add calls which counter overflows highfreq_threshold
        self.hash_overwrites = 0 # num of hash-add calls which key overwrites another one and counter resets to 1
        self.hash_counter_lost = 0 # sum of counters when key is overwritten
        self.bnt = bounter(need_counts=False) # use HLL algorithm only
        self.bnt_count = 0
        self.ht = None
        self.hash_seeds = [2819948058, 5686873063, 1769651746, 8745608315, 2414950003, 
        3583714723, 1224464945, 2514535028] #np.random.randint(10**9,10**10,8)
        self.hash_funcs_num = len(self.hash_seeds)

        if dumpfile:
            self.ht = self.load(dumpfile)
        if self.ht is None:
            self.ht = self.init()
        if self.ht is not None:
            print("hash_table dtype(%s)" % self.ht.dtype)
            print("hash_size(%d): %s" % (len(self.ht), self.ht))
        else:
            print("hash_table load() or init() failed.")
コード例 #5
0
    def num_documents_with_ngram(self, memory_size_mb):
        if self._candidates is None:
            raise ValueError(
                'n-gram candidate is None, cannot count # documents '
                'with ngram if it is not given.')

        # less candidates, less memory usage.
        count = bounter.bounter(size_mb=memory_size_mb // 2)

        process_bar = tqdm.tqdm(self.corpus, ascii=True)
        for doc in process_bar:
            process_bar.set_description('Counting doc with ngram')

            tokens_doc_level = set()
            if not isinstance(doc, Sentences):
                err = 'doc must be a Sentence object. but got: {}'.format(
                    type(doc))
                raise TypeError(err)

            for sent in doc:
                tokens_sent_level = set(t
                                        for t in ngram(sent, self.ngram_size)
                                        if t in self._candidates)
                tokens_doc_level.update(tokens_sent_level)

            if tokens_doc_level:
                count.update(tokens_doc_level)
        return count
コード例 #6
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
    def test_sanity_nocount(self):
        counter = bounter(need_counts=False)
        counter.update([u'foo', u'bar', u'foo'])
        self.assertEqual(counter.total(), 3)
        self.assertEqual(counter.cardinality(), 2)

        with self.assertRaises(NotImplementedError):
            print(counter[u'foo'])
コード例 #7
0
ファイル: segments.py プロジェクト: mmh-max/py_autophrase
    def update_delta(self, B):
        assert (isinstance(B, list))
        assert len(B) > 1

        pre_param = self._param_delta

        m = len(B)
        n = len(self.C)

        TxTy_numerator = bounter.bounter(size_mb=self.memory_size_mb // 2)
        TxTy_denominator = bounter.bounter(size_mb=self.memory_size_mb // 2)

        bar = tqdm.tqdm(range(1, m - 1), ascii=True)

        for i in bar:
            bar.set_description('delta 1/3')

            for j in range(B[i], B[i + 1] - 1):
                TxTy_numerator.increment(self.C.tag(j, j + 2))

        bar = tqdm.tqdm(range(1, n), ascii=True)

        for i in bar:
            bar.set_description('delta 2/3')
            TxTy_denominator.increment(self.C.tag(i, i + 2))

        bar = tqdm.tqdm(self._delta.items(),
                        total=len(self._delta),
                        ascii=True)

        for txty, idx in bar:
            bar.set_description('delta 3/3')

            if TxTy_numerator[txty] != 0:
                print(txty)
                new_delta_value = TxTy_numerator[txty] / TxTy_denominator[txty]
                self._param_delta[idx] = new_delta_value
            else:
                self._param_delta[idx] = 0.

        self._trace_delta.append(compare_arrays(self._param_delta, pre_param))

        del pre_param
コード例 #8
0
def count_number_worker(fname, size_mb=200):
    """
    Worker for multithreading the counting operation for speed.

    :param fname: the bz2 file to open.
    :param size_mb: the maximum memory footprint of the bounter.
    :return: bounter object on the subreddit keys
    """
    b = bounter.bounter(size_mb=size_mb)
    with bz2.open(fname, "rt", encoding="utf8") as F:
        b.update(json.loads(i)["subreddit"] for i in F)
    return b
コード例 #9
0
ファイル: preprocessor.py プロジェクト: zzozzolev/kor2vec
    def get_word_cnts_gt_min_cnt(self, sentences, min_cnt):
        flattened = [word for sentence in sentences for word in sentence]
        counter = bounter(size_mb=4096)
        counter.update(flattened)

        array_cnt = np.array(list(counter.iteritems()))
        array = array_cnt[:, 0]
        cnts = array_cnt[:, 1]
        cnts = cnts.astype(int)

        valid = np.where(cnts > min_cnt)
        valid_array = array[valid]
        valid_cnts = cnts[valid]
        word_cnts = list(zip(valid_array, valid_cnts))

        return word_cnts, valid_array, len(flattened)
コード例 #10
0
    def count_ngrams(self,
                     sentences,
                     n,
                     use_bounter=True,
                     sep='\t',
                     **bounterargs):
        """Counts n-gram occurrences in a corpus.

        Counts n-gram occurrences in a corpus and inserts the output in an
        SQLite database.

        Parameters
        ----------
        sentences: Iterable
            Iterable of sentences. Each sentence must be a list of strings
            representing word features separated with the character that
            is passed to the 'sep' argument of this function.
        n: int or list of int
            length of the n-grams
        use_bounter: bool, default=True
            If True, the counts are performed via bounter, a probabilistic and
            memory efficient counter. If false, they are performed via regular
            Counter. The use of bounter is strongly recommended when working
            with a large corpus.
        sep: str, default '\t'
            The character that separates the features of each word in the
            input.
        **bounterargs
            keyword arguments passed to the bounter constructor if used.

        """
        messages.msg("Counting ngrams of length {}...".format(n))
        if use_bounter:
            bounterargs.setdefault('size_mb', 1024)
            counter = bounter(**bounterargs)
        else:
            counter = Counter()
        for sent in sentences:
            if type(n) == list:
                ngrams = list()
                for i in n:
                    ngrams += NgramCounter._gen_ngrams(sent, i)
            else:
                ngrams = NgramCounter._gen_ngrams(sent, n)
            counter.update(ngrams)
        messages.done()
        self._counts_to_db(counter, sep)
コード例 #11
0
        def _index_words(df):
            all_words = np.concatenate(df['mecab'].values)

            if self.use_min_cnt:
                cnt = bounter(size_mb=4096)
                cnt.update(all_words)
                words_cnt = np.array(list(cnt.iteritems()))
                words = words_cnt[:, 0]
                cnts = words_cnt[:, 1]
                cnts = cnts.astype(int)
                unique_words = words[np.where(cnts >= self.word_min_cnt)]
            else:
                unique_words = pd.unique(all_words)
            print('number of unique_words:{}'.format(unique_words.shape[0]))

            w2i, i2w = {}, {}
            for i, w in enumerate(unique_words):
                w2i[w] = i + 1
                i2w[i + 1] = w

            return unique_words, w2i, i2w
コード例 #12
0
ファイル: counters.py プロジェクト: simomarsili/ndd
def _frequencies_from_records(records, ids=None, size_mb=None):
    """Frequencies from records generator."""

    if ids is not None:
        x = set()
        if isinstance(ids, numbers.Integral):
            x.add(ids)
        else:
            x.update(ids)
        ids = x

    is_1d = None

    def is_sequence(obj):
        if isinstance(obj, str):
            return True
        return not isinstance(obj, collections.abc.Sequence)

    def stringify(features):
        nonlocal is_1d
        if is_1d is None:
            is_1d = is_sequence(features)

        if is_1d:
            return str(features)

        if ids:  # select set of indices
            return ' '.join(str(x) for j, x in enumerate(features) if j in ids)
        return ' '.join(str(x) for x in features)

    if callable(records):
        records = records()

    if size_mb:
        # approximate counting using bounter
        counts = bounter(size_mb=size_mb)
    else:
        counts = collections.Counter()
    counts.update(stringify(row) for row in records)
    return counts
コード例 #13
0
ファイル: utils.py プロジェクト: quadrismegistus/lltk
def do_gen_mfw(paths_freqs,
               estimate=True,
               n=None,
               by_ntext=False,
               by_fpm=False,
               progress=False,
               desc='',
               num_proc=1,
               floatpad=100000):
    from bounter import bounter
    from collections import Counter
    from tqdm import tqdm
    countd = bounter(1024) if estimate else Counter()
    for freqs in pmap_iter(
            getfreqs,
            paths_freqs,
            kwargs=dict(by_ntext=by_ntext, by_fpm=by_fpm),
            progress=progress,
            num_proc=num_proc,
            desc='Computing most frequent words across all texts'):
        freqs = dict((w, c) for w, c in freqs.items() if is_valid_mfw_word(w))
        # if these aren't integers...
        typs = {type(c) for w, c in freqs.items()}
        # print(typs)
        if typs != {int}:
            # if we're not estimating, it should be ok?
            # if we are...
            if estimate:
                # just make the count a fpm as integer
                freqs_int = dict((w, int(math.ceil(c * floatpad)))
                                 for w, c in freqs.items())
                freqs = [w for w, c in freqs_int.items() for _ in range(c)]
                # print(f'freqs is now a list of {len(freqs)} items long')
        # print(f'freqs has {len(freqs)} keys now')
        countd.update(freqs)
        # print(f'countd now has {len(countd)} keys')
    # print(f'returning countd of {len(countd)} keys')
    return countd
コード例 #14
0
def create_counts(input):
    qout = Queue(cpus * 2)
    workers = []
    logging.info("Spawning {} count processes on {}".format(cpus, input))
    for i in range(cpus):
        p = Process(target=counter, args=(qout, i, input, "count"))
        p.start()
        workers.append(p)

    wordcounter = bounter.bounter(memory)

    while True:

        try:
            input_dict = qout.get_nowait()
            logging.debug("inputting queue of length {} from worker".format(
                len(input_dict)))
            wordcounter.update(input_dict)

        except queue.Empty:
            if running_processes(workers):
                time.sleep(1 / 100)
            else:
                break
        except ValueError:
            for k, v in input_dict.items():
                print("'{}'\t'{}'".format(k, v))
                wordcounter.update({k: v})
            raise
        except TypeError:
            for k, v in input_dict.items():
                print("'{}'\t'{}'".format(k, v))
                wordcounter.update({k: v})
            raise

    return wordcounter
コード例 #15
0
ファイル: segments.py プロジェクト: mmh-max/py_autophrase
    def update_theta(self, B):
        assert (isinstance(B, list))
        assert len(B) > 1

        pre_param = self._param_theta

        m = len(B)

        numerator = bounter.bounter(size_mb=self.memory_size_mb // 2)
        denominator = collections.defaultdict(int)

        bar = tqdm.tqdm(range(m - 1), ascii=True)

        for i in bar:
            bar.set_description('theta 1/2')
            numerator.increment(self.C.word(B[i], B[i + 1]))
            denominator[B[i + 1] - B[i]] += 1

        bar = tqdm.tqdm(self._theta.items(),
                        total=len(self._theta),
                        ascii=True)

        for candidate, idx in bar:
            bar.set_description('theta 2/2')

            if numerator[candidate] != 0:
                u_len = candidate.count(' ') + 1

                new_theta_value = numerator[candidate] / denominator[u_len]
                self._param_theta[idx] = new_theta_value
            else:
                self._param_theta[idx] = 0.

        self._trace_theta.append(compare_arrays(self._param_theta, pre_param))

        del pre_param
コード例 #16
0
def count_posts(num_threads):
    """
    Main function to multithread the subreddit counting.  Saves the counts
    out to a new csv file.

    :param num_threads: int
        How many threads to use when processing the data.  More --> faster,
        but also uses more memory.
    :param reddit_dir: str
        The folder containing the Reddit Public Comment Corpus files in bz2
        format.  The files can be contained in subfolders, but they should
        be the ONLY files in this directory.
    :param outfile: str, ends with .csv
        The path to save the output file to.  Must be a .csv file.
    :return: bounter object with subreddit counts
    """
    # files = []
    # for i in os.walk("reddit"):
    #     if i[0] == "by subreddit": continue
    #     for j in i[2]:
    #         files.append(f"{i[0]}\\{j}")
    files = list(recursive_scan("reddit"))

    # multiprocess the counting of posts per subreddit.
    with multiprocessing.Pool(processes=num_threads) as pool:
        bounters = list(
            tqdm(pool.imap(count_number_worker, files),
                 desc="Counting Posts",
                 total=len(files)))
    big_bounter = bounter.bounter(size_mb=200)
    for i in bounters:
        big_bounter.update(i)
    df = pd.Series(dict(big_bounter.items()))
    df.to_csv("Subreddit Post Counts.csv")

    return big_bounter
コード例 #17
0
    for e in v:
        e = e.split()
        for wd in e:
            nodes.append(wd)
months = sorted(months)
nodes = Counter(nodes)
nodes = {
    k: v
    for k, v in sorted(nodes.items(), key=lambda item: item[1])
    if v > 500 and (len(k) > 1 or k == "a") and k not in stoplist
}
vocabulary = set(nodes.keys())
print(len(vocabulary))

wd_times = {}
edges = bounter(size_mb=1024)
i = len(vocabulary)
for wd in vocabulary:
    print(f"remains {i} words")
    times = []
    for k, v in month_utterances.items():
        for e in v:
            e = e.split()
            e = [w for w in e if w in vocabulary]
            if len(e) > 1:
                if wd in set(e):
                    times.append(int(k))
                skips = list(skipgrams(e, 2, 3))
                skips = [skip[0] + "_" + skip[1] for skip in skips]
                edges.update(skips)
    if len(times) > 0:
コード例 #18
0
ファイル: text.py プロジェクト: alvations/komorebi
    def __init__(self,
                 filename=None,
                 vocab_size=None,
                 max_len=None,
                 chunk_size=10**5,
                 delimiter=None,
                 size_mb=4024,
                 pad_symbol='<pad>',
                 start_symbol='<s>',
                 end_symbol='</s>',
                 unknown_symbol='<unk>',
                 default_pad_start=False,
                 default_pad_end=True,
                 filter_on=None,
                 prune_at=10**10,
                 encoding='utf8',
                 **kwargs):
        """
        This is the object to store text and read them into vocabulary
        indices. The object is an iterable that yields vocabulary indices of the
        tokens in the sentences.
        :param filename: Textfile that contains source sentences.
        :type filename: str
        :param vocab_size: Max no. of words to keep in the source vocab.
        :type vocab_size: int
        :param chunk_size: Use to limit no. of sentences to load at a time when populating the vocabulary.
        :type chunk_size: int
        :param delimiter: Delimiter to split on when "tokenizing"
        :type delimiter: str
        :param size_mb: Memory footprint of the bounter object use to count the vocab.
        :type size_mb: int
        :param start_symbol: Start symbol use for padding.
        :type start_symbol: str
        :param end_symbol: End symbol use for padding.
        :type end_symbol: str
        :param unknown_symbol: Unknown symbol for OOV words.
        :type unknown_symbol: str
        :param default_pad_start: By default, pad the <s> to sentence when vectorizing.
        :type default_pad_start: bool
        :param default_pad_end: By default, pad the </s> to sentence when vectorizing.
        :type default_pad_end: bool
        :param filter_on: Option to filter on term-freq ('tf') or doc-freq ('df')
        :type filter_on: str
        :param prune_at: *prune_at* parameter used by gensim.Dictionary
        :type prune_at: int
        """
        if 'loadfrom' not in kwargs:  # Creating.

            self.filename = absolute_path(filename)

            # Check that inputs are not None.
            assert Path(self.filename).exists(
            ), "File {filename} does not exist".format(filename=filename)

            # Initialize encoding.
            self.encoding = encoding

            # Initialize the pad, start, end and unknown symbols.
            self.PAD, self.PAD_IDX = pad_symbol, 0
            self.START, self.START_IDX = start_symbol, 1
            self.END, self.END_IDX = end_symbol, 2
            self.UNK, self.UNK_IDX = unknown_symbol, 3
            self.default_pad_start = default_pad_start
            self.default_pad_end = default_pad_end

            # Save the user-specific delimiter
            self.delimiter = delimiter

            # Gensim related attribute to keep the pruning cap.
            self.prune_at = prune_at

            # Populate the source vocabulary.
            print('Creating Vocabulary...', end='\n', file=sys.stderr)
            self.vocab = Dictionary(
                [[pad_symbol], [start_symbol], [end_symbol], [unknown_symbol]],
                prune_at=self.prune_at)
            self.counter = bounter(size_mb=size_mb)

            print('Building source vocab and counter...',
                  end=' ',
                  file=sys.stderr)
            self.populate_dictionary(self.filename, self.vocab, self.counter,
                                     chunk_size)
            # Use the user-specified source/target vocab size if set,
            # else use the full vocab_size.
            self.vocab_size = min(len(
                self.vocab), vocab_size) if vocab_size else len(self.vocab)

            # Keep the vocabulary to a max set by user.
            if filter_on and self.vocab_size < len(self.vocab):
                print('Filtering least frequent words in vocab.',
                      end='\n',
                      file=sys.stderr)
                if filter_on == 'tf':
                    self.filter_n_least_frequent(
                        self.vocab,
                        self.counter,
                        self.vocab_size,
                        keep_tokens=['<pad>', '<s>', '</s>', '<unk>'])
                elif filter_on == 'df':
                    self.vocab.filter_extremes(
                        no_below=1,
                        no_above=self.prune_at,
                        keep_n=self.vocab_size,
                        keep_tokens=['<pad>', '<s>', '</s>', '<unk>'])

            self.iterable = self._iterate()

        else:  # Loading.
            self.load(kwargs['loadfrom'], filename,
                      kwargs.get('load_counter', False))
            self.iterable = self._iterate()
コード例 #19
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
 def test_sanity_default(self):
     counter = bounter(size_mb=16)
     counter.update([u'foo', u'bar', u'foo'])
     self.assertEqual(counter[u'foo'], 2)
     self.assertEqual(counter[u'bar'], 1)
     self.assertEqual(counter.cardinality(), 2)
コード例 #20
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
 def test_nocounts_init(self):
     counter = bounter(need_counts=False)
     self.assertTrue(issubclass(type(counter), CountMinSketch))
     self.assertEqual(counter.size(), 4)
コード例 #21
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
 def test_ht_log_init(self):
     with self.assertRaises(ValueError):
         bounter(size_mb=4, log_counting=8)
コード例 #22
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
    def test_cms_init_log8(self):
        counter = bounter(size_mb=1, need_iteration=False, log_counting=8)

        self.assertEqual(type(counter), CountMinSketch)
        self.assertEqual(type(counter.cms), cmsc.CMS_Log8)
        self.assertEqual(counter.size(), 2**20)
コード例 #23
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
    def test_cms_init_default(self):
        counter = bounter(size_mb=64, need_iteration=False)

        self.assertEqual(type(counter), CountMinSketch)
        self.assertEqual(type(counter.cms), cmsc.CMS_Conservative)
        self.assertEqual(counter.size(), 2**26)
コード例 #24
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
    def test_explicit_init(self):
        counter = bounter(size_mb=2, need_iteration=True)

        self.assertEqual(type(counter), HashTable)
        self.assertEqual(counter.buckets(), 2**16)
コード例 #25
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
 def test_no_size_init(self):
     with self.assertRaises(ValueError):
         counter = bounter()
コード例 #26
0
ファイル: test_bounter.py プロジェクト: tjbookreader/bounter
 def test_default_init(self):
     counter = bounter(7)
     self.assertEqual(type(counter), HashTable)
コード例 #27
0
 def test_contains(self):
     counter = bounter(size_mb=16)
     counter.update([u'foo', u'bar', u'foo'])
     self.assertTrue('foo' in counter)
     self.assertFalse('foobar' in counter)
コード例 #28
0
def process_pubtator(sample_ratio=0.03):
    """Sample Pubtator documents and batchify for multiprocessing"""
    logger.info('Processing PubTator documents...')

    global docs
    docs = []
    total_meshes = 0
    total_docs_read = 0
    bsz = 10000  # batch size
    word_freq = bounter(size_mb=1024 * 4)

    def cb_proc_pubtator(res):
        nonlocal total_meshes
        docs_, words_, num_meshes = res
        docs.extend(docs_)
        word_freq.update(words_)
        total_meshes += num_meshes
        print('total_meshes {}, num_meshes {}\r'
              ''.format(total_meshes, num_meshes),
              end='')

    p = mp.Pool()
    # Read the PubTator datafile
    with pubtator_file.open('rt') as f:
        batch = []
        aDoc = []
        flgSample = rnd.random() < sample_ratio
        while True:
            line = f.readline()
            if not line:  # End of file
                break
            if not flgSample:  # Do nothing and just read on
                if line == '\n':  # End of document
                    flgSample = rnd.random() < sample_ratio
                    total_docs_read += 1
                else:
                    continue
            else:
                if line == '\n':  # End of document
                    total_docs_read += 1
                    batch.append(aDoc)  # Add current document
                    flgSample = rnd.random() < sample_ratio
                    aDoc = []
                    if len(batch) == bsz:  # If batch is full, assign a job
                        p.apply_async(mp_proc_pubtator, (batch, ),
                                      callback=cb_proc_pubtator)
                        batch = []
                else:
                    aDoc.append(line.rstrip())
        if len(batch) > 0:
            p.apply_async(mp_proc_pubtator, (batch, ),
                          callback=cb_proc_pubtator)
    p.close()
    p.join()

    # move from bounter to Counter (bounter dosen't have most_common())
    global words
    words = Counter({k: v for k, v in word_freq.items()})
    logger.info('{}/{} documents processed, {} mesh terms found ({} meshes per'
                ' doc)'.format(len(docs), total_docs_read, total_meshes,
                               total_meshes / len(docs)))
コード例 #29
0
import pickle

import networkx as nx
import numpy as np
from bounter import bounter

with open("data/langmods/seedonly.p", "rb") as f:
    vocab = pickle.load(f)

m = np.load("data/langmods/seedonly.npy")

G = nx.Graph()
counts = bounter(size_mb=1024)

for i in range(len(vocab)):
    source = vocab[i]
    dists = np.dot(m, m[i])
    closests = sorted(list(dists), reverse=True)[:6]
    edges = []
    for c in closests:
        wordid = list(dists).index(c)
        target = vocab[wordid]
        if source != target:
            t = "@".join(sorted((source, target)))
            edges.append(t)
    counts.update(edges)

G = nx.Graph()
i = 0
for skip, freq in counts.iteritems():
    if freq > 0:
コード例 #30
0
 def __init__(self, bounter_size_mb=1024, *args, **kwargs):
     # Global counter for term frequency
     self.TF = bounter(size_mb=bounter_size_mb)