Example #1
0
def merge(dicts):
    """
    Merges a list of dicts, summing their values.
    (Parallelized wrapper around `_count`)
    """
    chunks = [args for args in np.array_split(dicts, 20)]
    results = parallel(_count, chunks, n_jobs=-1)
    return _count(results)
Example #2
0
def merge(dicts):
    """
    Merges a list of dicts, summing their values.
    (Parallelized wrapper around `_count`)
    """
    chunks = [args for args in np.array_split(dicts, 20)]
    results = parallel(_count, chunks, n_jobs=-1)
    return _count(results)
Example #3
0
    def tokenize(self, docs):
        stops = stopwords.words('english')
        r = Rake(stops_path)

        if self.n_jobs == 1:
            keywords = [[kw[0] for kw in r.run(doc) if kw not in stops] for doc in docs]
        else:
            keywords = [[kw[0] for kw in kwd if kw not in stops] for kwd in parallel(r.run, docs, self.n_jobs)]
        return keywords
Example #4
0
    def tokenize(self, docs):
        stops = stopwords.words('english')
        r = Rake(stops_path)

        if self.n_jobs == 1:
            keywords = [[kw[0] for kw in r.run(doc) if kw not in stops]
                        for doc in docs]
        else:
            keywords = [[kw[0] for kw in kwd if kw not in stops]
                        for kwd in parallel(r.run, docs, self.n_jobs)]
        return keywords
Example #5
0
 def preprocess(self, docs):
     #print('Cleaning...')
     clean_func = partial(clean,
                          remove_urls=self.remove_urls,
                          lowercase=self.lowercase,
                          remove_possessors=self.remove_possessors,
                          remove_punctuation=self.remove_punctuation)
     if self.n_jobs == 1:
         return [clean_func(d) for d in docs]
     else:
         return parallel(clean_func, docs, self.n_jobs)
Example #6
0
 def preprocess(self, docs):
     #print('Cleaning...')
     clean_func = partial(
         clean,
         remove_urls=self.remove_urls,
         lowercase=self.lowercase,
         remove_possessors=self.remove_possessors,
         remove_punctuation=self.remove_punctuation
     )
     if self.n_jobs == 1:
         return [clean_func(d) for d in docs]
     else:
         return parallel(clean_func, docs, self.n_jobs)
Example #7
0
    def tokenize(self, docs):
        """ Tokenizes a document, using a lemmatizer.

        Args:
            | doc (str)                 -- the text document to process.

        Returns:
            | list                      -- the list of tokens.
        """
        if self.n_jobs == 1:
            return [self._tokenize(doc) for doc in docs]
        else:
            return parallel(self._tokenize, docs, self.n_jobs)
Example #8
0
def train_tf(tokens_stream, out=None, **kwargs):
    """
    Train a map of term frequencies on a list of files (parallelized).
    """
    print('Counting terms...')
    results = parallel(count_tf, tokens_stream, n_jobs=-1)

    print('Merging...')
    tf = merge(results)

    if out is not None:
        with open(out, 'w') as f:
            json.dump(tf, f)

    return tf
Example #9
0
def train_idf(tokens_stream, out=None, **kwargs):
    """
    Train a IDF model on a list of files (parallelized).
    """
    idfs = parallel(count_idf, tokens_stream, n_jobs=-1)
    N = len(idfs) # n docs
    idf = merge(idfs)

    for k, v in idf.items():
        idf[k] = math.log(N/v)
        # v ~= N/(math.e ** idf[k])

    # Keep track of N to update IDFs
    idf['_n_docs'] = N

    if out is not None:
        with open(out, 'w') as f:
            json.dump(idf, f)

    return idf
Example #10
0
def train_idf(tokens_stream, out=None, **kwargs):
    """
    Train a IDF model on a list of files (parallelized).
    """
    idfs = parallel(count_idf, tokens_stream, n_jobs=-1)
    N = len(idfs)  # n docs
    idf = merge(idfs)

    for k, v in idf.items():
        idf[k] = math.log(N / v)
        # v ~= N/(math.e ** idf[k])

    # Keep track of N to update IDFs
    idf['_n_docs'] = N

    if out is not None:
        with open(out, 'w') as f:
            json.dump(idf, f)

    return idf
Example #11
0
    def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
Example #12
0
 def preprocess(self, docs):
     #print('HTML cleaning...')
     if self.n_jobs == 1:
         return [strip_html(d) for d in docs]
     else:
         return parallel(strip_html, docs, self.n_jobs)