def merge(dicts): """ Merges a list of dicts, summing their values. (Parallelized wrapper around `_count`) """ chunks = [args for args in np.array_split(dicts, 20)] results = parallel(_count, chunks, n_jobs=-1) return _count(results)
def tokenize(self, docs): stops = stopwords.words('english') r = Rake(stops_path) if self.n_jobs == 1: keywords = [[kw[0] for kw in r.run(doc) if kw not in stops] for doc in docs] else: keywords = [[kw[0] for kw in kwd if kw not in stops] for kwd in parallel(r.run, docs, self.n_jobs)] return keywords
def preprocess(self, docs): #print('Cleaning...') clean_func = partial(clean, remove_urls=self.remove_urls, lowercase=self.lowercase, remove_possessors=self.remove_possessors, remove_punctuation=self.remove_punctuation) if self.n_jobs == 1: return [clean_func(d) for d in docs] else: return parallel(clean_func, docs, self.n_jobs)
def preprocess(self, docs): #print('Cleaning...') clean_func = partial( clean, remove_urls=self.remove_urls, lowercase=self.lowercase, remove_possessors=self.remove_possessors, remove_punctuation=self.remove_punctuation ) if self.n_jobs == 1: return [clean_func(d) for d in docs] else: return parallel(clean_func, docs, self.n_jobs)
def tokenize(self, docs): """ Tokenizes a document, using a lemmatizer. Args: | doc (str) -- the text document to process. Returns: | list -- the list of tokens. """ if self.n_jobs == 1: return [self._tokenize(doc) for doc in docs] else: return parallel(self._tokenize, docs, self.n_jobs)
def train_tf(tokens_stream, out=None, **kwargs): """ Train a map of term frequencies on a list of files (parallelized). """ print('Counting terms...') results = parallel(count_tf, tokens_stream, n_jobs=-1) print('Merging...') tf = merge(results) if out is not None: with open(out, 'w') as f: json.dump(tf, f) return tf
def train_idf(tokens_stream, out=None, **kwargs): """ Train a IDF model on a list of files (parallelized). """ idfs = parallel(count_idf, tokens_stream, n_jobs=-1) N = len(idfs) # n docs idf = merge(idfs) for k, v in idf.items(): idf[k] = math.log(N/v) # v ~= N/(math.e ** idf[k]) # Keep track of N to update IDFs idf['_n_docs'] = N if out is not None: with open(out, 'w') as f: json.dump(idf, f) return idf
def train_idf(tokens_stream, out=None, **kwargs): """ Train a IDF model on a list of files (parallelized). """ idfs = parallel(count_idf, tokens_stream, n_jobs=-1) N = len(idfs) # n docs idf = merge(idfs) for k, v in idf.items(): idf[k] = math.log(N / v) # v ~= N/(math.e ** idf[k]) # Keep track of N to update IDFs idf['_n_docs'] = N if out is not None: with open(out, 'w') as f: json.dump(idf, f) return idf
def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() #print('RAKE tokenizing...') pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs) for i, tdoc in enumerate(pre_tdocs): for t in tdoc: if t.startswith('one'): print(t) print(i) #print('Additional Tokenizing docs...') if self.n_jobs == 1: tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)] else: tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True) #print('Training bigram...') if self.bigram is None: self.bigram = Phrases(tdocs, min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.bigram.add_vocab(tdocs) #print('Training trigram...') if self.trigram is None: self.trigram = Phrases(self.bigram[tdocs], min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.trigram.add_vocab(self.bigram[tdocs]) return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
def preprocess(self, docs): #print('HTML cleaning...') if self.n_jobs == 1: return [strip_html(d) for d in docs] else: return parallel(strip_html, docs, self.n_jobs)