def preprocess_text(docs): num_task = os.cpu_count() len_slices = len(docs) // num_task remainder_slices = len(docs) % num_task texts = [] stoplist = set(stopwords.words('english')) wn.ensure_loaded() t_start = time.perf_counter() with ProcessPoolExecutor(max_workers=num_task) as executor: futures_tokenize = [] for n in range(0, num_task): upper_bound = (n+1) * len_slices if n == num_task - 1: upper_bound = (n+1) * len_slices + remainder_slices print(n, upper_bound) futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound], stoplist)) for future in concurrent.futures.as_completed(futures_tokenize): texts += future.result() t_stop = time.perf_counter() print("removed stopwords and lemmatized in {} s".format(t_stop - t_start)) # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phraser(Phrases(texts, min_count=20)) for idx in range(len(texts)): for token in bigram[texts[idx]]: if '_' in token: # Token is a bigram, add to document. texts[idx].append(token) print("Done bigrams") dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=30, no_above=0.5) dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]]) special_tokens = {'_pad_': 0} dictionary.patch_with_special_tokens(special_tokens) return texts, dictionary
def further_preprocessing_phase(temp_data_frame): temp_data_frame['text'] = temp_data_frame['text'].apply(lambda text: th.tokenize_text(text) if text != None else '') # textlist = temp_data_frame['text'].to_numpy() textlist = temp_data_frame['text'].tolist() # if it raises an exeption could be the empty texts patent_dictionary = Dictionary(textlist) corpus = [patent_dictionary.doc2bow(text) for text in textlist] print('original dictionary size: ', len(patent_dictionary)) vocab_tf={} for i in corpus: for item, count in dict(i).items(): if item in vocab_tf: vocab_tf[item]+=int(count) else: vocab_tf[item] =int(count) remove_ids=[] no_of_ids_below_limit=0 for id,count in vocab_tf.items(): if count<=5: remove_ids.append(id) patent_dictionary.filter_tokens(bad_ids=remove_ids) patent_dictionary.filter_extremes(no_below=0) patent_dictionary.filter_n_most_frequent(30) print('parsed dictionary size: ', len(patent_dictionary)) vocabulary = list(patent_dictionary.token2id.keys()) ids_list = [] data_frame = pd.DataFrame(columns=['patent_id', 'text', 'classification']) temp_data_frame.apply(lambda row : shrink_vocabulary(row, vocabulary, data_frame, ids_list), axis=1) print(len(ids_list)) data_frame.set_index(data_frame['patent_id'], inplace=True) data_frame.drop(ids_list, axis=0, inplace=True) return data_frame
class Corpora(Loader): """ """ is_built = False def __init__(self, data_path: str, prefix: str = None, iterator: str = 'token', parsing: str = 'simple', word_up_limit: float = 0.75, word_low_limit: int = 20, dictionary: str = None, shuffle: bool = False, seed: int = 42, document_minimum_length: int = 5, stopwords: str = None): iter_map = dict(token=self.tokenize, bow=self.bowize, sentences=self.sentences) self.iterator = iter_map[iterator] self.word_low_limit = word_low_limit self.word_up_limit = word_up_limit if stopwords: self.stopwords = [w.strip() for w in open(stopwords).readlines()] else: self.stopwords = [] if not dictionary: self.dictionary = Dictionary() else: self.dictionary = Dictionary.load_from_text(dictionary) if self.stopwords: self.dictionary.filter_tokens( bad_ids=self.dictionary.doc2idx(self.stopwords)) self.is_built = True self.shuffle = shuffle if self.shuffle: np.random.seed(seed) self.document_minimum_length = document_minimum_length corpus = self.init_corpus(data_path, prefix, parsing) super(Corpora, self).__init__(corpus=corpus) def __enter__(self): if not self.is_built: self.build() return super(Corpora, self).__enter__() def __exit__(self, *args): self.clear() return super(Corpora, self).__exit__(*args) def __iter__(self): for v in self.iterator(): yield v def __getitem__(self, key): return self.iterator(index=key) def init_corpus(self, path: str, prefix: str, parsing: str): """ """ directory = [os.path.join(path, f) for f in os.listdir(path)] folders = list(filter(lambda p: os.path.isdir(p), directory)) if prefix: folders = list(filter(lambda p: prefix in p, folders)) corpus = [Corpus(path=p, parsing=parsing).load() for p in folders] self.__paths = {c.path: c for c in corpus} return corpus def load_vectors(self, path: str): """ """ if not path.endswith('.csv'): raise AssertionError( 'Asserted the vectors to be provided with csv.') #TODO Use dask in case of too large word vector maps. return pd.read_csv(path) def build(self): """ """ if self.is_built: logging.warn('Attempted to build already built Corpora.') return for c in self.corpus: self.dictionary.add_documents(c.tokens) c.clear() self.dictionary.filter_extremes(no_below=self.word_low_limit, no_above=self.word_up_limit) return self def clear(self): """ """ self.dictionary = Dictionary() def bowize(self, index=None): """ """ N = len(self) iterable = self._iterator(index) for idx in self._indices(iterable): corpus = iterable[idx] tokens = corpus.tokens for ind in self._indices(tokens): doc_tokens = tokens[ind] bow = self.dictionary.doc2bow(doc_tokens) if len(bow) > self.document_minimum_length: yield bow, N else: logging.warn( f'Received empty file at {corpus.documents[ind]}, skipping.' ) corpus.mark_empty(ind) corpus.clear() def tokenize(self, index=None): """ """ N = len(self) iterable = self._iterator(index) for idx in self._indices(iterable): corpus = iterable[idx] tokens = corpus.tokens self._move() for ind in self._indices(tokens): doc_tokens = tokens[ind] if len(doc_tokens) > self.document_minimum_length: yield doc_tokens, N else: logging.warn( f'Received empty file at {corpus.documents[ind]}, skipping.' ) corpus.mark_empty(ind) corpus.clear() def sentences(self, index=None): """ """ iterable = self._iterator(index=index) for ind in self._indices(iterable=iterable): corpus = iterable[ind] for sentence in corpus.sentences: if len(sentence) > self.document_minimum_length: yield sentence else: logging.warn( f'Received empty file at {corpus.documents[ind]}, skipping.' ) def documents(self, index=None): """ """ for c in self.corpus: if len(c) > 1: yield c.documents else: for doc in c.documents: yield doc @property def years(self): """ """ return sorted([int(c.year) for c in self.corpus]) def _iterator(self, index=None): """ """ iterator = self.corpus if index: if isinstance(index, int): iterator = [self.corpus[index]] #TODO: Handle indices as slice elif isinstance(index, str): iterator = [self.__paths[index]] return iterator def _indices(self, iterable): """ """ if self.shuffle: indices = np.random.permutation(len(iterable)) else: indices = range(len(iterable)) return indices