def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table( bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.apply() else: self.clear()
def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.apply() else: self.clear()
def test_compute_values(self): """ Test compute values on new data """ data = self._compute_features("Word count") computed = Corpus.from_table(data.domain, self.book_data) self.assertEqual(data.domain, computed.domain) self.assertTupleEqual((len(self.book_data), 1), computed.X.shape)
def load(self): self.progressBarInit() if not os.path.isdir(self.directory): print(f"error: {self.directory} is not a valid directory!", file=sys.stderr) files = list(Path(self.directory).glob(self._glob)) if len(files) == 0: print("error: no files found!", file=sys.stderr) mails = [] seen = {} for i, filename in enumerate(files): try: mails.append(list(parse_enron_mail(filename))) key = "#".join([mails[-1][0], mails[-1][7]]) if key in seen: mails[-1][3] = self.YESSTRING seen[key] = True except Exception as e: print(filename) print(e) self.progressBarSet(100 * (i + 1) / len(files)) domain = self.corpusDomain(mails) table = Table.from_list(domain, mails) self.Outputs.data.send(Corpus.from_table(table.domain, table)) self.progressBarFinished()
def load(self): files = list(Path(self.directory).glob(self._glob)) self.progress.advance(0) mails = [] self.progress.iter = len(files) for i, filename in enumerate(files): try: mails.append(parse_enron_mail(filename)) except Exception as e: print(filename) print(e) self.progress.advance() table = table_from_frame(pd.DataFrame(mails)) self.Outputs.data.send(Corpus.from_table(table.domain, table))
def load(self): self.label.setText("") self.progressBarInit() if re.search("^\d+$",str(self.filePattern)): patientFileName = tactusloaderLIB.makeFileName(self.filePattern) table,mails = tactusloaderLIB.processFile(self.directory,patientFileName) self.progressBarSet(100) self.label.setText("read 1 file") else: table,fileCounter = self.readFiles(self.directory,self.filePattern) self.label.setText("read "+str(fileCounter)+" files") self.progressBarFinished() if len(table) > 0: self.Outputs.data.send(Corpus.from_table(table.domain, table)) else: self.label.setText("Warning: non-existent data file\n"+self.directory+"/"+patientFileName+"\nor empty corpus")
def tfidf_keywords( corpus: Corpus, progress_callback: Callable = None) -> List[List[Tuple[str, float]]]: """ Extract keywords using TF-IDF. Parameters ---------- tokens : list Lists of tokens. progress_callback : callable Function for reporting progress. Returns ------- keywords : list """ if progress_callback is None: progress_callback = dummy_callback # empty X part - to know that every feature of X is bag of wrds domain = Domain([], class_vars=corpus.domain.class_vars, metas=corpus.domain.metas) corpus = corpus.from_table(domain, corpus) vectorizer = BowVectorizer( wlocal=BowVectorizer.COUNT, wglobal=BowVectorizer.IDF if len(corpus) > 1 else BowVectorizer.NONE, norm=BowVectorizer.L2, ) res = vectorizer.transform(corpus) X, words = res.X, [a.name for a in res.domain.attributes] keywords = [] n_docs = X.shape[0] for i, row in enumerate(X): progress_callback(i / n_docs) nonzero = row.nonzero() if len(nonzero) > 1: keywords.append([(words[i], row[0, i]) for i in nonzero[1]]) else: keywords.append([]) return keywords
def _embedding_similarity( corpus: Corpus, words: List[str], callback: Callable, embedding_language: str, ) -> np.ndarray: language = LANGS_TO_ISO[embedding_language] # make sure there will be only embeddings in X after calling the embedder corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus) emb = DocumentEmbedder(language) cb_part = len(corpus) / (len(corpus) + len(words)) documet_embeddings, skipped = emb.transform( corpus, wrap_callback(callback, 0, cb_part)) assert skipped is None words = [[w] for w in words] word_embeddings = np.array( emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part))) return cosine_similarity(documet_embeddings.X, word_embeddings)
def _embedding_similarity( corpus: Corpus, words: List[str], callback: Callable, embedding_language: str, ) -> np.ndarray: ticks = iter(np.linspace(0, 0.8, len(corpus) + len(words))) # TODO: currently embedding report success unify them to report progress float def emb_cb(sucess: bool): if sucess: callback(next(ticks)) language = LANGS_TO_ISO[embedding_language] # make sure there will be only embeddings in X after calling the embedder corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus) emb = DocumentEmbedder(language) documet_embeddings, skipped = emb(corpus, emb_cb) assert skipped is None word_embeddings = np.array(emb([[w] for w in words], emb_cb)) return cosine_similarity(documet_embeddings.X, word_embeddings)
def convert(self, table): if table: self.Outputs.corpus.send(Corpus.from_table(table.domain, table))