Beispiel #1
0
    def check_data(self):
        self.Error.clear()
        if isinstance(self.data, Table) and \
                isinstance(self.selected_data, Table):
            if len(self.selected_data) == 0:
                self.Error.empty_selection()
                self.clear()
                return

            # keep only BoW features
            bow_domain = self.get_bow_domain()
            if len(bow_domain.attributes) == 0:
                self.Error.no_bow_features()
                self.clear()
                return
            self.data = Corpus.from_table(bow_domain, self.data)
            self.selected_data_transformed = Corpus.from_table(
                bow_domain, self.selected_data)

            if np_sp_sum(self.selected_data_transformed.X) == 0:
                self.Error.no_words_overlap()
                self.clear()
            elif len(self.data) == len(self.selected_data):
                self.Error.all_selected()
                self.clear()
            else:
                self.apply()
        else:
            self.clear()
    def check_data(self):
        self.Error.clear()
        if isinstance(self.data, Table) and \
                isinstance(self.selected_data, Table):
            if len(self.selected_data) == 0:
                self.Error.empty_selection()
                self.clear()
                return

            # keep only BoW features
            bow_domain = self.get_bow_domain()
            if len(bow_domain.attributes) == 0:
                self.Error.no_bow_features()
                self.clear()
                return
            self.data = Corpus.from_table(bow_domain, self.data)
            self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data)

            if np_sp_sum(self.selected_data_transformed.X) == 0:
                self.Error.no_words_overlap()
                self.clear()
            elif len(self.data) == len(self.selected_data):
                self.Error.all_selected()
                self.clear()
            else:
                self.apply()
        else:
            self.clear()
    def test_compute_values(self):
        """ Test compute values on new data """
        data = self._compute_features("Word count")

        computed = Corpus.from_table(data.domain, self.book_data)
        self.assertEqual(data.domain, computed.domain)
        self.assertTupleEqual((len(self.book_data), 1), computed.X.shape)
Beispiel #4
0
    def load(self):
        self.progressBarInit()
        if not os.path.isdir(self.directory):
            print(f"error: {self.directory} is not a valid directory!",
                  file=sys.stderr)
        files = list(Path(self.directory).glob(self._glob))
        if len(files) == 0:
            print("error: no files found!", file=sys.stderr)
        mails = []
        seen = {}
        for i, filename in enumerate(files):
            try:
                mails.append(list(parse_enron_mail(filename)))
                key = "#".join([mails[-1][0], mails[-1][7]])
                if key in seen:
                    mails[-1][3] = self.YESSTRING
                seen[key] = True
            except Exception as e:
                print(filename)
                print(e)
            self.progressBarSet(100 * (i + 1) / len(files))

        domain = self.corpusDomain(mails)
        table = Table.from_list(domain, mails)
        self.Outputs.data.send(Corpus.from_table(table.domain, table))
        self.progressBarFinished()
Beispiel #5
0
    def load(self):
        files = list(Path(self.directory).glob(self._glob))
        self.progress.advance(0)
        mails = []
        self.progress.iter = len(files)
        for i, filename in enumerate(files):
            try:
                mails.append(parse_enron_mail(filename))
            except Exception as e:
                print(filename)
                print(e)
            self.progress.advance()

        table = table_from_frame(pd.DataFrame(mails))
        self.Outputs.data.send(Corpus.from_table(table.domain, table))
 def load(self):
     self.label.setText("")
     self.progressBarInit()
     if re.search("^\d+$",str(self.filePattern)):
         patientFileName = tactusloaderLIB.makeFileName(self.filePattern)
         table,mails = tactusloaderLIB.processFile(self.directory,patientFileName)
         self.progressBarSet(100)
         self.label.setText("read 1 file")
     else: 
         table,fileCounter = self.readFiles(self.directory,self.filePattern)
         self.label.setText("read "+str(fileCounter)+" files")
     self.progressBarFinished()
     if len(table) > 0: 
         self.Outputs.data.send(Corpus.from_table(table.domain, table))
     else:
         self.label.setText("Warning: non-existent data file\n"+self.directory+"/"+patientFileName+"\nor empty corpus")
Beispiel #7
0
def tfidf_keywords(
        corpus: Corpus,
        progress_callback: Callable = None) -> List[List[Tuple[str, float]]]:
    """
    Extract keywords using TF-IDF.

    Parameters
    ----------
    tokens : list
        Lists of tokens.
    progress_callback : callable
        Function for reporting progress.

    Returns
    -------
    keywords : list
    """
    if progress_callback is None:
        progress_callback = dummy_callback

    # empty X part - to know that every feature of X is bag of wrds
    domain = Domain([],
                    class_vars=corpus.domain.class_vars,
                    metas=corpus.domain.metas)
    corpus = corpus.from_table(domain, corpus)

    vectorizer = BowVectorizer(
        wlocal=BowVectorizer.COUNT,
        wglobal=BowVectorizer.IDF if len(corpus) > 1 else BowVectorizer.NONE,
        norm=BowVectorizer.L2,
    )
    res = vectorizer.transform(corpus)
    X, words = res.X, [a.name for a in res.domain.attributes]

    keywords = []
    n_docs = X.shape[0]
    for i, row in enumerate(X):
        progress_callback(i / n_docs)
        nonzero = row.nonzero()
        if len(nonzero) > 1:
            keywords.append([(words[i], row[0, i]) for i in nonzero[1]])
        else:
            keywords.append([])
    return keywords
Beispiel #8
0
def _embedding_similarity(
    corpus: Corpus,
    words: List[str],
    callback: Callable,
    embedding_language: str,
) -> np.ndarray:
    language = LANGS_TO_ISO[embedding_language]
    # make sure there will be only embeddings in X after calling the embedder
    corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus)
    emb = DocumentEmbedder(language)

    cb_part = len(corpus) / (len(corpus) + len(words))
    documet_embeddings, skipped = emb.transform(
        corpus, wrap_callback(callback, 0, cb_part))
    assert skipped is None

    words = [[w] for w in words]
    word_embeddings = np.array(
        emb.transform(words, wrap_callback(callback, cb_part, 1 - cb_part)))
    return cosine_similarity(documet_embeddings.X, word_embeddings)
Beispiel #9
0
def _embedding_similarity(
    corpus: Corpus,
    words: List[str],
    callback: Callable,
    embedding_language: str,
) -> np.ndarray:
    ticks = iter(np.linspace(0, 0.8, len(corpus) + len(words)))

    # TODO: currently embedding report success unify them to report progress float
    def emb_cb(sucess: bool):
        if sucess:
            callback(next(ticks))

    language = LANGS_TO_ISO[embedding_language]
    # make sure there will be only embeddings in X after calling the embedder
    corpus = Corpus.from_table(Domain([], metas=corpus.domain.metas), corpus)
    emb = DocumentEmbedder(language)
    documet_embeddings, skipped = emb(corpus, emb_cb)
    assert skipped is None
    word_embeddings = np.array(emb([[w] for w in words], emb_cb))
    return cosine_similarity(documet_embeddings.X, word_embeddings)
Beispiel #10
0
 def convert(self, table):
     if table:
         self.Outputs.corpus.send(Corpus.from_table(table.domain, table))