Python TextPreprocessor.tokenize_textの例

プログラミング言語: Python

名前空間/パッケージ名: src.data.text_preprocessor

クラス/型: TextPreprocessor

メソッド/関数: tokenize_text

hotexamples.comのコード掲載数: 6

Python TextPreprocessor.tokenize_text - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのsrc.data.text_preprocessor.TextPreprocessor.tokenize_textの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

TextPreprocessor(9)

clean_sentence(7)

tokenize_text(6)

split_into_paragraphs(1)

コード例 #1

ファイルを表示

def preclean_xlsxParagraph(text_list):
    preprocessor = TextPreprocessor()
    word_list = list()
    for text in text_list:
        clean_text = preprocessor.clean_sentence(text)
        word_list.append(' '.join(preprocessor.tokenize_text(clean_text)))
    return (word_list)

コード例 #2

ファイルを表示

ファイル: build_corpus_tdm.py プロジェクト: thefirebanks/policy-data-analyzer

    def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary

コード例 #3

ファイルを表示

ファイル: search_engine.py プロジェクト: thefirebanks/policy-data-analyzer

class BM25Engine(SearchEngine):
    tokenizer_file = "data/processed/{id}/tokenizer"
    corpus_file = "data/processed/{id}/corpus"
    doc_idxs_file = "data/processed/{id}/doc_idxs.json"

    def __init__(self, tokenizer, corpus, idxs2id):
        """
        Parameters
        ----------
        tokenizer : gensim.corpora.Dictionary
            Word tokenizer.
        corpus : gensim.corpora.mmcorpus.MmCorpus
            Bag-of-words formatted corpus of documents.
        """
        self.preprocessor = TextPreprocessor()
        self.tokenizer = tokenizer
        self.corpus = corpus
        self.internal_engine = BM25(self.corpus)
        self.idxs2id = idxs2id
        print("BM25 engine loaded")

    def top_k_matches(self, query, k):
        clean = self.preprocessor.clean_sentence(query, alphabetic_only=True)
        word_list = self.preprocessor.tokenize_text(clean)
        bow_representation = self.tokenizer.doc2bow(word_list)
        scores = self.internal_engine.get_scores(bow_representation)
        top_k_idxs = np.argsort(scores)[::-1][:k]
        return [self.idxs2id[str(idx)] for idx in top_k_idxs]

    def load(id):
        tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id))
        corpus = MmCorpus(BM25Engine.corpus_file.format(id=id))
        with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f:
            idxs2id = json.load(f)
        return BM25Engine(tokenizer, corpus, idxs2id)

コード例 #4

ファイルを表示

class BoWHighlighter(SegmentHighlighter):
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.counter = CountVectorizer()

    def highlight(self, document, query, precision):
        clean_query = self.preprocessor.clean_sentence(
            query)  #, alphabetic_only=True
        word_list = self.preprocessor.tokenize_text(clean_query)
        #Convert query after tokenization as a string
        query_mod = ' '.join(word_list)
        document = re.sub(" +", " ", document)
        document = re.sub("\n\s+", " \n \n ", document)
        document = re.sub("\n+", "\n", document)
        # document = re.sub("\n*", "\n\n ", document)
        document = document.strip()
        highlights = []
        scores = []
        for paragraph in document.split(" \n \n "):
            if len(paragraph) == 0:
                continue
            clean_paragraph = self.preprocessor.clean_sentence(
                paragraph)  #, alphabetic_only=True
            corpus = self.preprocessor.tokenize_text(clean_paragraph)
            paragraph_mod = ' '.join(corpus)
            vectorizer = self.counter.fit([query_mod, paragraph_mod])
            vectors = [
                vec for vec in vectorizer.transform([query_mod, paragraph_mod
                                                     ]).toarray()
            ]
            norm_vec_query = np.linalg.norm(vectors[0])
            norm_vec_paragraph = np.linalg.norm(vectors[1])
            if norm_vec_paragraph == 0: continue
            cosine_similarity = np.dot(
                vectors[0], vectors[1]) / (norm_vec_query * norm_vec_paragraph)
            if cosine_similarity > precision:
                print("The cosine similary is: ", cosine_similarity, paragraph,
                      "\n \n")
                highlights.append(paragraph)
                scores.append(cosine_similarity)
        return highlights, scores

    def load(id):
        return BoWHighlighter()

コード例 #5

ファイルを表示

def preclean_entireDoc(text):
    preprocessor = TextPreprocessor()
    clean_text = preprocessor.clean_sentence(text)
    word_list = preprocessor.tokenize_text(clean_text)
    words = ' '.join(word_list)
    return (words)

コード例 #6

ファイルを表示

ファイル: build_corpus_tdm.py プロジェクト: thefirebanks/policy-data-analyzer

class IterativeCorpusBuilder():
    def __init__(self, document_paths, max_words):
        self.tokenizer = IterativeCorpusBuilder.extract_dictionary(
            document_paths, max_words=max_words)
        self.document_paths = iter(document_paths)
        self.preprocessor = TextPreprocessor()
        self.clock = time()
        self.iterations = 0
        self.inform_frequency = 1000
        self.max_words = 2 * 10**6

    def __next__(self):
        with open(next(self.document_paths), "r") as f:
            document = f.read()
        document = self.preprocessor.clean_sentence(document,
                                                    alphabetic_only=True)
        words = self.preprocessor.tokenize_text(document)
        bow_representation = self.tokenizer.doc2bow(words)
        # Inform progress as specified
        self.iterations += 1
        if self.iterations % self.inform_frequency == 0:
            print("{} iterations took {} seconds. {} done.".format(
                self.inform_frequency,
                time() - self.clock, self.iterations))
            self.clock = time()
        return bow_representation

    def __iter__(self):
        print("Building corpus term-document matrices")
        return self

    def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary