Python Tokenizer Examples

Programming Language: Python

Namespace/Package Name: authordetect

Class/Type: Tokenizer

Examples at hotexamples.com: 6

Python Tokenizer - 6 examples found. These are the top rated real world Python examples of authordetect.Tokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(6)

Frequently Used Methods

Tokenizer (6)

Example #1

Show file

File: driver_create_json.py Project: quanqhow/DoyleInvestigators2

def get_documents(corpus_and_labels, part_size: int = None):
    if isinstance(corpus_and_labels, str):
        corpus_and_labels = [(corpus_and_labels, None)]
    docs = []
    for corpus, label in corpus_and_labels:
        author = Author(corpus, label)
        author.preprocess(Tokenizer(lemmatizer='wordnet'))
        author.partition_into_documents(part_size)
        for doc in author.parsed_documents:
            words = doc.get_tokens()
            docs.append({
                'label': author.label,
                'text': words.substitute(author.text),
            })
    return docs

Example #2

Show file

File: driver_classifier.py Project: quanqhow/DoyleInvestigators2

doyle = Author(doyle_infile)
rinehart = Author(rinehart_infile)
christie = Author(christie_infile)
t.toc()
print('Doyle corpus characters:', len(doyle.corpus))
print('Rinehart corpus characters:', len(rinehart.corpus))
print('Christie corpus characters:', len(christie.corpus))

# Names and object handles to enable looping through same operations
names = ['Doyle', 'Rinehart', 'Christie']
authors = [doyle, rinehart, christie]

for name, author in zip(names, authors):
    t.tic(f'{name}: writer2vec')
    author.writer2vec(
        tokenizer=Tokenizer(),
        stopwords=Tokenizer.STOPWORDS,
        part_size=part_size,
        workers=workers,
        seed=seed,
        use_norm=True,
    )
    t.toc()
for name, author in zip(names, authors):
    print(f'{name} corpus sentences:', len(author.sentences))
    print(f'{name} corpus tokens:', len(author.words))
    print(f'{name} corpus vocabulary:', len(author.parsed.vocabulary))
    print(f'{name} documents:', len(author.docs))
    print(f'{name} document tokens:', author.docs[0].size)
    print(f'{name} embedding vocabulary:', len(author.model.vocabulary))
    print(f'{name} embedding matrix:', author.model.vectors.shape)

Example #3

Show file

File: driver_10_90_split.py Project: quanqhow/DoyleInvestigators2

test_size = 0.1
train_outfile = 'Doyle_90.txt'
test_outfile = 'Doyle_10.txt'

##############
# Processing #
##############
t = SmartTimer('10/90 Split')

t.tic('Load corpus')
a = Author(infile)
t.toc()
print('Corpus characters:', len(a.corpus))

t.tic('Preprocessing: Tokenizer')
a.preprocess(Tokenizer(lemmatizer=None))
t.toc()
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))

t.tic('Document partitioning')
a.partition_into_docs(part_size, remain_factor)
t.toc()
print('Documents:', len(a.docs))
print('Document tokens:', a.docs[0].size)

t.tic('Train/test splits')
train_docs, test_docs = trainutils.split_data_into_train_test(
    a.docs,
    test_size=test_size,
    random_state=random_state,

Example #4

Show file

        print(f'Usage: {sys.argv[0]} lang infile outfile')
        print('lang (str): uk, us')
        print('infile (str): JSON file')
        print('outfile (str): JSON file')
        sys.exit()

    lang, infile, outfile = sys.argv[1:]
    print('Input file:', infile)
    print('Output file:', outfile)

    # Generate list of documents
    docs = load_json(infile)
    print('Total documents:', len(docs))

    total_word_count = 0
    total_repl_count = 0
    perturb_freq_map = {}
    for i, doc in enumerate(docs):
        perturbed_text, repl_count = translate(doc['text'], lang)
        author = Author(perturbed_text)
        author.preprocess(Tokenizer(lemmatizer='wordnet'))
        perturb_freq_map[i] = repl_count / len(author.words)

        total_repl_count += repl_count
        total_word_count += len(author.words)

    print('Perturbation ratio:', total_repl_count / total_word_count)
    print('Total replacement count:', total_repl_count)
    print('Total word count:', total_word_count)
    save_json(perturb_freq_map, outfile)

Example #5

Show file

######################
# User Configuration #
######################
infile = '../data/Doyle_10.txt'
workers = 1
seed = 0

##############
# Processing #
##############
# Load corpus
a = Author(infile)
print('Corpus characters:', len(a.corpus))

# Sentence segmentation and tokenization
a.preprocess(Tokenizer())
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))
print('Corpus vocabulary:', len(a.parsed.vocabulary))

# Create an author's word2vec embedding model
a.embed(workers=workers, seed=seed)
print('Embedding vocabulary:', len(a.model.vocabulary))
print('Embedding matrix:', a.model.vectors.shape)

# Access the embedding matrix
a.model.vectors

####################################
# Accessing Vectors and Vocabulary #
####################################

Example #6

Show file

File: train_classifier.py Project: quanqhow/DoyleInvestigators2

import itertools
from authordetect import Author, Tokenizer, SmartTimer, save_pickle
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.neural_network import MLPClassifier
from typing import Any, Dict, Tuple, Union, Iterable

# NOTE: Set PYTHONHASHSEED to constant value to have deterministic hashing
# across Python interpreter processes.
os.environ['PYTHONHASHSEED'] = str(0)

######################
# User Configuration #
######################
verbose = True
seed = 0  # int, None
tokenizer = Tokenizer(min_token_length=1, use_stopwords=False)
stopwords = Tokenizer.STOPWORDS
mlp_file = 'mlp.pkl'

train_data = [
    '../data/Doyle_90.txt',
    '../data/Rinehart_90.txt',
    '../data/Christie_90.txt',
]
train_labels = [1, 0, 0]

writer2vec_params = [
    {
        'verbose': verbose,

        # Preprocess