def get_documents(corpus_and_labels, part_size: int = None):
    if isinstance(corpus_and_labels, str):
        corpus_and_labels = [(corpus_and_labels, None)]
    docs = []
    for corpus, label in corpus_and_labels:
        author = Author(corpus, label)
        author.preprocess(Tokenizer(lemmatizer='wordnet'))
        author.partition_into_documents(part_size)
        for doc in author.parsed_documents:
            words = doc.get_tokens()
            docs.append({
                'label': author.label,
                'text': words.substitute(author.text),
            })
    return docs
doyle = Author(doyle_infile)
rinehart = Author(rinehart_infile)
christie = Author(christie_infile)
t.toc()
print('Doyle corpus characters:', len(doyle.corpus))
print('Rinehart corpus characters:', len(rinehart.corpus))
print('Christie corpus characters:', len(christie.corpus))

# Names and object handles to enable looping through same operations
names = ['Doyle', 'Rinehart', 'Christie']
authors = [doyle, rinehart, christie]

for name, author in zip(names, authors):
    t.tic(f'{name}: writer2vec')
    author.writer2vec(
        tokenizer=Tokenizer(),
        stopwords=Tokenizer.STOPWORDS,
        part_size=part_size,
        workers=workers,
        seed=seed,
        use_norm=True,
    )
    t.toc()
for name, author in zip(names, authors):
    print(f'{name} corpus sentences:', len(author.sentences))
    print(f'{name} corpus tokens:', len(author.words))
    print(f'{name} corpus vocabulary:', len(author.parsed.vocabulary))
    print(f'{name} documents:', len(author.docs))
    print(f'{name} document tokens:', author.docs[0].size)
    print(f'{name} embedding vocabulary:', len(author.model.vocabulary))
    print(f'{name} embedding matrix:', author.model.vectors.shape)
test_size = 0.1
train_outfile = 'Doyle_90.txt'
test_outfile = 'Doyle_10.txt'

##############
# Processing #
##############
t = SmartTimer('10/90 Split')

t.tic('Load corpus')
a = Author(infile)
t.toc()
print('Corpus characters:', len(a.corpus))

t.tic('Preprocessing: Tokenizer')
a.preprocess(Tokenizer(lemmatizer=None))
t.toc()
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))

t.tic('Document partitioning')
a.partition_into_docs(part_size, remain_factor)
t.toc()
print('Documents:', len(a.docs))
print('Document tokens:', a.docs[0].size)

t.tic('Train/test splits')
train_docs, test_docs = trainutils.split_data_into_train_test(
    a.docs,
    test_size=test_size,
    random_state=random_state,
Example #4
0
        print(f'Usage: {sys.argv[0]} lang infile outfile')
        print('lang (str): uk, us')
        print('infile (str): JSON file')
        print('outfile (str): JSON file')
        sys.exit()

    lang, infile, outfile = sys.argv[1:]
    print('Input file:', infile)
    print('Output file:', outfile)

    # Generate list of documents
    docs = load_json(infile)
    print('Total documents:', len(docs))

    total_word_count = 0
    total_repl_count = 0
    perturb_freq_map = {}
    for i, doc in enumerate(docs):
        perturbed_text, repl_count = translate(doc['text'], lang)
        author = Author(perturbed_text)
        author.preprocess(Tokenizer(lemmatizer='wordnet'))
        perturb_freq_map[i] = repl_count / len(author.words)

        total_repl_count += repl_count
        total_word_count += len(author.words)

    print('Perturbation ratio:', total_repl_count / total_word_count)
    print('Total replacement count:', total_repl_count)
    print('Total word count:', total_word_count)
    save_json(perturb_freq_map, outfile)
Example #5
0
######################
# User Configuration #
######################
infile = '../data/Doyle_10.txt'
workers = 1
seed = 0

##############
# Processing #
##############
# Load corpus
a = Author(infile)
print('Corpus characters:', len(a.corpus))

# Sentence segmentation and tokenization
a.preprocess(Tokenizer())
print('Corpus sentences:', len(a.sentences))
print('Corpus tokens:', len(a.words))
print('Corpus vocabulary:', len(a.parsed.vocabulary))

# Create an author's word2vec embedding model
a.embed(workers=workers, seed=seed)
print('Embedding vocabulary:', len(a.model.vocabulary))
print('Embedding matrix:', a.model.vectors.shape)

# Access the embedding matrix
a.model.vectors

####################################
# Accessing Vectors and Vocabulary #
####################################
import itertools
from authordetect import Author, Tokenizer, SmartTimer, save_pickle
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.neural_network import MLPClassifier
from typing import Any, Dict, Tuple, Union, Iterable

# NOTE: Set PYTHONHASHSEED to constant value to have deterministic hashing
# across Python interpreter processes.
os.environ['PYTHONHASHSEED'] = str(0)

######################
# User Configuration #
######################
verbose = True
seed = 0  # int, None
tokenizer = Tokenizer(min_token_length=1, use_stopwords=False)
stopwords = Tokenizer.STOPWORDS
mlp_file = 'mlp.pkl'

train_data = [
    '../data/Doyle_90.txt',
    '../data/Rinehart_90.txt',
    '../data/Christie_90.txt',
]
train_labels = [1, 0, 0]

writer2vec_params = [
    {
        'verbose': verbose,

        # Preprocess