def get_documents(corpus_and_labels, part_size: int = None): if isinstance(corpus_and_labels, str): corpus_and_labels = [(corpus_and_labels, None)] docs = [] for corpus, label in corpus_and_labels: author = Author(corpus, label) author.preprocess(Tokenizer(lemmatizer='wordnet')) author.partition_into_documents(part_size) for doc in author.parsed_documents: words = doc.get_tokens() docs.append({ 'label': author.label, 'text': words.substitute(author.text), }) return docs
doyle = Author(doyle_infile) rinehart = Author(rinehart_infile) christie = Author(christie_infile) t.toc() print('Doyle corpus characters:', len(doyle.corpus)) print('Rinehart corpus characters:', len(rinehart.corpus)) print('Christie corpus characters:', len(christie.corpus)) # Names and object handles to enable looping through same operations names = ['Doyle', 'Rinehart', 'Christie'] authors = [doyle, rinehart, christie] for name, author in zip(names, authors): t.tic(f'{name}: writer2vec') author.writer2vec( tokenizer=Tokenizer(), stopwords=Tokenizer.STOPWORDS, part_size=part_size, workers=workers, seed=seed, use_norm=True, ) t.toc() for name, author in zip(names, authors): print(f'{name} corpus sentences:', len(author.sentences)) print(f'{name} corpus tokens:', len(author.words)) print(f'{name} corpus vocabulary:', len(author.parsed.vocabulary)) print(f'{name} documents:', len(author.docs)) print(f'{name} document tokens:', author.docs[0].size) print(f'{name} embedding vocabulary:', len(author.model.vocabulary)) print(f'{name} embedding matrix:', author.model.vectors.shape)
test_size = 0.1 train_outfile = 'Doyle_90.txt' test_outfile = 'Doyle_10.txt' ############## # Processing # ############## t = SmartTimer('10/90 Split') t.tic('Load corpus') a = Author(infile) t.toc() print('Corpus characters:', len(a.corpus)) t.tic('Preprocessing: Tokenizer') a.preprocess(Tokenizer(lemmatizer=None)) t.toc() print('Corpus sentences:', len(a.sentences)) print('Corpus tokens:', len(a.words)) t.tic('Document partitioning') a.partition_into_docs(part_size, remain_factor) t.toc() print('Documents:', len(a.docs)) print('Document tokens:', a.docs[0].size) t.tic('Train/test splits') train_docs, test_docs = trainutils.split_data_into_train_test( a.docs, test_size=test_size, random_state=random_state,
print(f'Usage: {sys.argv[0]} lang infile outfile') print('lang (str): uk, us') print('infile (str): JSON file') print('outfile (str): JSON file') sys.exit() lang, infile, outfile = sys.argv[1:] print('Input file:', infile) print('Output file:', outfile) # Generate list of documents docs = load_json(infile) print('Total documents:', len(docs)) total_word_count = 0 total_repl_count = 0 perturb_freq_map = {} for i, doc in enumerate(docs): perturbed_text, repl_count = translate(doc['text'], lang) author = Author(perturbed_text) author.preprocess(Tokenizer(lemmatizer='wordnet')) perturb_freq_map[i] = repl_count / len(author.words) total_repl_count += repl_count total_word_count += len(author.words) print('Perturbation ratio:', total_repl_count / total_word_count) print('Total replacement count:', total_repl_count) print('Total word count:', total_word_count) save_json(perturb_freq_map, outfile)
###################### # User Configuration # ###################### infile = '../data/Doyle_10.txt' workers = 1 seed = 0 ############## # Processing # ############## # Load corpus a = Author(infile) print('Corpus characters:', len(a.corpus)) # Sentence segmentation and tokenization a.preprocess(Tokenizer()) print('Corpus sentences:', len(a.sentences)) print('Corpus tokens:', len(a.words)) print('Corpus vocabulary:', len(a.parsed.vocabulary)) # Create an author's word2vec embedding model a.embed(workers=workers, seed=seed) print('Embedding vocabulary:', len(a.model.vocabulary)) print('Embedding matrix:', a.model.vectors.shape) # Access the embedding matrix a.model.vectors #################################### # Accessing Vectors and Vocabulary # ####################################
import itertools from authordetect import Author, Tokenizer, SmartTimer, save_pickle from sklearn.model_selection import train_test_split, ShuffleSplit from sklearn.neural_network import MLPClassifier from typing import Any, Dict, Tuple, Union, Iterable # NOTE: Set PYTHONHASHSEED to constant value to have deterministic hashing # across Python interpreter processes. os.environ['PYTHONHASHSEED'] = str(0) ###################### # User Configuration # ###################### verbose = True seed = 0 # int, None tokenizer = Tokenizer(min_token_length=1, use_stopwords=False) stopwords = Tokenizer.STOPWORDS mlp_file = 'mlp.pkl' train_data = [ '../data/Doyle_90.txt', '../data/Rinehart_90.txt', '../data/Christie_90.txt', ] train_labels = [1, 0, 0] writer2vec_params = [ { 'verbose': verbose, # Preprocess