def main(): text = 'Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in ' \ 'Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great ' \ 'civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are ' \ 'met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final ' \ 'resting place for those who here gave their lives that that nation might live. It is altogether fitting and ' \ 'proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we ' \ 'can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far ' \ 'above our poor power to add or detract. The world will little note, nor long remember what we say here, but ' \ 'it can never forget what they did here. It is for us the living, rather, to be dedicated here to the ' \ 'unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here ' \ 'dedicated to the great task remaining before us -- that from these honored dead we take increased devotion ' \ 'to that cause for which they gave the last full measure of devotion -- that we here highly resolve that ' \ 'these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- ' \ 'and that government of the people, by the people, for the people, shall not perish from the earth.' text_tok = tok(text) text_pos = pd.DataFrame(tag(text_tok), columns=['words', 'pos tags']) text_pos.insert(0, 'key', list(range(len(text_pos.index)))) print(text_pos) repl_pos = { 'pos tags': [ 'CD', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS' ], 'pos names': [ 'number', 'adjective', 'comparative adjective', 'superlative adjective', 'singular noun', 'plural noun', 'proper noun', 'plural proper noun', 'adverb', 'comparative adverb', 'superlative adverb' ] } repl_pos = pd.DataFrame(data=repl_pos) print(repl_pos) text_repl = pd.merge(text_pos, repl_pos, on='pos tags', how='inner') text_repl['key'] = text_repl['key'].astype(int) print(text_repl) sparsity = 7 repl_num = floor(len(text_repl.index) / sparsity) replace = text_repl.sample(n=repl_num) new_words = ['_____'] * repl_num replace.insert(4, 'new words', new_words) print(replace) output = ''
def returnNames(url): theurl = "https://raw.githubusercontent.com/robincamille/replacethechar/master/texts/biblekjv.txt" #raw_input("URL to .txt file: ") sourcefile = urllib2.urlopen(theurl) source = sourcefile.read() #Tokenize sourcetok = tok(source[:partition]) #Tag POS sourcetag = postag(sourcetok) #Outputs POS-tagged text sourcene = ne(sourcetag, binary=False) charsall = [] for n in sourcene: if type(n) == tree.Tree: if n.label() == 'PERSON': for m in n: charsall.append(m[0]) honorifics = ['Mr.', 'Mrs.', 'Ms.', 'Miss', 'Dr.', 'Prof.', 'Professor', 'Lord', 'Lady', 'Sir', 'Madam', 'Dame', 'Rev.', 'Rabbi'] charsallnames = [] for s in charsall: if s in honorifics: pass else: charsallnames.append(s) counted = (word for word in charsallnames if word[:1].isupper()) c = Counter(counted) charscommon = c.most_common(5) chars = [] for s in charscommon: chars.append(s[0]) print '\nMost common names:' print '\t'.join(chars) return chars,source
def my_form_post(): # Nondescript UI input page: left box, writing sample # same as output page: invisible corpus = request.form['corpus'] # Nondescript UI input page: right box, message # Output page: 3 tabs at the bottom if request.form['whichmessage'] == 'choosesuggestmessage': message = request.form['suggestmessage'] if request.form['whichmessage'] == 'chooseluckymessage': message = request.form['luckymessage'] if request.form['whichmessage'] == 'chooseorigmessage': message = request.form['origmessage'] docraw = corpus + ' ' + message #Analyze writing overall #doc = docraw.split() doc = tok(docraw) printcompare = [] #things to print: style vs. all background documents printoverall = [] #things to print: overall style printunusualwords = [] unusualwordsonly = [] printclassify = [] #things to print: classifier output advice = [] #tips such as use shorter sentences #Document length #s.append('Document length: %d words' % len(doc)) #Return message forms: synonym-suggestion, -replacement, original origmessage = message anonmessage = changewords(message) suggestmessage = anonmessage[0] #includes synonym suggestions in parens luckymessage = anonmessage[1] #randomly replaces some words with synonyms #Cosine similarity in vocabularies of 100, 1000, 10000 words printcompare.append('Similarity between this message and original writing sample: %.3f'\ % (sim(toponly.top(corpus,10000),toponly.top(message,10000))[0,1])) # printcompare.append('Similarity between this message and original writing sample (10k words): %.3f'\ # % (sim(toponly.top(corpus,10000),toponly.top(message,10000))[0,1])) # printcompare.append('Similarity between this message and original writing sample (1k words): %.3f' \ # % (sim(toponly.top(corpus,1000),toponly.top(message,1000))[0,1])) # printcompare.append('Similarity between this message and original writing sample (100 words): %.3f' \ # % (sim(toponly.top(corpus,100),toponly.top(message,100))[0,1])) #Average word lengths printcompare.append("Your message's word length is {:.2f}x \ your average".format( avgwordlength(message.split()) / avgwordlength(corpus.split()))) #totwlavg = mean(totwl) word_compare = avgwordlength(doc) / backgroundcorpusWL if word_compare > 1.2: advice.append("Try using shorter words.") elif word_compare < 0.9: advice.append("Try using longer words.") printoverall.append("Your overall word length is {:.2f}x \ everyone else's average.".format(word_compare)) #Average sent lengths printcompare.append("Your message's sentence length is {:.2f}x \ your average".format(avgsentlength(message) / avgsentlength(corpus))) #totslavg = mean(totsl) sent_compare = avgsentlength(doc) / backgroundcorpusSL if sent_compare > 1.2: advice.append("Use shorter sentences.") elif sent_compare < 0.9: advice.append("Use longer sentences.") printoverall.append("Your overall sentence length is {:.2f}x \ everyone else's average.".format(sent_compare)) advice.append( "Focus on changing the highlighted and red-underlined words.") #Top unusual words #Set up word frequency comparison with open(bcfreqs) as infile: #from sources.py allfreqraw = [l[1:] for l in infile] allfreq = {} for row in allfreqraw: row = row.split(',') allfreq[row[0][:-1]] = float(row[1]) # with open('allfreq.csv','w') as allfreqfile: # allfreqfile.write(str(allfreq)) doccount = defaultdict(int) docfreq = defaultdict(int) for word in doc: doccount[word.lower()] += 1 #term count for word in doccount: docfreq[word] = doccount[word] / float(len(doccount)) #term frequency # with open('docfreq.csv','w') as docfreqfile: # docfreqfile.write(str(docfreq)) #Compare word frequencies compfreq = defaultdict(list) for word in docfreq: if word in allfreq.keys(): compfreq[word] = [docfreq[word], allfreq[word]] else: pass compwords = [] for word in compfreq: if doccount[word] > 1: if compfreq[word][0] > compfreq[word][1]: # if compfreq[word][1] == 0: # v = compfreq[word][1] / minfreq #min freq from train/ # else: v = compfreq[word][0] / float(compfreq[word][1]) compwords.append([v, word, doccount[word] ]) #currently 0 words? fix this else: pass else: pass compwordssort = sorted(compwords, reverse=True) for i in compwordssort[:10]: unusualwordsonly.append(i[1]) printunusualwords.append( '{}: {:.1f}x more frequent (used {} times in sample and message)'. format(i[1], i[0], i[2])) unusualwordsonly = ' '.join(unusualwordsonly) # The important bit: #Compare to n random authors in background corpus #Run through classifier: train & test #backgroundcorpus directory & filelist .txt file specified #in sources.py classifieroutcome = [i for i in classifydocs(backgroundcorpus,\ filelist,\ docraw,\ message,\ 1000)] #vocab of n words #Output to send to compare-output-simple.html return render_template("compare-output-simple.html", \ compareoverall = printoverall, \ unusualwordsonly = unusualwordsonly, \ unusualwords = printunusualwords, \ advice = advice, \ corpus = corpus, \ repeatdoc = message, \ suggestdoc = suggestmessage, \ luckydoc = luckymessage, \ origdoc = origmessage, \ comparestats = printcompare, \ classifieroutcome = classifieroutcome[0], \ classifierscore = classifieroutcome[1])
##Caveats: first/last names and most honorifics not considered from nltk import tree from nltk import word_tokenize as tok from nltk import pos_tag as postag from nltk import ne_chunk as ne from collections import Counter infile = open('data_columns/nussbaum/nuss01.txt', 'r') # Put your filename here source = infile.read() source = source.decode('utf-8') infile.close() print 'Tokenizing' sourcetok = tok(source) print 'Tagging Part Of Speech (POS)...' sourcetag = postag(sourcetok) print 'Running POS-tagged text through Named Entity chunker...' sourcene = ne(sourcetag, binary=False) # Find just the Named Entities that we want charsall = [] for n in sourcene: if type(n) == tree.Tree: #if n.label() == 'PERSON': if n.node == 'PERSON': #Options: PERSON, ORGANIZATION, LOCATION for m in n:
def readbook(): bk = pglist[pickbook()] #pick #2000 from list number = bk[1] codetitle = bk[2] title = bk[3] intro = [] intro.append(foundit[randint(0, len(foundit) - 1)] % title) intro.append('\n\nYou flip to a random page and begin to read...\n\n') urll = 'https://raw.githubusercontent.com/GITenberg/' + codetitle + \ '/master/' + number + '.txt' guturl = 'http://www.gutenberg.org/ebooks/' + number time.sleep(3) # politeness req = urllib2.Request(urll) response = urllib2.urlopen(req) the_page = response.read() ex = the_page[10000:11000] if "Project Gutenberg" in ex: ex = the_page[30000:31000] elif "PROJECT GUTENBERG" in ex: ex = the_page[30000:31000] else: pass # stitch together line breaks ex = ex + '\n\r' #hacky way to make sure below splits happen ex = ex.split('\n') ex = ' '.join(ex) ex = ex.split('\r') ex = ' '.join(ex) # get rid of double spaces, brackets, and asterisky section breaks ex = ex + ' ' ex = ex.split(' ') exfin = [] for w in ex: if w == '': pass elif w == '*': pass elif w[0] == ' ': w = w[1:] exfin.append(w) elif w[0] == '[': w = w[1:] exfin.append(w) elif w[-1] == ']': w = w[:-1] exfin.append(w) else: exfin.append(w) ex = ' '.join(exfin) # split into sentences exs = tok(ex) # start with second sentence, end with second-to-last if exs[1][:2] == '" ': # skip initial quotation mark if any exs[1] = exs[1][2:] blurb = '> ... ' + (' '.join(exs[1:-1])) + ' ...' outtro = (leave[randint(0, len(leave) - 1)]) usedbooktitles.append(title) titlelink = '[' + title + '](' + guturl + ')' usedbooktitlesandlinks.append(titlelink) return ' '.join(intro)[1:], blurb, outtro
import urllib2 from nltk import tree from nltk import word_tokenize as tok from nltk import pos_tag as postag from nltk import ne_chunk as ne from nltk.corpus import gutenberg as gb from collections import Counter theurl = raw_input("URL to .txt file: ") sourcefile = urllib2.urlopen(theurl) source = sourcefile.read() # Tokenize sourcetok = tok(source) # Tag POS sourcetag = postag(sourcetok) # Outputs POS-tagged text sourcene = ne(sourcetag, binary=False) charsall = [] for n in sourcene: if type(n) == tree.Tree: if n.label() == "PERSON": for m in n: charsall.append(m[0]) honorifics = [
def prepare_for_bert(folder, number, filename, write_to_file=False, wiki_novel=False): ####### STEP 1: setting up the folders' path variables book_nlp_output_folder = '{}/book_nlp_output'.format(folder) temp_folder = '{}/temp'.format(folder) processed_novel_folder = '{}/processed_novel'.format(folder) if wiki_novel == False: filename = filename.replace('_clean.txt_', '_') else: filename = '{}/original_wikipedia_page/{}.txt'.format(folder, number) if wiki_novel == False: transform_characters_list(book_nlp_output_folder, folder, number) ######## STEP 3: creating a list of characters from that file, by using a function in nonce2vec.utils.novels_utilities char_list = get_characters_list(folder, number) if wiki_novel == False: genders_dict = get_characters_gender(folder, number, char_list) else: genders_dict = {} ######## STEP 5: creating the final version of the txt to be used for training on N2V. Main features are 1) one sentence per line, 2) different names for the same character are substituted with one single name, 3) punctuation is removed and double/triple backspaces, common within Gutenberg files, are removed print('Creating the final version of novel {} for version: {}'.format( number, filename)) if wiki_novel == False: f = open('{}'.format(filename)).read() out = open('{}_bert'.format(filename), 'w') lines = tok(f) else: lines = open('{}'.format(filename)).readlines() out = open('{}_bert'.format(filename), 'w') if wiki_novel == False: current_char_list, full_novel = cleanup_novels(char_list, lines, out, write_to_file) else: current_char_list, full_novel = cleanup_novels(char_list, lines, out, write_to_file, wiki_novel=True) if wiki_novel == False: novel_versions = {} mid_novel = int(len(full_novel) / 2) novel_versions['{}_part_a'.format(filename)] = full_novel[:mid_novel] novel_versions['{}_part_b'.format(filename)] = full_novel[mid_novel:] elif wiki_novel == True: novel_versions = full_novel return novel_versions, current_char_list, genders_dict
processed_novel_folder = '{}/processed_novel'.format(base_folder) os.makedirs('{}'.format(temp_folder), exist_ok=True) os.makedirs('{}'.format(booknlp_folder), exist_ok=True) os.makedirs('{}'.format(processed_novel_folder), exist_ok=True) number = sys.argv[2] f = open('{}/{}.txt'.format(folder, number)).readlines() out_clean = open('{}/{}_clean.txt'.format(temp_folder, number), 'w') clean_list = [] for v, i in enumerate(f): if '***START' not in str(i) and v != len(f) - 1: pass elif v == len(f) - 1: malandrino = f else: malandrino = f[(v + 1):] break for i in malandrino: if '***END' not in str(i): c = i.replace('_', ' ').strip(' ').strip('\n').replace('\r', '') clean_list.append('{}'.format(c)) else: break clean_book = ' '.join(clean_list) sent_book = tok(clean_book) for i in sent_book: out_clean.write('{}\n'.format(i)) out_clean.close()
import numpy as np import matplotlib.pyplot as plt from nltk import word_tokenize as tok from scipy import linalg as SPLA import utils testFile = r"C:\Users\Carl Wilhjelm\PycharmProjects\ICSProject\logs\syslogTest.txt" # create tokenized list of all logs syslog = [] with open(testFile, 'r') as f: syslogText = f.readlines() for line in syslogText: syslog.append(tok(line)) n = len(syslog) print(n) opNumber = int(n * (0.01)) # create naive distance matrix for all logs distanceMatrix = [[0 for x in range(n)] for y in range(n)] for i in range(n): for j in range(i, n): distance = 0 k = min(len(syslog[i]), len(syslog[j])) for e in range(k): if syslog[i][e] != syslog[j][e]: distance += 1 distanceMatrix[i][j] = distance / k distanceMatrix[j][i] = distanceMatrix[i][j]
def prepare_for_n2v(folder, number, filename, w2v_model): ####### STEP 1: from the output of booknlp to a file containing the list of characters and the number of times they occur f = open('{}/book_nlp_output/book.id.html'.format(folder)).read().split( '<br />') out = open('{}/characters_{}.txt'.format(folder, number), 'w') for i in f: if '<h1>Text' in i: break else: i2 = sub('.*Characters</h1>', '', i) i3 = i2.replace('-- ', '') i4 = sub('\([0-9]*\)', '_', i3) i5 = i4.replace(' _ ', '_').strip('_') i6 = i5.replace('\t ', '\t') i7 = sub(r'[^\w\s]', '', i6) if 'Gutenberg' not in i7: out.write('{}\n'.format(i7.lower())) out.close() ######## STEP 2: creating a list of characters from that file, by using a function in nonce2vec.utils.novels_utilities char_list = get_characters_list(folder, number) gender_list = get_characters_gender(folder, number, char_list) print(gender_list) ######## STEP 3: creating the final version of the txt to be used for training on N2V. Main features are 1) one sentence per line, 2) different names for the same character are substituted with one single name, 3) punctuation is removed and double/triple backspaces, common within Gutenberg files, are removed files = [ '{}'.format(filename), '{}_part_a'.format(filename), '{}_part_b'.format(filename) ] add_to_char_list = 1 char_dict = {} for i in files: char_dict_part = {} # f=open('../{}'.format(i)).read() f = open('{}'.format(i)).read() out_filename = i.replace('_clean.txt_', '_').replace('/temp', '/processed_novel') # out=open('../{}_n2v'.format(out_filename),'w') out = open('{}_n2v'.format(out_filename), 'w') lines = tok(f) for line in lines: line = line.strip('\n') for alias in char_list: if type(alias) == list: for a in alias: a = sub(r'\W+', ' ', a) first_name = alias[0] if ' ' in first_name: name_parts = first_name.split(' ') character = name_parts[1] else: character = first_name aliases = alias[1:] for name in aliases: if name in line: char_dict_part[character] += 1 line = line.replace(str(name), str(character)) else: if alias in line: alias = sub(r'\W+', ' ', alias) if ' ' in alias: name_parts = alias.split(' ') character = name_parts[1] line = line.replace(alias, character) char_dict_part[character] += 1 else: pass line2 = re.sub(r'\W+', r' ', line) line3 = line2.strip(' ') out.write('{}\n'.format(line3.lower())) add_to_char_list_final == False
import os from nltk import word_tokenize as tok pos = os.listdir('winners') neg = os.listdir('losers') total = 0 cnt = 0 for p in pos: cnt += 1 with open('winners/{}'.format(p), 'r') as f: total += len(tok(f.read())) for p in neg: cnt += 1 with open('losers/{}'.format(p), 'r') as f: total += len(tok(f.read())) print(total, cnt, total // cnt)
def returnNames(url): theurl = "http://www.ccel.org/ccel/bible/kjv.txt" # raw_input("URL to .txt file: ") sourcefile = urllib2.urlopen(theurl) source = sourcefile.read() # Tokenize sourcetok = tok(source[:partition]) # Tag POS sourcetag = postag(sourcetok) # Outputs POS-tagged text sourcene = ne(sourcetag, binary=False) charsall = [] for n in sourcene: if type(n) == tree.Tree: if n.label() == "PERSON": for m in n: charsall.append(m[0]) # exclude from names: honorifics = [ "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Prof.", "Professor", "Lord", "Lady", "Sir", "Madam", "Dame", "Rev.", "Rabbi", "Version", "Gutenberg", ] charsallnames = [] for s in charsall: if s in honorifics: pass else: charsallnames.append(s) counted = (word for word in charsallnames if word[:1].isupper()) c = Counter(counted) charscommon = c.most_common(5) chars = [] for s in charscommon: chars.append(s[0]) # print '\nMost common names:' # print '\t'.join(chars) return chars, source
def main(): """Amplifies the affect of a given text. Adverbs and adjectives are altered.""" if len(sys.argv) == 2: f = sys.argv[1] if f[len(f) - 4:] == '.txt': usefile = f #must be a .txt file else: usefile = 'great_expectations.txt' print("This script requires .txt files only. The file you\n\ specified was not a .txt file. Instead, we'll use\n\ Great Expectations as an example...") else: usefile = 'great_expectations.txt' print("You can define which .txt file to use like so: \n\ python textillating.py [filename you want to use.txt]\n\ You didn't specify a .txt file to use, so in the meantime,\n\ we'll use Great Expectations as an example...") print('Processing... This may take a minute...') filename = open(usefile, 'r') text = filename.readlines() #readlines in order to preserve line breaks filename.close() outfile = open('extremely_' + usefile, 'w') raw_text = [] new_text = [] for line in text: line = pos(tok(line)) #('excellent', 'JJ') raw_text.append(line) modifiers = [ 'WAY', 'ABSOLUTELY', 'ACTUALLY', 'ACUTELY', 'ALMIGHTY', 'AMPLY', 'ASSUREDLY', 'ASTONISHINGLY', 'AWFULLY', 'CATEGORICALLY', 'CERTAINLY', 'CLEARLY', 'CONSIDERABLY', 'DECIDEDLY', 'DEEPLY', 'DRASTICALLY', 'EMINENTLY', 'EMPHATICALLY', 'EXAGGERATEDLY', 'EXCEEDINGLY', 'EXCEPTIONALLY', 'EXCESSIVELY', 'EXORBITANTLY', 'EXPLICITLY', 'EXTENSIVELY', 'EXTRAORDINARILY', 'EXTREMELY', 'FOR REAL', 'GENUINELY', 'GREATLY', 'HIGHLY', 'HUGELY', 'IMMENSELY', 'IMMODERATELY', 'INCREDIBLY', 'INDUBITABLY', 'INORDINATELY', 'INTENSELY', 'LARGELY', 'LEGITIMATELY', 'LITERALLY', 'MARKEDLY', 'NOTABLY', 'NOTICEABLY', 'OBVIOUSLY', 'OVERLY', 'PARTICULARLY', 'PLENTY', 'POSITIVELY', 'POWERFULLY', 'PRODIGIOUSLY', 'PROFOUNDLY', 'PROHIBITIVELY', 'QUITE', 'RADICALLY', 'REALLY', 'REAL', 'REMARKABLY', 'SEVERELY', 'STRIKINGLY', 'SUBSTANTIALLY', 'SUPER', 'SUPERLATIVELY', 'SURPASSINGLY', 'SURPRISINGLY', 'TERRIBLY', 'TERRIFICALLY', 'TOO', 'TOTALLY', 'TRULY', 'ULTRA', 'UNCOMMONLY', 'UNDENIABLY', 'UNDOUBTEDLY', 'UNEQUIVOCALLY', 'UNMISTAKABLY', 'UNQUESTIONABLY', 'UTTERLY', 'VASTLY', 'VERILY', 'VERY', 'VIOLENTLY', 'VITALLY', 'WONDERFULLY' ] for line in raw_text: #goes line by line to preserve line breaks for word in line: word_score = sid.polarity_scores(word[0])['compound'] use_synonym = word[0] #updates later possible_synonyms = [] if wf.blacklisted(word[0]): pass elif word[0].lower() in ignore: pass elif word[ 1] == 'JJ': #adjectives only; adverbs don't quite work well here for syn in wn.synsets(word[0]): for lemma in syn.lemmas(): syn_meta = str(syn).split('.') #match part of speech if syn_meta[1] == 'a' or 's': #adjectives possible_synonyms.append(lemma.name()) all_synonyms = set(possible_synonyms) #de-dupe for synonym in all_synonyms: syn_score = sid.polarity_scores(synonym)['compound'] #scores range from -1 to 1, 1 being positive affect if wf.blacklisted(synonym): pass elif word_score == 0: if abs(syn_score) > sid.polarity_scores( use_synonym)['compound']: use_synonym = synonym #choose most xtreme synonym (either pos or neg) elif word_score > 0: if syn_score > sid.polarity_scores( use_synonym)['compound']: use_synonym = synonym #choose most xtreme synonym (positive) elif word_score < 0: if syn_score < sid.polarity_scores( use_synonym)['compound']: use_synonym = synonym #choose most xtreme synonym (negative) if use_synonym == word[0]: use_synonym = modifiers[randint( 0, len(modifiers) - 1)] + ' ' + word[0] #VERY neutral use_synonym = use_synonym.replace("_", " ").upper() elif word[0] == '.': use_synonym = '!' elif word[0] == '!': use_synonym = '!!!!!!!!!!!!!' elif word[0] == '?': use_synonym = '??!!' else: use_synonym = word[0] new_text.append(use_synonym) new_text.append('\n') #preserve line breaks outfile.write(detok.detokenize(new_text)) #Does not deal with quotation marks well. Adds a space before/after them outfile.close() print('All done! See extremely_' + usefile + ' for your newly exciting text.')