def get_file(): global fname # need to retrieve the uploaded file here for further processing print(fname) print(request.form) dict = request.form.to_dict() #filename = str(request)[ str(request).index('=')+1 : str(request).index('\' [GET]>') ] text = simpleTokenize('uploads/' + fname) """text2 = cleanText( 'uploads/' + fname ) pos = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) savePOSPiChart(text2, pos) top = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) saveTopWords(text2, top) bw = ['yellow', 'fish', 'glass', 'foot', 'beach', 'suicide'] genGraph = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) secgen = similarContext( text, bw) saveChronoMap(text, bw, secgen, genGraph) genReport = wpReport(text, bw, secgen, 10) charReport = sampleCharacter(text, 'Caddy', 3, 100) return render_template('results.html', pq = percentQuotes(text), sen = senlenStats(text), pos = pos, top = top, genGraph=genGraph,genReport=genReport, charReport=charReport) """ return render_template('hello.html')
def wpReport(filename, firstgen, secgen, ssName, numTop, writeTo): sys.stdout = open(writeTo, 'w') print("\n\n\nTitle: " + ssName) print("FirstGen Words: " + str(firstgen)) print("SecondGen Words: " + str(secgen)) text = simpleTokenize(filename) arr = wordProgression(text, firstgen, secgen) for i in range(numTop): topIndex = arr.index(max(arr)) topWords = text[topIndex * 100:topIndex * 100 + 100] print("\n\nNumber " + str(i)) print("--> Score: " + str(arr[topIndex]) + "\n--> Z-Score: " + str(stats.zscore(arr)[topIndex])) print("--> Average Score " + str(statistics.mean(arr))) fg = 0 sg = 0 for i in range(len(topWords)): if topWords[i] in firstgen: fg += 1 if topWords[i] in secgen: sg += 1 print("Number of FirstGen Words: " + str(fg)) print("Number of SecondGen Words: " + str(sg)) print(TreebankWordDetokenizer().detokenize(topWords)) del arr[arr.index(max(arr))] sys.stdout = sys.__stdout__
def wpReport(filename, firstgen, secgen, ssName, numTop): str = "" str = "\n\n\nTitle: " + ssName + "\n" str += "FirstGen Words: " + str(firstgen)+ "\n" str += "SecondGen Words: " + str(secgen) + "\n" text = simpleTokenize(filename) arr = wordProgression(text, firstgen, secgen) for i in range(numTop): topIndex = arr.index(max(arr)) topWords = text[topIndex*100:topIndex*100+100] str+="\n\nNumber " + str(i) + "\n" str+= "--> Score: " + str( arr[topIndex]) + "\n--> Z-Score: " + str(stats.zscore(arr)[topIndex]) + "\n" str+="--> Average Score " + str(statistics.mean(arr)) + "\n" fg=0 sg=0 for i in range(len(topWords)): if topWords[i] in firstgen: fg+=1 if topWords[i] in secgen: sg+=1 str+="Number of FirstGen Words: " + str(fg) + "\n" str+="Number of SecondGen Words: " + str(sg) + "\n" str+= TreebankWordDetokenizer().detokenize(topWords) + "\n" del arr[arr.index(max(arr))] return str
def vectorize(fileName, firstgen): text = simpleTokenize(fileName) vector = [ strength(text), avgSentenceLength(text), rebound(text, firstgen) * 1000.0 ] return vector
def plotChronoMap(textName, firstgen, secgen, title): #fig, axs = plt.subplots(1) #fig.suptitle('Short Story Readings on Salinger') t = nltk.Text(word.lower() for word in simpleTokenize(textName)) y = wordProgression( t , firstgen, secgen) x = list(range( int(len(t)/100) +1)) # If you want to make separate plots plt.plot(x,y) plt.savefig('/templates/static/graphs/' + title + '.png')
def generateSimilarWords(rootTextName, newDocName, words): text = nltk.Text(word.lower() for word in simpleTokenize(rootTextName)) sys.stdout = open(newDocName, 'w') stop_words = set(stopwords.words("english")) for i in range(len(words)): print(words[i]) text.similar(words[i], 1) print() sys.stdout = sys.__stdout__ m = simpleTokenize(newDocName) n = m.copy() for i in range(len(m)): if m[i] == "No" and m[i + 1] == "matches": n.remove("No") n.remove("matches") elif m[i] in words or m[i] in stop_words: n.remove(m[i]) return n
def plotChronoMap(textName, firstgen, secgen, title, graphNum): t = nltk.Text(word.lower() for word in simpleTokenize(textName)) y = wordProgression(t, firstgen, secgen) x = list(range(int(len(t) / 100) + 1)) # If you want to make separate plots #a, f = plt.subplots(1) #a.suptitle(title) #f.plot(x,y) axs[graphNum].plot(x, y) axs[graphNum].set_title(title)
def plotChronoMap(textName, firstgen, secgen, title, graphNum, writeTo): #fig, axs = plt.subplots(1) #fig.suptitle('Short Story Readings on Salinger') t = nltk.Text(word.lower() for word in simpleTokenize(textName)) y = wordProgression(t, firstgen, secgen) x = list(range(int(len(t) / 100) + 1)) # If you want to make separate plots a, f = plt.subplots(1) a.suptitle(title) f.plot(x, y)
def nextGeneration(rootTextName, newDocName, words): text = nltk.Text(word.lower() for word in simpleTokenize(rootTextName) ) sys.stdout = open(newDocName, 'w') for i in range(len(words)): # Original word print(words[i]) # Context words within document text.similar(words[i], 1) # Context words from Wikipedia w = wikipediaWords(words, 7) for j in range(len(w[i])): print(w[i][j]) sys.stdout = sys.__stdout__ m = simpleTokenize(newDocName) n = m.copy() for i in range(len(m)): if m[i] == "No" and m[i+1] == "matches": n.remove("No") n.remove("matches") return n
def firstgen(fileName): w = cleanText( simpleTokenize(fileName) ) w = set(w) lim = 5 + random.randint(0,3) w = [o for o in w if len(o) >= lim] ret = [] for i in range(30): rand = random.randint(0, len(w)-1) ret.append( w[rand] ) w.pop(rand) return ret
def master(baseText, metricText, firstgen, writeTo): #simpleTokenize(metricText) simpleTokenize(baseText) secgen = nextGeneration(baseText, writeTo, firstgen) wpReport(baseText, firstgen, secgen, metricText, 5) plotChronoMap(baseText, firstgen, secgen, metricText)
new.append(text[i + start]) master.append(new) n = n - 1 indices.remove(x) return master # Given an array of character names, does comparative analysis on the surrounding # text of the characters def characterCompare(text, chars): x = [] for i in range(len(chars)): # 5 samples from each character sample = sampleCharacter(text, chars[i], 5, 200) temp = [] # Creating vectors for each sample for j in range(len(sample)): temp.append( vectorize2( sample[j], ['retard', 'female', 'greedy', 'suicidal', 'incestuous'])) # Averaging sampled vectors x.append(np.mean(temp, axis=0).tolist()) #x.append(temp) print(x) characterCompare(simpleTokenize('SoundAndFury.txt'), ['Caddy', 'Benjy', 'Quentin', 'Jason', 'Dilsey']) #print( np.mean([[3,5,7], [1,5,7], [3,6,7]], axis = 0).tolist() )