Beispiel #1
0
def get_file():
    global fname
    # need to retrieve the uploaded file here for further processing
    print(fname)
    print(request.form)
    dict = request.form.to_dict()
    #filename =  str(request)[ str(request).index('=')+1 : str(request).index('\' [GET]>') ]
    text = simpleTokenize('uploads/' + fname)
    """text2  = cleanText( 'uploads/' + fname )
    pos = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    savePOSPiChart(text2, pos)
    top = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    saveTopWords(text2, top)


    bw = ['yellow', 'fish', 'glass', 'foot', 'beach', 'suicide']
    genGraph = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    secgen = similarContext( text,  bw)
    saveChronoMap(text, bw, secgen, genGraph)
    genReport = wpReport(text, bw, secgen, 10)

    charReport = sampleCharacter(text, 'Caddy', 3, 100)


    return render_template('results.html', pq = percentQuotes(text), sen = senlenStats(text), pos = pos, top = top, genGraph=genGraph,genReport=genReport, charReport=charReport)
    """
    return render_template('hello.html')
Beispiel #2
0
def wpReport(filename, firstgen, secgen, ssName, numTop, writeTo):
    sys.stdout = open(writeTo, 'w')
    print("\n\n\nTitle: " + ssName)
    print("FirstGen Words: " + str(firstgen))
    print("SecondGen Words: " + str(secgen))
    text = simpleTokenize(filename)
    arr = wordProgression(text, firstgen, secgen)
    for i in range(numTop):
        topIndex = arr.index(max(arr))
        topWords = text[topIndex * 100:topIndex * 100 + 100]
        print("\n\nNumber " + str(i))
        print("--> Score: " + str(arr[topIndex]) + "\n--> Z-Score: " +
              str(stats.zscore(arr)[topIndex]))
        print("--> Average Score " + str(statistics.mean(arr)))
        fg = 0
        sg = 0
        for i in range(len(topWords)):
            if topWords[i] in firstgen:
                fg += 1
            if topWords[i] in secgen:
                sg += 1
        print("Number of FirstGen Words: " + str(fg))
        print("Number of SecondGen Words: " + str(sg))
        print(TreebankWordDetokenizer().detokenize(topWords))
        del arr[arr.index(max(arr))]
    sys.stdout = sys.__stdout__
def wpReport(filename, firstgen, secgen, ssName, numTop):
	str = ""
	str = "\n\n\nTitle: " + ssName + "\n"
	str += "FirstGen Words: " + str(firstgen)+ "\n"
	str += "SecondGen Words: " + str(secgen) + "\n"
	text = simpleTokenize(filename)
	arr = wordProgression(text, firstgen, secgen)
	for i in range(numTop):
		topIndex = arr.index(max(arr))
		topWords = text[topIndex*100:topIndex*100+100]
		str+="\n\nNumber " + str(i) + "\n"
		str+= "--> Score: " + str( arr[topIndex]) + "\n--> Z-Score: " + str(stats.zscore(arr)[topIndex]) + "\n"
		str+="--> Average Score " + str(statistics.mean(arr)) + "\n"
		fg=0
		sg=0
		for i in range(len(topWords)):
			if topWords[i] in firstgen:
				fg+=1
			if topWords[i] in secgen:
				sg+=1
		str+="Number of FirstGen Words: " + str(fg) + "\n"
		str+="Number of SecondGen Words: " + str(sg) + "\n"
		str+= TreebankWordDetokenizer().detokenize(topWords) + "\n"
		del arr[arr.index(max(arr))]
	return str
def vectorize(fileName, firstgen):
    text = simpleTokenize(fileName)
    vector = [
        strength(text),
        avgSentenceLength(text),
        rebound(text, firstgen) * 1000.0
    ]
    return vector
def plotChronoMap(textName, firstgen, secgen, title):
	#fig, axs = plt.subplots(1)
	#fig.suptitle('Short Story Readings on Salinger')
	t = nltk.Text(word.lower() for word in simpleTokenize(textName))
	y = wordProgression( t , firstgen, secgen)
	x =  list(range( int(len(t)/100) +1))
		# If you want to make separate plots
	plt.plot(x,y)
	plt.savefig('/templates/static/graphs/' + title + '.png')
Beispiel #6
0
def generateSimilarWords(rootTextName, newDocName, words):
    text = nltk.Text(word.lower() for word in simpleTokenize(rootTextName))
    sys.stdout = open(newDocName, 'w')
    stop_words = set(stopwords.words("english"))
    for i in range(len(words)):
        print(words[i])
        text.similar(words[i], 1)
        print()
    sys.stdout = sys.__stdout__
    m = simpleTokenize(newDocName)
    n = m.copy()
    for i in range(len(m)):
        if m[i] == "No" and m[i + 1] == "matches":
            n.remove("No")
            n.remove("matches")
        elif m[i] in words or m[i] in stop_words:
            n.remove(m[i])
    return n
Beispiel #7
0
def plotChronoMap(textName, firstgen, secgen, title, graphNum):
    t = nltk.Text(word.lower() for word in simpleTokenize(textName))
    y = wordProgression(t, firstgen, secgen)
    x = list(range(int(len(t) / 100) + 1))
    # If you want to make separate plots
    #a, f = plt.subplots(1)
    #a.suptitle(title)
    #f.plot(x,y)
    axs[graphNum].plot(x, y)
    axs[graphNum].set_title(title)
Beispiel #8
0
def plotChronoMap(textName, firstgen, secgen, title, graphNum, writeTo):
    #fig, axs = plt.subplots(1)
    #fig.suptitle('Short Story Readings on Salinger')
    t = nltk.Text(word.lower() for word in simpleTokenize(textName))
    y = wordProgression(t, firstgen, secgen)
    x = list(range(int(len(t) / 100) + 1))
    # If you want to make separate plots
    a, f = plt.subplots(1)
    a.suptitle(title)
    f.plot(x, y)
def nextGeneration(rootTextName, newDocName, words):
	text = nltk.Text(word.lower() for word in simpleTokenize(rootTextName) )
	sys.stdout = open(newDocName, 'w')
	for i in range(len(words)):
		# Original word
		print(words[i])
		# Context words within document
		text.similar(words[i], 1)
		# Context words from Wikipedia
		w = wikipediaWords(words, 7)
		for j in range(len(w[i])):
			print(w[i][j])
	sys.stdout = sys.__stdout__
	m = simpleTokenize(newDocName)
	n = m.copy()
	for i in range(len(m)):
		if m[i] == "No" and m[i+1] == "matches":
			n.remove("No")
			n.remove("matches")
	return n
def firstgen(fileName):
	w = cleanText( simpleTokenize(fileName) )
	w = set(w)
	lim = 5 + random.randint(0,3)
	w = [o for o in w if len(o) >= lim]
	ret = []
	for i in range(30):
		rand = random.randint(0, len(w)-1)
		ret.append( w[rand] )
		w.pop(rand)
	return ret
def master(baseText, metricText, firstgen, writeTo):
	#simpleTokenize(metricText)
	simpleTokenize(baseText)
	secgen = nextGeneration(baseText, writeTo, firstgen)
	wpReport(baseText, firstgen, secgen, metricText, 5)
	plotChronoMap(baseText, firstgen, secgen, metricText)
Beispiel #12
0
                new.append(text[i + start])
        master.append(new)
        n = n - 1
        indices.remove(x)
    return master


# Given an array of character names, does comparative analysis on the surrounding
# text of the characters
def characterCompare(text, chars):
    x = []
    for i in range(len(chars)):
        # 5 samples from each character
        sample = sampleCharacter(text, chars[i], 5, 200)
        temp = []
        # Creating vectors for each sample
        for j in range(len(sample)):
            temp.append(
                vectorize2(
                    sample[j],
                    ['retard', 'female', 'greedy', 'suicidal', 'incestuous']))
        # Averaging sampled vectors
        x.append(np.mean(temp, axis=0).tolist())
        #x.append(temp)
    print(x)


characterCompare(simpleTokenize('SoundAndFury.txt'),
                 ['Caddy', 'Benjy', 'Quentin', 'Jason', 'Dilsey'])
#print( np.mean([[3,5,7], [1,5,7], [3,6,7]], axis = 0).tolist() )