def create(self, text=''): #rfi = readabilityFilesInstaller() # NewDaleChallWordsFile = open(rfi.getPath("Dale-Chall wordlist")[0]).read() # NewDaleChallWordsList = NewDaleChallWordsFile.split(';') #Array mit Werten erstellen ta = textanalyzer("eng") raw_sentences = ta.getSentences(text) values = [] sentence_values = [] max_words = 0 print "\n\n\nhalllo\n\n\n" for sentence in raw_sentences: raw_words = ta.getWords(sentence) if len(raw_words) > max_words: max_words = len(raw_words) for word in raw_words: value = 0.0 # if word.lower() in NewDaleChallWordsList: # value = 0.25 # else: # value = 0.5 if word.isdigit(): value = 0.0 sentence_values.append(value) values.append(sentence_values) sentence_values = [] #mit Nullen auffuellen for value in values: while len(value) < max_words: value.append(1.0) values.reverse() a = array(values) fig = dale_plt.figure() #Achsenbeschriftungen erstellen i = len(values) ylabels = [] while i > 0: ylabels.append(i) i = i - 1 yticks(arange(len(values)) + 0.5, ylabels) #pcolor-Graph erzeugen pcolor(a, cmap=self.my_cmap, norm=normalize(vmin=0.0, vmax=1.0))
def create(self, text=""): # rfi = readabilityFilesInstaller() # NewDaleChallWordsFile = open(rfi.getPath("Dale-Chall wordlist")[0]).read() # NewDaleChallWordsList = NewDaleChallWordsFile.split(';') # Array mit Werten erstellen ta = textanalyzer("eng") raw_sentences = ta.getSentences(text) values = [] sentence_values = [] max_words = 0 print "\n\n\nhalllo\n\n\n" for sentence in raw_sentences: raw_words = ta.getWords(sentence) if len(raw_words) > max_words: max_words = len(raw_words) for word in raw_words: value = 0.0 # if word.lower() in NewDaleChallWordsList: # value = 0.25 # else: # value = 0.5 if word.isdigit(): value = 0.0 sentence_values.append(value) values.append(sentence_values) sentence_values = [] # mit Nullen auffuellen for value in values: while len(value) < max_words: value.append(1.0) values.reverse() a = array(values) fig = dale_plt.figure() # Achsenbeschriftungen erstellen i = len(values) ylabels = [] while i > 0: ylabels.append(i) i = i - 1 yticks(arange(len(values)) + 0.5, ylabels) # pcolor-Graph erzeugen pcolor(a, cmap=self.my_cmap, norm=normalize(vmin=0.0, vmax=1.0))
def create_enhanced_dale_chall_list(self): #list of sites used to create list of most frequent words alexa_list = [ 'Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask' ] #bring all privacy texts into one list corpus = [] data = get_all_policies() for site in data: if site in alexa_list: corpus.append(data[site]["text"]) #get the words of this list into a list of words t = textanalyzer("eng") words = t.getWords("".join(corpus)) #open the dale chall wordlist dale_chall_list = open( '../nltk_contrib/dale_chall_wordlist.txt').read().split(';') #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words new_corpus = [] for word in words: if word.lower() not in dale_chall_list and word not in alexa_list: new_corpus.append(word.lower()) #create a frequency distribution of the words of this list of words fdist = FreqDist(new_corpus) #plot this fdist.plot(80, cumulative=True) #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative) most_frequ = [] cum_percentage = 0.0 for sample in fdist: cum_percentage += fdist.freq(sample) most_frequ.append(sample) if cum_percentage > 0.33: break #write those into a file privacy_file = open("privacy_wordlist.txt", "w") privacy_file.write(";".join(most_frequ))
def create_enhanced_dale_chall_list(self): #list of sites used to create list of most frequent words alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask'] #bring all privacy texts into one list corpus = [] data = get_all_policies() for site in data: if site in alexa_list: corpus.append(data[site]["text"]) #get the words of this list into a list of words t = textanalyzer("eng") words = t.getWords("".join(corpus)) #open the dale chall wordlist dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';') #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words new_corpus = [] for word in words: if word.lower() not in dale_chall_list and word not in alexa_list: new_corpus.append(word.lower()) #create a frequency distribution of the words of this list of words fdist = FreqDist(new_corpus) #plot this fdist.plot(80, cumulative=True) #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative) most_frequ = [] cum_percentage = 0.0 for sample in fdist: cum_percentage += fdist.freq(sample) most_frequ.append(sample) if cum_percentage > 0.33: break #write those into a file privacy_file = open("privacy_wordlist.txt", "w") privacy_file.write(";".join(most_frequ))
def create(self, data = {}): sites = data.keys() ###################################### # To calculate a grade level score: # 1. Randomly select three separate 100 word passages. # 2. Count the number of sentences in each 100 word sample (estimate to nearest tenth). # 3. Count the number of syllables in each 100 word sample. (Each numeral is a syllable. For example, 2007 is 5 syllables -- two-thou-sand-se-ven -- and one word.) # 4. Plot the average sentence length and the average number of syllables on the graph. # The area in which it falls is the approximate grade ###################################### for site in sites: site_sentences = [] site_words = [] sentence_lengths = [] sentences_count = [] syllables_count = [] ta = textanalyzer("eng") site_sentences = ta.getSentences(data[site]['text']) words = ta.getWords(data[site]['text']) word_count = len(words) for sentence in site_sentences: site_words.append(ta.getWords(sentence)) sentence_lengths.append(len(ta.getWords(sentence))) print word_count sample_size = "" if word_count < 100: sample_size = word_count number_of_iterations = 1 else: sample_size = 100 if word_count < 200: number_of_iterations = 1 elif word_count < 300: number_of_iterations = 2 else: number_of_iterations = 3 j = 1 while j <= number_of_iterations: print j count_index = j - 1 if word_count < 100: start = 0 else: start = randint(0, word_count - (sample_size * number_of_iterations)) #Silben zählen sample_words = words[start:start + sample_size] #Sätze zählen #Beginn des Samples finden i = 0 start_value = start while (start_value - sentence_lengths[i]) > 0: start_value = start_value - sentence_lengths[i] i += 1 sentneces_count_rest = sentence_lengths[i] - start_value sentences_count.append(0.0) words_to_count_for = sample_size - sentneces_count_rest rest = sentneces_count_rest / sentence_lengths[i] #100 Wörter abzählen (abzüglich Restwörter aus Vorsatz) i += 1 while (words_to_count_for - sentence_lengths[i]) > 0: words_to_count_for = words_to_count_for - sentence_lengths[i] sentences_count[count_index] = sentences_count[count_index] + 1 i += 1 #Anzahl der Sätze zählen und Reste vorher und nachher aufaddieren sentences_count[count_index] = sentences_count[count_index] rest = rest + (words_to_count_for / sentence_lengths[i]) #Werte vom aktuellen Sample sentences_count[count_index] = sentences_count[count_index] + rest syllables_count.append(ta.countSyllables(sample_words)) #Wenn kleiner 100, dann auf 100 hochrechnen if word_count < 100: sentences_count[count_index] = sentences_count[count_index] * 100 / word_count syllables_count[count_index] = syllables_count[count_index] * 100 / word_count #das nächste sample j += 1 data[site]['Syllables'] = float(sum(syllables_count)) / len(syllables_count) data[site]['Sentences'] = float(sum(sentences_count)) / len(sentences_count) fig = fry_plt.figure(figsize=(8.15,5.03)) ax = fig.add_subplot(111) #Achse ausblenden Axes.set_frame_on(ax, False) for site in sites: ax.plot(self.get_x_value(data[site]['Syllables']), self.get_y_value(data[site]['Sentences']), 'bo', ms=5) ax.annotate(site, (self.get_x_value(data[site]['Syllables']) - 6, self.get_y_value(data[site]['Sentences']))) fig.figimage(self.im, 82, 40) fry_plt.xlim([108, 174]) fry_plt.xlabel("Average Number of Syllables per 100 words") fry_plt.xticks(arange(108, 174, 4)) fry_plt.ylim([0,29]) fry_plt.ylabel("Average Number of Sentences per 100 words") #Beschriftung gemäà Fry-Graph y_ticks = ['2.0','2.5','3.0','3.3','3.5','3.6','3.7','3.8','4.0','4.2','4.3','4.5','4.8','5.0','5.2','5.6','5.9','6.3','6.7','7.1','7.7','8.3','9.1','10.0','11.1','12.5','14.3','16.7','20','25+'] fry_plt.yticks(arange(30), y_ticks) labels = sites
def create(self, data={}): sites = data.keys() ###################################### # To calculate a grade level score: # 1. Randomly select three separate 100 word passages. # 2. Count the number of sentences in each 100 word sample (estimate to nearest tenth). # 3. Count the number of syllables in each 100 word sample. (Each numeral is a syllable. For example, 2007 is 5 syllables -- two-thou-sand-se-ven -- and one word.) # 4. Plot the average sentence length and the average number of syllables on the graph. # The area in which it falls is the approximate grade ###################################### for site in sites: site_sentences = [] site_words = [] sentence_lengths = [] sentences_count = [] syllables_count = [] ta = textanalyzer("eng") site_sentences = ta.getSentences(data[site]['text']) words = ta.getWords(data[site]['text']) word_count = len(words) for sentence in site_sentences: site_words.append(ta.getWords(sentence)) sentence_lengths.append(len(ta.getWords(sentence))) print word_count sample_size = "" if word_count < 100: sample_size = word_count number_of_iterations = 1 else: sample_size = 100 if word_count < 200: number_of_iterations = 1 elif word_count < 300: number_of_iterations = 2 else: number_of_iterations = 3 j = 1 while j <= number_of_iterations: print j count_index = j - 1 if word_count < 100: start = 0 else: start = randint( 0, word_count - (sample_size * number_of_iterations)) #Silben zählen sample_words = words[start:start + sample_size] #Sätze zählen #Beginn des Samples finden i = 0 start_value = start while (start_value - sentence_lengths[i]) > 0: start_value = start_value - sentence_lengths[i] i += 1 sentneces_count_rest = sentence_lengths[i] - start_value sentences_count.append(0.0) words_to_count_for = sample_size - sentneces_count_rest rest = sentneces_count_rest / sentence_lengths[i] #100 Wörter abzählen (abzüglich Restwörter aus Vorsatz) i += 1 while (words_to_count_for - sentence_lengths[i]) > 0: words_to_count_for = words_to_count_for - sentence_lengths[ i] sentences_count[ count_index] = sentences_count[count_index] + 1 i += 1 #Anzahl der Sätze zählen und Reste vorher und nachher aufaddieren sentences_count[count_index] = sentences_count[count_index] rest = rest + (words_to_count_for / sentence_lengths[i]) #Werte vom aktuellen Sample sentences_count[ count_index] = sentences_count[count_index] + rest syllables_count.append(ta.countSyllables(sample_words)) #Wenn kleiner 100, dann auf 100 hochrechnen if word_count < 100: sentences_count[count_index] = sentences_count[ count_index] * 100 / word_count syllables_count[count_index] = syllables_count[ count_index] * 100 / word_count #das nächste sample j += 1 data[site]['Syllables'] = float( sum(syllables_count)) / len(syllables_count) data[site]['Sentences'] = float( sum(sentences_count)) / len(sentences_count) fig = fry_plt.figure(figsize=(8.15, 5.03)) ax = fig.add_subplot(111) #Achse ausblenden Axes.set_frame_on(ax, False) for site in sites: ax.plot(self.get_x_value(data[site]['Syllables']), self.get_y_value(data[site]['Sentences']), 'bo', ms=5) ax.annotate(site, (self.get_x_value(data[site]['Syllables']) - 6, self.get_y_value(data[site]['Sentences']))) fig.figimage(self.im, 82, 40) fry_plt.xlim([108, 174]) fry_plt.xlabel("Average Number of Syllables per 100 words") fry_plt.xticks(arange(108, 174, 4)) fry_plt.ylim([0, 29]) fry_plt.ylabel("Average Number of Sentences per 100 words") #Beschriftung gemäà Fry-Graph y_ticks = [ '2.0', '2.5', '3.0', '3.3', '3.5', '3.6', '3.7', '3.8', '4.0', '4.2', '4.3', '4.5', '4.8', '5.0', '5.2', '5.6', '5.9', '6.3', '6.7', '7.1', '7.7', '8.3', '9.1', '10.0', '11.1', '12.5', '14.3', '16.7', '20', '25+' ] fry_plt.yticks(arange(30), y_ticks) labels = sites