def graphAndEvaluation(tf=''): file = open(tf, encoding="Latin-1") text = file.read() file.close() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.rabinKarp1(fullwordlist, obo.stopwords, 101) wordlist1 = obo.rabinKarp2(wordlist, posiwordlist, 101) wordlist2 = obo.rabinKarp2(wordlist, negawordlist, 101) wordString = listToString(wordlist) dictionary = obo.wordListToFreqDict(wordlist) dictionaryposi = obo.wordListToFreqDict(wordlist1) dictionarynega = obo.wordListToFreqDict(wordlist2) sorteddict = obo.sortFreqDict(dictionary) sorteddictposi = obo.sortFreqDict(dictionaryposi) sorteddictnega = obo.sortFreqDict(dictionarynega) # for s in sorteddictposi: print(str(s)) N = 100000 t = list(dictionary.keys()) y = list(dictionary.values()) fig1 = go.Figure(data=go.Scatter(x=t, y=y, mode='markers')) fig1.update_layout( title={ 'text': tf + " Word Counts", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }) fig1.show() x1 = wordlist1 x2 = wordlist2 fig = go.Figure() fig.update_layout( title={ 'text': tf + " Negative and Positive Histogram", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }) fig.add_trace(go.Histogram(histfunc="sum", x=x1, name="Positive Word")) fig.add_trace(go.Histogram(histfunc="sum", x=x2, name="Negative Word")) fig.show()
def getKeywords(pdfFile,Occur): tikaurl= tika_obo.getTikaAddress() parsed = parser.from_file(pdfFile, tikaurl) metadata = parsed["metadata"] doccontent = parsed["content"] fullwordlist = obo.stripNonAlphaNum(doccontent) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) count = 0 keywords = [] shortkey = [] maxoccur = Occur for s in sorteddict: numocc = int(s[0]) word = s[1].encode('utf-8') if numocc > maxoccur: keyword = { word : str(numocc) } keywords.append(keyword) if len(word)>6: shortkey.append(word.lower()) count = count + 1 if Occur > 0: return shortkey return keywords
def count(): form = WordForm() if form.validate_on_submit(): url = form.url.data response = requests.get(url) html = response.content.decode("utf-8") text = obo.stripTags(html).lower() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) for s in sorteddict[:21]: flash(str(s)) return redirect(url_for('index')) return render_template('count.html', title='Word Count Application', form=form)
def hitString(limit, url): response = urllib2.urlopen(url) html = response.read() soup = BeautifulSoup(html, "lxml") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() text = text.lower() '''# break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk)''' #text = text.lower() fullwordlist = obo.stripNonAlphaNum(text) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) count = 0 global retStr for s in sorteddict: mys = (str(s[1:])) mys2 = mys.strip("(,)") mys3 = mys2[1:] mys4 = str(mys3.strip("''")) mys4 = str(mys4) if re.match("^-?[0-9]+$", mys4): continue count += 1 #print mys4 retStr += str(mys4) + ' ' if count == limit: break return retStr
def main(): f = open("output.xls", "r") fsort = open("output-sort.xls", "w") wordstring = f.read() #print('wordstring: ', wordstring) wordlist = wordstring.split() #print('startlist 0: ', wordlist[0]) #print('startlist 1: ', wordlist[1]) #fsort.write(wordlist[1]) wordfreq = [] for w in wordlist: wordfreq.append(wordlist.count(w)) #print("String\n" + wordstring +"\n") #print("List\n" + str(wordlist) + "\n") #print("Frequencies\n" + str(wordfreq) + "\n") #print("Pairs\n" + str(list(zip(wordlist, wordfreq)))) #fsort.write(str(list(zip(wordlist, wordfreq))) ) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) #fsort.write('Word, Count\n') for s in sorteddict: fsort.write(s[1]) fsort.write('|') fsort.write(str(s[0])) fsort.write('\n') #for s in sorteddict: print(s[1]) # for x in range(10): # print(sorteddict[x][1]) # fsort.write(sorteddict[x][1]) f.close() fsort.close()
#html-to-freq.py import urllib2, obo url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' response = urllib2.urlopen(url) html = response.read() text = obo.stripTags(html).lower() wordlist = obo.stripNonAlphaNum(text) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) for s in sorteddict: print(str(s))
import requests, obo url = 'http://literature.org/authors/shelley-mary/frankenstein/chapter-01.html' pagetext = requests.get(url) HTML = pagetext.text text = obo.stripTags(HTML).lower() # convert to lower case fullwordlist = obo.stripNonAlphaNum(text) # only words, into list wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) # remove common useless words dictionary = obo.wordListToFreqDict( wordlist) # add words and counts to dictionary sorteddict = obo.sortFreqDict(dictionary) # sort word list by frequency if __name__ == '__main__': for s in sorteddict: print(str(s))
date=datetime.strptime(dates[rec],'%Y-%m-%d') Q=int(ceil(date.month/3.)-1) ind = 4*(date.year-year0)+Q NRecQuarter[ind]+=1 fulltextQuarter[Q]=fulltextQuarter[Q]+text NRecRunning=NRecRunning+NRecYear[x] for q in range(4): desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q]) desc_wordlist = obo.removeStopwords(desc_fullwordlist,obo.stopwords) desc_dictionary = obo.wordListToFreqDict(desc_wordlist) desc_sorteddict = obo.sortFreqDict(desc_dictionary) topWords.append(desc_sorteddict[:5000]) print ('Year: {}; Quarter: Q{}; Num. entries: {}'.format(years[x],q+1,NRecQuarter[4*(date.year-year0)+q])) #for s in desc_sorteddict[:10]: print(str(s)) #print('\n') print('\n') ################################################################################################# ################################################################################################# ## Pickle? ## Pickle? with open('obj/'+ 'NRecQuarter' + '.pkl', 'wb') as f: pickle.dump(NRecQuarter, f, pickle.HIGHEST_PROTOCOL) with open('obj/'+ 'topWords5k' + '.pkl', 'wb') as f:
if statusCode == 200: text = obo.stripTags(r.text) #quitamos las etiquetas y pasamos a minuscula fullwordlist = obo.stripNonAlphaNum( text) #quitamos los que no son alfanumericos if args.stopwords: fullwordlist = obo.removeStopwords( fullwordlist, args.stopwords ) #eliminamos las palabras de uso comun segun el idioma if args.long: fullwordlist = obo.excludeTwo( fullwordlist, args.long) #eliminamos las palabras con menos de 2 caracteres dictionary = obo.wordListToFreqDict( fullwordlist) #nos devuelve un diccionario palabra - frequencia sorteddict = obo.sortFreqDict( dictionary ) #ordena las palabras por su frequencia (nos han devuelto una lista de listas) if args.tipo == 'simple': obo.makePassfile(sorteddict, args.file) #crea el primer archivo de pass.txt print('Archivo simple creado correctamente:' + args.file) elif args.tipo == 'numin': obo.makePassfile(sorteddict, args.file) #crea el primer archivo de pass.txt obo.numInside(args.file, args.numint) #crea el archivo passInt.txt print('Archivo con numeros en el interior creado correctamente:', args.file) elif args.tipo == 'numout': obo.makePassfile(sorteddict, args.file) #crea el primer archivo de pass.txt obo.passNumeracion(args.file,
text = title + desc date = datetime.strptime(dates[rec], '%Y-%m-%d') Q = int(ceil(date.month / 3.) - 1) ind = 4 * (date.year - year0) + Q NRecQuarter[ind] += 1 fulltextQuarter[Q] = fulltextQuarter[Q] + text NRecRunning = NRecRunning + NRecYear[x] for q in range(4): desc_fullwordlist = obo.stripNonAlphaNum(fulltextQuarter[q]) desc_wordlist = obo.removeStopwords(desc_fullwordlist, obo.stopwords) desc_dictionary = obo.wordListToFreqDict(desc_wordlist) desc_sorteddict = obo.sortFreqDict(desc_dictionary) topWords.append(desc_sorteddict[:5000]) print('Year: {}; Quarter: Q{}; Num. entries: {}'.format( years[x], q + 1, NRecQuarter[4 * (date.year - year0) + q])) #for s in desc_sorteddict[:10]: print(str(s)) #print('\n') print('\n') ################################################################################################# ################################################################################################# ## Pickle? ## Pickle? with open('obj/' + 'NRecQuarter' + '.pkl', 'wb') as f: pickle.dump(NRecQuarter, f, pickle.HIGHEST_PROTOCOL)
def hello_world(): if request.method == "GET": return redirect("/app/index.html") else: pprint.pprint(request.form) pprint.pprint(request.files) #Language check if request.form['language'] not in ['english', 'dutch']: return jsonify(status='error', message="Invalid language!") #Input normalization if request.form['upload_option'] == 'text_field': input_text = request.form['upload_textarea'] elif request.form['upload_option'] == 'url': page_text = requests.get(request.form['upload_url']).text soup = BeautifulSoup(page_text, "html.parser") input_text = soup.text elif request.form['upload_option'] == 'file': input_text = UnicodeDammit( request.files.get('upload_file').read()).unicode_markup #Stemmer selection if request.form['stemmer'] == 'no_stemmer': stemmer = None elif request.form['stemmer'] == 'porter': if request.form['language'] != 'english': return jsonify(status='error', message="Invalid language for stemmer porter!") stemmer = PorterStemmer() elif request.form['stemmer'] == 'snowball': stemmer = SnowballStemmer(request.form['language']) else: return jsonify(status='error', message="Invalid stemmer!") #Lemmatizer selection if request.form['lemmatizer'] == 'lemmatizer_off': lemmatizer = None elif request.form['language'] == 'english': lemmatizer = lemmatizer_en else: lemmatizer = lemmatizer_nl #Stopwords selection if request.form['stopwords'] == 'no_stopwords': stopwords = None elif request.form['stopwords'] == 'our_stopwords': stopwords = obo.stopwords elif request.form['stopwords'] == 'custom_stopwords': custom_stopword_text = UnicodeDammit( request.files.get( 'custom_stopword_file').read()).unicode_markup stopwords = obo.stripNonAlphaNum(custom_stopword_text) #Process the text input_text_word_count = 0 resulting_text = "" final_wordlist = [] for word_type, word in text_processor.parse_text(input_text): if word_type == "non-word": resulting_text += word else: input_text_word_count += 1 processed_word = word if stemmer: processed_word = stemmer.stem(processed_word) if lemmatizer: processed_word = lemmatizer(processed_word) if not stopwords or processed_word not in stopwords: if request.form['exclude_vowels'] == 'exclude_vowels_yes': if request.form['language'] == 'english': regex = re_vowel_en else: regex = re_vowel_nl processed_word = regex.sub("", processed_word) resulting_text += processed_word final_wordlist.append(processed_word) dictionary = obo.wordListToFreqDict(final_wordlist) sorteddict = obo.sortFreqDict(dictionary) ignore_results_amount = int(request.form['ignore_results_amount']) if ignore_results_amount > 0: initial_index = ignore_results_amount ignored_words = [word for rank, word in sorteddict[:initial_index]] sorteddict = sorteddict[initial_index:] new_text = "" new_wordlist = [] for word_type, word in text_processor.parse_text(resulting_text): if word_type == "non-word": new_text += word elif word not in ignored_words: new_text += word new_wordlist.append(word) resulting_text = new_text final_wordlist = new_wordlist else: initial_index = 0 #Do the math! input_text_char_count = len(input_text) word_count = len(final_wordlist) distinct_words_count = len(sorteddict) words = [] frequencies = [] word_cloud = [] for frequency, word in sorteddict: words.append(word) frequencies.append(frequency) word_cloud.append([word, frequency]) acum_perc = Decimal(0) percentages = [] acum_perc_list = [] for freq in frequencies: perc = Decimal((freq * 100.0) / word_count) percentages.append(round(perc, 2)) acum_perc += perc acum_perc_list.append(round(acum_perc, 2)) logarithms = [] for i in range(len(sorteddict)): logarithms.append((math.log(i + 1), math.log(frequencies[i]))) #Calculate Linear regression #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq x = numpy.array([math.log(f) for f in frequencies]) y = numpy.array( [math.log(rank) for rank in range(1, distinct_words_count + 1)]) A = numpy.vstack([x, numpy.ones(len(x))]).T m, c = numpy.linalg.lstsq(A, y)[0] #Calculate the regression line start and end, # and sort making the start be the one with the lower X value # (highcharts requires this) regline_start = (0, c) regline_end = (math.log(distinct_words_count), math.log(distinct_words_count) * m + c) regression_line = {'start': regline_start, 'end': regline_end} return jsonify(status='success', words=words, frequencies=frequencies, percentages=percentages, acum_perc_list=acum_perc_list, logarithms=logarithms, regression_line=regression_line, resulting_text=resulting_text, input_text_char_count=input_text_char_count, input_text_word_count=input_text_word_count, output_text_word_count=word_count, word_cloud=word_cloud, sorteddict=sorteddict)
def post(self, request): pprint.pprint(request.POST) pprint.pprint(request.FILES) #Language check if request.POST['language'] not in ['english', 'dutch']: return jsonify(status='error', message="Invalid language!") if request.POST['database'] not in connections: return jsonify(status='error', message="Invalid database!") #Input normalization if request.POST['upload_option'] == 'text_field': input_text = request.POST['upload_textarea'] elif request.POST['upload_option'] == 'url': page_text = requests.get(request.POST['upload_url']).text soup = BeautifulSoup(page_text, "html.parser") input_text = soup.text elif request.POST['upload_option'] == 'file': input_text = UnicodeDammit(request.FILES['upload_file'].read()).unicode_markup elif request.POST['upload_option'] == 'news_comments': start_date_text = request.POST['news_comments_start_date'] end_date_text = request.POST['news_comments_end_date'] start_date = datetime.date(*[int(i) for i in start_date_text.split('-')]) end_date = datetime.date(*[int(i) for i in end_date_text.split('-')]) filters = { 'date__gte': start_date, 'date__lte': end_date, 'text__isnull': False } input_text = "" if 'news' in request.POST['news_comments']: queryset = Newsitem.objects\ .using(request.POST['database'])\ .filter(**filters)\ .select_related('text') for newsitem in queryset: input_text += "\n"+newsitem.text.text if 'comments' in request.POST['news_comments']: for comment in Comment.objects\ .using(request.POST['database'])\ .filter(**filters)\ .select_related('text'): input_text += "\n"+comment.text.text #Stemmer selection if request.POST['stemmer'] == 'no_stemmer': stemmer = None elif request.POST['stemmer'] == 'porter': if request.POST['language'] != 'english': return jsonify(status='error', message="Invalid language for stemmer porter!") stemmer = PorterStemmer() elif request.POST['stemmer'] == 'snowball': stemmer = SnowballStemmer(request.POST['language']) else: return jsonify(status='error', message="Invalid stemmer!") #Lemmatizer selection if request.POST['lemmatizer'] == 'lemmatizer_off': lemmatizer = None elif request.POST['language'] == 'english': lemmatizer = lemmatizer_en else: lemmatizer = lemmatizer_nl #Stopwords selection if request.POST['stopwords'] == 'no_stopwords': stopwords = None elif request.POST['stopwords'] == 'our_stopwords': stopwords = obo.stopwords elif request.POST['stopwords'] == 'custom_stopwords': custom_stopword_text = UnicodeDammit(request.FILES.get('custom_stopword_file').read()).unicode_markup stopwords = obo.stripNonAlphaNum(custom_stopword_text) #Process the text input_text_word_count = 0 resulting_text = "" final_wordlist = [] for word_type, word in text_processor.parse_text(input_text): if word_type == "non-word": resulting_text += word else: input_text_word_count += 1 processed_word = word if stemmer: processed_word = stemmer.stem(processed_word) if lemmatizer: processed_word = lemmatizer(processed_word) if not stopwords or processed_word not in stopwords: if request.POST['exclude_vowels'] == 'exclude_vowels_yes': if request.POST['language'] == 'english': regex = re_vowel_en else: regex = re_vowel_nl processed_word = regex.sub("", processed_word) resulting_text += processed_word final_wordlist.append(processed_word) dictionary = obo.wordListToFreqDict(final_wordlist) sorteddict = obo.sortFreqDict(dictionary) ignore_results_amount = int(request.POST['ignore_results_amount']) if ignore_results_amount > 0: initial_index = ignore_results_amount ignored_words = [word for rank, word in sorteddict[:initial_index]] sorteddict = sorteddict[initial_index:] new_text = "" new_wordlist = [] for word_type, word in text_processor.parse_text(resulting_text): if word_type == "non-word": new_text += word elif word not in ignored_words: new_text += word new_wordlist.append(word) resulting_text = new_text final_wordlist = new_wordlist else: initial_index = 0 #Do the math! input_text_char_count = len(input_text) word_count = len(final_wordlist) distinct_words_count = len(sorteddict) words = [] frequencies = [] word_cloud = [] for frequency, word in sorteddict: words.append(word) frequencies.append(frequency) word_cloud.append([word, frequency]) acum_perc = Decimal(0) percentages = [] acum_perc_list = [] for freq in frequencies: perc = Decimal((freq*100.0)/word_count) percentages.append(round(perc, 2)) acum_perc += perc acum_perc_list.append(round(acum_perc, 2)) logarithms = [] for i in range(len(sorteddict)): logarithms.append((math.log(i+1), math.log(frequencies[i]))) #Calculate Linear regression #http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html#numpy.linalg.lstsq x = numpy.array([math.log(f) for f in frequencies]) y = numpy.array([math.log(rank) for rank in range(1, distinct_words_count + 1)]) A = numpy.vstack([x, numpy.ones(len(x))]).T m, c = numpy.linalg.lstsq(A, y)[0] #Calculate the regression line start and end, # and sort making the start be the one with the lower X value # (highcharts requires this) regline_start = (0, c) regline_end = (math.log(distinct_words_count), math.log(distinct_words_count) * m + c) regression_line = { 'start': regline_start, 'end': regline_end } return JsonResponse({ 'status': 'success', 'words': words, 'frequencies': frequencies, 'percentages': percentages, 'acum_perc_list': acum_perc_list, 'logarithms': logarithms, 'regression_line': regression_line, 'resulting_text': resulting_text, 'input_text_char_count': input_text_char_count, 'input_text_word_count': input_text_word_count, 'output_text_word_count': word_count, 'word_cloud': word_cloud, 'sorteddict': sorteddict })