# Fitting RandomForestClassification to the Training set ''' classifier = RandomForestClassifier(n_estimators=20, criterion='entropy', random_state=0) # 0.841 ''' classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Accuracy score accuracy = accuracy_score(y_test, y_pred) print('Model Accuracy = {}'.format(accuracy)) # Making the Confusion Matrix cm = confusion_matrix(y_test, y_pred) return cm if __name__ == "__main__": cm = nltk() print('confusioin matrix', cm) # data_format()
def sttxt(request, filename, textfilepath, textfilename): kl = [] service = SpeechToTextV1( username='******', password='******', url='https://stream.watsonplatform.net/speech-to-text/api') models = service.list_models().get_result() #print(json.dumps(models, indent=2)) model = service.get_model('en-US_NarrowbandModel').get_result() #print(json.dumps(model, indent=2)) # with open(join(dirname(__file__), filename),'rb') as audio_file: print(filename) with open(filename, 'rb') as audio_file: with io.open('data.json', 'w', encoding='utf8') as outfile: str_ = json.dumps(service.recognize( audio=audio_file, content_type='audio/mp3', speaker_labels=True).get_result(), indent=2) outfile.write(to_unicode(str_)) outfile.close() # Read JSON file with open('data.json') as data_file: data_loaded = json.load(data_file) spea = [] l = 0 for i in data_loaded['speaker_labels']: temp = "" if l == int(i['speaker']): for z in range(math.floor(i['from']), math.ceil(i['to'])): for v in data_loaded['results']: for m in v['alternatives']: for n in m['timestamps']: if n[1] >= i['from'] and n[2] <= i['to']: if temp is not n[0]: spea.append(n[0]) temp = n[0] #print(spea) else: str1 = ' '.join(spea) print(textfilepath + 'transcripts/' + textfilename + '/' + textfilename + ".txt") with io.open(textfilepath + 'transcripts/' + textfilename + '/' + textfilename + ".txt", 'a', encoding='utf8') as outfile: # print("Speaker "+str(l)+": "+str1+"\n") str_ = outfile.write(" Speaker " + str(l) + ": " + str1 + "\n") kl.append("Speaker " + str(l) + ": " + str1 + "\n") outfile.close() l = i['speaker'] del spea[0:len(spea) - 1] str1 = ' '.join(spea) with io.open(textfilepath + 'transcripts/' + textfilename + '/' + textfilename + ".txt", 'a', encoding='utf8') as outfile: # print("Speaker "+str(l)+": "+str1+"\n") str_ = outfile.write(" Speaker " + str(l) + ": " + str1 + "\n") kl.append("Speaker " + str(l) + ": " + str1 + "\n") outfile.close() u = summary_function(textfilepath + 'transcripts/' + textfilename + '/' + textfilename + ".txt") print('vvvvvvvvvvvvvvvvvvv summarize VVVVVVVVVVVVVVVv') print(u) print('------------------- decisions ------------------------------------') decision = nltk(textfilepath + 'transcripts/' + textfilename + '/' + textfilename + ".txt") print(decision) request.session['summ'] = u request.session['trans1'] = kl request.session['deci'] = decision context = { 'summarize': u, 'trans': kl, } return render(request, 'Analyse/transcript.html', context)
""" Word Scoring Program July 01, 2012 Written by Justin Black ([email protected]) This file scores a list of words, based on the frequency that each word is used in another data source It takes a text file containing a list of words you want to score and outputs a new text file where each word in that list is given a frequency score. The frequency depends on which data source (corpus = body of text) you are using to score it. This program uses 6 such data sources. I assign no additional copyright to the combined database and the software is explicitly being placed in the public domain. I would appreciate credit for my work. --------------------- INTENDED USE --------------------- I have a list of words but I want them sorted by "populaity" Popular words are words we use often in speech or text, unpopular words are typicaly obscure dictionary/scrabble words. I can then filter out words that are too obscure using only the words with a score >= mylimit You pick the mylimit value by opening one of the scored text files and picking the cut-off point you prefer.
def getnltk(): somejsonfile = request.get_json() subject = nltk(somejsonfile['data']) relevantNews = getrelevantNews(subject, somejsonfile['data']) return jsonify({"data": relevantNews, "subject": subject})
计算一篇Economist读起来难度如何? 计算一则川普推文的情感是积极还是消极de? 标出一首流行歌曲中,哪些歌词押韵? ... ... 作为一名在线学习设计师,我每天都要处理、统计、分析大量的陌生英文文本 从打开python的大门起,就经常探索一些工具来辅助工作: 本文我归纳了用过的工具,如下: 1. nltk(word_tokenize, sent_tokenize, corpus.cmudict, pos_tag) 2. SpaCy 3. textstat 4. textblob 考虑到趣味性,所有的介绍都以解决问题出发,不罗列所有功能 若有兴趣深入研究,请自行进入文档链接1. nltk (word_tokenize, sent_tokenize)NLTK的全称为Natural Language Toolkit,是一套用于英文自然语言处理的Python库与程序。 文档地址:https://www.nltk.org/ NLTK Book 地址:https://www.nltk.org/book/ 其中 word_tokenize 和 sent_tokenize 可以对文本分别进行以词、句为单位的切割。 https://www.nltk.org/api/nltk.tokenize.html 问题:比较两篇文章的长度(各自的句子数,各自句子长度) 我们经常会接触到大量陌生的文本,不知道它们的长度如何。可以用nltk来计算两篇文本各自的句子数,以及每个句子的单词数。思维步骤: