Ejemplo n.º 1
0
    # Fitting RandomForestClassification to the Training set
    '''

    classifier = RandomForestClassifier(n_estimators=20, criterion='entropy', random_state=0)  # 0.841

    '''
    classifier.fit(X_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(X_test)

    # Accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    print('Model Accuracy = {}'.format(accuracy))

    # Making the Confusion Matrix

    cm = confusion_matrix(y_test, y_pred)

    return cm


if __name__ == "__main__":
    cm = nltk()

    print('confusioin matrix', cm)

    # data_format()
Ejemplo n.º 2
0
def sttxt(request, filename, textfilepath, textfilename):
    kl = []
    service = SpeechToTextV1(
        username='******',
        password='******',
        url='https://stream.watsonplatform.net/speech-to-text/api')

    models = service.list_models().get_result()
    #print(json.dumps(models, indent=2))

    model = service.get_model('en-US_NarrowbandModel').get_result()
    #print(json.dumps(model, indent=2))
    # with open(join(dirname(__file__), filename),'rb') as audio_file:
    print(filename)
    with open(filename, 'rb') as audio_file:
        with io.open('data.json', 'w', encoding='utf8') as outfile:
            str_ = json.dumps(service.recognize(
                audio=audio_file,
                content_type='audio/mp3',
                speaker_labels=True).get_result(),
                              indent=2)
            outfile.write(to_unicode(str_))

        outfile.close()

    # Read JSON file
    with open('data.json') as data_file:
        data_loaded = json.load(data_file)
    spea = []
    l = 0

    for i in data_loaded['speaker_labels']:
        temp = ""
        if l == int(i['speaker']):
            for z in range(math.floor(i['from']), math.ceil(i['to'])):
                for v in data_loaded['results']:
                    for m in v['alternatives']:
                        for n in m['timestamps']:
                            if n[1] >= i['from'] and n[2] <= i['to']:
                                if temp is not n[0]:
                                    spea.append(n[0])
                                    temp = n[0]

                                #print(spea)

        else:
            str1 = ' '.join(spea)
            print(textfilepath + 'transcripts/' + textfilename + '/' +
                  textfilename + ".txt")
            with io.open(textfilepath + 'transcripts/' + textfilename + '/' +
                         textfilename + ".txt",
                         'a',
                         encoding='utf8') as outfile:

                # print("Speaker "+str(l)+": "+str1+"\n")
                str_ = outfile.write(" Speaker " + str(l) + ": " + str1 + "\n")

                kl.append("Speaker " + str(l) + ": " + str1 + "\n")
            outfile.close()
            l = i['speaker']
            del spea[0:len(spea) - 1]

    str1 = ' '.join(spea)
    with io.open(textfilepath + 'transcripts/' + textfilename + '/' +
                 textfilename + ".txt",
                 'a',
                 encoding='utf8') as outfile:
        # print("Speaker "+str(l)+": "+str1+"\n")
        str_ = outfile.write(" Speaker " + str(l) + ": " + str1 + "\n")
        kl.append("Speaker " + str(l) + ": " + str1 + "\n")
    outfile.close()

    u = summary_function(textfilepath + 'transcripts/' + textfilename + '/' +
                         textfilename + ".txt")

    print('vvvvvvvvvvvvvvvvvvv summarize VVVVVVVVVVVVVVVv')

    print(u)
    print('------------------- decisions ------------------------------------')
    decision = nltk(textfilepath + 'transcripts/' + textfilename + '/' +
                    textfilename + ".txt")

    print(decision)
    request.session['summ'] = u
    request.session['trans1'] = kl
    request.session['deci'] = decision

    context = {
        'summarize': u,
        'trans': kl,
    }

    return render(request, 'Analyse/transcript.html', context)
"""
Word Scoring Program
July 01, 2012

Written by Justin Black ([email protected])

This file scores a list of words, based on the frequency that each word is used in another data source

It takes a text file containing a list of words you want to score
and outputs a new text file where each word in that list is given a frequency score.
The frequency depends on which data source (corpus = body of text) you are using to
score it. This program uses 6 such data sources.

I assign no additional copyright to the combined database and the
software is explicitly being placed in the public domain. I
would appreciate credit for my work.


---------------------
INTENDED USE
---------------------
I have a list of words but I want them sorted by "populaity"
Popular words are words we use often in speech or text, unpopular words are typicaly
obscure dictionary/scrabble words.

I can then filter out words that are too obscure using
only the words with a score >= mylimit
You pick the mylimit value by opening one of the scored text files
and picking the cut-off point you prefer.
def getnltk():
    somejsonfile = request.get_json()
    subject = nltk(somejsonfile['data'])
    relevantNews = getrelevantNews(subject, somejsonfile['data'])
    return jsonify({"data": relevantNews, "subject": subject})
"""
Word Scoring Program
July 01, 2012

Written by Justin Black ([email protected])

This file scores a list of words, based on the frequency that each word is used in another data source

It takes a text file containing a list of words you want to score
and outputs a new text file where each word in that list is given a frequency score.
The frequency depends on which data source (corpus = body of text) you are using to
score it. This program uses 6 such data sources.

I assign no additional copyright to the combined database and the
software is explicitly being placed in the public domain. I
would appreciate credit for my work.


---------------------
INTENDED USE
---------------------
I have a list of words but I want them sorted by "populaity"
Popular words are words we use often in speech or text, unpopular words are typicaly
obscure dictionary/scrabble words.

I can then filter out words that are too obscure using
only the words with a score >= mylimit
You pick the mylimit value by opening one of the scored text files
and picking the cut-off point you prefer.
   计算一篇Economist读起来难度如何?

   计算一则川普推文的情感是积极还是消极de?

   标出一首流行歌曲中,哪些歌词押韵?

   ... ...

作为一名在线学习设计师,我每天都要处理、统计、分析大量的陌生英文文本

从打开python的大门起,就经常探索一些工具来辅助工作:

本文我归纳了用过的工具,如下:

1. nltk(word_tokenize, sent_tokenize, corpus.cmudict, pos_tag)
2. SpaCy
3. textstat
4. textblob

考虑到趣味性,所有的介绍都以解决问题出发,不罗列所有功能

若有兴趣深入研究,请自行进入文档链接1. nltk (word_tokenize, sent_tokenize)NLTK的全称为Natural Language Toolkit,是一套用于英文自然语言处理的Python库与程序。
文档地址:https://www.nltk.org/
NLTK Book 地址:https://www.nltk.org/book/

其中 word_tokenize 和 sent_tokenize 可以对文本分别进行以词、句为单位的切割。
https://www.nltk.org/api/nltk.tokenize.html

问题:比较两篇文章的长度(各自的句子数,各自句子长度)
我们经常会接触到大量陌生的文本,不知道它们的长度如何。可以用nltk来计算两篇文本各自的句子数,以及每个句子的单词数。思维步骤: