コード例 #1
0
import nltk
from nltk.corpus import sinica_treebank

print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
コード例 #2
0
ファイル: test_corpora.py プロジェクト: Geolem/nltk
 def test_sents(self):
     first_3_sents = sinica_treebank.sents()[:3]
     self.assertEqual(
         first_3_sents, [['一'], ['友情'], ['嘉珍', '和', '我', '住在', '同一條', '巷子']]
     )
コード例 #3
0
ファイル: precision_segmenter.py プロジェクト: abr1989/PFC
from __future__ import division
import nltk
from nltk.corpus import sinica_treebank as s
import Segmenter
from Segmenter import sent_segment


frases= s.sents()

def haz_uno(sentence):
    ## 'sentence' es una lista de caracteres sin codificar en utf8
    devolver= ""
    for i in sentence:
        devolver = devolver+ str(i).decode('utf-8')
    return devolver

def compare_both(a,b):
    ## 'a' siempre será la frase del corpus
    ## 'b' será la frase segmentada por mi tokenizer
    count= 0
    total= 0
    if len(a) <= len(b):
        for i in range(len(a)):
            if a[i] == b[i]:
                count = count + 1
    else:
        for i in range(len(b)):
            if a[i] == b[i]:
                count = count + 1
    return count/int(len(a))
コード例 #4
0
 def test_sents(self):
     first_3_sents = sinica_treebank.sents()[:3]
     self.assertEqual(
         first_3_sents,
         [['一'], ['友情'], ['嘉珍', '和', '我', '住在', '同一條', '巷子']]
     )
コード例 #5
0
ファイル: ch5_6.py プロジェクト: xenron/sandbox-da-python
import nltk
from nltk.corpus import sinica_treebank
print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
コード例 #6
0
 def test_sents(self):
     first_3_sents = sinica_treebank.sents()[:3]
     self.assertEqual(first_3_sents,
                      [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]])
コード例 #7
0
ファイル: test_corpora.py プロジェクト: GloriousFt/TextBlob
 def test_sents(self):
     first_3_sents = sinica_treebank.sents()[:3]
     self.assertEqual(first_3_sents, [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]])
コード例 #8
0
    def __init__(self, min_nchar, fn, lang="ENG"):
        """
        TXT_FN : path to file containing text data.
        """
        self.min_nchar = min_nchar
        self.fdict = {
            'WORD': self.sample_word,
            'LINE': self.sample_line,
            'PARA': self.sample_para
        }
        self.lang = lang
        # parse English text
        if self.lang == "ENG":
            print('Generate English Data with NLTK:PlaintextCorpusReader')
            corpus = PlaintextCorpusReader("./", fn)

            self.words = corpus.words()
            self.sents = corpus.sents()
            self.paras = corpus.paras()

        # parse Japanese text
        elif self.lang == "JPN":
            print('Generate Japanese Data with NLTK:ChasenCorpusReader')
            # convert fs into chasen file
            _, ext = os.path.splitext(os.path.basename(fn))
            fn_chasen = fn.replace(ext, ".chasen")
            print("Convert {} into {}".format(fn, fn_chasen))

            cmd = "mecab -Ochasen {} > {}".format(fn, fn_chasen)
            print(
                "The following cmd below was executed to convert into chasen (for Japanese)"
            )
            print("\t{}".format(cmd))
            p = subprocess.call(cmd, shell=True)
            data = ChasenCorpusReader('./', fn_chasen, encoding='utf-8')

            self.words = data.words()
            self.sents = data.sents()
            self.paras = data.paras()

            # jp_sent_tokenizer = nltk.RegexpTokenizer(u'[^ 「」!?。]*[!?。]')
            # jp_chartype_tokenizer = nltk.RegexpTokenizer(u'([ぁ-んー]+|[ァ-ンー]+|[\u4e00-\u9FFF]+|[^ぁ-んァ-ンー\u4e00-\u9FFF]+)')
            #
            # corpus = PlaintextCorpusReader("./",
            #                              fn,
            #                              encoding='utf-8',
            #                              para_block_reader=read_line_block,
            #                              sent_tokenizer=jp_sent_tokenizer,
            #                              word_tokenizer=jp_chartype_tokenizer)
        elif self.lang == "ZHTW":
            print(
                'Generate Traditional Chinese Data with NLTK:sinica_treebank')
            self.words = []
            self.sents = []
            self.paras = []
            #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8')
            #self.words = data.words()
            #self.sents = data.sents()
            #self.paras = data.parsed_sents()
            self.words = sinica_treebank.words()
            self.sents = sinica_treebank.sents()
            self.paras = sinica_treebank.parsed_sents()
        else:
            self.words = []
            self.sents = []
            self.paras = []
            #data = SinicaTreebankCorpusReader('./', fn, encoding='utf-8')
            #self.words = data.words()
            #self.sents = data.sents()
            #self.paras = data.parsed_sents()
            self.words = sinica_treebank.words()
            self.sents = sinica_treebank.sents()
            self.paras = sinica_treebank.parsed_sents()
        # distribution over line/words for LINE/PARA:
        self.p_line_nline = np.array([0.85, 0.10, 0.05])
        self.p_line_nword = [4, 3, 12]  # normal: (mu, std)
        self.p_para_nline = [1.0, 1.0]  #[1.7,3.0] # beta: (a, b), max_nline
        self.p_para_nword = [1.7, 3.0, 10]  # beta: (a,b), max_nword

        # probability to center-align a paragraph:
        self.center_para = 0.5
コード例 #9
0
ファイル: Main.py プロジェクト: abr1989/PFC
import nltk
from nltk.corpus import sinica_treebank as sinica

import Segmenter
from Segmenter import sent_segment
import PCFG
from PCFG import PCFGChino
import time

####################################
## ESTA PARTE NO ES IMPORTANTE PARA TESTEAR EL PARSER
tagged_sents= sinica.tagged_sents()
sents= sinica.sents()

size= int(len(tagged_sents) * 0.9)
train_set= tagged_sents[:size]
test_set= tagged_sents[size:]
##trigram_tagger= nltk.TrigramTagger(train_set)
##score= trigram_tagger.evaluate(test_set)
print "Entrenando"
ini= time.time()
t0= nltk.DefaultTagger('Nab')
t1= nltk.UnigramTagger(train_set, backoff=t0)
t2= nltk.BigramTagger(train_set, backoff=t1)
t3= nltk.TrigramTagger(train_set, backoff=t2)
fin= time.time()
score= t3.evaluate(test_set)
print("Entrenamiento terminado ", str(fin-ini))
print "Evaluation Tagger= ",score
####################################
## Se crea el parser
コード例 #10
0
print(treebank.fileids())  # doctest: +ELLIPSIS
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')
      [0])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('ptb')
print(ptb.fileids())  # doctest: +SKIP
# download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip
# then extract and place to the following location: .../nltk_data/corpora/ptb/
print(ptb.words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
print(ptb.tagged_words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
# print(ptb.categories())  # doctest: +SKIP
# print(ptb.fileids('news'))  # doctest: +SKIP
# print(ptb.words(categories=['humor', 'fiction']))  # doctest: +SKIP
# nltk.download('sinica_treebank')
print(sinica_treebank.sents())  # doctest: +SKIP
print(sinica_treebank.parsed_sents()[25])  # doctest: +SKIP
# nltk.download('conll2007')
print(conll2007.sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0].tree())  # doctest: +SKIP
# for tree in ycoe.parsed_sents('cocuraC')[:4]:
#     print(tree)  # doctest: +SKIP
# word lists and lexicons
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS
print(stopwords.fileids())  # doctest: +ELLIPSIS
print(stopwords.words('portuguese'))  # doctest: +ELLIPSIS
# nltk.download('names')
print(names.fileids())
print(names.words('male.txt'))  # doctest: +ELLIPSIS
コード例 #11
0
ファイル: precision.py プロジェクト: abr1989/PFC
        s.remove(u'}')
    b= "".join(s)
##    print "B",b
    f.write('Corpus: '+str(b)+'\n')
    r1= tree2set(str(a))
##    print "Tree A: "+str(r1)
    r2= tree2set(str(b))
##    print "Tree B: "+str(r2)
    return lp_lr(r2,r1)#parseval(r2,r1), labeled_recall(r2,r1),lp_lr(r2,r1)



##    ## TRAIN + TEST 1000
##   
size= 1000        
frases= sinica.sents()
arboles= sinica.parsed_sents()
train= pcfg(size)
train.carga_pesos()

with open('gramatica1000total.txt','r') as g:
    gramatica=g.readlines()
    
train.carga_gramatica(gramatica)

F1= 0
f= open('t1000.txt', 'w')
##f= open('Knownwords.txt', 'w')
for i in range(size):
##    print i
    f.write(str(i)+'\n')