Beispiel #1
0
    def pretrain_model(self, space=' '):
        # Assert the textfile is exist?
        if self.textfile == None:
            self.textfile = self.getTexts(self.fname + '.txt', space=' ')

        self.traincorpusfname = self.fname + '.traincorpus'
        # iter counter for articles
        i = 0
        with open(self.textfile, 'r') as icorpus, \
            open(self.traincorpusfname, 'w') as ocorpus:
            for line in icorpus.readlines():
                # Convert the translated chinese to simple
                text = tran2simple(line)
                # seperate word using jieba
                text = seperate_word(text)

                # remove non-chinese word from corpus
                text = remove_word(line=text, encoding='utf8')
                # print text
                if text: ocorpus.write(text + '\n')

                i = i + 1
                if i % tCorpus.SEGSIZE == 0:
                    self.logger.info('PreVecModel: ' + str(i) + ' articles')
        self.logger.info('PreVecModel:' + str(i) + ' articles')
        return self.traincorpusfname
Beispiel #2
0
    def pretrain_model(self, space = ' '):
        # Assert the textfile is exist?
        if self.textfile == None:
            self.textfile = self.getTexts(self.fname + '.txt', space=' ')

        self.traincorpusfname = self.fname + '.traincorpus'
        # iter counter for articles
        i = 0
        with open(self.textfile, 'r') as icorpus, \
            open(self.traincorpusfname, 'w') as ocorpus:
            for line in icorpus.readlines():
                # Convert the translated chinese to simple
                text = tran2simple(line)
                # seperate word using jieba
                text = seperate_word(text)
               
                # remove non-chinese word from corpus
                text = remove_word(line = text, encoding = 'utf8')
                # print text
                if text: ocorpus.write(text + '\n') 
                    
                i = i + 1
                if i % tCorpus.SEGSIZE == 0:
                    self.logger.info('PreVecModel: ' + str(i) + ' articles')
        self.logger.info('PreVecModel:' + str(i) + ' articles')
        return self.traincorpusfname
Beispiel #3
0
	def testtrans2simplefile(self):
		import filecmp
		with open(self.corpus, 'r') as icorpus, \
			open(self.corpus+'.simple', 'w') as ocorpus:
			for line in icorpus.readlines():
				ocorpus.write(tran2simple(line) + '\n')
		assert(filecmp.cmp('transchinese.txt.simpletarget', self.corpus+'.simple'))
Beispiel #4
0
	def testtrans2simplefile(self):
		import filecmp
		with open(self.corpus, 'r') as icorpus, \
			open(self.corpus+'.simple', 'w') as ocorpus:
			for line in icorpus.readlines():
				ocorpus.write(tran2simple(line) + '\n')
		assert(filecmp.cmp('transchinese.txt.simpletarget', self.corpus+'.simple'))
Beispiel #5
0
 def testtrans2simple(self):
     line = ['開放中文轉換,是一個致力於中文簡繁轉換的項目,提供高質量詞庫和函數庫', '的項目,提供高質量詞庫和函數']
     ret = tran2simple(line)
     exposeret = [
         '\xe5\xbc\x80\xe6\x94\xbe\xe4\xb8\xad\xe6\x96\x87\xe8\xbd\xac\xe6\x8d\xa2\xef\xbc\x8c\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe8\x87\xb4\xe5\x8a\x9b\xe4\xba\x8e\xe4\xb8\xad\xe6\x96\x87\xe7\xae\x80\xe7\xb9\x81\xe8\xbd\xac\xe6\x8d\xa2\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0\xe5\xba\x93',
         '\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0'
     ]
     assert (ret == exposeret)
Beispiel #6
0
	def testtrans2simple(self):
		line = ['開放中文轉換,是一個致力於中文簡繁轉換的項目,提供高質量詞庫和函數庫','的項目,提供高質量詞庫和函數']
		ret = tran2simple(line)
		exposeret = ['\xe5\xbc\x80\xe6\x94\xbe\xe4\xb8\xad\xe6\x96\x87\xe8\xbd\xac\xe6\x8d\xa2\xef\xbc\x8c\xe6\x98\xaf\xe4\xb8\x80\xe4\xb8\xaa\xe8\x87\xb4\xe5\x8a\x9b\xe4\xba\x8e\xe4\xb8\xad\xe6\x96\x87\xe7\xae\x80\xe7\xb9\x81\xe8\xbd\xac\xe6\x8d\xa2\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0\xe5\xba\x93', '\xe7\x9a\x84\xe9\xa1\xb9\xe7\x9b\xae\xef\xbc\x8c\xe6\x8f\x90\xe4\xbe\x9b\xe9\xab\x98\xe8\xb4\xa8\xe9\x87\x8f\xe8\xaf\x8d\xe5\xba\x93\xe5\x92\x8c\xe5\x87\xbd\xe6\x95\xb0']
		assert(ret == exposeret)