Ejemplo n.º 1
0
    def train(self, fpath):
        """train pinyin model"""

        for fname in self.filelist(fpath):
            with io.open(fname, 'r', encoding='utf-8') as f:
                for line in f:
                    line = safe_input(line)
                    words = line.split()
                    self.trie.add(words[0], words[1:])
Ejemplo n.º 2
0
    def load_data(self, fpath):
        """load data from file"""

        for fname in filelist(fpath):
            with io.open(fname, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    arr = line.split()
                    if arr:
                        yield safe_input(arr[0])
Ejemplo n.º 3
0
 def load_data(self, fpath):
     datas = []
     for fname in self.filelist(fpath):
         with io.open(fname, 'r', encoding='utf-8') as f:
             for line in f:
                 line = safe_input(line)
                 if len(line) == 0:
                     continue
                 self.line_total += 1
                 datas.append(list(map(lambda x: x.split('/'), line.split())))
     return datas
Ejemplo n.º 4
0
    def load_dict(self, fpath, authfreq=False, defaultfreq=5):
        counter = {}
        total = 0
        word_tag = {}
        for fname in self.filelist(fpath):
            with io.open(fname, 'r', encoding='utf-8') as f:
                for idx, line in enumerate(f, 1):
                    try:
                        line = safe_input(line)

                        arr = line.split()
                        if len(arr) >= 3:
                            word, freq, tag = arr[:3]
                        elif len(arr) == 2:
                            word, x = arr[:2]
                            if x.isdigit():
                                freq = x
                                tag = 'un'
                            else:
                                if authfreq:
                                    freq = self.get_freq(word)
                                else:
                                    freq = defaultfreq
                                tag = x

                        elif len(arr) == 1:
                            word = arr[0]
                            if authfreq:
                                freq = self.get_freq(word)
                            else:
                                freq = defaultfreq
                            tag = 'un'
                        else:
                            continue

                        freq = int(freq)
                        counter[word] = freq
                        total += freq
                        word_tag[word] = tag

                        for ch in range(len(word)):
                            wfrag = word[:ch + 1]
                            if wfrag not in counter:
                                counter[wfrag] = 0
                    except ValueError:
                        raise ValueError('''invalid dictionary entry 
                                            in %s at Line %s: %s''' %
                                         (fname, idx, line))

        return counter, total, word_tag
Ejemplo n.º 5
0
    def load_data(self, posfname, negfname):
        """load dataset from file"""
        def get_file(path):
            if os.path.isdir(path):
                for root, dirs, files in os.walk(path):
                    if not dirs:
                        for f in files:
                            yield os.sep.join([root, f])
            else:
                yield path

        pos_docs, neg_docs = [], []
        for fname in get_file(posfname):
            with io.open(fname, 'r', encoding='utf-8') as f:
                for line in f:
                    line = safe_input(line)
                    pos_docs.append(seg(line))
        for fname in get_file(negfname):
            with io.open(fname, 'r', encoding='utf-8') as f:
                for line in f:
                    line = safe_input(line)
                    neg_docs.append(seg(line))

        return pos_docs, neg_docs