def train(self, fpath): """train pinyin model""" for fname in self.filelist(fpath): with io.open(fname, 'r', encoding='utf-8') as f: for line in f: line = safe_input(line) words = line.split() self.trie.add(words[0], words[1:])
def load_data(self, fpath): """load data from file""" for fname in filelist(fpath): with io.open(fname, 'r', encoding='utf-8') as f: for line in f: line = line.strip() arr = line.split() if arr: yield safe_input(arr[0])
def load_data(self, fpath): datas = [] for fname in self.filelist(fpath): with io.open(fname, 'r', encoding='utf-8') as f: for line in f: line = safe_input(line) if len(line) == 0: continue self.line_total += 1 datas.append(list(map(lambda x: x.split('/'), line.split()))) return datas
def load_dict(self, fpath, authfreq=False, defaultfreq=5): counter = {} total = 0 word_tag = {} for fname in self.filelist(fpath): with io.open(fname, 'r', encoding='utf-8') as f: for idx, line in enumerate(f, 1): try: line = safe_input(line) arr = line.split() if len(arr) >= 3: word, freq, tag = arr[:3] elif len(arr) == 2: word, x = arr[:2] if x.isdigit(): freq = x tag = 'un' else: if authfreq: freq = self.get_freq(word) else: freq = defaultfreq tag = x elif len(arr) == 1: word = arr[0] if authfreq: freq = self.get_freq(word) else: freq = defaultfreq tag = 'un' else: continue freq = int(freq) counter[word] = freq total += freq word_tag[word] = tag for ch in range(len(word)): wfrag = word[:ch + 1] if wfrag not in counter: counter[wfrag] = 0 except ValueError: raise ValueError('''invalid dictionary entry in %s at Line %s: %s''' % (fname, idx, line)) return counter, total, word_tag
def load_data(self, posfname, negfname): """load dataset from file""" def get_file(path): if os.path.isdir(path): for root, dirs, files in os.walk(path): if not dirs: for f in files: yield os.sep.join([root, f]) else: yield path pos_docs, neg_docs = [], [] for fname in get_file(posfname): with io.open(fname, 'r', encoding='utf-8') as f: for line in f: line = safe_input(line) pos_docs.append(seg(line)) for fname in get_file(negfname): with io.open(fname, 'r', encoding='utf-8') as f: for line in f: line = safe_input(line) neg_docs.append(seg(line)) return pos_docs, neg_docs