def main(): from utils import get_sents from dataset import Dset import featchar, rep trn, dev, tst = get_sents('toy') dset = Dset('toy') r = rep.Repstd() for sent in trn: sent.update({ 'cseq': r.get_cseq(sent), 'wiseq': r.get_wiseq(sent), 'tseq': r.get_tseq(sent) }) r.pprint(trn[0]) print r.pprint(trn[1]) print rep.get_ts_bio(trn[0]['wiseq'], trn[0]['tseq']) feat = featchar.Feat('basic') feat.fit(dset) vdecoder = ViterbiDecoder(trn, feat) vdecoder.pprint() sent = trn[0] vdecoder.decode(sent, randlogprob(sent, feat.NC), debug=True) """
def __init__(self, model_file): dat = np.load(model_file) args = dat['argsd'].tolist() rnn_param_values = dat['rnn_param_values'].tolist() trn, dev, tst = get_sents(args['lang']) repclass = getattr(rep, 'Rep' + args['rep']) repobj = repclass() for d in (trn, dev, tst): for sent in d: sent.update({ 'cseq': repobj.get_cseq(sent), 'wiseq': repobj.get_wiseq(sent), 'tseq': repobj.get_tseq(sent) }) trn = sorted(trn, key=lambda sent: len(sent['cseq'])) dev = sorted(dev, key=lambda sent: len(sent['cseq'])) tst = sorted(tst, key=lambda sent: len(sent['cseq'])) self.feat = featchar.Feat(args['feat']) self.feat.fit(trn, dev, tst) self.vdecoder = decoder.ViterbiDecoder(trn, self.feat) batcher = Batcher(args['n_batch'], self.feat) # batch size 1 devdat = batcher.get_batches(dev) tstdat = batcher.get_batches(tst) rdnn = RNN(self.feat.NC, self.feat.NF, args) cost, dev_predictions = rdnn.predict(devdat) cost, tst_predictions = rdnn.predict(tstdat) self.predictions = {} self.predictions['dev'] = dev_predictions self.predictions['tst'] = tst_predictions self.dset = {} self.dset['dev'] = dev self.dset['tst'] = tst self.repobj = repobj self.reporter = exper.Reporter(self.feat, rep.get_ts_bio) print rdnn.l_soft_out.get_params() print rdnn.blayers[0][0].get_params() params = lasagne.layers.get_all_param_values(rdnn.layers[-1]) print map(np.shape, params) lasagne.layers.set_all_param_values(rdnn.layers[-1], rnn_param_values[:len(params)]) logger = logging.getLogger() logger.setLevel(logging.INFO) shandler = logging.StreamHandler() shandler.setLevel(logging.INFO) logger.addHandler(shandler) validator = Validator(trn, dev, tst, batcher, self.reporter) validator.validate(rdnn, args, self.vdecoder)
def tez_datasets_pos(): langs = ['eng-pos', 'fin-pos', 'deu-pos', 'spa-pos', 'pos', 'chu'] dsetnames = ['trn', 'dev', 'tst'] data = dict((lang, dict((dname, dset) for dname, dset in zip(dsetnames, get_sents(lang)))) for lang in langs) table = [] for l in langs: table.append([l] + map(len, [data[l][dname] for dname in dsetnames])) print tabulate(np.array(table).T, headers=['#sent'] + dsetnames, tablefmt='latex') print table = [] for dname in dsetnames: table.append( [dname] + [sum(len(sent['ws']) for sent in data[l][dname]) for l in langs]) print tabulate(table, headers=['#token'] + langs) print table = [] for l in langs: char_set = set(c for sent in data[l]['trn'] for w in sent['ws'] for c in w) tag_set = set(t for dname in dsetnames for sent in data[l][dname] for t in encoding.any2io(sent['ts'])) table.append(['%s' % l, len(char_set), len(tag_set)]) print tabulate(table, headers=['i/o'] + ['input', 'output'], tablefmt='latex') print table = [] # for l, dname in product(langs,('dev','tst')): for l in langs: dname = 'tst' vdst = get_vocab(data[l][dname]) vsrc = get_vocab(data[l]['trn']) vdiff = vdst.difference(vsrc) uperc = len(vdiff) / float(len(vdst)) * 100 cnt = Counter(w for sent in data[l][dname] for w, t in zip(sent['ws'], sent['ts']) if t != 'O') pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 cnt = Counter(w for sent in data[l][dname] for w in sent['ws']) cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 table.append([l + '-' + dname] + [uperc, pperc, cperc]) print tabulate(np.array(table).T, headers=['unk', 'unique', 'phrase', 'corpus'], tablefmt='latex', floatfmt='.2f')
def __init__(self, lang='eng', level='char', tagging='bio', breaktrn=False, captrn=500, sample=0, charrep='std', sort=True, **kwargs): self.level = level self.tagging = tagging trn, dev, tst = utils.get_sents(lang) repclass = getattr(rep, 'Rep' + charrep) repobj = repclass() for d in (trn, dev, tst): for sent in d: sent.update({ 'cseq': repobj.get_cseq(sent), 'wiseq': repobj.get_wiseq(sent), 'tseq': repobj.get_tseq(sent) }) sent['x'] = sent['cseq'] if level == 'char' else sent['ws'] sent['y'] = sent['tseq'] if level == 'char' else sent['ts'] if captrn: trn = filter(lambda sent: len(sent['ws']) < captrn, trn) if sample > 0: trn_size = sample * 1000 trn = utils.sample_sents(trn, trn_size) if sort: trn = sorted(trn, key=lambda sent: len(sent['x'])) dev = sorted(dev, key=lambda sent: len(sent['x'])) tst = sorted(tst, key=lambda sent: len(sent['x'])) ntrnsent, ndevsent, ntstsent = list(map(len, (trn, dev, tst))) logging.info('# of sents trn, dev, tst: {} {} {}'.format( ntrnsent, ndevsent, ntstsent)) for dset, dname in zip((trn, dev, tst), ('trn', 'dev', 'tst')): slens = [len(sent['x']) for sent in dset] MAX_LENGTH, MIN_LENGTH, AVG_LENGTH, STD_LENGTH = max(slens), min( slens), np.mean(slens), np.std(slens) logging.info( 'input: {}\tmaxlen: {} minlen: {} avglen: {:.2f} stdlen: {:.2f}' .format(dname, MAX_LENGTH, MIN_LENGTH, AVG_LENGTH, STD_LENGTH)) self.trn, self.dev, self.tst = trn, dev, tst
def main(): langs = ['eng', 'deu', 'spa', 'ned', 'tr', 'cze', 'ger', 'arb0', 'ita'] # langs = ['eng', 'deu'] dsetnames = ['trn', 'dev', 'tst'] data = dict((lang, dict((dname, dset) for dname, dset in zip(dsetnames, get_sents(lang)))) for lang in langs) for l in langs: print l, sorted(set(t for sent in data[l]['trn'] for t in sent['ts'])) print table = [] for l in langs: table.append([ l, sum(1 for sent in data[l]['trn'] if len(' '.join(sent['ws'])) > 500) ]) print tabulate(table) table = [] for dname in dsetnames: table.append([dname] + map(len, [data[l][dname] for l in langs])) print tabulate(table, headers=['#sent'] + langs, tablefmt='latex') print table = [] for dname in dsetnames: table.append( [dname] + [sum(len(sent['ws']) for sent in data[l][dname]) for l in langs]) print tabulate(table, headers=['#token'] + langs) print table = [] for dname in dsetnames: table.append([dname] + [ float( sum( len([c for w in sent['ws'] for c in w]) for sent in data[l][dname])) for l in langs ]) print tabulate(table, headers=['#char'] + langs, floatfmt='.1e') print table = [] for l in langs: # nchar_sents = [sum(1 for w in sent['ws']) for sent in chain(*data[l].values())] for dname in dsetnames: nchar_sents = [ sum(1 for w in sent['ws']) for sent in data[l][dname] ] table.append(['{}-{}'.format(l, dname)] + [ int(f(nchar_sents)) if len(nchar_sents) else 0 for f in (np.min, np.max, np.mean, np.std) ]) table.append(['...'] * 5) print tabulate(table, headers=['#word per sent'] + ['min', 'max', 'mean', 'std']) print table = [] for l in langs: # nchar_sents = [sum(1 for c in ' '.join(sent['ws'])) for sent in chain(*data[l].values())] for dname in dsetnames: nchar_sents = [ sum(1 for c in ' '.join(sent['ws'])) for sent in data[l][dname] ] table.append(['{}-{}'.format(l, dname)] + [ int(f(nchar_sents)) if len(nchar_sents) else 0 for f in (np.min, np.max, np.mean, np.std) ]) table.append(['...'] * 5) print tabulate(table, headers=['#char per sent'] + ['min', 'max', 'mean', 'std']) print table = [] for dname in dsetnames: table.append([dname] + [len(get_vocab(data[l][dname])) for l in langs]) print tabulate(table, headers=['size(vocab)'] + langs) print table = [] for l, dname in product(langs, ('dev', 'tst')): vdst = get_vocab(data[l][dname]) vsrc = get_vocab(data[l]['trn']) vdiff = vdst.difference(vsrc) uperc = len(vdiff) / float(len(vdst)) * 100 cnt = Counter(w for sent in data[l][dname] for w, t in zip(sent['ws'], sent['ts']) if t != 'O') pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 cnt = Counter(w for sent in data[l][dname] for w in sent['ws']) cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 table.append([l + '-' + dname] + [uperc, pperc, cperc]) print tabulate(table, headers=['unk', 'unique', 'phrase', 'corpus'], floatfmt='.2f') table = [] for l, dname in product(langs, ('dev', 'tst')): dset = data[l][dname] ts_gold = [sent['ts'] for sent in dset] ts_pred = [encoding.any2io(sent['ts']) for sent in dset] r1, r2 = conlleval(ts_gold, ts_pred) table.append([l + '-' + dname] + map(str, r1)) print tabulate(table, headers=['io-ideal', 'wacc', 'pre', 'rec', 'f1']) print
def paper(): # langs = ['eng', 'deu', 'spa', 'ned', 'tr', 'cze', 'ger', 'arb', 'ita'] # langs = ['arb0', 'cze', 'ned', 'eng', 'deu', 'spa', 'tr'] langs = ['cze-pos', 'eng-pos', 'deu-pos', 'spa-pos', 'pos', 'chu'] dsetnames = ['trn', 'dev', 'tst'] data = dict((lang, dict((dname, dset) for dname, dset in zip(dsetnames, get_sents(lang)))) for lang in langs) table = [] for l in langs: table.append([l] + map(len, [data[l][dname] for dname in dsetnames])) print tabulate(np.array(table).T, headers=['#sent'] + dsetnames, tablefmt='latex') print table = [] for l in langs: # nchar_sents = [sum(1 for c in ' '.join(sent['ws'])) for sent in chain(*data[l].values())] # for dname in dsetnames: nchar_sents = [ sum(1 for c in ' '.join(sent['ws'])) for dname in dsetnames for sent in data[l][dname] ] # table.append(['%s'%l]+[int(f(nchar_sents)) for f in (np.min,np.max,np.mean,np.std)]) table.append(['%s' % l] + [int(f(nchar_sents)) for f in (np.mean, np.std)]) print tabulate(table, headers=['#char per sent'] + ['mean', 'std'], tablefmt='latex') print table = [] for l in langs: # char_set = set(c for dname in dsetnames for sent in data[l][dname] for c in ''.join(sent['ws'])) char_set = set(c for dname in ('trn', 'dev') for sent in data[l][dname] for w in sent['ws'] for c in w) # char_set = set(c for sent in data[l]['trn'] for w in sent['ws'] for c in w) tag_set = set(t for dname in dsetnames for sent in data[l][dname] for t in encoding.any2io(sent['ts'])) # table.append(['%s'%l]+[int(f(nchar_sents)) for f in (np.min,np.max,np.mean,np.std)]) table.append(['%s' % l, len(char_set) + 1, len(tag_set)]) print tabulate(table, headers=['i/o'] + ['input', 'output'], tablefmt='latex') print table = [] for l in langs: # char_set = set(c for dname in dsetnames for sent in data[l][dname] for c in ''.join(sent['ws'])) # char_set = set(c for dname in dsetnames for sent in data[l][dname] for w in sent['ws'] for c in w) char_set = set(c for sent in data[l]['trn'] for w in sent['ws'] for c in w) tag_set = set(t for dname in dsetnames for sent in data[l][dname] for t in encoding.any2io(sent['ts'])) # table.append(['%s'%l]+[int(f(nchar_sents)) for f in (np.min,np.max,np.mean,np.std)]) table.append(['%s' % l, len(char_set), len(tag_set)]) print tabulate(table, headers=['i/o'] + ['input', 'output'], tablefmt='latex') print table = [] # for l, dname in product(langs,('dev','tst')): for l in langs: dname = 'tst' vdst = get_vocab(data[l][dname]) vsrc = get_vocab(data[l]['trn']) vdiff = vdst.difference(vsrc) uperc = len(vdiff) / float(len(vdst)) * 100 cnt = Counter(w for sent in data[l][dname] for w, t in zip(sent['ws'], sent['ts']) if t != 'O') pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 cnt = Counter(w for sent in data[l][dname] for w in sent['ws']) cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 table.append([l + '-' + dname] + [uperc, pperc, cperc]) print tabulate(np.array(table).T, headers=['unk', 'unique', 'phrase', 'corpus'], tablefmt='latex', floatfmt='.2f')
# ) # parser.add_argument("--name", # default=None, # type=str, # required=True, # ) # parser.add_argument("--n_samples", # default=None, # type=int) # args = parser.parse_args() for data in datasets: myprint(data) for key in ["positive", "negative"]: myprint(key) reviews = utils.read_file(data[key]["data_filepath"]) parent_dir = os.path.dirname(data[key]["data_filepath"]) sents = utils.get_sents(reviews) train_size = int(0.9 * len(sents)) split_data_dir = os.path.join( os.path.dirname(data[key]["data_filepath"]), "split_data") Path(split_data_dir).mkdir(parents=True, exist_ok=True) sents_filename_train = os.path.join(split_data_dir, key + "_reviews_train_sents") sents_filename_dev = os.path.join(split_data_dir, key + "_reviews_dev_sents") # sents_filename = data[key]["data_filepath"]+"_sents" utils.write_file(sents[:train_size], sents_filename_train) utils.write_file(sents[train_size:], sents_filename_dev)
for sent in dset_part: for w, t in zip(sent['ws'], sent['ts']): src.write(('%s\t%s\n' % (w, t)).encode('utf-8')) src.write('\n') def get_sample(l, k): rand_indices = random.sample(xrange(len(l)), k) return [l[i] for i in rand_indices] if __name__ == '__main__': random.seed(7) args = get_args() print args trn, dev, tst = get_sents(args['dset']) dset_parts = (trn, dev, tst) print map(len, (trn, dev, tst)) strn, sdev, stst = map(get_sample, dset_parts, map(lambda x: x * 1000, args['nums'])) print map(len, (strn, sdev, stst)) # filter out sents in sdev & stst if they contain a tag that is not in strn trn_tags = set(t for sent in strn for t in sent['ts']) sdev = filter(lambda sent: all(t in trn_tags for t in sent['ts']), sdev) stst = filter(lambda sent: all(t in trn_tags for t in sent['ts']), stst) print map(len, (strn, sdev, stst)) dir_name = 'data/%s-sample' % args['dset'] os.mkdir(dir_name)
if j in 'BS': # 词的开始 result.append(i) else: # 接着原来的词 result[-1] += i return result if __name__ == '__main__': method = 'TestOne' if method == 'TestOne': s = '造成交通事故后逃逸被吊销机动车驾驶证的' data_name = './data/train.utf8' model_name = 'tmp_crflstm.model.h5' modelType = 'lstmcrf' sentences, words = get_sents(datasets=data_name) vocab_size = len(words) max_len = 75 id2char = {i + 1: j for i, j in enumerate(words)} # id到字的映射 char2id = {j: i for i, j in id2char.items()} # 字到id的映射 res = cutTest(s, filename=model_name, batch_size=1, modelType=modelType) print("------------------分词结果为:------------------") print(res) if method == 'TestBatch': # 把要测试的句子放到文件data.test.txt中,执行下面的代码 with open('data.test.txt', 'r') as fr: lines = fr.readlines()
for i, wi in enumerate(wiseq) if wi > -1), lambda x: x[0]) ] ts = [] for i in windxs: if tseq[i] == 'o': ts.append('O') else: ttype = tseq[i].split('-')[1] if i == 0: ts.append('B-{}'.format(ttype.upper())) else: if tseq[i - 1] == tseq[i]: ts.append('I-{}'.format(ttype.upper())) else: ts.append('B-{}'.format(ttype.upper())) return ts def is_consec(sent): return any( t1.startswith('I-') and t2.startswith('B-') and t1.split('-')[1] == t2.split('-')[1] for t1, t2 in zip(sent['ts'], sent['ts'][1:])) if __name__ == '__main__': trn, dev, tst = utils.get_sents('eng') rep = Repstd() print Counter(c for sent in trn for c in rep.get_cseq(sent))