def read_instances_from_file(files, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' vocab = Vocabulary() lb_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents, labels = [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = l[0] sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: sents.append(word_lst) labels.append(label) vocab.add_word_lst(word_lst) lb_vocab.add_word(label) assert len(sents) == len(labels) sets.append({'sents': sents, 'labels': labels}) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.info( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.add_word_lst(['<cls>'] * 6) vocab.build_vocab() lb_vocab.build_vocab() logger.info('Finished. Size of vocab: {}. # Class: {}.'.format( len(vocab), len(lb_vocab))) logger.info('<pad>: {}'.format(vocab.to_index('<pad>'))) logger.info('<unk>: {}'.format(vocab.to_index('<unk>'))) logger.info('<cls>: {}'.format(vocab.to_index('<cls>'))) return sets, vocab, lb_vocab