def read_data(self, fn, verbose=True, column_no=-1): word_sequences = list() tag_sequences = list() with codecs.open(fn, 'r', 'utf-8') as f: lines = f.readlines() curr_words = list() curr_tags = list() for k in range(len(lines)): line = lines[k].strip() if len(line) == 0 or line.startswith( '-DOCSTART-'): # new sentence or new document if len(curr_words) > 0: word_sequences.append(curr_words) tag_sequences.append(curr_tags) curr_words = list() curr_tags = list() continue strings = line.split(' ') word = strings[0] tag = strings[column_no] # be default, we take the last tag curr_words.append(word) curr_tags.append(tag) if k == len(lines) - 1: word_sequences.append(curr_words) tag_sequences.append(curr_tags) if verbose: print('Loading from %s: %d samples, %d words.' % (fn, len(word_sequences), get_words_num(word_sequences))) return word_sequences, tag_sequences
def read_data(self, fn, verbose=True, column_no=-1): word_sequences = list() tag_sequences = list() curr_words = list() curr_tags = list() with codecs.open(fn, 'r', 'utf-8') as f: lines = f.readlines() for k, line in enumerate(lines): elements = line.strip().split('\t') if len(elements) < 3: # end of the document word_sequences.append(curr_words) tag_sequences.append(curr_tags) curr_words = list() curr_tags = list() continue word = elements[1] tag = elements[2].split(':')[0] curr_words.append(word) curr_tags.append(tag) if verbose: print('Loading from %s: %d samples, %d words.' % (fn, len(word_sequences), get_words_num(word_sequences))) return word_sequences, tag_sequences
def read_train_dev_test(self, args): def tokenize(tokenizer, text): sequence = [ token.text for token in tokenizer(text.decode('ascii')) ] return sequence path = args.data_dir data = [] labels = [] f_names = ['rt-polarity.neg', 'rt-polarity.pos'] train_dir = os.path.join(args.data_dir, 'train') dev_dir = os.path.join(args.data_dir, 'dev') test_dir = os.path.join(args.data_dir, 'test') try: os.mkdir(train_dir) os.mkdir(dev_dir) os.mkdir(test_dir) except: pass if args.save_data: print("Loading tokenizer...") tokenizer = en_core_web_lg.load() for (l, f) in enumerate(f_names): for line in open(os.path.join(path, f), 'rb'): try: line.decode('utf-8') except: continue data.append(line.strip()) labels.append('pos' if l else 'neg') # seed set in main.py train, test, train_labels, test_labels = sklearn.model_selection.train_test_split( data, labels, test_size=.2) train, val, train_labels, val_labels = sklearn.model_selection.train_test_split( train, train_labels, test_size=.1) word_sequences_train = [ tokenize(tokenizer, text) for text in train ] word_sequences_dev = [tokenize(tokenizer, text) for text in val] word_sequences_test = [tokenize(tokenizer, text) for text in test] pickle.dump( word_sequences_train, open(os.path.join(train_dir, 'word_sequences') + '.pkl', 'wb')) pickle.dump(train_labels, open(os.path.join(train_dir, 'labels') + '.pkl', 'wb')) pickle.dump( word_sequences_dev, open(os.path.join(dev_dir, 'word_sequences') + '.pkl', 'wb')) pickle.dump(val_labels, open(os.path.join(dev_dir, 'labels') + '.pkl', 'wb')) pickle.dump( word_sequences_test, open(os.path.join(test_dir, 'word_sequences') + '.pkl', 'wb')) pickle.dump(test_labels, open(os.path.join(test_dir, 'labels') + '.pkl', 'wb')) else: print("Loading data from .pkl's in %s" % args.data_dir) word_sequences_train = pickle.load( open(os.path.join(train_dir, 'word_sequences') + '.pkl', 'rb')) train_labels = pickle.load( open(os.path.join(train_dir, 'labels') + '.pkl', 'rb')) word_sequences_dev = pickle.load( open(os.path.join(dev_dir, 'word_sequences') + '.pkl', 'rb')) val_labels = pickle.load( open(os.path.join(dev_dir, 'labels') + '.pkl', 'rb')) word_sequences_test = pickle.load( open(os.path.join(test_dir, 'word_sequences') + '.pkl', 'rb')) test_labels = pickle.load( open(os.path.join(test_dir, 'labels') + '.pkl', 'rb')) if args.verbose: for name, word_sequences in zip(['train', 'dev', 'test'], [ word_sequences_train, word_sequences_dev, word_sequences_test ]): print( 'Loading from %s: %d samples, %d words.' % (name, len(word_sequences), get_words_num(word_sequences))) return word_sequences_train, train_labels, word_sequences_dev, val_labels, word_sequences_test, \ test_labels
def read_data(self, args, corpus_type, fn, verbose=True, column_no=-1): mode = '#' column_no = -1 src_data = [] data = [] label = [] src_data_sentence = [] data_sentence = [] label_sentence = [] with codecs.open(fn, 'r', 'utf-8') as f: lines = f.readlines() for line in lines: # print('line',line) # for k in range(len(lines)): # line = str(line, 'utf-8') line_t = line.replace('\n', '').replace('\r', '').replace(' ', '#').split('#') if len(line_t) < 3: if len(data_sentence) == 0: continue src_data.append(src_data_sentence) data.append(data_sentence) label.append(label_sentence) src_data_sentence = [] data_sentence = [] label_sentence = [] continue src_word = line_t[0] word = line_t[1] src_data_sentence.append(src_word) data_sentence.append(word) label_sentence += [line_t[2].split('_')[0]] if verbose: print('Loading from %s: %d samples, %d words.' % (fn, len(data), get_words_num(data))) datas = deepcopy(data) # convert the word to window size words ... window_datas = [] for sent in data: if not args.if_no_bigram: window_datas.append( self.convert2window_bigram_feature( sent, win_size=args.window_size)) else: window_datas.append( self.convert2window_noBigram_feature( sent, win_size=args.window_size)) # if args.if_bigram: # for sent in data: # window_datas.append(self.convert2window_bigram_feature(sent, win_size=args.window_size) ) # else: # for sent in data: # window_datas.append(self.convert2window_noBigram_feature(sent, win_size=args.window_size) ) return window_datas, datas, label