from nutrition.structure.data_set import DataSet import numpy as np if __name__ == '__main__': data_set = DataSet('newsela') labels = np.genfromtxt('D:/master project/data/newsela/average_level.csv', delimiter=',') data_set.set_labels(labels[:, 1].tolist()) for i in range(0, 17027): path = 'D:/master project/data/newsela/text/{}.txt'.format(i + 1) data_set.import_raw_text(path, i) print(i)
from nutrition.structure.data_set import DataSet if __name__ == '__main__': data_set = DataSet('cepp') levels = ['KET', 'PET', 'FCE', 'CAE', 'CPE'] num_articles = [64, 60, 71, 67, 69] labels = [] text_id = 0 for l in range(0, 5): print('working on level', l) for i in range(1, num_articles[l] + 1): print('working on text', i) path = '{}/_origin/{}/{}.txt'.format(data_set.path, levels[l], i) data_set.import_raw_text(path, text_id) labels.append(l) text_id += 1 data_set.set_labels(labels)
from nutrition.structure.data_set import DataSet import os if __name__ == '__main__': data_set = DataSet('nil') text_id = 0 labels = [] for level in range(1, 4): folder = 'D:/master project/data/news_in_levels/News_in_levels_level{}/articles/'.format( level) for filename in os.listdir(folder): # ignore files that are very small (< n bytes) if os.stat(folder + filename).st_size < 10: print('ignored {} because its size is only {} bytes'.format( folder + filename, os.stat(folder + filename).st_size)) continue data_set.import_raw_text(folder + filename, text_id) labels.append(level) text_id += 1 print(text_id) data_set.set_labels(labels)