コード例 #1
0
from nutrition.structure.data_set import DataSet
import numpy as np

if __name__ == '__main__':
    data_set = DataSet('newsela')

    labels = np.genfromtxt('D:/master project/data/newsela/average_level.csv',
                           delimiter=',')
    data_set.set_labels(labels[:, 1].tolist())

    for i in range(0, 17027):
        path = 'D:/master project/data/newsela/text/{}.txt'.format(i + 1)
        data_set.import_raw_text(path, i)

        print(i)
コード例 #2
0
from nutrition.structure.data_set import DataSet

if __name__ == '__main__':
    
    data_set = DataSet('cepp')
    
    levels = ['KET', 'PET', 'FCE', 'CAE', 'CPE']
    num_articles = [64, 60, 71, 67, 69]
    
    labels = []
    
    text_id = 0
    for l in range(0, 5):
        print('working on level', l)
        for i in range(1, num_articles[l] + 1):
            print('working on text', i)
            path = '{}/_origin/{}/{}.txt'.format(data_set.path, levels[l], i)
            data_set.import_raw_text(path, text_id)
            labels.append(l)
            text_id += 1
            
    data_set.set_labels(labels)
    
コード例 #3
0
ファイル: nil.py プロジェクト: kevincentius/nutrition-ml
from nutrition.structure.data_set import DataSet
import os

if __name__ == '__main__':

    data_set = DataSet('nil')

    text_id = 0
    labels = []
    for level in range(1, 4):
        folder = 'D:/master project/data/news_in_levels/News_in_levels_level{}/articles/'.format(
            level)
        for filename in os.listdir(folder):
            # ignore files that are very small (< n bytes)
            if os.stat(folder + filename).st_size < 10:
                print('ignored {} because its size is only {} bytes'.format(
                    folder + filename,
                    os.stat(folder + filename).st_size))
                continue

            data_set.import_raw_text(folder + filename, text_id)
            labels.append(level)
            text_id += 1
            print(text_id)

    data_set.set_labels(labels)