def samples_generator_sorted(path, max_text_legth=10000): data = [] with open(path, newline='') as f: reader = csv.reader(f) for row in reader: data.append(row) MAX = max_text_legth datas = sorted(data, key=lambda x: len(x[3]), reverse=True) print('Longest text', len(datas[0][3])) for row in datas: id = row[0] print(id) text_id = row[1] sequence = row[2] text = row[3] if len(text) > MAX: for fragment in split_long_text(text, MAX): s = Sentence(fragment, use_tokenizer='toki') s.id = id s.text_id = text_id s.sequence = sequence s.ner = [] s.length = len(fragment) yield s else: s = Sentence(text, use_tokenizer='toki') s.id = id s.text_id = text_id s.sequence = sequence s.ner = [] s.length = len(text) yield s
def samples_generator(path): with open(path, newline='') as f: reader = csv.reader(f) for row in reader: id = row[0] print(id) text_id = row[1] sequence = row[2] text = row[3] s = Sentence(text, use_tokenizer='toki') s.id = id s.text_id = text_id s.sequence = sequence s.ner = [] yield s