def sample_train(input_file): closed_count = cu.get_closed_count(input_file) sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count) sample.extend(cu.iter_closed_questions(input_file)) random.shuffle(sample) header = cu.get_header(input_file) return header, sample
def sample_train(input_file): print("get closed question count") closed_count = cu.get_closed_count(input_file) print("sample open questions") sample = reservoir_sample(cu.iter_open_questions(input_file), closed_count) print("get all closed questions") sample.extend(cu.iter_closed_questions(input_file)) print("shuffle all the data") random.shuffle(sample) header = cu.get_header(input_file) return header, sample