def main(): random.seed() bot_name = "training_output" # Insert the names of the subreddits training_subreddits = [] all_submissions = [] # all submissions ordered by date all_submissions = list( db_Submission.select().where((fn.Lower(db_Submission.subreddit).in_( [s.lower() for s in training_subreddits])) & (fn.Lower(db_Submission.author).not_in( [a.lower() for a in author_blacklist])))) # We'll shuffle all the submission records and split them into a training and evaluation # lists in a 90/10 ratio. simpletransformers will use the evaluation to test the accuracy # of the training random.shuffle(all_submissions) split_point = int(len(all_submissions) * 0.9) training_submissions = all_submissions[:split_point] eval_submissions = all_submissions[split_point:] print( f'{len(training_submissions)} training submissions, {len(eval_submissions)} evaluation submissions' ) # file name for the output text file date_string = datetime.today().strftime('%d%m%y_%H%M') counter = 0 # use concurrent futures (multiprocessing) to speed up the output with concurrent.futures.ProcessPoolExecutor() as executor: filename = f'{bot_name}_{date_string}_training.txt' with open(filename, 'a', encoding='utf-8') as fd: for sub, output_text_gen_string in zip( training_submissions, executor.map(gather_comments_for_submission, training_submissions)): counter += 1 if output_text_gen_string: fd.write(f'{output_text_gen_string}' + '<|endoftext|>\n') print( f'subs counted: {counter}. {round(counter/len(all_submissions), 2)}' ) filename = f'{bot_name}_{date_string}_eval.txt' with open(filename, 'a', encoding='utf-8') as fd: for sub, output_text_gen_string in zip( eval_submissions, executor.map(gather_comments_for_submission, eval_submissions)): counter += 1 if output_text_gen_string: fd.write(f'{output_text_gen_string}' + '<|endoftext|>\n') print( f'subs counted: {counter}. {round(counter/len(all_submissions), 2)}' )