コード例 #1
0
def main():

    random.seed()

    bot_name = "training_output"

    # Insert the names of the subreddits
    training_subreddits = []

    all_submissions = []
    # all submissions ordered by date
    all_submissions = list(
        db_Submission.select().where((fn.Lower(db_Submission.subreddit).in_(
            [s.lower() for s in training_subreddits]))
                                     & (fn.Lower(db_Submission.author).not_in(
                                         [a.lower()
                                          for a in author_blacklist]))))

    # We'll shuffle all the submission records and split them into a training and evaluation
    # lists in a 90/10 ratio. simpletransformers will use the evaluation to test the accuracy
    # of the training
    random.shuffle(all_submissions)

    split_point = int(len(all_submissions) * 0.9)
    training_submissions = all_submissions[:split_point]
    eval_submissions = all_submissions[split_point:]

    print(
        f'{len(training_submissions)} training submissions, {len(eval_submissions)} evaluation submissions'
    )

    # file name for the output text file
    date_string = datetime.today().strftime('%d%m%y_%H%M')
    counter = 0

    # use concurrent futures (multiprocessing) to speed up the output
    with concurrent.futures.ProcessPoolExecutor() as executor:
        filename = f'{bot_name}_{date_string}_training.txt'

        with open(filename, 'a', encoding='utf-8') as fd:
            for sub, output_text_gen_string in zip(
                    training_submissions,
                    executor.map(gather_comments_for_submission,
                                 training_submissions)):
                counter += 1
                if output_text_gen_string:
                    fd.write(f'{output_text_gen_string}' + '<|endoftext|>\n')
                print(
                    f'subs counted: {counter}. {round(counter/len(all_submissions), 2)}'
                )

        filename = f'{bot_name}_{date_string}_eval.txt'
        with open(filename, 'a', encoding='utf-8') as fd:
            for sub, output_text_gen_string in zip(
                    eval_submissions,
                    executor.map(gather_comments_for_submission,
                                 eval_submissions)):
                counter += 1
                if output_text_gen_string:
                    fd.write(f'{output_text_gen_string}' + '<|endoftext|>\n')
                print(
                    f'subs counted: {counter}. {round(counter/len(all_submissions), 2)}'
                )