# identified by not having a certain month in their filenames and depending on
# parameter settings can include: author, sentiments, lda_prep, nn_prep,
# original_comm, original_indices, Random_Count_Dict, Random_Count_List,
# random_indices, RC_Count_Dict, RC_Count_List, total_count and votes

theparser = Parser()

# create the relevant subfolders for storing dataset attributes
theparser.safe_dir_create()

theparser.Parse_Rel_RC_Comments()

if Neural_Relevance_Filtering:
    # Use a transformer-based neural network trained on human ratings to prune the
    # dataset from irrelevant posts. Path will default to the Human_Ratings folder
    theparser.Neural_Relevance_Screen()

    # Needs results from Neural_Relevance_Screen
    theparser.Neural_Relevance_Clean()

# Filter the dataset based on whether posts are in English (uses Google's
# language detection)
# NOTE: Requires original text of comments to be available on disk
# NOTE: Should be separately run for LDA and NN, as their preprocessed comments
# are stored in separate files
# NOTE: Performance is significantly worse for shorter posts. By default,
# the filtering is only performed on posts that contain at least 20 words
theparser.lang_filtering()
# TODO: Run the function for alternative sentiment estimates after this

## TextBlob sentiment analysis is integrated into parsing. If requested and not
Exemple #2
0
# To simplify the coding, I should just feed in consecutive IDs of each 24 months
# through the sbatch file. In other words:
# The batch IDs should be determined as follows: 0 for (2008,1), then +1 for
# each month after.

# BUG: Because of a hacky solution within Neural_Relevance_Clean(), the function
# would only work properly for fully consecutive set of months within self.dates
# TODO: make it more general

### import the required modules and functions

import time
import sys
from Utils import Write_Performance
from config import *
#from ModelEstimation import NNModel
from transformers import BertTokenizer
from NN_Utils import *
from reddit_parser import Parser  # Does the parser object need to be adjusted?

# NOTE: Feed machine="local" as an argument if not running through the cluster
theparser = Parser()

# Use a transformer-based neural network trained on human ratings to prune the
# dataset from irrelevant posts. Path will default to the Human_Ratings folder
theparser.Neural_Relevance_Screen(batch_size=1200)