### import the required modules and functions import subprocess import time import sys from Utils import Write_Performance from config import * from ModelEstimation import NNModel from transformers import BertTokenizer from NN_Utils import * from reddit_parser import Parser # Does the parser object need to be adjusted? # QUESTION: Does the ID need to show up here in the functions too? theparser = Parser() # Create relevant folders theparser.safe_dir_create() # parse the documents theparser.Parse_Rel_RC_Comments() ### Define the neural network object nnmodel = NNModel() ### check key hyperparameters for correct data types nnmodel.NN_param_typecheck() ### create training, development and test sets
# NOTE: If NN = False, will pre-process data for LDA. # NOTE: If write_original = True, the original text of a relevant comment - # without preprocessing - will be saved to a separate file # NOTE: If clean_raw = True, the compressed data files will be removed from disk # after processing # NOTE: Relevance filters can be changed from defaults.py # NOTE: If there is partial record on file, e.g. including some months in the # desired range, but not all, delete the aggregated text files resulting from # previous parsing manually before running the function again.These files can be # identified by not having a certain month in their filenames and depending on # parameter settings can include: author, sentiments, lda_prep, nn_prep, # original_comm, original_indices, Random_Count_Dict, Random_Count_List, # random_indices, RC_Count_Dict, RC_Count_List, total_count and votes theparser = Parser() theparser.Parse_Rel_RC_Comments() # Filter the dataset based on whether posts are in English (uses Google's # language detection) # NOTE: Requires original text of comments to be available on disk # NOTE: Should be separately run for LDA and NN, as their preprocessed comments # are stored in separate files # NOTE: Performance is significantly worse for shorter posts. By default, # the filtering is only performed on posts that contain at least 20 words theparser.lang_filtering() # TODO: Run the function for alternative sentiment estimates after this ## TextBlob sentiment analysis is integrated into parsing. If requested and not # available, the following function retrieves alternative sentiment measures # (from NLTK's Vader and CoreNLP)
# NOTE: THIS FILE IS NOW OUTDATED ### import the required modules and functions import time import sys from Utils import * from config import * from ModelEstimation import NNModel from reddit_parser import Parser theparser = Parser() # Create relevant folders # theparser.safe_dir_create() # parse the documents # theparser.Parse_Rel_RC_Comments() ### check key hyperparameters for correct data types NN_param_typecheck() ### Define the neural network object nnmodel = NNModel() ### create training, development and test sets # NOTE: Always index training set first. # NOTE: For valid analysis results, maximum vocabulary size and frequency filter # should not be changed between the creation of sets for LDA and NN analyses ## Determine the comments that will comprise various sets
# the aggregation across all of the months # BUG: Because of a hacky solution within lang_filtering(), the language filtering # would only work properly for fully consecutive set of months within self.dates # TODO: make it more general import subprocess import time import sys from pycorenlp import StanfordCoreNLP from reddit_parser import Parser import argparse import numpy from defaults import * if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--array', type=int) argparser.add_argument('--machine', type=str) args = argparser.parse_args() ### call the parsing function theparser = Parser(array=args.array, machine=args.machine) # Create relevant folders theparser.safe_dir_create() # parse the documents theparser.Parse_Rel_RC_Comments()
import subprocess import time import sys from Utils import Write_Performance from config import * from ModelEstimation import NNModel from transformers import BertTokenizer from NN_Utils import * from reddit_parser import Parser # define the parser object with variables from defaults.py, imported via config.py # NOTE: Feed machine="local" as an argument if not running through the cluster theparser = Parser(machine="local") # Extract a random sample of rel_sample_num documents along with their labels # according to the pretrained neural relevance classifier, to evaluate classifier # performance. # # NOTE: feed in [human_ratings_pattern] as an argument if there are previous # samples that you would like to be excluded in the next sampling. The fn uses # glob to match the list of patterns provided to files within the offered path # and include them in training/testing # NOTE: Make sure the corresponding "info" files containing the previous samples' # metadata are stored in the same directory theparser.Rel_sample( human_ratings_pattern=["/auto_labels/sample_info-200-False-*"]) # NOTE: Requires rel_sample results hand-annotated for accuracy # Reports per-class precision, recall, f1 and accuracy # If there are different evaluation trials with the same rel_sample_num and # balanced_rel_sample parameters, feed in trial=[trial number] as an argument. # By default, the naming convention for the rated files are:
# NOTE: If NN = False, will pre-process data for LDA. # NOTE: If write_original = True, the original text of a relevant comment - # without preprocessing - will be saved to a separate file # NOTE: If clean_raw = True, the compressed data files will be removed from disk # after processing # NOTE: Relevance filters can be changed from defaults.py # NOTE: If there is partial record on file, e.g. including some months in the # desired range, but not all, delete the aggregated text files resulting from # previous parsing manually before running the function again.These files can be # identified by not having a certain month in their filenames and depending on # parameter settings can include: author, sentiments, lda_prep, nn_prep, # original_comm, original_indices, Random_Count_Dict, Random_Count_List, # random_indices, RC_Count_Dict, RC_Count_List, total_count and votes theparser = Parser() # create the relevant subfolders for storing dataset attributes theparser.safe_dir_create() theparser.Parse_Rel_RC_Comments() if Neural_Relevance_Filtering: # Use a transformer-based neural network trained on human ratings to prune the # dataset from irrelevant posts. Path will default to the Human_Ratings folder theparser.Neural_Relevance_Screen() # Needs results from Neural_Relevance_Screen theparser.Neural_Relevance_Clean() # Filter the dataset based on whether posts are in English (uses Google's
import argparse import numpy from defaults import * if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--array', type=int) argparser.add_argument('--machine', type=str) args = argparser.parse_args() ### call the parsing function # CoreNLP # create a connection to the CoreNLP server to retrieve sentiment # (requires CoreNLP_server.py in the same directory) subprocess.Popen([ 'java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -threads ' + str(num_process) + ' --quite' ], shell=True, cwd="./stanford-corenlp-4.0.0") time.sleep(5) # wait for connection to the server to be established theparser = Parser(machine="local") # Create relevant folders theparser.safe_dir_create() # parse the documents theparser.Parse_Rel_RC_Comments()
# To simplify the coding, I should just feed in consecutive IDs of each 24 months # through the sbatch file. In other words: # The batch IDs should be determined as follows: 0 for (2008,1), then +1 for # each month after. # BUG: Because of a hacky solution within Neural_Relevance_Clean(), the function # would only work properly for fully consecutive set of months within self.dates # TODO: make it more general ### import the required modules and functions import time import sys from Utils import Write_Performance from config import * #from ModelEstimation import NNModel from transformers import BertTokenizer from NN_Utils import * from reddit_parser import Parser # Does the parser object need to be adjusted? # NOTE: Feed machine="local" as an argument if not running through the cluster theparser = Parser() # Use a transformer-based neural network trained on human ratings to prune the # dataset from irrelevant posts. Path will default to the Human_Ratings folder theparser.Neural_Relevance_Screen(batch_size=1200)