# the aggregation across all of the months # BUG: Because of a hacky solution within lang_filtering(), the language filtering # would only work properly for fully consecutive set of months within self.dates # TODO: make it more general import subprocess import time import sys from pycorenlp import StanfordCoreNLP from reddit_parser import Parser import argparse import numpy from defaults import * if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('--array', type=int) argparser.add_argument('--machine', type=str) args = argparser.parse_args() ### call the parsing function theparser = Parser(array=args.array, machine=args.machine) # Create relevant folders theparser.safe_dir_create() # parse the documents theparser.Parse_Rel_RC_Comments()
### import the required modules and functions import subprocess import time import sys from Utils import Write_Performance from config import * from ModelEstimation import NNModel from transformers import BertTokenizer from NN_Utils import * from reddit_parser import Parser # Does the parser object need to be adjusted? # QUESTION: Does the ID need to show up here in the functions too? theparser = Parser() # Create relevant folders theparser.safe_dir_create() # parse the documents theparser.Parse_Rel_RC_Comments() ### Define the neural network object nnmodel = NNModel() ### check key hyperparameters for correct data types nnmodel.NN_param_typecheck() ### create training, development and test sets
import subprocess import time import sys from Utils import Write_Performance from config import * from ModelEstimation import NNModel from transformers import BertTokenizer from NN_Utils import * from reddit_parser import Parser # define the parser object with variables from defaults.py, imported via config.py # NOTE: Feed machine="local" as an argument if not running through the cluster theparser = Parser(machine="local") # Extract a random sample of rel_sample_num documents along with their labels # according to the pretrained neural relevance classifier, to evaluate classifier # performance. # # NOTE: feed in [human_ratings_pattern] as an argument if there are previous # samples that you would like to be excluded in the next sampling. The fn uses # glob to match the list of patterns provided to files within the offered path # and include them in training/testing # NOTE: Make sure the corresponding "info" files containing the previous samples' # metadata are stored in the same directory theparser.Rel_sample( human_ratings_pattern=["/auto_labels/sample_info-200-False-*"]) # NOTE: Requires rel_sample results hand-annotated for accuracy # Reports per-class precision, recall, f1 and accuracy # If there are different evaluation trials with the same rel_sample_num and # balanced_rel_sample parameters, feed in trial=[trial number] as an argument. # By default, the naming convention for the rated files are: