### import the required modules and functions

import subprocess
import time
import sys
from Utils import Write_Performance
from config import *
from ModelEstimation import NNModel
from transformers import BertTokenizer
from NN_Utils import *
from reddit_parser import Parser  # Does the parser object need to be adjusted?

# QUESTION: Does the ID need to show up here in the functions too?
theparser = Parser()

# Create relevant folders
theparser.safe_dir_create()

# parse the documents
theparser.Parse_Rel_RC_Comments()

### Define the neural network object

nnmodel = NNModel()

### check key hyperparameters for correct data types

nnmodel.NN_param_typecheck()

### create training, development and test sets
Esempio n. 2
0
# NOTE: If NN = False, will pre-process data for LDA.
# NOTE: If write_original = True, the original text of a relevant comment -
# without preprocessing - will be saved to a separate file
# NOTE: If clean_raw = True, the compressed data files will be removed from disk
# after processing
# NOTE: Relevance filters can be changed from defaults.py
# NOTE: If there is partial record on file, e.g. including some months in the
# desired range, but not all, delete the aggregated text files resulting from
# previous parsing manually before running the function again.These files can be
# identified by not having a certain month in their filenames and depending on
# parameter settings can include: author, sentiments, lda_prep, nn_prep,
# original_comm, original_indices, Random_Count_Dict, Random_Count_List,
# random_indices, RC_Count_Dict, RC_Count_List, total_count and votes

theparser = Parser()
theparser.Parse_Rel_RC_Comments()

# Filter the dataset based on whether posts are in English (uses Google's
# language detection)
# NOTE: Requires original text of comments to be available on disk
# NOTE: Should be separately run for LDA and NN, as their preprocessed comments
# are stored in separate files
# NOTE: Performance is significantly worse for shorter posts. By default,
# the filtering is only performed on posts that contain at least 20 words
theparser.lang_filtering()
# TODO: Run the function for alternative sentiment estimates after this

## TextBlob sentiment analysis is integrated into parsing. If requested and not
# available, the following function retrieves alternative sentiment measures
# (from NLTK's Vader and CoreNLP)
Esempio n. 3
0
# NOTE: THIS FILE IS NOW OUTDATED
### import the required modules and functions

import time
import sys
from Utils import *
from config import *
from ModelEstimation import NNModel
from reddit_parser import Parser

theparser = Parser()

# Create relevant folders
# theparser.safe_dir_create()

# parse the documents
# theparser.Parse_Rel_RC_Comments()

### check key hyperparameters for correct data types
NN_param_typecheck()

### Define the neural network object
nnmodel = NNModel()

### create training, development and test sets

# NOTE: Always index training set first.
# NOTE: For valid analysis results, maximum vocabulary size and frequency filter
# should not be changed between the creation of sets for LDA and NN analyses

## Determine the comments that will comprise various sets
# the aggregation across all of the months

# BUG: Because of a hacky solution within lang_filtering(), the language filtering
# would only work properly for fully consecutive set of months within self.dates
# TODO: make it more general

import subprocess
import time
import sys
from pycorenlp import StanfordCoreNLP
from reddit_parser import Parser
import argparse
import numpy
from defaults import *

if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--array', type=int)
    argparser.add_argument('--machine', type=str)
    args = argparser.parse_args()

### call the parsing function

theparser = Parser(array=args.array, machine=args.machine)

# Create relevant folders
theparser.safe_dir_create()

# parse the documents
theparser.Parse_Rel_RC_Comments()
import subprocess
import time
import sys
from Utils import Write_Performance
from config import *
from ModelEstimation import NNModel
from transformers import BertTokenizer
from NN_Utils import *
from reddit_parser import Parser

# define the parser object with variables from defaults.py, imported via config.py
# NOTE: Feed machine="local" as an argument if not running through the cluster
theparser = Parser(machine="local")

# Extract a random sample of rel_sample_num documents along with their labels
# according to the pretrained neural relevance classifier, to evaluate classifier
# performance.
# # NOTE: feed in [human_ratings_pattern] as an argument if there are previous
# samples that you would like to be excluded in the next sampling. The fn uses
# glob to match the list of patterns provided to files within the offered path
# and include them in training/testing
# NOTE: Make sure the corresponding "info" files containing the previous samples'
# metadata are stored in the same directory
theparser.Rel_sample(
    human_ratings_pattern=["/auto_labels/sample_info-200-False-*"])

# NOTE: Requires rel_sample results hand-annotated for accuracy
# Reports per-class precision, recall, f1 and accuracy
# If there are different evaluation trials with the same rel_sample_num and
# balanced_rel_sample parameters, feed in trial=[trial number] as an argument.
# By default, the naming convention for the rated files are:
# NOTE: If NN = False, will pre-process data for LDA.
# NOTE: If write_original = True, the original text of a relevant comment -
# without preprocessing - will be saved to a separate file
# NOTE: If clean_raw = True, the compressed data files will be removed from disk
# after processing
# NOTE: Relevance filters can be changed from defaults.py
# NOTE: If there is partial record on file, e.g. including some months in the
# desired range, but not all, delete the aggregated text files resulting from
# previous parsing manually before running the function again.These files can be
# identified by not having a certain month in their filenames and depending on
# parameter settings can include: author, sentiments, lda_prep, nn_prep,
# original_comm, original_indices, Random_Count_Dict, Random_Count_List,
# random_indices, RC_Count_Dict, RC_Count_List, total_count and votes

theparser = Parser()

# create the relevant subfolders for storing dataset attributes
theparser.safe_dir_create()

theparser.Parse_Rel_RC_Comments()

if Neural_Relevance_Filtering:
    # Use a transformer-based neural network trained on human ratings to prune the
    # dataset from irrelevant posts. Path will default to the Human_Ratings folder
    theparser.Neural_Relevance_Screen()

    # Needs results from Neural_Relevance_Screen
    theparser.Neural_Relevance_Clean()

# Filter the dataset based on whether posts are in English (uses Google's
Esempio n. 7
0
import argparse
import numpy
from defaults import *

if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--array', type=int)
    argparser.add_argument('--machine', type=str)
    args = argparser.parse_args()

### call the parsing function

# CoreNLP
# create a connection to the CoreNLP server to retrieve sentiment
# (requires CoreNLP_server.py in the same directory)
subprocess.Popen([
    'java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -threads '
    + str(num_process) + ' --quite'
],
                 shell=True,
                 cwd="./stanford-corenlp-4.0.0")
time.sleep(5)  # wait for connection to the server to be established

theparser = Parser(machine="local")

# Create relevant folders
theparser.safe_dir_create()

# parse the documents
theparser.Parse_Rel_RC_Comments()
Esempio n. 8
0
# To simplify the coding, I should just feed in consecutive IDs of each 24 months
# through the sbatch file. In other words:
# The batch IDs should be determined as follows: 0 for (2008,1), then +1 for
# each month after.

# BUG: Because of a hacky solution within Neural_Relevance_Clean(), the function
# would only work properly for fully consecutive set of months within self.dates
# TODO: make it more general

### import the required modules and functions

import time
import sys
from Utils import Write_Performance
from config import *
#from ModelEstimation import NNModel
from transformers import BertTokenizer
from NN_Utils import *
from reddit_parser import Parser  # Does the parser object need to be adjusted?

# NOTE: Feed machine="local" as an argument if not running through the cluster
theparser = Parser()

# Use a transformer-based neural network trained on human ratings to prune the
# dataset from irrelevant posts. Path will default to the Human_Ratings folder
theparser.Neural_Relevance_Screen(batch_size=1200)