def configure_command_line_arguments(): # Initialize the commandline argument parser. parser = argparse.ArgumentParser(description='Naive Bayes Classifier') # Configure the log level parser. Verbose shows some logs, veryVerbose # shows more logging_group = parser.add_mutually_exclusive_group(required=False) logging_group.add_argument("-v", "--verbose", help="Set the log level verbose.", action='store_true', required=False) logging_group.add_argument("-vv", "--veryVerbose", help="Set the log level verbose.", action='store_true', required=False) # NLTK supports six built in plaintext corpora. This allows the user # to choose between those six corpora or a seventh option - the # corpus the user provided. # The first is a corpus taken from ABC news. parser.add_argument('-abc', '--abc', help="ABC news corpus", required=False, action='store_true') # The second corpus is the book of Genesis parser.add_argument('-gen', '--genesis', help="The book of Genesis from the Bible.", required=False, action='store_true') # Third is a collection of text from project Gutenberg parser.add_argument('-gut', '--gutenberg', help="Text from Project Gutenberg.", required=False, action='store_true') # Fourth is text from presidential inaugural addresses parser.add_argument('-in', '--inaugural', help="Text from inaugural addresses.", required=False, action='store_true') # Fifth is text from the State of the Union parser.add_argument('-su', '--stateUnion', help="Text from State of the Union Addresses.", required=False, action='store_true') # The final NLTK provided corpus is text from the web parser.add_argument('-web', '--webtext', help="Text taken from the web.", required=False, action='store_true') # Tell the parser that there is an optional corpus that can be pulled in. # The directory can contain multiple files and directories (if the user # also passes --recursive) fs.add_filesystem_path_args(parser, '-c', '--custom', help='Directory of files to include in a custom corpus.', required=False) parser.add_argument('-t', '--train', help="Train the classifier using the NLTK tokens", required=False, action='store_true') parser.add_argument('-cl', '--classify', help="Classify the contents of classify.txt", required=False) # Third is a collection of text from project Gutenberg parser.add_argument('-s', '--stemming', help="Stem in the classifier or trainer.", required=False, action='store_true') # Third is a collection of text from project Gutenberg parser.add_argument('-lp', '--printProbabilities', help="Print each word probability.", required=False, action='store_true') # Parse the passed commandline args and turn them into a dictionary. args = vars(parser.parse_args()) # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL log.set_log_level_from_args(args) return args
def configure_command_line_arguments(): # Initialize the commandline argument parser. parser = argparse.ArgumentParser(description="Play with words using NLTK.") # Configure the log level parser. Verbose shows some logs, veryVerbose # shows more logging_group = parser.add_mutually_exclusive_group(required=False) logging_group.add_argument( "-v", "--verbose", help="Set the log level verbose.", action="store_true", required=False ) logging_group.add_argument( "-vv", "--veryVerbose", help="Set the log level verbose.", action="store_true", required=False ) # In this app we allow the user to choose from a handful of built-in # corpora and a user provided one corpora_group = parser.add_mutually_exclusive_group(required=True) # NLTK supports six built in plaintext corpora. This allows the user # to choose between those six corpora or a seventh option - the # corpus the user provided. # The first option is a corpus taken from ABC news. corpora_group.add_argument("-abc", "--abc", help="ABC news corpus", required=False, action="store_true") # The second option is the book of Genesis corpora_group.add_argument( "-gen", "--genesis", help="The book of Genesis from the Bible.", required=False, action="store_true" ) # Third option is a collection of text from project Gutenberg corpora_group.add_argument( "-gut", "--gutenberg", help="Text from Project Gutenberg.", required=False, action="store_true" ) # Fourth is text from presidential inaugural addresses corpora_group.add_argument( "-in", "--inaugural", help="Text from inaugural addresses.", required=False, action="store_true" ) # Fifth is text from the State of the Union corpora_group.add_argument( "-su", "--stateUnion", help="Text from State of the Union Addresses.", required=False, action="store_true" ) # The final NLTK provided corpus is text from the web corpora_group.add_argument( "-web", "--webtext", help="Text taken from the web.", required=False, action="store_true" ) corpora_group.add_argument( "-svl", "--stemVsLemma", help="Generate chart of corpus length of original, stemmed and lemmatized word", required=False, action="store_true", ) # Tell the parser that there is an optional corpus that can be pulled in. # The directory can contain multiple files and directories (if the user # also passes --recursive) fs.add_filesystem_path_args( parser, "-c", "--custom", help="Directory of files to include in a custom corpus.", required=False, group=corpora_group, ) # Optionally, the user is able to stem or lemmatize the input. preprocessing_group = parser.add_mutually_exclusive_group(required=False) # Select stemming preprocessing_group.add_argument("-s", "--stem", help="Stem the input.", required=False, action="store_true") # Select lemmatization preprocessing_group.add_argument("-l", "--lemma", help="Lemmatize the input.", required=False, action="store_true") # What do you want to know? These params allow one or more calculations to be run on # the input data. In addition, you can ask the app to stem the data before running any # of these calculations # Calculate the vocabulary size of the selected corpus parser.add_argument( "-vs", "--vocabularySize", help="Calculate the vocabulary size.", required=False, action="store_true" ) # List all terms found in the corpus parser.add_argument( "-tp", "--termPresence", help="List all words that are present.", required=False, action="store_true" ) # List the frequency of terms in the corpus parser.add_argument( "-tf", "--termFrequency", help="Calculate the frequency of each word.", required=False, action="store_true" ) # Log normalize the term frequencies parser.add_argument( "-ln", "--logNormalize", help="Calculate the log of the frequency.", required=False, action="store_true" ) # Determine the frequency of each frequency of terms parser.add_argument( "-ff", "--frequencyFrequency", help="Calculate the frequency of each frequency. For example, 7 words appear once, 5 appear twice, etc.", required=False, action="store_true", ) # Parse the passed commandline args and turn them into a dictionary. args = vars(parser.parse_args()) # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL log.set_log_level_from_args(args) return args
def configure_command_line_arguments(): # Initialize the commandline argument parser. parser = argparse.ArgumentParser(description='Play with words using NLTK.') # Configure the log level parser. Verbose shows some logs, veryVerbose # shows more logging_group = parser.add_mutually_exclusive_group(required=False) logging_group.add_argument("-v", "--verbose", help="Set the log level verbose.", action='store_true', required=False) logging_group.add_argument("-vv", "--veryVerbose", help="Set the log level verbose.", action='store_true', required=False) # In this app we allow the user to choose from a handful of built-in # corpora and a user provided one corpora_group = parser.add_mutually_exclusive_group(required=True) # NLTK supports six built in plaintext corpora. This allows the user # to choose between those six corpora or a seventh option - the # corpus the user provided. # The first option is a corpus taken from ABC news. corpora_group.add_argument('-abc', '--abc', help="ABC news corpus", required=False, action='store_true') # The second option is the book of Genesis corpora_group.add_argument('-gen', '--genesis', help="The book of Genesis from the Bible.", required=False, action='store_true') # Third option is a collection of text from project Gutenberg corpora_group.add_argument('-gut', '--gutenberg', help="Text from Project Gutenberg.", required=False, action='store_true') # Fourth is text from presidential inaugural addresses corpora_group.add_argument('-in', '--inaugural', help="Text from inaugural addresses.", required=False, action='store_true') # Fifth is text from the State of the Union corpora_group.add_argument('-su', '--stateUnion', help="Text from State of the Union Addresses.", required=False, action='store_true') # The final NLTK provided corpus is text from the web corpora_group.add_argument('-web', '--webtext', help="Text taken from the web.", required=False, action='store_true') corpora_group.add_argument( '-svl', '--stemVsLemma', help= "Generate chart of corpus length of original, stemmed and lemmatized word", required=False, action='store_true') # Tell the parser that there is an optional corpus that can be pulled in. # The directory can contain multiple files and directories (if the user # also passes --recursive) fs.add_filesystem_path_args( parser, '-c', '--custom', help='Directory of files to include in a custom corpus.', required=False, group=corpora_group) # Optionally, the user is able to stem or lemmatize the input. preprocessing_group = parser.add_mutually_exclusive_group(required=False) # Select stemming preprocessing_group.add_argument('-s', '--stem', help="Stem the input.", required=False, action='store_true') # Select lemmatization preprocessing_group.add_argument('-l', '--lemma', help="Lemmatize the input.", required=False, action='store_true') # What do you want to know? These params allow one or more calculations to be run on # the input data. In addition, you can ask the app to stem the data before running any # of these calculations # Calculate the vocabulary size of the selected corpus parser.add_argument('-vs', '--vocabularySize', help="Calculate the vocabulary size.", required=False, action='store_true') # List all terms found in the corpus parser.add_argument('-tp', '--termPresence', help="List all words that are present.", required=False, action='store_true') # List the frequency of terms in the corpus parser.add_argument('-tf', '--termFrequency', help="Calculate the frequency of each word.", required=False, action='store_true') # Log normalize the term frequencies parser.add_argument('-ln', '--logNormalize', help="Calculate the log of the frequency.", required=False, action='store_true') # Determine the frequency of each frequency of terms parser.add_argument( '-ff', '--frequencyFrequency', help= "Calculate the frequency of each frequency. For example, 7 words appear once, 5 appear twice, etc.", required=False, action='store_true') # Parse the passed commandline args and turn them into a dictionary. args = vars(parser.parse_args()) # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL log.set_log_level_from_args(args) return args
def configure_command_line_arguments(): # Initialize the commandline argument parser. parser = argparse.ArgumentParser(description='Naive Bayes Classifier') # Configure the log level parser. Verbose shows some logs, veryVerbose # shows more logging_group = parser.add_mutually_exclusive_group(required=False) logging_group.add_argument("-v", "--verbose", help="Set the log level verbose.", action='store_true', required=False) logging_group.add_argument("-vv", "--veryVerbose", help="Set the log level verbose.", action='store_true', required=False) # NLTK supports six built in plaintext corpora. This allows the user # to choose between those six corpora or a seventh option - the # corpus the user provided. # The first is a corpus taken from ABC news. parser.add_argument('-abc', '--abc', help="ABC news corpus", required=False, action='store_true') # The second corpus is the book of Genesis parser.add_argument('-gen', '--genesis', help="The book of Genesis from the Bible.", required=False, action='store_true') # Third is a collection of text from project Gutenberg parser.add_argument('-gut', '--gutenberg', help="Text from Project Gutenberg.", required=False, action='store_true') # Fourth is text from presidential inaugural addresses parser.add_argument('-in', '--inaugural', help="Text from inaugural addresses.", required=False, action='store_true') # Fifth is text from the State of the Union parser.add_argument('-su', '--stateUnion', help="Text from State of the Union Addresses.", required=False, action='store_true') # The final NLTK provided corpus is text from the web parser.add_argument('-web', '--webtext', help="Text taken from the web.", required=False, action='store_true') # Tell the parser that there is an optional corpus that can be pulled in. # The directory can contain multiple files and directories (if the user # also passes --recursive) fs.add_filesystem_path_args( parser, '-c', '--custom', help='Directory of files to include in a custom corpus.', required=False) parser.add_argument('-t', '--train', help="Train the classifier using the NLTK tokens", required=False, action='store_true') parser.add_argument('-cl', '--classify', help="Classify the contents of classify.txt", required=False) # Third is a collection of text from project Gutenberg parser.add_argument('-s', '--stemming', help="Stem in the classifier or trainer.", required=False, action='store_true') # Third is a collection of text from project Gutenberg parser.add_argument('-lp', '--printProbabilities', help="Print each word probability.", required=False, action='store_true') # Parse the passed commandline args and turn them into a dictionary. args = vars(parser.parse_args()) # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL log.set_log_level_from_args(args) return args