Beispiel #1
0
def configure_command_line_arguments():
    # Initialize the commandline argument parser.
    parser = argparse.ArgumentParser(description='Naive Bayes Classifier')

    # Configure the log level parser.  Verbose shows some logs, veryVerbose
    # shows more
    logging_group = parser.add_mutually_exclusive_group(required=False)
    logging_group.add_argument("-v",
                               "--verbose",
                               help="Set the log level verbose.",
                               action='store_true',
                               required=False)

    logging_group.add_argument("-vv",
                               "--veryVerbose",
                               help="Set the log level verbose.",
                               action='store_true',
                               required=False)

    # NLTK supports six built in plaintext corpora.  This allows the user
    # to choose between those six corpora or a seventh option - the
    # corpus the user provided.
    # The first is a corpus taken from ABC news.
    parser.add_argument('-abc',
                       '--abc',
                       help="ABC news corpus",
                       required=False,
                       action='store_true')

    # The second corpus is the book of Genesis
    parser.add_argument('-gen',
                       '--genesis', help="The book of Genesis from the Bible.",
                       required=False,
                       action='store_true')

    # Third is a collection of text from project Gutenberg
    parser.add_argument('-gut',
                       '--gutenberg', help="Text from Project Gutenberg.",
                       required=False,
                       action='store_true')

    # Fourth is text from presidential inaugural addresses
    parser.add_argument('-in',
                       '--inaugural', help="Text from inaugural addresses.",
                       required=False,
                       action='store_true')

    # Fifth is text from the State of the Union
    parser.add_argument('-su',
                       '--stateUnion', help="Text from State of the Union Addresses.",
                       required=False,
                       action='store_true')

    # The final NLTK provided corpus is text from the web
    parser.add_argument('-web',
                       '--webtext', help="Text taken from the web.",
                       required=False,
                       action='store_true')

    # Tell the parser that there is an optional corpus that can be pulled in.
    # The directory can contain multiple files and directories (if the user
    # also passes --recursive)
    fs.add_filesystem_path_args(parser,
                                '-c',
                                '--custom',
                                help='Directory of files to include in a custom corpus.',
                                required=False)

    parser.add_argument('-t',
                        '--train',
                        help="Train the classifier using the NLTK tokens",
                        required=False,
                        action='store_true')

    parser.add_argument('-cl',
                        '--classify',
                        help="Classify the contents of classify.txt",
                        required=False)

    # Third is a collection of text from project Gutenberg
    parser.add_argument('-s',
                       '--stemming', help="Stem in the classifier or trainer.",
                       required=False,
                       action='store_true')

    # Third is a collection of text from project Gutenberg
    parser.add_argument('-lp',
                       '--printProbabilities', help="Print each word probability.",
                       required=False,
                       action='store_true')


    # Parse the passed commandline args and turn them into a dictionary.
    args = vars(parser.parse_args())

    # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL
    log.set_log_level_from_args(args)

    return args
Beispiel #2
0
def configure_command_line_arguments():
    # Initialize the commandline argument parser.
    parser = argparse.ArgumentParser(description="Play with words using NLTK.")

    # Configure the log level parser.  Verbose shows some logs, veryVerbose
    # shows more
    logging_group = parser.add_mutually_exclusive_group(required=False)
    logging_group.add_argument(
        "-v", "--verbose", help="Set the log level verbose.", action="store_true", required=False
    )
    logging_group.add_argument(
        "-vv", "--veryVerbose", help="Set the log level verbose.", action="store_true", required=False
    )

    # In this app we allow the user to choose from a handful of built-in
    # corpora and a user provided one
    corpora_group = parser.add_mutually_exclusive_group(required=True)

    # NLTK supports six built in plaintext corpora.  This allows the user
    # to choose between those six corpora or a seventh option - the
    # corpus the user provided.
    # The first option is a corpus taken from ABC news.
    corpora_group.add_argument("-abc", "--abc", help="ABC news corpus", required=False, action="store_true")

    # The second option is the book of Genesis
    corpora_group.add_argument(
        "-gen", "--genesis", help="The book of Genesis from the Bible.", required=False, action="store_true"
    )

    # Third option is a collection of text from project Gutenberg
    corpora_group.add_argument(
        "-gut", "--gutenberg", help="Text from Project Gutenberg.", required=False, action="store_true"
    )

    # Fourth is text from presidential inaugural addresses
    corpora_group.add_argument(
        "-in", "--inaugural", help="Text from inaugural addresses.", required=False, action="store_true"
    )

    # Fifth is text from the State of the Union
    corpora_group.add_argument(
        "-su", "--stateUnion", help="Text from State of the Union Addresses.", required=False, action="store_true"
    )

    # The final NLTK provided corpus is text from the web
    corpora_group.add_argument(
        "-web", "--webtext", help="Text taken from the web.", required=False, action="store_true"
    )

    corpora_group.add_argument(
        "-svl",
        "--stemVsLemma",
        help="Generate chart of corpus length of original, stemmed and lemmatized word",
        required=False,
        action="store_true",
    )

    # Tell the parser that there is an optional corpus that can be pulled in.
    # The directory can contain multiple files and directories (if the user
    # also passes --recursive)
    fs.add_filesystem_path_args(
        parser,
        "-c",
        "--custom",
        help="Directory of files to include in a custom corpus.",
        required=False,
        group=corpora_group,
    )

    # Optionally, the user is able to stem or lemmatize the input.
    preprocessing_group = parser.add_mutually_exclusive_group(required=False)

    # Select stemming
    preprocessing_group.add_argument("-s", "--stem", help="Stem the input.", required=False, action="store_true")

    # Select lemmatization
    preprocessing_group.add_argument("-l", "--lemma", help="Lemmatize the input.", required=False, action="store_true")

    # What do you want to know?  These params allow one or more calculations to be run on
    # the input data.  In addition, you can ask the app to stem the data before running any
    # of these calculations

    # Calculate the vocabulary size of the selected corpus
    parser.add_argument(
        "-vs", "--vocabularySize", help="Calculate the vocabulary size.", required=False, action="store_true"
    )

    # List all terms found in the corpus
    parser.add_argument(
        "-tp", "--termPresence", help="List all words that are present.", required=False, action="store_true"
    )

    # List the frequency of terms in the corpus
    parser.add_argument(
        "-tf", "--termFrequency", help="Calculate the frequency of each word.", required=False, action="store_true"
    )

    # Log normalize the term frequencies
    parser.add_argument(
        "-ln", "--logNormalize", help="Calculate the log of the frequency.", required=False, action="store_true"
    )

    # Determine the frequency of each frequency of terms
    parser.add_argument(
        "-ff",
        "--frequencyFrequency",
        help="Calculate the frequency of each frequency.  For example, 7 words appear once, 5 appear twice, etc.",
        required=False,
        action="store_true",
    )

    # Parse the passed commandline args and turn them into a dictionary.
    args = vars(parser.parse_args())

    # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL
    log.set_log_level_from_args(args)

    return args
Beispiel #3
0
def configure_command_line_arguments():
    # Initialize the commandline argument parser.
    parser = argparse.ArgumentParser(description='Play with words using NLTK.')

    # Configure the log level parser.  Verbose shows some logs, veryVerbose
    # shows more
    logging_group = parser.add_mutually_exclusive_group(required=False)
    logging_group.add_argument("-v",
                               "--verbose",
                               help="Set the log level verbose.",
                               action='store_true',
                               required=False)
    logging_group.add_argument("-vv",
                               "--veryVerbose",
                               help="Set the log level verbose.",
                               action='store_true',
                               required=False)

    # In this app we allow the user to choose from a handful of built-in
    # corpora and a user provided one
    corpora_group = parser.add_mutually_exclusive_group(required=True)

    # NLTK supports six built in plaintext corpora.  This allows the user
    # to choose between those six corpora or a seventh option - the
    # corpus the user provided.
    # The first option is a corpus taken from ABC news.
    corpora_group.add_argument('-abc',
                               '--abc',
                               help="ABC news corpus",
                               required=False,
                               action='store_true')

    # The second option is the book of Genesis
    corpora_group.add_argument('-gen',
                               '--genesis',
                               help="The book of Genesis from the Bible.",
                               required=False,
                               action='store_true')

    # Third option is a collection of text from project Gutenberg
    corpora_group.add_argument('-gut',
                               '--gutenberg',
                               help="Text from Project Gutenberg.",
                               required=False,
                               action='store_true')

    # Fourth is text from presidential inaugural addresses
    corpora_group.add_argument('-in',
                               '--inaugural',
                               help="Text from inaugural addresses.",
                               required=False,
                               action='store_true')

    # Fifth is text from the State of the Union
    corpora_group.add_argument('-su',
                               '--stateUnion',
                               help="Text from State of the Union Addresses.",
                               required=False,
                               action='store_true')

    # The final NLTK provided corpus is text from the web
    corpora_group.add_argument('-web',
                               '--webtext',
                               help="Text taken from the web.",
                               required=False,
                               action='store_true')

    corpora_group.add_argument(
        '-svl',
        '--stemVsLemma',
        help=
        "Generate chart of corpus length of original, stemmed and lemmatized word",
        required=False,
        action='store_true')

    # Tell the parser that there is an optional corpus that can be pulled in.
    # The directory can contain multiple files and directories (if the user
    # also passes --recursive)
    fs.add_filesystem_path_args(
        parser,
        '-c',
        '--custom',
        help='Directory of files to include in a custom corpus.',
        required=False,
        group=corpora_group)

    # Optionally, the user is able to stem or lemmatize the input.
    preprocessing_group = parser.add_mutually_exclusive_group(required=False)

    # Select stemming
    preprocessing_group.add_argument('-s',
                                     '--stem',
                                     help="Stem the input.",
                                     required=False,
                                     action='store_true')

    # Select lemmatization
    preprocessing_group.add_argument('-l',
                                     '--lemma',
                                     help="Lemmatize the input.",
                                     required=False,
                                     action='store_true')

    # What do you want to know?  These params allow one or more calculations to be run on
    # the input data.  In addition, you can ask the app to stem the data before running any
    # of these calculations

    # Calculate the vocabulary size of the selected corpus
    parser.add_argument('-vs',
                        '--vocabularySize',
                        help="Calculate the vocabulary size.",
                        required=False,
                        action='store_true')

    # List all terms found in the corpus
    parser.add_argument('-tp',
                        '--termPresence',
                        help="List all words that are present.",
                        required=False,
                        action='store_true')

    # List the frequency of terms in the corpus
    parser.add_argument('-tf',
                        '--termFrequency',
                        help="Calculate the frequency of each word.",
                        required=False,
                        action='store_true')

    # Log normalize the term frequencies
    parser.add_argument('-ln',
                        '--logNormalize',
                        help="Calculate the log of the frequency.",
                        required=False,
                        action='store_true')

    # Determine the frequency of each frequency of terms
    parser.add_argument(
        '-ff',
        '--frequencyFrequency',
        help=
        "Calculate the frequency of each frequency.  For example, 7 words appear once, 5 appear twice, etc.",
        required=False,
        action='store_true')

    # Parse the passed commandline args and turn them into a dictionary.
    args = vars(parser.parse_args())

    # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL
    log.set_log_level_from_args(args)

    return args
Beispiel #4
0
def configure_command_line_arguments():
    # Initialize the commandline argument parser.
    parser = argparse.ArgumentParser(description='Naive Bayes Classifier')

    # Configure the log level parser.  Verbose shows some logs, veryVerbose
    # shows more
    logging_group = parser.add_mutually_exclusive_group(required=False)
    logging_group.add_argument("-v",
                               "--verbose",
                               help="Set the log level verbose.",
                               action='store_true',
                               required=False)

    logging_group.add_argument("-vv",
                               "--veryVerbose",
                               help="Set the log level verbose.",
                               action='store_true',
                               required=False)

    # NLTK supports six built in plaintext corpora.  This allows the user
    # to choose between those six corpora or a seventh option - the
    # corpus the user provided.
    # The first is a corpus taken from ABC news.
    parser.add_argument('-abc',
                        '--abc',
                        help="ABC news corpus",
                        required=False,
                        action='store_true')

    # The second corpus is the book of Genesis
    parser.add_argument('-gen',
                        '--genesis',
                        help="The book of Genesis from the Bible.",
                        required=False,
                        action='store_true')

    # Third is a collection of text from project Gutenberg
    parser.add_argument('-gut',
                        '--gutenberg',
                        help="Text from Project Gutenberg.",
                        required=False,
                        action='store_true')

    # Fourth is text from presidential inaugural addresses
    parser.add_argument('-in',
                        '--inaugural',
                        help="Text from inaugural addresses.",
                        required=False,
                        action='store_true')

    # Fifth is text from the State of the Union
    parser.add_argument('-su',
                        '--stateUnion',
                        help="Text from State of the Union Addresses.",
                        required=False,
                        action='store_true')

    # The final NLTK provided corpus is text from the web
    parser.add_argument('-web',
                        '--webtext',
                        help="Text taken from the web.",
                        required=False,
                        action='store_true')

    # Tell the parser that there is an optional corpus that can be pulled in.
    # The directory can contain multiple files and directories (if the user
    # also passes --recursive)
    fs.add_filesystem_path_args(
        parser,
        '-c',
        '--custom',
        help='Directory of files to include in a custom corpus.',
        required=False)

    parser.add_argument('-t',
                        '--train',
                        help="Train the classifier using the NLTK tokens",
                        required=False,
                        action='store_true')

    parser.add_argument('-cl',
                        '--classify',
                        help="Classify the contents of classify.txt",
                        required=False)

    # Third is a collection of text from project Gutenberg
    parser.add_argument('-s',
                        '--stemming',
                        help="Stem in the classifier or trainer.",
                        required=False,
                        action='store_true')

    # Third is a collection of text from project Gutenberg
    parser.add_argument('-lp',
                        '--printProbabilities',
                        help="Print each word probability.",
                        required=False,
                        action='store_true')

    # Parse the passed commandline args and turn them into a dictionary.
    args = vars(parser.parse_args())

    # Configure the log level based on passed in args to be one of DEBUG, INFO, WARN, ERROR, CRITICAL
    log.set_log_level_from_args(args)

    return args