def main(source, output, n_folds, n_folds_max, word_max_features, word_min_df,
         pos_max_features, pos_min_df, pos_vec, pos_ngram, type_max_features,
         type_min_df, type_vec, type_ngram, sparse, feature_name_header):
    """
    Generates a good vs bad training dataset from Fuman user posts. (Binary Classification)

    Concatenates simple features from the database, hand crafted features based on various character and word counts,
    and Tf-Idf weighted bag of words based on the text as well as the part-of-speech tags of Fuman user posts.

    :param source: directory or file of the input files. (If dir, file will be all-scored-rants.csv)
    :param output: the output directory
    :param n_folds: the number of splits to generate (using StratifiedKFold)
    :param n_folds_max: max number of folds to output
    :param pos_max_features: parameter for tf-idf vectorizer of POS (default 3000)
    :param pos_min_df: parameter for tf-idf vectorizer of POS (default 25)
    :param word_max_features: parameter for tf-idf vectorizer of words (default 3000)
    :param word_min_df: parameter for tf-idf vectorizer of words (default 25)
    :param pos_vec: [tfidf, count] use corresponding term weighting
    :param pos_ngram: Learn vocabulary with ngrams in range (1,pos_ngram) (default is 3)
    :param sparse: output in svmlight sparse format
    :param feature_name_header: output headers as the feature names
    """
    if not os.path.isdir(output):
        raise ValueError("Output must be a directory")

    if os.path.isfile(source):
        source_dir, source_filename = os.path.split(source)
    else:
        source_dir = source
        source_filename = PRICE_FILENAME
    logging.info("Source dump: {}/{}".format(source_dir, source_filename))

    timestamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d_%H-%M-%S')
    output_path = os.path.join(output, "price-{}".format(timestamp))

    logging.info("Timestamp: {}".format(timestamp))
    pos_dict_filename = os.path.join(output_path,
                                     "pos-features-" + timestamp + ".json")
    type_dict_filename = os.path.join(output_path,
                                      "type-features-" + timestamp + ".json")
    rant_dict_filename = os.path.join(output_path,
                                      "rant-features-" + timestamp + ".json")

    rant_stats_vectorizer = DictVectorizer()
    userprofile_vectorizer = DictVectorizer()
    transformer_list = [
        (
            'rant_stats',
            Pipeline([
                ('selector', FieldSelector(key='rant')),
                ('stats', RantStats()),  # returns a list of dicts
                ('vect',
                 rant_stats_vectorizer),  # list of dicts -> feature matrix
            ])),
        (
            'userprofile_stats',
            Pipeline([
                ('selector', FieldSelector(key='userprofile')),
                ('stats', UserProfileStats()),  # returns a list of dicts
                ('vect',
                 userprofile_vectorizer),  # list of dicts -> feature matrix
            ])),
    ]

    pos_vectorizer_func = VECTORIZERS[pos_vec]
    pos_vectorizer = None
    type_vectorizer_func = VECTORIZERS[type_vec]
    type_vectorizer = None
    word_vectorizer = None
    if pos_max_features:
        pos_vectorizer = pos_vectorizer_func(tokenizer=tokenize_pos,
                                             ngram_range=(1, pos_ngram),
                                             strip_accents='unicode',
                                             min_df=pos_min_df,
                                             max_features=pos_max_features)
        transformer_list.append(('pos_bow',
                                 Pipeline([
                                     ('selector', FieldSelector(key='rant')),
                                     ('vectorize', pos_vectorizer),
                                 ])))
    if type_max_features:
        type_vectorizer = type_vectorizer_func(tokenizer=tokenize_token_type,
                                               ngram_range=(1, type_ngram),
                                               strip_accents='unicode',
                                               min_df=type_min_df,
                                               max_features=type_max_features)
        transformer_list.append(('type_bow',
                                 Pipeline([
                                     ('selector', FieldSelector(key='rant')),
                                     ('vectorize', type_vectorizer),
                                 ])))
    if word_max_features:
        word_vectorizer = TfidfVectorizer(tokenizer=tokenize_rant,
                                          strip_accents='unicode',
                                          min_df=word_min_df,
                                          max_features=word_max_features)
        transformer_list.append(('rant_bow',
                                 Pipeline([
                                     ('selector', FieldSelector(key='rant')),
                                     ('vectorize', word_vectorizer),
                                 ])))
    pipeline = Pipeline([('union',
                          FeatureUnion(transformer_list=transformer_list))])

    fuman_data = load_fuman_price(source_dir, filename=source_filename)
    logging.info("Processing pipeline...")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", module="sklearn")
        instances = pipeline.fit_transform(fuman_data.data)
        n_samples = instances.shape[0]
        y = np.asarray(fuman_data.target, dtype=np.int8).reshape((n_samples, ))

        pos_features = list()
        type_features = list()
        rant_features = list()
        if pos_max_features:
            pos_features = pos_vectorizer.get_feature_names()
            save_features_json(pos_dict_filename, pos_features)
        if type_max_features:
            type_features = type_vectorizer.get_feature_names()
            save_features_json(type_dict_filename, type_features)
        if word_max_features:
            rant_features = word_vectorizer.get_feature_names()
            save_features_json(rant_dict_filename, rant_features)
        header = make_header(rant_stats_vectorizer.get_feature_names(),
                             userprofile_vectorizer.get_feature_names(),
                             pos_features, type_features, rant_features,
                             feature_name_header)
        logging.info("Saving {} of {} folds to disk...".format(
            n_folds_max, n_folds))
        if n_folds == 1:
            dump_csv(output_path, instances, y, "price", 0, header, timestamp,
                     sparse)
        else:
            skf = KFold(n=n_samples, n_folds=n_folds, shuffle=True)
            for i, (_, test_index) in enumerate(skf, 1):
                dump_csv(output_path, instances[test_index], y[test_index],
                         "price", i, header, timestamp, sparse)
                if i == n_folds_max:
                    break
        save_dataset_metadata(sparse,
                              output_path,
                              "price",
                              source_filepath=source,
                              timestamp=timestamp,
                              word_vectorizer=word_vectorizer,
                              tokenize_rant=tokenize_rant,
                              pos_vectorizer=pos_vectorizer,
                              tokenize_pos=tokenize_pos,
                              type_vectorizer=type_vectorizer,
                              tokenize_type=tokenize_token_type)
    logging.info("Work complete!")
def main(source, output, n_folds, n_folds_max, word_max_features, word_min_df, pos_max_features, pos_min_df,
         pos_vec, pos_ngram, type_max_features, type_min_df, type_vec, type_ngram, sparse, feature_name_header):
    """
    Generates a good vs bad training dataset from Fuman user posts. (Binary Classification)

    Concatenates simple features from the database, hand crafted features based on various character and word counts,
    and Tf-Idf weighted bag of words based on the text as well as the part-of-speech tags of Fuman user posts.

    :param source: directory or file of the input files. (If dir, file will be all-scored-rants.csv)
    :param output: the output directory
    :param n_folds: the number of splits to generate (using StratifiedKFold)
    :param n_folds_max: max number of folds to output
    :param pos_max_features: parameter for tf-idf vectorizer of POS (default 3000)
    :param pos_min_df: parameter for tf-idf vectorizer of POS (default 25)
    :param word_max_features: parameter for tf-idf vectorizer of words (default 3000)
    :param word_min_df: parameter for tf-idf vectorizer of words (default 25)
    :param pos_vec: [tfidf, count] use corresponding term weighting
    :param pos_ngram: Learn vocabulary with ngrams in range (1,pos_ngram) (default is 3)
    :param sparse: output in svmlight sparse format
    :param feature_name_header: output headers as the feature names
    """
    if not os.path.isdir(output):
        raise ValueError("Output must be a directory")

    if os.path.isfile(source):
        source_dir, source_filename = os.path.split(source)
    else:
        source_dir = source
        source_filename = PRICE_FILENAME
    logging.info("Source dump: {}/{}".format(source_dir, source_filename))

    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
    output_path = os.path.join(output, "price-{}".format(timestamp))

    logging.info("Timestamp: {}".format(timestamp))
    pos_dict_filename = os.path.join(output_path, "pos-features-" + timestamp + ".json")
    type_dict_filename = os.path.join(output_path, "type-features-" + timestamp + ".json")
    rant_dict_filename = os.path.join(output_path, "rant-features-" + timestamp + ".json")

    rant_stats_vectorizer = DictVectorizer()
    userprofile_vectorizer = DictVectorizer()
    transformer_list = [
        ('rant_stats', Pipeline([
            ('selector', FieldSelector(key='rant')),
            ('stats', RantStats()),  # returns a list of dicts
            ('vect', rant_stats_vectorizer),  # list of dicts -> feature matrix
        ])),
        ('userprofile_stats', Pipeline([
            ('selector', FieldSelector(key='userprofile')),
            ('stats', UserProfileStats()),  # returns a list of dicts
            ('vect', userprofile_vectorizer),  # list of dicts -> feature matrix
        ])),
    ]

    pos_vectorizer_func = VECTORIZERS[pos_vec]
    pos_vectorizer = None
    type_vectorizer_func = VECTORIZERS[type_vec]
    type_vectorizer = None
    word_vectorizer = None
    if pos_max_features:
        pos_vectorizer = pos_vectorizer_func(tokenizer=tokenize_pos, ngram_range=(1, pos_ngram),
                                             strip_accents='unicode', min_df=pos_min_df, max_features=pos_max_features)
        transformer_list.append(('pos_bow', Pipeline([
            ('selector', FieldSelector(key='rant')),
            ('vectorize', pos_vectorizer),
        ])))
    if type_max_features:
        type_vectorizer = type_vectorizer_func(tokenizer=tokenize_token_type, ngram_range=(1, type_ngram),
                                               strip_accents='unicode', min_df=type_min_df,
                                               max_features=type_max_features)
        transformer_list.append(('type_bow', Pipeline([
            ('selector', FieldSelector(key='rant')),
            ('vectorize', type_vectorizer),
        ])))
    if word_max_features:
        word_vectorizer = TfidfVectorizer(tokenizer=tokenize_rant, strip_accents='unicode', min_df=word_min_df,
                                          max_features=word_max_features)
        transformer_list.append(('rant_bow', Pipeline([
            ('selector', FieldSelector(key='rant')),
            ('vectorize', word_vectorizer),
        ])))
    pipeline = Pipeline([
        ('union', FeatureUnion(transformer_list=transformer_list))
    ])

    fuman_data = load_fuman_price(source_dir, filename=source_filename)
    logging.info("Processing pipeline...")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", module="sklearn")
        instances = pipeline.fit_transform(fuman_data.data)
        n_samples = instances.shape[0]
        y = np.asarray(fuman_data.target, dtype=np.int8).reshape((n_samples,))

        pos_features = list()
        type_features = list()
        rant_features = list()
        if pos_max_features:
            pos_features = pos_vectorizer.get_feature_names()
            save_features_json(pos_dict_filename, pos_features)
        if type_max_features:
            type_features = type_vectorizer.get_feature_names()
            save_features_json(type_dict_filename, type_features)
        if word_max_features:
            rant_features = word_vectorizer.get_feature_names()
            save_features_json(rant_dict_filename, rant_features)
        header = make_header(rant_stats_vectorizer.get_feature_names(),
                             userprofile_vectorizer.get_feature_names(),
                             pos_features, type_features, rant_features, feature_name_header)
        logging.info("Saving {} of {} folds to disk...".format(n_folds_max, n_folds))
        if n_folds == 1:
            dump_csv(output_path, instances, y, "price", 0, header, timestamp, sparse)
        else:
            skf = KFold(n=n_samples, n_folds=n_folds, shuffle=True)
            for i, (_, test_index) in enumerate(skf, 1):
                dump_csv(output_path, instances[test_index], y[test_index], "price", i, header, timestamp, sparse)
                if i == n_folds_max:
                    break
        save_dataset_metadata(sparse, output_path, "price", source_filepath=source, timestamp=timestamp,
                              word_vectorizer=word_vectorizer, tokenize_rant=tokenize_rant,
                              pos_vectorizer=pos_vectorizer, tokenize_pos=tokenize_pos, type_vectorizer=type_vectorizer,
                              tokenize_type=tokenize_token_type)
    logging.info("Work complete!")
Example #3
0
def main(source, output, n_folds, n_folds_max, type_max_features, type_min_df, type_ngram, pos_max_features, pos_min_df,
         pos_ngram, pos_vec_type, sparse, feature_name_header):
    """
    Generates a good vs bad training dataset from Fuman user posts. (Binary Classification)

    Concatenates simple features from the database, hand crafted features based on various character and word counts,
    and Tf-Idf weighted bag of words based on the text as well as the part-of-speech tags of Fuman user posts.

    :param source: directory or file of the input files. (If dir, file will be all-scored-rants.csv)
    :param output: the output directory
    :param n_folds: the number of splits to generate (using StratifiedKFold)
    :param pos_max_features: parameter for tf-idf vectorizer (default 50000)
    :param pos_min_df: parameter for tf-idf vectorizer (default 100)
    :param pos_vec: [tfidf, count] use corresponding term weighting
    :param pos_ngram: Learn vocabulary with ngrams in range (1,pos_ngram) (default is 3)
    """
    if not os.path.isdir(output):
        raise ValueError("Output must be a directory")

    if os.path.isfile(source):
        raise ValueError("Source must be a directory")
    logging.info("Source dump: {}".format(source))
    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H_%M_%S')
    logging.info("Timestamp: {}".format(timestamp))
    output_path = os.path.join(output, "gvsb-{}".format(timestamp))

    rant_stats_vectorizer = DictVectorizer()
    stats_pipeline = Pipeline([('stats', RantStats()),  # returns a list of dicts
                               ('vect', rant_stats_vectorizer)])  # list of dicts -> feature matrix
    type_vec = CountVectorizer(tokenizer=tokenize_token_type, ngram_range=(1, type_ngram), strip_accents='unicode',
                               min_df=type_min_df, max_features=type_max_features)
    transformer_list = [
        ('rant_stats', stats_pipeline),
        ("type_vec", type_vec),
    ]
    pos_vec = None
    pos_dict_filename = os.path.join(output_path, "pos-vocabulary-" + timestamp + ".json")
    if pos_max_features:
        logging.info("Adding POS vectorization with max_features: {} ngram: {} max_df: {}".format(pos_max_features,
                                                                                                  pos_ngram,
                                                                                                  pos_min_df))
        pos_vec = VECTORIZERS[pos_vec_type](tokenizer=tokenize_pos, ngram_range=(1, pos_ngram), strip_accents='unicode',
                                            min_df=pos_min_df, max_features=pos_max_features)
        transformer_list.append(("pos_vec", pos_vec))

    pipeline = Pipeline([
        ('union', FeatureUnion(transformer_list))
    ])

    fuman_data = load_fuman_gvb(source, good_filename=GOOD_FILENAME, bad_filename=BAD_FILENAME)
    logging.info("Processing pipeline...")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="deprecated", module="sklearn")
        instances = pipeline.fit_transform(fuman_data.data)
        n_samples = instances.shape[0]
        y = np.asarray(fuman_data.target, dtype=np.int8).reshape((n_samples,))

        pos_features = list()
        if pos_max_features:
            pos_features = pos_vec.get_feature_names()
            save_features_json(pos_dict_filename, pos_features)

        header = make_header(rant_stats_vectorizer.get_feature_names(),
                             token_type_features=type_vec.get_feature_names(),
                             pos_features=pos_features,
                             feature_name_header=feature_name_header)

        logging.info("Saving {} folds to disk...".format(n_folds))
        splits = StratifiedShuffleSplit(y, test_size=1.0 / n_folds)
        for i, (_, test_index) in enumerate(splits, 1):
            dump_csv(output_path, instances[test_index], y[test_index], "gvsb", i, header, timestamp, sparse)
            if i == n_folds_max:
                break
        save_dataset_metadata(sparse, output_path, "goodvsbad", source_filepath=source, timestamp=timestamp,
                              pos_vectorizer=pos_vec, tokenize_pos=tokenize_pos)
    logging.info("Work complete!")