Esempio n. 1
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
  INPUT_DATA = os.path.join("..", "..", "data", "features", "bag_of_words_sparse_matrix.p")
  #INPUT_LABELS = os.path.join("..", "..", "data", "features", "hier_labels.pkl")

  parser.add_argument('--id', 
                      '--input-data', 
                      type=str, 
                      dest='input_data',
                      default=INPUT_DATA, 
                      help="File with input data in matrix format. Defaults to '%(default)s'")

#  parser.add_argument('--il', 
#                      '--input-labels',
#                      type=str, 
#                      dest='input_labels',
#                      default=INPUT_LABELS, 
#                      help="File with input labels. Defaults to '%(default)s'")

  ####PCA command line argument should go here (wether to do it or not and how much of the energy to be kept)


  args = parser.parse_args()

  np.random.seed(123)
  ipdb.set_trace()
  clf1 = LogisticRegression()
  clf2 = RandomForestClassifier()
  clf3 = GaussianNB()
  clf4 = DecisionTreeClassifier()
  clf5 = AdaBoostClassifier()

  print('5-fold cross validation:\n')

  #X = counted # sparse matrix input
  #X = tfidf
  #X = counted_bigr
  #y = data_str.iloc[:, 0]
  sparse_mat = pickle.load( open( args.input_data, "rb" ) )

  X = sparse_mat.iloc[:, 2]

  y = sparse_mat.iloc[:, 1]

  ###### if command line argument for PCA is True, then perform PCA on X here!

  # Ensemble classifier
  eclf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3, clf4, clf5], voting='hard')
  #eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[2,1,5]) # average probabilities, soft voting

  for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Decision Tree', 'AdaBoost', 'Ensemble']):

    scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    
    #scores = cross_validation.cross_val_score(clf3, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), label))
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    INPUT_FILE = os.path.join("..", "..", "data", "features",
                              "bow_string_input_dframe.p")
    OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output",
                                    'predicted_labels_ensemble.csv')

    parser.add_argument(
        '-i',
        '--input-file',
        type=str,
        dest='input_file',
        default=INPUT_FILE,
        help=
        "File with the list of item classes and features. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-g',
        '--gridres-file',
        type=str,
        dest='gridres_file',
        default=None,
        help=
        "File with the best parameters of the grid search. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-n',
        '-ngrams',
        dest='nGrams',
        type=int,
        default=1,
        nargs='+',
        help=
        'Defines how to split words by ngrams. Default is tokenized to one word ngrams'
    )

    parser.add_argument('--ti',
                        dest='tf_idf',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, TfIdf features will be used')

    parser.add_argument(
        '-c',
        '--classifier',
        type=str,
        dest='classifier',
        default="ensemble",
        help="The classifier to be used. Defaults to '%(default)s'")

    parser.add_argument(
        '--cat',
        type=str,
        dest='category',
        default='income-type',
        choices=('income-type', 'income-source', 'expenditure-type'),
        help="The type of categorization. Defaults to '%(default)s'")

    parser.add_argument(
        '--od',
        '--output-file-desc',
        type=str,
        dest='output_file_desc',
        default=OUTPUT_FILE_DESC,
        help=
        "A csv file to output the predicted labels. Defaults to '%(default)s'")

    args = parser.parse_args()

    # Read the input dictionary

    data_in = pandas.read_pickle(args.input_file)
    type_classes = list(data_in['type_class'])
    source_classes = list(data_in['source_class'])
    frID = list(data_in['frID'])
    data_orig = data_in['description']

    if args.category == 'income-type' or args.category == 'expenditure-type':
        labels_orig = [
            str(i) for i in type_classes
        ]  # converting them to strings if they are not strings already
    else:
        labels_orig = [
            str(i) for i in source_classes
        ]  # converting them to strings if they are not strings already

    if args.gridres_file is not None:
        gridres_params = pickle.load(open(args.gridres_file, "rb"))[0]
    else:
        gridres_params = None

    if args.classifier == 'decision-tree':
        max_depth = gridres_params[
            'clf__max_depth'] if gridres_params is not None else 100
        clf = tree.DecisionTreeClassifier(max_depth=max_depth,
                                          criterion='entropy')
        print("Will run a decision tree with max depth=%d" % max_depth)
    elif args.classifier == 'random-forest':
        max_depth = gridres_params[
            'clf__max_depth'] if gridres_params is not None else 100
        clf = ensemble.RandomForestClassifier(max_depth=max_depth,
                                              criterion='entropy')
        print("Will run a random forest with max depth=%d" % max_depth)
    elif args.classifier == 'logistic-regression':
        C = gridres_params['clf__C'] if gridres_params is not None else 1
        clf = linear_model.LogisticRegression(C=C)
        print("Will run logistic regressor with C=%d" % C)
    else:  # ensemble
        clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
        clf2 = ensemble.RandomForestClassifier()
        clf3 = linear_model.LogisticRegression()
        clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3],
                                                     voting='hard')

    vectorizer = feature_extraction.text.CountVectorizer(
        analyzer='word',  #whether should be made ofword or char n-grams
        binary=
        False,  # if True all non-zero counts are set to one - used for probabilistic mapping
        decode_error=
        'strict',  # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding
        #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform()
        encoding="ISO-8859-15",  # 
        input='content',  # can be 'file', 'filename' or 'content'
        lowercase=
        False,  #Convert all characters to lowercase before tokenizing. 
        max_df=
        1.0,  # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."
        max_features=
        None,  # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.
        ngram_range=(
            1, args.nGrams
        ),  # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
        preprocessor=
        None,  # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.
        stop_words=None,  #     
        min_df=1,
        strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary=None)

    if args.tf_idf == True:
        transformer = feature_extraction.text.TfidfTransformer()
        ppl = pipeline.Pipeline([
            ('vectorizer', vectorizer),
            ('transformer', transformer),
            ('clf', clf),
        ])
    else:
        ppl = pipeline.Pipeline([
            ('vectorizer', vectorizer),
            ('clf', clf),
        ])

    k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)

    labels_predicted = [-1] * len(labels_orig)

    accuracy = []

    for train_idx, dev_idx in k_fold:
        data_train = [data_orig[i] for i in train_idx]
        data_dev = [data_orig[i] for i in dev_idx]
        labels_train = [labels_orig[i] for i in train_idx]
        labels_dev = [labels_orig[i] for i in dev_idx]

        ppl.fit(data_train, labels_train)
        predicted_dev = ppl.predict(data_dev)
        labels_predicted = set_all_predicted(predicted_dev, labels_predicted,
                                             dev_idx)

        accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)]

    print("Accuracy of the %s classifier: %.4f +- %.4f" %
          (args.classifier, numpy.mean(accuracy), numpy.std(accuracy)))

    # Save the predicted classes
    to_dump = [labels_orig, labels_predicted]
    helpers.ensure_dir(os.path.dirname(args.output_file_desc))

    #create a dataframe to output type class, predicted type class and description data
    if args.category == 'income-type' or args.category == 'expenditure-type':
        dump_op_desc = pandas.DataFrame({
            'frID': frID,
            'type_class': labels_orig,
            'type_class_predicted': labels_predicted,
            'description': data_orig
        })
    else:
        dump_op_desc = pandas.DataFrame({
            'frID': frID,
            'source_class': labels_orig,
            'source_class_predicted': labels_predicted,
            'description': data_orig
        })
    dump_op_desc.to_csv(args.output_file_desc)
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()

#X = pickle.load(open('../../data/features/hier_data.pkl', 'rb'))
#X = X.toarray()
#y = pickle.load(open('../../data/features/hier_labels.pkl', 'rb'))

N1 = math.floor(X.shape[1] / 2)
Nend = X.shape[1]
vec1 = range(0, N1)
vec2 = range(N1, Nend)

pipe1 = Pipeline([
    ('sel',
     ensemble_classifier.ColumnSelector(vec1)),  # use only the 1st feature
    ('clf', clf1)
])

pipe2 = Pipeline([
    ('sel',
     ensemble_classifier.ColumnSelector(vec2)),  # use the 1st and 2nd feature
    ('dim', LDA(n_components=1)),  # Dimensionality reduction via LDA
    ('clf', clf2)
])

eclf = ensemble_classifier.EnsembleClassifier([pipe1, pipe2])
scores = cross_validation.cross_val_score(eclf, X, y, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
      (scores.mean(), scores.std(), 'pipeline classifier'))
Esempio n. 4
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
  INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl")
  OUTPUT_FILE = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.pkl')

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with the list of item classes and features. Defaults to '%(default)s'")

  parser.add_argument('-c', 
                      '--classifier', 
                      type=str, 
                      dest='classifier',
                      default="ensemble", 
                      help="The classifier to be used. Defaults to '%(default)s'")

  parser.add_argument('--orig-labels', 
                      dest='orig_labels', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, the original data labels will be stored. Otherwise, they will be coded as integers')

  parser.add_argument('--cat',
                      type=str, 
                      dest='category', 
                      default='income-type',
                      choices=('income-type','income-source','expenditure-type'),
                      help="The type of categorization. Defaults to '%(default)s'")

  parser.add_argument('-o', 
                      '-output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="A file to output the predicted labels. Defaults to '%(default)s'")

  

  args = parser.parse_args()

  # Read the input dictionary
  type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb"))
  
  import ipdb; ipdb.set_trace()
  # get all the label data
  if args.category == 'income-type' or args.category == 'expenditure-type':
    labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already
  else:
    labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already
  #labels_orig = [type_dict[x] for x in type_classes] 
  data_orig = token_container

  if args.classifier == 'decision-tree':
    clf = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
  elif args.classifier == 'random-forest':
    clf = ensemble.RandomForestClassifier()
  elif args.classifier == 'logistic-regression':
    clf = linear_model.LogisticRegression()
  else: # ensemble
    clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
    clf2 = ensemble.RandomForestClassifier()
    clf3 = linear_model.LogisticRegression()
    clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard')    

  ppl = pipeline.Pipeline([
    ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True
    ('clf', clf),
  ])

  k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)

  #labels_predicted = numpy.array([-1] * len(labels_orig), dtype='int')
  labels_predicted = [-1] * len(labels_orig) 

  accuracy = []

  for train_idx, dev_idx in k_fold: 
    data_train = [data_orig[i] for i in train_idx]
    data_dev =  [data_orig[i] for i in dev_idx]
    labels_train = [labels_orig[i] for i in train_idx]
    labels_dev =  [labels_orig[i] for i in dev_idx]
  
    ppl.fit(data_train, labels_train)
    predicted_dev = ppl.predict(data_dev)
    #labels_predicted[dev_idx] = predicted_dev
    labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx)

    accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)]
  
  print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy)))

  # Save the predicted classes  
  #inv_type_dict = {v: k for k, v in type_dict.items()}
  
  to_dump = [labels_orig, labels_predicted]
  helpers.ensure_dir(os.path.dirname(args.output_file))
  pickle.dump(to_dump, open(args.output_file, "wb"))

  '''
  predicted_type_classes = [inv_type_dict[x] for x in labels_predicted]
  if args.orig_labels: # save the original labels
    predicted_type_classes = [inv_type_dict[x] for x in labels_predicted]
    to_dump = [type_classes, predicted_type_classes]
  else: # save the labels codified with integer numbers, as well as the decoding dictionary
    to_dump = [labels_orig, labels_predicted, inv_type_dict]
  '''
  



  

  
Esempio n. 5
0

label =['Logistic Regression', 'Random Forest', 'Decision Tree']

print(label)
df = pd.DataFrame(columns=('w1', 'w2', 'w3', 'mean', 'std'))

i = 0
for w1 in range(0,3):
    for w2 in range(0,3):
        for w3 in range(0,3):

            if len(set((w1,w2,w3))) == 1: # skip if all weights are equal
                continue

            eclf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[w1,w2,w3])
            scores = cross_validation.cross_val_score(
                                            estimator=eclf,
                                            X=X,
                                            y=y,
                                            cv=5,
                                            scoring='accuracy',
                                            n_jobs=1)

            df.loc[i] = [w1, w2, w3, scores.mean(), scores.std()]
            print("Accuracy: %0.5f (+/- %0.5f) w1=%d w2=%d w3=%d" % (scores.mean(), scores.std(), w1, w2, w3))
            i += 1

df.sort(columns=['mean', 'std'], ascending=False)
# printing out the results:
# w1, w2, w3, mean (averaged over k-folds), std