Ejemplo n.º 1
0
features.add('avg_position', StandardScaler(), 'avg_position'),
features.add('avg_cost', StandardScaler(), 'avg_cost'),
features.add('device', CategoricalEncoder(), 'device'),
features.add('ad_placement', CategoricalEncoder(), 'ad_placement'),
features.add('ad_type', CategoricalEncoder(), 'ad_type'),
features.add('match_type', CategoricalEncoder(), 'match_type'),

#Step 10: Specify the classifier you want to use (additionaly!)
new_classifier = LogisticRegression()
#new_classifier = SVR(kernel='linear')
#new_classifier = LinearRegression(fit_intercept=True, normalize=True)
#new_classifier = Ridge(alpha=.5)

if options.args.print_details >= 2:
  printer.labelDistribution(data.Y_train, 'Training Set')

#Step 11: Run our system.
if len(data.labels) > 1: #otherwise, there is nothing to train
  clf = run(options.args.k, options.args.method, data, features._list, printer, options.args.predict_method, new_classifier, options.args.print_details, options.args.show_fitting)

  new_x_tester = {}
  for row in x_tester:
    for var in row:
      if var not in new_x_tester:
        new_x_tester[var] = []
      new_x_tester[var].append(row[var])

  #print(new_x_tester)
  t = clf.classifier.predict_proba(new_x_tester)
  results_y = []
Ejemplo n.º 2
0
if data.file_test != '':
  data.test = data.load(data.file_test, format='pickle')

#Step 8.2: Formulate the preprocessing steps which have to be done
textPreprocessing = ['replaceTwitterInstagram', 'replaceTwitterURL', 'replaceSpecialCharacters', 'maxCharacterSequence']

#Step 8.3: Transform the data to our desired format
data.transform(_type='YXrow', preprocessing=textPreprocessing) #> now we got X, Y and X_train, Y_train, X_development, Y_development and X_test

#Step 8.4: For training purposes, we can specify what our subset will look like (train_size, development_size, test_size)
#data.subset(500, 50, 50)

#Step 9: Specify the features to use, this part is merely for sklearn.
features = ClassifierFeatures()
#features.add('wordCount', TextFeatures.wordCount())
features.add('word', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeTweet, lowercase=False, analyzer='word', stop_words=sw.words('english'), ngram_range=(1,20), min_df=1)),#, max_features=100000)),

#Step 10: Specify the classifier you want to use (additionaly!)
#new_classifier = LinearSVC()
new_classifier = None

if options.args.print_details >= 2:
  printer.labelDistribution(data.Y_train, 'Training Set')

#Step 11: Run our system.
if len(data.labels) > 1: #otherwise, there is nothing to train
  run(options.args.k, options.args.method, data, features._list, printer, new_classifier, options.args.print_details, options.args.show_fitting)

  printer.duration()
else:
  print('The combination of the language <{}> and the variable <{}> only have one label. Thus, there is nothing to train. Try another combination!'.format(predict_languages, args.predict_label))