コード例 #1
0
ファイル: decisionTree.py プロジェクト: RKruizinga/easyML
class DecisionTree:
    X_train = []
    Y_train = []
    X_development = []
    Y_development = []
    X_test = []

    Y_predicted = []

    labels = []

    features = []

    def __init__(self, data, predict_method, show_fitting):

        self.X_train = data.X_train
        self.Y_train = data.Y_train

        self.X_development = data.X_development
        self.Y_development = data.Y_development

        self.X_test = data.X_test

        self.labels = data.labels

        self.predict_method = predict_method
        self.show_fitting = show_fitting

    def classify(self, features, classifier=None):
        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = DecisionTreeClassifier(min_samples_leaf=2,
                                                max_depth=50)

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()

    def evaluate(self):
        if self.X_development:
            self.Y_development_predicted = self.classifier.predict(
                self.X_development)
        if self.X_test:
            self.Y_test_predicted = self.classifier.predict(self.X_test)

        self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)
コード例 #2
0
class NaiveBayes:
    X_train = []
    Y_train = []
    X_development = []
    Y_development = []
    X_test = []

    Y_predicted = []

    labels = []

    features = []

    def __init__(self, data, show_fitting):

        self.X_train = data.X_train
        self.Y_train = data.Y_train

        self.X_development = data.X_development
        self.Y_development = data.Y_development

        self.X_test = data.X_test

        self.labels = data.labels

        self.show_fitting = show_fitting

    def classify(self, features, classifier=None):
        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = MultinomialNB()

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()

    def evaluate(self):
        if self.X_development:
            self.Y_development_predicted = self.classifier.predict(
                self.X_development)
        if self.X_test:
            self.Y_test_predicted = self.classifier.predict(self.X_test)

        self.accuracy, self.precision, self.recall, self.f1score = metrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)
コード例 #3
0
class Baseline:

  X_train = []
  Y_train = []
  X_development = []
  Y_development = []
  X_test = []

  labels = []

  features = []

  def __init__(self, data, predict_method, show_fitting):
    self.X_train = data.X_train
    self.Y_train = data.Y_train

    self.X_development = data.X_development
    self.Y_development = data.Y_development

    self.X_test = data.X_test

    self.labels = data.labels

    self.predict_method = predict_method
    self.show_fitting = show_fitting

    self.classifier = Classifier()

  def classify(self, features, classifier=None):
    self.printer = Printer('Model Fitting', self.show_fitting)
    self.classifier.fit(self.X_train, self.Y_train)  
    self.printer.duration()

  def evaluate(self):
    if self.X_development:
      self.Y_development_predicted = self.classifier.predict(self.X_development)
    if self.X_test:
      self.Y_test_predicted = self.classifier.predict(self.X_test)

    self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(self.Y_development, self.Y_development_predicted, self.labels)

  def printBasicEvaluation(self):    
    self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation")

  def printClassEvaluation(self):
    self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
コード例 #4
0
class NeuralNetwork:
    word_embeddings_file = 'data/word_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
    word_embeddings_dim = 200
    word_embeddings_layer = None
    word_embeddings_index = {}

    labels = []
    labels_dict = {}
    labels_dict_rev = {}

    Y = []

    def __init__(self, data, show_fitting):
        self.data = data

        self.X = self.data.X
        self.labels = self.data.labels

        for i, label in enumerate(self.labels):
            self.labels_dict[label] = i
            self.labels_dict_rev[i] = label

        self.Y = []
        for label in self.data.Y:
            self.Y.append(self.labels_dict[label])

        self.show_fitting = show_fitting

    def tokenize(self):
        self.X_tokenized = TextTokenizer.tokenizeTweets(self.X)  #all tweets!
        self.tokenizer = Tokenizer(split="|", )
        self.tokenizer.fit_on_texts(self.X_tokenized)
        self.sequences = self.tokenizer.texts_to_sequences(self.X_tokenized)
        self.X = pad_sequences(self.sequences)
        self.Y = to_categorical(self.Y)

    def classify(self, features, classifier=None):
        self.tokenize()

        train_development_split = self.data.amount_train
        development_test_split = self.data.amount_train + self.data.amount_development

        self.X_train = self.X[:train_development_split]
        self.Y_train = self.Y[:train_development_split]

        self.X_development = self.X[
            train_development_split:development_test_split]
        self.Y_development = self.Y[
            train_development_split:development_test_split]

        self.X_test = self.X[development_test_split:]

        if self.data.avoid_skewness:
            Y_train = np.argmax(self.Y_train, axis=1)
            Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)]

            self.X_train, self.Y_train = unskewedTrain(self.X_train,
                                                       self.Y_train, Y_train)
            self.X_train = np.array(self.X_train)
            self.Y_train = np.array(self.Y_train)

        self.word_embeddings_layer, self.word_embeddings_index = readWordEmbeddings(
            self.data.languages, self.data.response_variable)
        if self.word_embeddings_layer == None:
            self.createWordEmbeddings()

        self.printDataInformation()

        ##CHANGE OPTIONS HERE
        self.model = Sequential()
        self.model.add(self.word_embeddings_layer)
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(self.word_embeddings_dim))
        self.model.add(Dense(self.Y.shape[1], activation='sigmoid'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        # Train the model
        self.printer = Printer('Model Fitting', self.show_fitting)
        self.model.fit(self.X_train,
                       self.Y_train,
                       epochs=5,
                       batch_size=128,
                       validation_split=0.2)
        self.printer.duration()

    def evaluate(self):
        self.Y_development_predicted = self.model.predict(self.X_development)

        self.Y_development_predicted = np.argmax(self.Y_development_predicted,
                                                 axis=1)
        self.Y_development_predicted = [
            self.labels_dict_rev[int(i)]
            for i in list(self.Y_development_predicted)
        ]

        self.Y_development = np.argmax(self.Y_development, axis=1)
        self.Y_development = [
            self.labels_dict_rev[int(i)] for i in list(self.Y_development)
        ]

        self.accuracy, self.precision, self.recall, self.f1score = metrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)

    def printDataInformation(self):

        print('\n~~~Neural Network Distribution~~~\n')
        print('Found {} unique tokens.'.format(len(self.tokenizer.word_index)))
        print('Shape of data tensor: {}'.format(self.X.shape))
        print('Shape of label tensor: {}\n'.format(self.Y.shape))

        if len(self.word_embeddings_index) > 0:
            print('Found {} word vectors.'.format(
                len(self.word_embeddings_index)))

    def createWordEmbeddings(self):
        self.word_embeddings_index = {}
        f = open(self.word_embeddings_file, encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.word_embeddings_index[word] = coefs
        f.close()

        self.word_embeddings_matrix = np.zeros(
            (len(self.tokenizer.word_index) + 1, self.word_embeddings_dim))
        for word, i in self.tokenizer.word_index.items():
            embedding_vector = self.word_embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.word_embeddings_matrix[i] = embedding_vector

        self.word_embeddings_layer = Embedding(
            len(self.tokenizer.word_index) + 1,
            self.word_embeddings_dim,
            mask_zero=True,
            weights=[self.word_embeddings_matrix],
            trainable=True)

        writeWordEmbeddings(self.word_embeddings_layer,
                            self.word_embeddings_index, self.data.languages,
                            self.data.response_variable)
コード例 #5
0
class SVM:
  X_train = []
  Y_train = []
  X_development = []
  Y_development = []
  X_test = []

  Y_predicted = []

  labels = []

  features = []

  def __init__(self, data, show_fitting):

    self.X_train = data.X_train
    self.Y_train = data.Y_train

    self.X_development = data.X_development
    self.Y_development = data.Y_development

    self.X_test = data.X_test

    self.labels = data.labels

    self.show_fitting =show_fitting

  def classify(self, features, classifier=None):
  
    feature_union = ('feats', FeatureUnion(
      features
    ))

    if classifier == None:
      classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None)
      
    self.classifier = Pipeline([
      feature_union,
      ('classifier', classifier)
    ])

    self.printer = Printer('Model Fitting', self.show_fitting)

    #self.X_train, X_none, self.Y_train, Y_none = train_test_split(self.X_train, self.Y_train, test_size=0.2, random_state=42)
    #self.printer.labelDistribution(self.Y_train, '80%')

    self.classifier.fit(self.X_train, self.Y_train)  
    self.printer.duration()

  def evaluate(self):
    if self.X_development:
      self.Y_development_predicted = self.classifier.predict(self.X_development)
    if self.X_test:
      self.Y_test_predicted = self.classifier.predict(self.X_test)

    self.accuracy, self.precision, self.recall, self.f1score = metrics(self.Y_development, self.Y_development_predicted, self.labels)

  def printBasicEvaluation(self):    
    self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation")

  def printClassEvaluation(self):
    self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
コード例 #6
0
class SVM:
    X_train = []
    Y_train = []
    X_development = []
    Y_development = []
    X_test = []

    Y_predicted = []

    labels = []

    features = []

    def __init__(self, data, predict_method, show_fitting):

        self.X_train = data.X_train
        self.Y_train = data.Y_train

        self.X_development = data.X_development
        self.Y_development = data.Y_development
        self.X_test = data.X_test

        self.labels = data.labels

        self.predict_method = predict_method
        self.show_fitting = show_fitting

    def classify(self, features, classifier=None):

        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = SGDClassifier(loss='hinge',
                                       random_state=42,
                                       max_iter=50,
                                       tol=None)

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])
        print(self.classifier)

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()

    def evaluate(self):
        if self.X_development:
            self.Y_development_predicted = self.classifier.predict(
                self.X_development)
            print(self.X_development)
            #print(self.classifier.predict_proba(self.X_development))
            #print(self.Y_development[:20], self.Y_development_predicted[:20])
        if self.X_test:
            self.Y_test_predicted = self.classifier.predict(self.X_test)

        if self.predict_method == 'classification':
            self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(
                self.Y_development, self.Y_development_predicted, self.labels)

        elif self.predict_method == 'regression':
            # self.Y_development_predicted = self.classifier.score(self.X_development, self.Y_development)
            # print(self.Y_development_predicted)
            self.mean_abs_err, self.mean_squ_err, self.r2score, self.kl_divergence = regressionMetrics(
                self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        if self.predict_method == 'classification':
            self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                    self.f1score, "Classification Evaluation")
        elif self.predict_method == 'regression':
            self.printer.regressionEvaluation(self.mean_abs_err,
                                              self.mean_squ_err, self.r2score,
                                              self.kl_divergence,
                                              "Regression Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)
コード例 #7
0
class NeuralNetwork:
    word_embeddings_file = 'data/word_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
    word_embeddings_dim = 200
    word_embeddings_layer = None
    word_embeddings_index = {}

    labels = []
    labels_dict = {}
    labels_dict_rev = {}

    Y = []

    def __init__(self, data, predict_method, show_fitting):
        self.data = data

        self.X = self.data.X
        self.labels = self.data.labels

        for i, label in enumerate(self.labels):
            self.labels_dict[label] = i
            self.labels_dict_rev[i] = label

        self.Y = []
        for label in self.data.Y:
            self.Y.append(self.labels_dict[label])

        self.predict_method = predict_method
        self.show_fitting = show_fitting

    def tokenize(self):
        xy_section = []
        xy_area = []
        xy_element = []
        xy = []

        for x in self.X:
            xy_section.append(x['xy_section'])
            #xy_area.append(text_to_word_sequence('|'.join(x['xy_area']), split='|'))
            xy_area.append(x['xy_area'])
            xy_element.append(x['xy_element'])
            xy.append(x['xy'])

        #self.X_tokenized = TextTokenizer.tokenizeTweets(self.X) #all tweets!
        #print(xy_area)

        # self.X_tokenized = xy_area
        # vectorizer = TfidfVectorizer(tokenizer=TextTokenizer.tokenized, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1)
        # self.X = vectorizer.fit_transform(self.X_tokenized)
        # self.input_length = len(vectorizer.get_feature_names())
        #print(self.X)
        self.X = sequence.pad_sequences(xy)
        self.feature_length = len(self.X[0])
        self.feature_dimensions = len(self.X[0][0])
        print(self.feature_dimensions)
        self.input_length = len(self.X)
        self.Y = to_categorical(self.Y)

    def classify(self, features, classifier=None):
        self.tokenize()

        train_development_split = self.data.amount_train
        development_test_split = self.data.amount_train + self.data.amount_development

        self.X_train = self.X[:train_development_split]
        self.Y_train = self.Y[:train_development_split]

        self.X_development = self.X[
            train_development_split:development_test_split]
        self.Y_development = self.Y[
            train_development_split:development_test_split]

        self.X_test = self.X[development_test_split:]

        if self.data.avoid_skewness:
            Y_train = np.argmax(self.Y_train, axis=1)
            Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)]

            self.X_train, self.Y_train = unskewedTrain(self.X_train,
                                                       self.Y_train, Y_train)
            self.X_train = np.array(self.X_train)
            self.Y_train = np.array(self.Y_train)

        ##CHANGE OPTIONS HERE
        self.model = Sequential()
        self.model.add(
            Dense(512,
                  input_shape=(self.feature_length, self.feature_dimensions)))
        self.model.add(Flatten())
        self.model.add(Dense(6))
        self.model.add(Activation('relu'))
        # self.model.add(Dropout(0.2))
        # # self.model.add(Dense(128))
        # self.model.add(Activation('relu'))
        # self.model.add(Dropout(0.1))
        # self.model.add(Dense(6, input_dim=self.feature_length,)))
        # self.model.add(Activation('sigmoid'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        # Train the model
        self.printer = Printer('Model Fitting', self.show_fitting)
        self.model.fit(self.X_train,
                       self.Y_train,
                       epochs=50,
                       batch_size=128,
                       validation_split=0.2)
        self.printer.duration()

    def evaluate(self):
        self.Y_development_predicted = self.model.predict(self.X_development)

        self.Y_development_predicted = np.argmax(self.Y_development_predicted,
                                                 axis=1)
        self.Y_development_predicted = [
            self.labels_dict_rev[int(i)]
            for i in list(self.Y_development_predicted)
        ]

        self.Y_development = np.argmax(self.Y_development, axis=1)
        self.Y_development = [
            self.labels_dict_rev[int(i)] for i in list(self.Y_development)
        ]

        self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)

    def printDataInformation(self):

        print('\n~~~Neural Network Distribution~~~\n')
        print('Found {} unique tokens.'.format(len(self.tokenizer.word_index)))
        print('Shape of data tensor: {}'.format(self.X.shape))
        print('Shape of label tensor: {}\n'.format(self.Y.shape))

        if len(self.word_embeddings_index) > 0:
            print('Found {} word vectors.'.format(
                len(self.word_embeddings_index)))

    def createWordEmbeddings(self):
        self.word_embeddings_index = {}
        f = open(self.word_embeddings_file, encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.word_embeddings_index[word] = coefs
        f.close()

        self.word_embeddings_matrix = np.zeros(
            (len(self.tokenizer.word_index) + 1, self.word_embeddings_dim))
        for word, i in self.tokenizer.word_index.items():
            embedding_vector = self.word_embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.word_embeddings_matrix[i] = embedding_vector

        self.word_embeddings_layer = Embedding(
            len(self.tokenizer.word_index) + 1,
            self.word_embeddings_dim,
            mask_zero=True,
            weights=[self.word_embeddings_matrix],
            trainable=True)

        writeWordEmbeddings(self.word_embeddings_layer,
                            self.word_embeddings_index, self.data.languages,
                            self.data.response_variable)