コード例 #1
0
    def classify(self):
        # Classifies unknown forum posts
        if not self.fit:
            print("Fitting must be performed before classifying")
            return

        vectorizer = Vectorizer(self.dictionary.dictionary)
        input_file = input(
            "Enter the name of the .txt file containing the unknown posts (including file-ending: "
        )
        try:
            with open(input_file, "r") as file:
                vectors = vectorizer.vectorize(
                    self.preprocessor.preprocess(file))
        except FileNotFoundError:
            if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\
                    == "m":
                return
            self.classify()
            return

        with open("result.txt", "w") as result_file:
            for line in self.classifier.classify(vectors):
                result_file.write((label_list[line] + "\n"))
        print(
            "Result saved in result.txt. " +
            "The predicted label of each post is printed on the corresponding line of the document."
        )
コード例 #2
0
    def preprocess_and_fit(self):
        # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier
        processed = []
        processed_test = []
        for category in self.categories:
            processed.append(
                self.preprocessor.preprocess('training' + str(category) +
                                             ".txt"))
            processed_test.append(
                self.preprocessor.preprocess('testing' + str(category) +
                                             ".txt"))

        # Word indexing
        for category in processed:  # indexes all words into dictionary
            self.dictionary.index_words(category)
        print("Words indexed. Dictionary size: ",
              len(self.dictionary.dictionary), " words")

        # Vectorization
        vectorizer = Vectorizer(
            self.dictionary.dictionary
        )  # initializes vectorizer-object with dictionary
        vector_start = time.time()
        print("Vectorizing...")
        training_vectors = []
        testing_vectors = []
        for category in processed:
            training_vectors.append(vectorizer.vectorize(category))
        for category in processed_test:
            testing_vectors.append(vectorizer.vectorize(category))
        vector_time = time.time() - vector_start
        print("Vectorization completed in ", ("%.2f" % vector_time), "seconds")

        # Training and evaluation
        self.classifier.train(training_vectors)
        self.fit = True
        self.classifier.evaluate(testing_vectors)
コード例 #3
0
	for article, score in relevance_sorted_articles:
		print "Id: ", article.id_num
		print "Link: ", article.link
		print "Description: ", article.description
		print article.title ,":",score
		print '\n\n\n'
		sorted_article_list.append(article)
	return sorted_article_list



#==================FINDING TRENDING ARTICLES=================
trending_articles = findTrending(PICKELED_RECENT_ARTICLES_ALL_TOPICS)

vectorizer = Vectorizer()
vectorized_trending_articles = vectorizer.vectorize(trending_articles)

setArticleVectors(trending_articles, vectorized_trending_articles)

# for article in trending_articles:
# 	print article.description, article.vector

dimensions = str(len(vectorized_trending_articles)) + " x " + str(len(vectorized_trending_articles[0]))
print "Term document matrix with" +  dimensions + ": \n", vectorized_trending_articles

#==================KMEANS STARTS HERE=======================
# print "Calculating kmeans..."

# kmeans_calculator = KMeansClusterer()

# predicted_clustering = kmeans_calculator.calculateKMeans(vectorized_trending_articles, 30)
コード例 #4
0
        x, y, c = [], [], []

        for token in tokens:
            if token in dictioanry.valences:
                x.append(np.random.rand())
                y.append(np.random.rand())
                c.append(rgb(np.mean(dictioanry.valences[token])))

        plt.scatter(x, y, c=c, alpha=0.8)

        plt.show()


if __name__ == "__main__":

    preprocessor = Preprocessor(['train.tsv', 'test.tsv'], Cruncher())

    dictionary = Dictioanry('..\\..\\lexica')

    vectorizer = Vectorizer()

    labels, vectors = vectorizer.vectorize(preprocessor, dictionary)

    visualizer = Visualizer(preprocessor)

    for method in Visualizer.supported_methods:

        visualizer.visualize(method=method,
                             dictionary=dictionary,
                             model=vectorizer.underlying)
コード例 #5
0
def evaluation(filenames,
               dictionary_root='../../lexica',
               cruncher_type='lemmatizer',
               vectorizer_type='word2vec',
               metrics=['f1-score', 'accuracy-score']):

    if not isinstance(filenames, list):
        raise ValueError("'" + filenames + "' is not an instance of 'list'")

    beg = time.time()

    vectorizer = Vectorizer(vectorizer_type)

    try:

        labels, vectors = vectorizer.vectorize(filenames, dictionary_root)

    except:

        preprocessor = Preprocessor(filenames, Cruncher(cruncher_type))

        dictionary = Dictioanry(dictionary_root) if dictionary_root else None

        labels, vectors = vectorizer.vectorize(preprocessor, dictionary)

    test_ids, test_labels, test_vectors = [], [], []
    train_ids, train_labels, train_vectors = [], [], []

    for id, label in labels.items():

        if label == 'unknown':
            test_ids.append(id)
            test_labels.append(label)
            test_vectors.append(vectors[id])

        else:
            train_ids.append(id)
            train_labels.append(label)
            train_vectors.append(vectors[id])

    evaluator = Evaluator()

    for classifing in ['knn', 'rrb', 'svm']:

        if classifing != 'rrb':
            classifier = Classifier(train_vectors, train_labels, classifing)

            predictions = classifier.predict(test_vectors)

        else:
            classifier = RoundRobin(train_labels, train_vectors, test_vectors)

            predictions = classifier.classify()

        for metric in metrics:

            value = evaluator.evaluate(dict(zip(test_ids, predictions)),
                                       metric)

            print('<LOG>: The performance of', "'" + classifing + "'",
                  'according to the',
                  ("'" + metric + "'").ljust(max(map(len, metrics)) + 2),
                  "metric is", '{0:.6f}'.format(value))

    end = time.time()

    print('\n\nElapsed time:',
          '{0:.6f}'.format(end - beg),
          'seconds',
          file=sys.stderr)
コード例 #6
0
class MetaModel:
    """
    We wrap the keras model in our own metaclass that handles text loading,
    and provides convenient train and sample functions.
    """
    def __init__(self):
        self.train_model = None
        self.sample_model = None
        self.seeds = None
        self.vectorizer = None

    # Read in our data and validation texts
    def _load_data(self, data_dir, word_tokens, pristine_input,
                   pristine_output, batch_size, seq_length, seq_step):
        try:
            with open(os.path.join(data_dir, 'input.txt')) as input_file:
                text = input_file.read()
        except FileNotFoundError:
            print_red("No input.txt in data_dir")
            sys.exit(1)

        skip_validate = True
        try:
            with open(os.path.join(data_dir, 'validate.txt')) as validate_file:
                text_val = validate_file.read()
                skip_validate = False
        except FileNotFoundError:
            pass  # Validation text optional

        # Find some good default seed string in our source text.
        self.seeds = find_random_seeds(text)
        # Include our validation texts with our vectorizer
        all_text = text if skip_validate else '\n'.join([text, text_val])
        self.vectorizer = Vectorizer(all_text, word_tokens, pristine_input,
                                     pristine_output)

        data = self.vectorizer.vectorize(text)
        x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step)
        print('x.shape:', x.shape)
        print('y.shape:', y.shape)

        if skip_validate:
            return x, y, None, None

        data_val = self.vectorizer.vectorize(text_val)
        x_val, y_val = shape_for_stateful_rnn(data_val, batch_size, seq_length,
                                              seq_step)
        print('x_val.shape:', x_val.shape)
        print('y_val.shape:', y_val.shape)
        return x, y, x_val, y_val

    # Builds the underlying keras model
    def _build_models(self, batch_size, embedding_size, rnn_size, num_layers):
        model = Sequential()
        model.add(
            Embedding(self.vectorizer.vocab_size,
                      embedding_size,
                      batch_input_shape=(batch_size, None)))
        for layer in range(num_layers):
            model.add(LSTM(rnn_size, stateful=True, return_sequences=True))
            model.add(Dropout(0.2))
        model.add(
            TimeDistributed(
                Dense(self.vectorizer.vocab_size, activation='softmax')))
        # With sparse_categorical_crossentropy we can leave as labels as
        # integers instead of one-hot vectors
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['accuracy'])
        model.summary()

        # Keep a separate model with batch_size 1 for sampling
        self.train_model = model
        config = model.get_config()
        config['layers'][0]['config']['batch_input_shape'] = (1, None)
        self.sample_model = Sequential.from_config(config)
        self.sample_model.trainable = False

    def update_sample_model_weights(self):
        """Sync training and sampling model weights"""
        self.sample_model.set_weights(self.train_model.get_weights())

    def train(self, data_dir, word_tokens, pristine_input, pristine_output,
              batch_size, seq_length, seq_step, embedding_size, rnn_size,
              num_layers, num_epochs, live_sample):
        """Train the model"""
        print_green('Loading data...')
        load_start = time.time()
        x, y, x_val, y_val = self._load_data(data_dir, word_tokens,
                                             pristine_input, pristine_output,
                                             batch_size, seq_length, seq_step)
        load_end = time.time()
        print_red('Data load time', load_end - load_start)

        print_green('Building model...')
        model_start = time.time()
        self._build_models(batch_size, embedding_size, rnn_size, num_layers)
        model_end = time.time()
        print_red('Model build time', model_end - model_start)

        print_green('Training...')
        train_start = time.time()
        validation_data = (x_val, y_val) if (x_val is not None) else None
        callbacks = [LiveSamplerCallback(self)] if live_sample else None
        self.train_model.fit(x,
                             y,
                             validation_data=validation_data,
                             batch_size=batch_size,
                             shuffle=False,
                             epochs=num_epochs,
                             verbose=1,
                             callbacks=callbacks)
        self.update_sample_model_weights()
        train_end = time.time()
        print_red('Training time', train_end - train_start)

    def sample(self, seed=None, length=None, diversity=1.0):
        """Sample the model"""
        self.sample_model.reset_states()

        if length is None:
            length = 100 if self.vectorizer.word_tokens else 500

        if seed is None:
            seed = random.choice(self.seeds)
            print('Using seed: ', end='')
            print_cyan(seed)
            print('-' * 50)

        preds = None
        seed_vector = self.vectorizer.vectorize(seed)
        # Feed in seed string
        print_cyan(seed, end=' ' if self.vectorizer.word_tokens else '')
        for char_index in np.nditer(seed_vector):
            preds = self.sample_model.predict(np.array([[char_index]]),
                                              verbose=0)

        sampled_indices = np.array([], dtype=np.int32)
        # Sample the model one token at a time
        for i in range(length):
            char_index = 0
            if preds is not None:
                char_index = sample_preds(preds[0][0], diversity)
            sampled_indices = np.append(sampled_indices, char_index)
            preds = self.sample_model.predict(np.array([[char_index]]),
                                              verbose=0)
        sample = self.vectorizer.unvectorize(sampled_indices)
        print(sample)
        return sample

    # Don't pickle the keras models, better to save directly
    def __getstate__(self):
        state = self.__dict__.copy()
        del state['train_model']
        del state['sample_model']
        return state
コード例 #7
0
def main(args):
    print('Settings:')
    print(str(args)[10:-1])

    length = 136
    print('Loading data...')

    if args.no_recalc:
        train_x, _, dev_x, _, _, test_x, _, _ = load_data(
            '../data_dir'
        )  # Makes use of both raw and preprocessed source codes.
        train_x2, _, dev_x2, _, _, test_x2, _, _ = load_data('../data_dir',
                                                             preprocess=True)
        print("Extracting stylometric features...")
        vec = Vectorizer(
            'lexical'
        )  # Runs the stylometry_vectorizer from the vectorizer.py file so characters can be grabbed
        # simultaneously.
        train_x = vec.vectorize(train_x, train_x2)  # Vectorize all 3 subsets
        dev_x = vec.vectorize(dev_x, dev_x2)
        test_x = vec.vectorize(test_x, test_x2)
        del train_x2, dev_x2, test_x2
        scaler = MinMaxScaler()  # Rescale values between 0 and 1.
        print("Rescaling...")
        train_x = scaler.fit_transform(train_x)
        dev_x = scaler.transform(dev_x)
        test_x = scaler.transform(test_x)
        length = len(train_x[0])
        print(length)
        trainmm = np.memmap('vectors/train.mm',
                            dtype='float32',
                            mode='w+',
                            shape=(50000, length))
        trainmm[:] = train_x[:]
        devmm = np.memmap('vectors/dev.mm',
                          dtype='float32',
                          mode='w+',
                          shape=(25000, length))
        devmm[:] = dev_x[:]
        testmm = np.memmap('vectors/test.mm',
                           dtype='float32',
                           mode='w+',
                           shape=(25000, length))
        testmm[:] = test_x[:]
        del trainmm, devmm, testmm, train_x, dev_x, test_x  # Save and flush all vectors.
        print("Finished building vectors.")
    # Load data from file.
    train_y, dev_y, _ = load_all_labels('../data_dir')
    dev = np.array(
        np.memmap('vectors/dev.mm',
                  dtype='float32',
                  mode='r',
                  shape=(25000, length)))
    test = np.array(
        np.memmap('vectors/test.mm',
                  dtype='float32',
                  mode='r',
                  shape=(25000, length)))

    train = np.array(
        np.memmap('vectors/train.mm',
                  dtype='float32',
                  mode='r',
                  shape=(50000, length)))
    # Model.
    callback_list = [
        EarlyStopping(monitor='val_acc', patience=10),
        ModelCheckpoint(filepath='style_model.h5',
                        monitor='val_acc',
                        save_best_only=True),
        ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=5)
    ]
    model = Sequential()
    model.add(Dense(500, activation='relu', input_shape=(136, )))
    model.add(Dropout(0.3))
    model.add(Dense(500, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1000, activation='softmax'))
    opt = RMSprop(learning_rate=0.001)
    model.compile(optimizer=opt,
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])
    model.summary()
    model.fit(train,
              train_y,
              epochs=1000,
              batch_size=250,
              validation_data=(dev, dev_y),
              shuffle=True,
              callbacks=callback_list)
    model = load_model('style_model.h5')
    print(model.evaluate(dev, dev_y))
    # Generate predictions.
    predict_vec = np.memmap('vectors/dev_style.mm',
                            dtype='float32',
                            mode='w+',
                            shape=(25000, 1000))
    predict_vec[:] = model.predict(dev)[:]
    del predict_vec

    predict_vec2 = np.memmap('vectors/test_style.mm',
                             dtype='float32',
                             mode='w+',
                             shape=(25000, 1000))
    predict_vec2[:] = model.predict(test)[:]
    del predict_vec2
コード例 #8
0
X_train, X_test, y_train, y_test = train_test_split(df.values[:, 0],
                                                    df.values[:,
                                                              1].astype('int'),
                                                    test_size=0.2,
                                                    random_state=0)

# Task 2
v = Vectorizer(docs=X_train)

# Task 3-4
result = Parallel(n_jobs=4)(
    delayed(gs.grid_search)(X=X_train, y=y_train, f_num=f_num, v=v)
    for f_num in range(1, 6))
result = sorted(result, key=itemgetter(0), reverse=True)
best_c = result[0][1]
best_penalty = result[0][2]
best_f_num = result[0][3]

# Task 5
X_train_vct = v.vectorize(docs=X_train, f_num=best_f_num)
X_test_vct = v.vectorize(docs=X_test, f_num=best_f_num)

best_lr = LogisticRegression(C=best_c,
                             penalty=best_penalty,
                             solver='liblinear')
best_lr.fit(X_train_vct, y_train)

print('The accuracy is %.3f percent' %
      (accuracy_score(y_test, best_lr.predict(X_test_vct)) * 100))
print(best_lr)