def run(self): # Application start. console_log(Constant.Text.APPLICATION + 'Application.Start.', indent=0, frequency=Constant.Sound.LOG_FREQUENCY) # Train model. x_train, y_train = self.load_training_data(self.file_name_training) x_validation, y_validation = self.load_validation_data( self.file_name_training) transformation, x_train_transformed = self.select_features(x_train) model = self.train(x_train_transformed, y_train) # Apply model id_live, x_live = self.load_live_data(self.file_name_live) y_live = self.compute_model(model, transformation, x_live) self.save_live_data(self.file_name_predictions, id_live, y_live) # Application end. console_log(Constant.Text.APPLICATION + 'Application.Stop.', indent=0, frequency=Constant.Sound.LOG_FREQUENCY) self.report_model(model, transformation, x_train_transformed, y_train)
def getClusters(self, topic_word_distr, algorithm=None): if algorithm is None: res = topic_word_distr.mean(0) sorted_res = sorted(res) indices = np.array(np.argsort(res)) topics = self.get_clusters(sorted_res) # console_log(indices, topics) console_log('{}{} {} topic(s) found {}{}'.format( ' '*25, '='*25, len(topics), '='*25, ' '*25)) tp = [] for topic in topics: indx = indices[topic] tp.append([]) for index in indx: tp[-1].append(index) # console_log(model.columns[index], round(res[index], 4)) # console_log() return tp elif algorithm == 'kmeans': X = self.doc_term_freq kmeans = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600, n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0) kmeans.fit(X) print(kmeans.inertia_) input() return
def train(self, x, y): console_log(Constant.Text.MODEL + 'Training model.', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) # Compute random state. random_state_max = 2**32 - 1 random_state = math.floor(random_state_max * random.random()) # Configure model parameters. model = MLPClassifier(hidden_layer_sizes=(7, 1), activation='tanh', solver='adam', learning_rate='adaptive', momentum=0.9, alpha=0.00001, random_state=random_state) # Train the model. model.fit(x, y) # Return to caller return model
def load_validation_data(self, file_name): console_log(Constant.Text.MODEL + 'Loading validation data, "' + file_name + '".', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) return [], []
def compute_model(self, model, transformation, x): console_log(Constant.Text.MODEL + 'Computing model.', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) # Compute model. x_transformed = transformation.transform(x) y = model.predict_proba(x_transformed) # Return to caller. return y
def initialize(): console_log(Constant.Text.SYSTEM + 'Initialize Program.', lines_before=1, frequency=Constant.Sound.START_FREQUENCY) # Disable warnings. # - We do this specificaly to disable TensorFlow warnings. if Constant.System.SUPPRESS_TENSOR_FLOW_WARNINGS: os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Initialize random number generaor. np.random.seed(0)
def main(): # Initialize application. console_log('PROGRAM.START: ' + str(time.now()), indent=0, lines_before=1, frequency=11000) # Shut down application. console_log('PROGRAM.STOP: ' + str(time.now()), indent=0, lines_before=1, frequency=11000) console_new_line()
def getTopWords(self, verbose=1, num=10, topic_word_distr=None): if topic_word_distr is None: topic_word_distr = self.topic_word_distr topwords = [] # geting the top words influencing topic for t in topic_word_distr.columns: topic_topwords = topic_word_distr[t].sort_values(ascending=False)[:num] if verbose: console_log('Topic {}\n{}'.format(t, '='*10)) console_log(topic_topwords, end='\n\n') topwords.append(list(topic_topwords.index)) return topwords
def save_live_data(self, file_name, i, y): # Load file. console_log(Constant.Text.MODEL + 'Saving live predictions, "' + file_name + '".', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) # Format prediction results as Pandas dataframe. results = y[:, 1] results_dataframe = pd.DataFrame( data={Constant.Numerai.CSV.PROBABILITY: results}) y_prediction_dataframe = pd.DataFrame(i).join(results_dataframe) # Save data. y_prediction_dataframe.to_csv(file_name, index=False)
def select_features(self, x): console_log(Constant.Text.MODEL + 'Selecting features.', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) # Configure aglorythm. feature_count = len(x.columns) transformation = PCA(n_components=feature_count) # Fit model. transformation.fit(x) # Apply model. x_transformed = transformation.transform(x) # return to caller. return transformation, x_transformed
def train(self, docs, labels, n=1.0): # the documents that word appears in docs_length = len(docs) # the batch to learn docs_length = int(n * docs_length) if type(n) == float else n # ---------------------------------------------preprocessing---------------------------------- console_log('-'*30, 'Preprocessing!', '-'*30, '\n') # constructs a model self.constructDTF(docs, labels) # ----------------------------------------training---------------------------------------- console_log('='*30, 'Training!', '='*30, '\n') # iterate and infer self.run_iteration() # set topic word_distr topic_word_distr = self.topic_word_distr.T # show the topic distribution console_log('{}{} Topic (word) distribution! {}{}'.format( ' '*25, '='*25, '='*25, ' '*25)) console_log(topic_word_distr, end='\n\n') # geting the percentage influence of word to topic # self.topic_word_distr /= topic_word_distr.sum(1) # console_log('{}{} Topic (word(%)) distribution! {}{}'.format(' '*25, '='*25, '='*25, ' '*25)) # console_log(self.topic_word_distr.T, end='\n\n') # self.topic_word_distr = topic_word_distr self.getTopWords() console_log('{} doc(s) read and {} word(s) in the vocabulary'.format( docs_length, len(self.word_docs))) return
def load_live_data(self, file_name): # Load file. console_log(Constant.Text.MODEL + 'Loading live data, "' + file_name + '".', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) live_data = pd.read_csv(file_name, header=0) # Format the loaded CSV data into numpy arrays. features = [ f for f in list(live_data) if Constant.Numerai.CSV.FEATURE in f ] i = live_data[Constant.Numerai.CSV.ID] # id vector...... x = live_data[features] # Feature tensor. # Return o caller. return i, x
def load_training_data(self, file_name): # Load data file. console_log(Constant.Text.MODEL + 'Loading training data, "' + file_name + '".', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY) training_data = pd.read_csv(file_name, header=0) # Format the loaded CSV data into numpy arrays. features = [ f for f in list(training_data) if Constant.Numerai.CSV.FEATURE in f ] x = training_data[features] y = training_data[Constant.Numerai.CSV.TARGET] # return to caller return x, y
def load_test_data(self, file_name): console_log(Constant.Text.MODEL + 'Loading test data, "' + file_name + '".', indent=Constant.Text.INDENT, frequency=Constant.Sound.LOG_FREQUENCY)
def run_iteration(self): term_doc_freq = self.doc_term_freq.T term_topic = pd.DataFrame(data=0, columns=term_doc_freq.columns, index=self.doc_topic.index) console_log('\n', '-'*25, 'Constructing Topic word distribution!', '-'*25) for word in tqdm(term_doc_freq.columns): tdf = term_doc_freq[word] # word term_doc_freq doc_indices = tdf[tdf > 0].index term_topic[word] += self.doc_topic[doc_indices].sum(1) # leave a line console_log() # the general topic word matrix word_topic_matrix = term_topic.copy() console_log('\n', '-'*25, 'Checking for informative words!', '-'*25) # non informative columns columns_to_drop = [] # check for informative words for term in tqdm(term_topic.columns): # normalize the term to topic word_topic_matrix[term] /= word_topic_matrix[term].sum() # select the ones with diluted topics to drop # if not (term_topic[term] == term_topic[term].sum()).any(): # if not (term_topic[term] > term_topic[term].mean()).any(): if not (term_topic[term] > term_topic[term].mean() + term_topic[term].min()).any(): columns_to_drop.append(term) # flip the word topic to get topic word self.topic_word_distr = word_topic_matrix.T # display the topic-word distr console_log(self.topic_word_distr.T) # leave a line console_log() # drop non informative columns term_topic = term_topic.drop(columns=columns_to_drop) # trim down the informative words best_words_indices = [] topic_term = term_topic.T console_log('\n', '-'*25, 'Trimming for informative words!', '-'*25) for topic in tqdm(term_topic.index): topic_terms = topic_term[topic] topic_terms_mean = np.unique(topic_terms.values).mean() best_words_indices.extend( list( topic_terms[topic_terms > topic_terms_mean].index ) ) # leave a line console_log() # the unique index of the best words best_words_indices = list(set(best_words_indices)) # display topwords for topi word distr # self.getTopWords(topic_word_distr=self.topic_word_distr.T[best_words_indices].T) if 1: # the new term term ratio to be infered from best of best # temp term term ratio matrix ttr = self.term_term_ratio[best_words_indices] * 0 # display the current runing process console_log('\n', '-'*25, 'Infering best_word-word ratio!', '-'*25) # infer word for word for w1 in tqdm(best_words_indices): for w2 in best_words_indices: factor = 1 # inference from sharing occurence with informative word # co_occurence_inference_factor = self.term_term_ratio[w1][w2] # if co_occurence_inference_factor > 0: # factor *= co_occurence_inference_factor # inference from sharing topic with best word co_topic_inference_factor = ( self.topic_word_distr.T[w1] * self.topic_word_distr.T[w2]).mean() if co_topic_inference_factor > 0: factor *= co_topic_inference_factor # infer relation of words ttr[w2] += factor * self.term_term_ratio[w1] # normalize self.term_term_ratio[best_words_indices] = ttr / len(best_words_indices) if len(best_words_indices) > 0 else ttr * 0 if 1: # the new term ratio to be infered from best ttr = self.term_term_ratio * 0 # temp term term ratio matrix # display the current runing process console_log( '\n', '-'*25, 'Infering word-word ratio!', '-'*25) # infer word for word for w1 in tqdm(best_words_indices): for w2 in self.term_term_ratio.index: factor = 1 # inference from sharing occurence with informative word # co_occurence_inference_factor = self.term_term_ratio[w1][w2] # if co_occurence_inference_factor > 0: # factor *= co_occurence_inference_factor # inference from sharing topic with best word co_topic_inference_factor = (self.topic_word_distr.T[w1] * self.topic_word_distr.T[w2]).mean() if co_topic_inference_factor > 0: factor *= co_topic_inference_factor # infer relation of words ttr[w2] += factor * self.term_term_ratio[w1] # normalize self.term_term_ratio = ttr / len(best_words_indices) if len(best_words_indices) > 0 else ttr * 0 # console_log(term_term_ratio, '\n') # input('enter to continue!') # the most informative words self.best_words_indices = best_words_indices.copy() return
def constructDTF(self, docs, labels): # construct the model doc_term_freq = {} doc_topic = {} term_term_freq = {} term_term_ratio = {} # docs that word belongs word_docs = {} console_log('-'*25, 'Building Document Term Matrix!', '-'*25) # build vocabulary for doc_index in tqdm(range(len(docs))): if doc_index not in doc_term_freq: doc_term_freq[doc_index] = {} doc_topic[doc_index] = {} text = docs[doc_index] doc_topic[doc_index][labels[doc_index]] = 1 # get word tokens tokens = self.tokenize(text) for token in tokens: if token not in doc_term_freq[doc_index]: doc_term_freq[doc_index][token] = 0 doc_term_freq[doc_index][token] += 1 # check if token already initialized in word_doc if token not in word_docs: word_docs[token] = [] # add the doc that word belong word_docs[token].append(doc_index) console_log() console_log('-'*25, 'Building Word Occurrence and Co-occurrence!', '-'*25) for token1 in tqdm(word_docs): wd1 = word_docs[token1] if token1 not in term_term_freq: term_term_freq[token1] = {} term_term_ratio[token1] = {} for token2 in word_docs: wd2 = word_docs[token2] term_term_freq[token1][token2] = len(set(wd1).intersection(set(wd2))) term_term_ratio[token1][token2] = term_term_freq[token1][token2] / len(wd1) if len(wd1) > 0 else 0 # make a dataframe self.doc_topic = pd.DataFrame(doc_topic) self.doc_term_freq = pd.DataFrame(doc_term_freq) # term term matrix term_term_freq = pd.DataFrame(term_term_freq) term_term_ratio = pd.DataFrame(term_term_ratio) # set word_docs as field self.word_docs = word_docs.copy() # replace nan as 0 self.doc_topic.fillna(0, inplace=True) self.doc_term_freq.fillna(0, inplace=True) term_term_freq.fillna(0, inplace=True) term_term_ratio.fillna(0, inplace=True) # adjust term ratio with trust factor self.term_term_ratio = self.trustFactor(term_term_freq) * term_term_ratio console_log() console_log(self.doc_topic, '\n')