Esempio n. 1
0
    def run(self):

        # Application start.

        console_log(Constant.Text.APPLICATION + 'Application.Start.',
                    indent=0,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        # Train model.

        x_train, y_train = self.load_training_data(self.file_name_training)
        x_validation, y_validation = self.load_validation_data(
            self.file_name_training)
        transformation, x_train_transformed = self.select_features(x_train)
        model = self.train(x_train_transformed, y_train)

        # Apply model

        id_live, x_live = self.load_live_data(self.file_name_live)
        y_live = self.compute_model(model, transformation, x_live)
        self.save_live_data(self.file_name_predictions, id_live, y_live)

        # Application end.

        console_log(Constant.Text.APPLICATION + 'Application.Stop.',
                    indent=0,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        self.report_model(model, transformation, x_train_transformed, y_train)
	def getClusters(self, topic_word_distr, algorithm=None):
		if algorithm is None:
			res = topic_word_distr.mean(0)
			sorted_res = sorted(res)

			indices = np.array(np.argsort(res))
			topics = self.get_clusters(sorted_res)

			# console_log(indices, topics)
			console_log('{}{} {} topic(s) found {}{}'.format(
				' '*25, '='*25, len(topics), '='*25, ' '*25))

			tp = []
			for topic in topics:
				indx = indices[topic]

				tp.append([])
				for index in indx:
					tp[-1].append(index)
					# console_log(model.columns[index], round(res[index], 4))
				# console_log()

			return tp

		elif algorithm == 'kmeans':
			X = self.doc_term_freq
			kmeans = KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=600,
							n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
							random_state=None, tol=0.0001, verbose=0)

			kmeans.fit(X)
			print(kmeans.inertia_)
			input()

			return
Esempio n. 3
0
    def train(self, x, y):

        console_log(Constant.Text.MODEL + 'Training model.',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        # Compute random state.

        random_state_max = 2**32 - 1
        random_state = math.floor(random_state_max * random.random())

        # Configure model parameters.

        model = MLPClassifier(hidden_layer_sizes=(7, 1),
                              activation='tanh',
                              solver='adam',
                              learning_rate='adaptive',
                              momentum=0.9,
                              alpha=0.00001,
                              random_state=random_state)

        # Train the model.

        model.fit(x, y)

        # Return to caller

        return model
Esempio n. 4
0
    def load_validation_data(self, file_name):

        console_log(Constant.Text.MODEL + 'Loading validation data, "' +
                    file_name + '".',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        return [], []
Esempio n. 5
0
    def compute_model(self, model, transformation, x):

        console_log(Constant.Text.MODEL + 'Computing model.',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        # Compute model.

        x_transformed = transformation.transform(x)
        y = model.predict_proba(x_transformed)

        # Return to caller.

        return y
Esempio n. 6
0
def initialize():

    console_log(Constant.Text.SYSTEM + 'Initialize Program.',
                lines_before=1,
                frequency=Constant.Sound.START_FREQUENCY)

    # Disable warnings.
    # - We do this specificaly to disable TensorFlow warnings.

    if Constant.System.SUPPRESS_TENSOR_FLOW_WARNINGS:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    # Initialize random number generaor.

    np.random.seed(0)
Esempio n. 7
0
def main():

    # Initialize application.

    console_log('PROGRAM.START: ' + str(time.now()),
                indent=0,
                lines_before=1,
                frequency=11000)

    # Shut down application.

    console_log('PROGRAM.STOP: ' + str(time.now()),
                indent=0,
                lines_before=1,
                frequency=11000)
    console_new_line()
	def getTopWords(self, verbose=1, num=10, topic_word_distr=None):
		if topic_word_distr is None:
			topic_word_distr = self.topic_word_distr

		topwords = []
		# geting the top words influencing topic
		for t in topic_word_distr.columns:
			topic_topwords = topic_word_distr[t].sort_values(ascending=False)[:num]

			if verbose:
				console_log('Topic {}\n{}'.format(t, '='*10))
				console_log(topic_topwords, end='\n\n')

			topwords.append(list(topic_topwords.index))

		return topwords
Esempio n. 9
0
    def save_live_data(self, file_name, i, y):

        # Load file.

        console_log(Constant.Text.MODEL + 'Saving live predictions, "' +
                    file_name + '".',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        # Format prediction results as Pandas dataframe.

        results = y[:, 1]
        results_dataframe = pd.DataFrame(
            data={Constant.Numerai.CSV.PROBABILITY: results})
        y_prediction_dataframe = pd.DataFrame(i).join(results_dataframe)

        # Save data.

        y_prediction_dataframe.to_csv(file_name, index=False)
Esempio n. 10
0
    def select_features(self, x):

        console_log(Constant.Text.MODEL + 'Selecting features.',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        # Configure aglorythm.

        feature_count = len(x.columns)
        transformation = PCA(n_components=feature_count)

        # Fit model.

        transformation.fit(x)

        # Apply model.

        x_transformed = transformation.transform(x)

        # return to caller.

        return transformation, x_transformed
	def train(self, docs, labels, n=1.0):
		# the documents that word appears in
		docs_length = len(docs)

		# the batch to learn
		docs_length = int(n * docs_length) if type(n) == float else n

		# ---------------------------------------------preprocessing----------------------------------
		console_log('-'*30, 'Preprocessing!', '-'*30, '\n')

		# constructs a model
		self.constructDTF(docs, labels)

		# ----------------------------------------training----------------------------------------
		console_log('='*30, 'Training!', '='*30, '\n')

		# iterate and infer
		self.run_iteration()

		# set topic word_distr
		topic_word_distr = self.topic_word_distr.T

		# show the topic distribution
		console_log('{}{} Topic (word) distribution! {}{}'.format(
			' '*25, '='*25, '='*25, ' '*25))
		console_log(topic_word_distr, end='\n\n')

		# geting the percentage influence of word to topic
		# self.topic_word_distr /= topic_word_distr.sum(1)

		# console_log('{}{} Topic (word(%)) distribution! {}{}'.format(' '*25, '='*25, '='*25, ' '*25))
		# console_log(self.topic_word_distr.T, end='\n\n')

		# self.topic_word_distr = topic_word_distr
		self.getTopWords()

		console_log('{} doc(s) read and {} word(s) in the vocabulary'.format(
			docs_length, len(self.word_docs)))
		return
Esempio n. 12
0
    def load_live_data(self, file_name):

        # Load file.

        console_log(Constant.Text.MODEL + 'Loading live data, "' + file_name +
                    '".',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        live_data = pd.read_csv(file_name, header=0)

        # Format the loaded CSV data into numpy arrays.

        features = [
            f for f in list(live_data) if Constant.Numerai.CSV.FEATURE in f
        ]

        i = live_data[Constant.Numerai.CSV.ID]  # id vector......
        x = live_data[features]  # Feature tensor.

        # Return o caller.

        return i, x
Esempio n. 13
0
    def load_training_data(self, file_name):

        # Load data file.

        console_log(Constant.Text.MODEL + 'Loading training data, "' +
                    file_name + '".',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)

        training_data = pd.read_csv(file_name, header=0)

        # Format the loaded CSV data into numpy arrays.

        features = [
            f for f in list(training_data) if Constant.Numerai.CSV.FEATURE in f
        ]

        x = training_data[features]
        y = training_data[Constant.Numerai.CSV.TARGET]

        # return to caller

        return x, y
Esempio n. 14
0
    def load_test_data(self, file_name):

        console_log(Constant.Text.MODEL + 'Loading test data, "' + file_name +
                    '".',
                    indent=Constant.Text.INDENT,
                    frequency=Constant.Sound.LOG_FREQUENCY)
	def run_iteration(self):
		term_doc_freq = self.doc_term_freq.T
		term_topic = pd.DataFrame(data=0, columns=term_doc_freq.columns, index=self.doc_topic.index)

		console_log('\n', '-'*25, 'Constructing Topic word distribution!', '-'*25)

		for word in tqdm(term_doc_freq.columns):
			tdf = term_doc_freq[word]  # word term_doc_freq

			doc_indices = tdf[tdf > 0].index
			term_topic[word] += self.doc_topic[doc_indices].sum(1)

		# leave a line
		console_log()

		# the general topic word matrix
		word_topic_matrix = term_topic.copy()

		console_log('\n', '-'*25, 'Checking for informative words!', '-'*25)

		# non informative columns
		columns_to_drop = []

		# check for informative words
		for term in tqdm(term_topic.columns):
			# normalize the term to topic
			word_topic_matrix[term] /= word_topic_matrix[term].sum()

			# select the ones with diluted topics to drop
			# if not (term_topic[term] == term_topic[term].sum()).any():
			# if not (term_topic[term] > term_topic[term].mean()).any():
			if not (term_topic[term] > term_topic[term].mean() + term_topic[term].min()).any():
				columns_to_drop.append(term)

		# flip the word topic to get topic word
		self.topic_word_distr = word_topic_matrix.T

		# display the topic-word distr
		console_log(self.topic_word_distr.T)

		# leave a line
		console_log()

		# drop non informative columns
		term_topic = term_topic.drop(columns=columns_to_drop)

		# trim down the informative words
		best_words_indices = []
		topic_term = term_topic.T

		console_log('\n', '-'*25, 'Trimming for informative words!', '-'*25)

		for topic in tqdm(term_topic.index):
			topic_terms = topic_term[topic]
			topic_terms_mean = np.unique(topic_terms.values).mean()

			best_words_indices.extend(
				list(
					topic_terms[topic_terms > topic_terms_mean].index
				)
			)

		# leave a line
		console_log()

		# the unique index of the best words
		best_words_indices = list(set(best_words_indices))

		# display topwords for topi word distr
		# self.getTopWords(topic_word_distr=self.topic_word_distr.T[best_words_indices].T)

		if 1:
			# the new term term ratio to be infered from best of best
			# temp term term ratio matrix
			ttr = self.term_term_ratio[best_words_indices] * 0

			# display the current runing process
			console_log('\n', '-'*25, 'Infering best_word-word ratio!', '-'*25)

			# infer word for word
			for w1 in tqdm(best_words_indices):
				for w2 in best_words_indices:
					factor = 1

					# inference from sharing occurence with informative word
					# co_occurence_inference_factor = self.term_term_ratio[w1][w2]
					# if co_occurence_inference_factor > 0:
					# 	factor *= co_occurence_inference_factor

					# inference from sharing topic with best word
					co_topic_inference_factor = (
						self.topic_word_distr.T[w1] * self.topic_word_distr.T[w2]).mean()
					if co_topic_inference_factor > 0:
						factor *= co_topic_inference_factor

					# infer relation of words
					ttr[w2] += factor * self.term_term_ratio[w1]

			# normalize
			self.term_term_ratio[best_words_indices] = ttr / len(best_words_indices) if len(best_words_indices) > 0 else ttr * 0
			
		if 1:
			# the new term ratio to be infered from best
			ttr = self.term_term_ratio * 0  # temp term term ratio matrix

			# display the current runing process
			console_log(
				'\n', '-'*25, 'Infering word-word ratio!', '-'*25)

			# infer word for word
			for w1 in tqdm(best_words_indices):
				for w2 in self.term_term_ratio.index:
					factor = 1

					# inference from sharing occurence with informative word
					# co_occurence_inference_factor = self.term_term_ratio[w1][w2]
					# if co_occurence_inference_factor > 0:
					# 	factor *= co_occurence_inference_factor

					# inference from sharing topic with best word
					co_topic_inference_factor = (self.topic_word_distr.T[w1] * self.topic_word_distr.T[w2]).mean()
					if co_topic_inference_factor > 0:
						factor *= co_topic_inference_factor

					# infer relation of words
					ttr[w2] += factor * self.term_term_ratio[w1]

			# normalize
			self.term_term_ratio = ttr / len(best_words_indices) if len(best_words_indices) > 0 else ttr * 0

		# console_log(term_term_ratio, '\n')
		# input('enter to continue!')

		# the most informative words
		self.best_words_indices = best_words_indices.copy()
		return
	def constructDTF(self, docs, labels):
		# construct the model
		doc_term_freq = {}
		doc_topic = {}
		term_term_freq = {}
		term_term_ratio = {}

		# docs that word belongs
		word_docs = {}

		console_log('-'*25, 'Building Document Term Matrix!', '-'*25)

		# build vocabulary
		for doc_index in tqdm(range(len(docs))):
			if doc_index not in doc_term_freq:
				doc_term_freq[doc_index] = {}
				doc_topic[doc_index] = {}

			text = docs[doc_index]
			doc_topic[doc_index][labels[doc_index]] = 1

			# get word tokens
			tokens = self.tokenize(text)

			for token in tokens:
				if token not in doc_term_freq[doc_index]:
					doc_term_freq[doc_index][token] = 0

				doc_term_freq[doc_index][token] += 1

				# check if token already initialized in word_doc
				if token not in word_docs:
					word_docs[token] = []

				# add the doc that word belong
				word_docs[token].append(doc_index)
		console_log()

		console_log('-'*25, 'Building Word Occurrence and Co-occurrence!', '-'*25)
		for token1 in tqdm(word_docs):
			wd1 = word_docs[token1]
			if token1 not in term_term_freq:

				term_term_freq[token1] = {}
				term_term_ratio[token1] = {}

			for token2 in word_docs:
				wd2 = word_docs[token2]

				term_term_freq[token1][token2] = len(set(wd1).intersection(set(wd2)))
				term_term_ratio[token1][token2] = term_term_freq[token1][token2] / len(wd1) if len(wd1) > 0 else 0

		# make a dataframe
		self.doc_topic = pd.DataFrame(doc_topic)
		self.doc_term_freq = pd.DataFrame(doc_term_freq)

		# term term matrix
		term_term_freq = pd.DataFrame(term_term_freq)
		term_term_ratio = pd.DataFrame(term_term_ratio)

		# set word_docs as field
		self.word_docs = word_docs.copy()

		# replace nan as 0
		self.doc_topic.fillna(0, inplace=True)
		self.doc_term_freq.fillna(0, inplace=True)
		term_term_freq.fillna(0, inplace=True)
		term_term_ratio.fillna(0, inplace=True)

		# adjust term ratio with trust factor
		self.term_term_ratio = self.trustFactor(term_term_freq) * term_term_ratio

		console_log()
		console_log(self.doc_topic, '\n')