def trainModelGMM(X, lengths, states, num_gaus): model = GMMHMM(n_components=states, n_mix=num_gaus,n_iter=1000,verbose=True).fit(X,lengths) print('Mixture Models + HMM') print(model.predict(X)) print(model.monitor_.converged) print(model.monitor_) print(model.score(X, lengths))
model_dining.fit(D) model_fitness.fit(F) model_work.fit(W) model_shop.fit(S) print model_dining.startprob_.tolist() print model_dining.transmat_.tolist() print 'After training' print ' - Classification for seq dining s-' print 'dining result:' print model_dining.score(np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print 'fitness result:' print model_fitness.score(np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print 'shop result:' print model_shop.score(np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print 'work result:' print model_work.score(np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print ' - Classification for seq dining l-' print 'dining result:' print model_dining.score(np.array(dataset_dining._convetNumericalSequence(seq_d))) print 'fitness result:' print model_fitness.score(np.array(dataset_dining._convetNumericalSequence(seq_d))) print 'work result:' print model_work.score(np.array(dataset_dining._convetNumericalSequence(seq_d)))
class StockPredictor(object): def __init__(self, ticker, chunks=9, delta=0, n_hidden_states=5, n_latency_days=10, n_steps_frac_change=10, n_steps_frac_high=30, n_steps_frac_low=10, n_iter=100, verbose=False, prediction_date=None): self.total_score = 0 self.verbose = verbose self.ticker = ticker self.n_latency_days = n_latency_days self.hmm = GMMHMM(n_components=n_hidden_states, n_iter=n_iter) self.chunks = chunks self.delta = delta self.prediction_date = prediction_date self.fetch_training_data() self._compute_all_possible_outcomes(n_steps_frac_change, n_steps_frac_high, n_steps_frac_low) def fetch_training_data(self): print("Fetching training data ...") res = es.search(index="market", doc_type="quote", size=10000, body={"query": { "match": { "ticker": self.ticker } }}) self.training_data = json_normalize(res['hits']['hits']) self.chunked_training_data = self.training_data #vectors = [] #chunked_training_data_lengths = [] #start_index = 0 #end_index = start_index + self.chunks #delta_date_index = end_index + self.delta #while delta_date_index <= len(self.training_data): #training_chunk = self.training_data[start_index:end_index] # delta_chunk = self.training_data.iloc[delta_date_index] # total_chunk = training_chunk.append(delta_chunk) # #print("%s training_chunk to train %s" % (total_chunk, self.ticker)) # start_index = end_index + 1 # end_index = start_index + self.chunks # delta_date_index = end_index + self.delta # vectors.append(total_chunk) # chunked_training_data_lengths.append(len(total_chunk)) # if self.verbose: print(total_chunk) #self.chunked_training_data = pd.DataFrame(np.concatenate(vectors), columns = self.training_data.columns) #self.chunked_training_data_lengths = chunked_training_data_lengths if self.verbose: print("Latest record for training:\n%s" % self.chunked_training_data.tail(1)) latest_date = self.chunked_training_data.tail(1)['_source.timestamp'] datetime_object = datetime.datetime.strptime(latest_date.values[0], '%Y-%m-%dT%H:%M:%S') if self.prediction_date == None: prediction_date = datetime_object + timedelta(days=self.delta + 1) self.prediction_date = datetime.datetime.strftime( prediction_date, '%Y-%m-%dT%H:%M:%S') @staticmethod def _extract_features(data): frac_change = np.array( data['_source.change']) #(close_price - open_price) / open_price frac_high = np.array(data['_source.change_high'] ) #(high_price - open_price) / open_price frac_low = np.array( data['_source.change_low']) #(open_price - low_price) / open_price return np.column_stack((frac_change, frac_high, frac_low)) def fit(self): print('Extracting Features') feature_vector = StockPredictor._extract_features( self.chunked_training_data) if self.verbose: print("feature vector %s" % feature_vector) print('Training Model with %s features' % feature_vector.size) print( "Latest date to be used in training is %s" % self.chunked_training_data.tail(1)['_source.timestamp'].values[0]) #self.hmm.fit(feature_vector, self.chunked_training_data_lengths) self.hmm.fit(feature_vector) print('Model trained') def _compute_all_possible_outcomes(self, n_steps_frac_change, n_steps_frac_high, n_steps_frac_low): frac_change_range = np.linspace(-0.1, 0.1, n_steps_frac_change) frac_high_range = np.linspace(0, 0.05, n_steps_frac_high) frac_low_range = np.linspace(0, 0.05, n_steps_frac_low) self.all_possible_outcomes = np.array( list( itertools.product(frac_change_range, frac_high_range, frac_low_range))) def json_data_for_trade(self): rows = list() # meta ticker = self.ticker date = self.prediction_date total_score = self.total_score id = "%s-%s-%s" % (ticker, date, total_score) meta = { "index": { "_index": TRADE_INDEX_NAME, "_type": TRADE_TYPE_NAME, "_id": id } } rows.append(json.dumps(meta)) # data row = ObjDict() row.total_score = total_score row.timestamp = self.prediction_date row.ticker = self.ticker rows.append(json.dumps(row)) return rows def json_data_for_outcome(self, outcome, score): rows = list() # meta ticker = self.ticker date = self.prediction_date vector = outcome id = "%s-%s-%s" % (ticker, date, vector) meta = {"index": {"_index": INDEX_NAME, "_type": TYPE_NAME, "_id": id}} rows.append(json.dumps(meta)) # data row = ObjDict() row.frac_change = outcome[0] row.frac_high_range = outcome[1] row.frac_low_range = outcome[2] open_price = self.training_data.tail(1)['_source.open'].values[0] predicted_close = open_price * (1 + outcome[0]) expected_value = outcome[0] * score row.predicted_close = predicted_close row.expected_value = expected_value row.timestamp = self.prediction_date row.score = score row.chunks = self.chunks row.delta = self.delta row.score = score row.ticker = self.ticker rows.append(json.dumps(row)) return rows def delete_prediction_data(self, ticker): print("Deleting prediction data for ... %s" % self.ticker) es.delete_by_query(index=INDEX_NAME, doc_type=TYPE_NAME, body={'query': { 'match': { 'ticker': self.ticker } }}) def predict_outcomes(self): print("predicting outcomes for: %s" % self.prediction_date) previous_testing_data = self.training_data.tail( self.n_latency_days).index if self.verbose: print("previous_testing_data %s" % previous_testing_data) test_data = self.training_data.iloc[previous_testing_data] if self.verbose: print("Using the following slice of data:") print("[%s]" % previous_testing_data) print(test_data) test_data_features = StockPredictor._extract_features(test_data) # to blow everything away - may need to recreate/refresh indexes in ES! #self.delete_and_create_index() bulk_data = list() trade_data = list() outcome_score = [] for possible_outcome in self.all_possible_outcomes: test_feature_vectors = np.row_stack( (test_data_features, possible_outcome)) score = self.hmm.score(test_feature_vectors) # ignoring scores <= 0 if score > 0: rows = self.json_data_for_outcome(possible_outcome, score) bulk_data.append(rows) if possible_outcome[0] > 0: self.total_score = self.total_score + score if possible_outcome[0] < 0: self.total_score = self.total_score - score trade_rows = self.json_data_for_trade() trade_data.append(trade_rows) print("Exporting predictions to ES") es_array = self.format_data_for_es(bulk_data) res = es.bulk(index=INDEX_NAME, body=es_array, refresh=True) es_array = self.format_data_for_es(trade_data) res = es.bulk(index=TRADE_INDEX_NAME, body=es_array, refresh=True) def format_data_for_es(self, data): es_array = "" for row in data: es_array += row[0] es_array += "\n" es_array += row[1] es_array += "\n" return es_array
def main(): outdir = r'./training_files/multi' outdir2 = r'./training_files/arnab' outdir3 = r'./training_files/kejriwal' outdir4 = r'./training_files/ravish' outdir5 = r'./training_files/not-shouting' outdir6 = r'./training_files/shouting' outdir7 = r'./training_files/single' outdir8 = r'./training_files/modi' outdir9 = r'./training_files/ond_more' #create 3 hmm one for each case multi = GMMHMM(5, 2) discuss = GMMHMM(5, 2) arnab = GMMHMM(5, 2) kejriwal = GMMHMM(5, 2) ravish = GMMHMM(5, 2) notshouting = GMMHMM(5, 2) shouting = GMMHMM(5, 2) single = GMMHMM(5, 2) #training for multi l = get_files_list(outdir) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] multi.fit(obs) #training for arnab l = get_files_list(outdir2) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] arnab.fit(obs) #training for kejriwal l = get_files_list(outdir3) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] kejriwal.fit(obs) #training for ravish l = get_files_list(outdir4) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] ravish.fit(obs) #training for notshouting l = get_files_list(outdir5) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] notshouting.fit(obs) #training for shouting l = get_files_list(outdir6) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] shouting.fit(obs) #training for single l = get_files_list(outdir7) for i in l: f = open(i, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] single.fit(obs) #Its time for some testing q = [] t = "testcase_output.txt" out = open(t, "w") #Read test file and make list of list of sequence 10 for --->1 #te=["test1.txt","test2.txt","test3.txt","test4.txt","test5.txt","test6.txt","test7.txt","test8.txt","test9.txt","test10.txt"] #f=open("expected.txt") #d_expected={} ''' for line in f: x=line.strip().split() d_expected[x[0]]={'arnab':float(x[1]),'kejriwal':float(x[2]),'ravish':float(x[3])} ''' te = get_files_list(r'./testing_files') #te=["test1.txt","test2.txt","test3.txt"] for ad in te: d = {"arnab": 0, "kejriwal": 0, "ravish": 0} f = open(ad, "r") obs = [] i_sequence = [] count = 0 for line in f: individual_obs = line.strip().split(",") #print individual_obs individual_obs = [float(i) for i in individual_obs] i_sequence.append(individual_obs) count += 1 if count == 10: obs.append(numpy.array(i_sequence)) count = 0 i_sequence = [] p = [] p_choosen = [] p1_choosen = [] p1 = [] p2 = [] p2_choosen = [] #print obs for i in obs: p.append((shouting.score(i), "shouting")) p.append((notshouting.score(i), "notshouting")) p_choosen.append(max(p, key=lambda x: x[0])) p = [] for i in obs: p1.append((arnab.score(i), "arnab")) p1.append((kejriwal.score(i), "kejriwal")) p1.append((ravish.score(i), "ravish")) p1_choosen.append(max(p1, key=lambda x: x[0])) p1 = [] for i in obs: p2.append((multi.score(i), "multi")) p2.append((single.score(i), "single")) p2_choosen.append(max(p2, key=lambda x: x[0])) p2 = [] #print p p = [] p1 = [] p_choosen = [b for a, b in p_choosen] p1_choosen = [b for a, b in p1_choosen] p2_choosen = [b for a, b in p2_choosen] ''' #print p_choosen #print the state sequence with the timestamp in the output file t="testcase_output_9.txt" out=open(t,"a+") out.write(str(ad)+"--->") out.write(p_choosen[0]) out.write("\n") ''' #calculate the amount per second and append to the same file #print p_choosen #print p1_choosen shouting1 = [] notshouting1 = [] totaltime = len(p_choosen) * 0.05 single_count = 0 for i in range(len(p_choosen)): if p2_choosen[i] == "single": single_count += 1 if p_choosen[i] == "shouting": shouting1.append(p1_choosen[i]) elif p_choosen[i] == "notshouting": notshouting1.append(p1_choosen[i]) #print d d_shouting = {"arnab": 0, "kejriwal": 0, "ravish": 0} d_notshouting = {"arnab": 0, "kejriwal": 0, "ravish": 0} for i in shouting1: d_shouting[i] += 1 for i in notshouting1: d_notshouting[i] += 1 #print p_choosen out.write("\n*******--> " + str(ad) + " <--*******\n") #write arnab,ravish and kejri fn = ad.strip().split("/") fn = fn[len(fn) - 1] #out.write("Time predicted for questioning: "+str((d5['question'])*0.05)+" seconds.\n") #out.write("Time predicted for discussion: "+str((d5['discuss'])*0.05)+" seconds.\n") out.write("\nChecking single HMM and multi HMM:\n") out.write("Number of instance of Single: " + str(single_count) + "\n") out.write( "\nChecking shouting and non-shouting HMM for all Single instances:\n" ) out.write("Number of instance of Shouting: " + str(len(shouting1)) + "\n") out.write("Number of instance of Not-shouting: " + str(len(notshouting1)) + "\n") out.write( "\nChecking the frequency of each speaker in both both shouting and not shouting instance...\n" ) out.write("Shouting instance: \n" + str(d_shouting) + "\n") out.write("Not-Shouting instance: \n" + str(d_notshouting) + "\n") out.write("\nResult:\n") for c, d in d_shouting.items(): out.write( str(c) + " was shouting for " + str(d * 0.05) + " sec.\n") out.write("\n") for c, d in d_notshouting.items(): out.write( str(c) + " was not shouting for " + str(d * 0.05) + " sec.\n") out.write("\n") for c, d in d_shouting.items(): out.write( str(c) + " was shouting for " + str(((d * 0.05) / totaltime) * 100) + " % of time.\n") out.write("\n") for c, d in d_notshouting.items(): out.write( str(c) + " was not shouting for " + str(((d * 0.05) / totaltime) * 100) + " sec.\n") out.write("\n") print d_shouting print d_notshouting
import os from hmmlearn.hmm import GMMHMM from python_speech_features import mfcc from scipy.io import wavfile from sklearn.model_selection import train_test_split import numpy as np import sys input_folder = '/home/sachin/Downloads/cmu_us_awb_arctic-0.95-release/cmu_us_awb_arctic/wav' hmm_models = [] X = np.array([]) for filename in os.listdir(input_folder): filepath = os.path.join(input_folder, filename) sampling_freq, audio = wavfile.read(filepath) mfcc_features = mfcc(audio, sampling_freq) if len(X) == 0: X = mfcc_features else: X = np.append(X, mfcc_features, axis=0) model = GMMHMM(n_components=3, n_mix=45, n_iter=100) X_train, X_test = train_test_split(X, train_size=0.7) hmm_models.append(model.fit(X_train)) print(model.score(X_test))
print "checking num", picChecked print '-----------------' try: resultGau0 = model0Gau.score(newTestPictures[picChecked]) resultGau1 = model1Gau.score(newTestPictures[picChecked]) resultGau2 = model2Gau.score(newTestPictures[picChecked]) resultGau3 = model3Gau.score(newTestPictures[picChecked]) resultGau4 = model4Gau.score(newTestPictures[picChecked]) resultGau5 = model5Gau.score(newTestPictures[picChecked]) resultGau6 = model6Gau.score(newTestPictures[picChecked]) except ValueError: print 'err gau' try: resultMix0 = model0GauMix.score(newTestPictures[picChecked]) resultMix1 = model1GauMix.score(newTestPictures[picChecked]) resultMix2 = model2GauMix.score(newTestPictures[picChecked]) resultMix3 = model3GauMix.score(newTestPictures[picChecked]) resultMix4 = model4GauMix.score(newTestPictures[picChecked]) resultMix5 = model5GauMix.score(newTestPictures[picChecked]) resultMix6 = model6GauMix.score(newTestPictures[picChecked]) except ValueError: print 'err gaumix' try: resultMulti0 = model0Multi.score(newTestPictures[picChecked]) resultMulti1 = model1Multi.score(newTestPictures[picChecked]) resultMulti2 = model2Multi.score(newTestPictures[picChecked]) resultMulti3 = model3Multi.score(newTestPictures[picChecked]) resultMulti4 = model4Multi.score(newTestPictures[picChecked])
print seq_s model_dining.fit(D) model_fitness.fit(F) model_work.fit(W) model_shop.fit(S) print model_dining.startprob_.tolist() print model_dining.transmat_.tolist() print 'After training' print ' - Classification for seq dining s-' print 'dining result:' print model_dining.score( np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print 'fitness result:' print model_fitness.score( np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print 'shop result:' print model_shop.score( np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print 'work result:' print model_work.score( np.array(dataset_dining._convetNumericalSequence(seq_d_s))) print ' - Classification for seq dining l-' print 'dining result:' print model_dining.score( np.array(dataset_dining._convetNumericalSequence(seq_d)))
class StockPredictor(object): def __init__(self, ticker, n_hidden_states=5, n_latency_days=10, n_steps_frac_change=50, n_steps_frac_high=30, n_steps_frac_low=10, n_iter=1000, verbose=False): self.verbose = verbose self.ticker = ticker self.n_latency_days = n_latency_days self.hmm = GMMHMM(n_components=n_hidden_states, n_iter=n_iter) self.fetch_training_data() self.fetch_latest_data() # to predict self._compute_allall_possible_outcomes(n_steps_frac_change, n_steps_frac_high, n_steps_frac_low) def fetch_latest_data(self): print("Fetching latest data ...") res = es.search(index="market", doc_type="quote", size=10000, body={"query": { "match": { "ticker": self.ticker } }}) latest_data = json_normalize(res['hits']['hits']) self.latest_data = latest_data.tail(1) if self.verbose: print("Latest data:\n%s" % self.latest_data) def fetch_training_data(self): print("Fetching training data ...") res = es.search(index="market", doc_type="quote", size=10000, body={"query": { "match": { "ticker": self.ticker } }}) self.training_data = json_normalize(res['hits']['hits']) self.training_data.drop(self.training_data.tail(1).index, inplace=True) print("%s records to train %s" % (len(self.training_data.index), self.ticker)) if self.verbose: print("Latest record for training:\n%s" % self.training_data.tail(1)) # tbd - to use es instead #q = query % (self.ticker, "lt", datetime.date.today().strftime("%Y-%m-%d")) #print(q) #res = es.search(index=INDEX_NAME, doc_type=TYPE_NAME, size=10000, body=query) @staticmethod def _extract_features(data): frac_change = np.array( data['_source.change']) #(close_price - open_price) / open_price frac_high = np.array(data['_source.change_high'] ) #(high_price - open_price) / open_price frac_low = np.array( data['_source.change_low']) #(open_price - low_price) / open_price return np.column_stack((frac_change, frac_high, frac_low)) def fit(self): print('Extracting Features') feature_vector = StockPredictor._extract_features(self.training_data) if self.verbose: print("feature vector %s" % feature_vector) print('Training Model with %s features' % feature_vector.size) print("Latest date to be used in training is %s" % self.training_data.tail(1)['_source.timestamp'].values[0]) self.hmm.fit(feature_vector) print('Model trained') def _compute_allall_possible_outcomes(self, n_steps_frac_change, n_steps_frac_high, n_steps_frac_low): frac_change_range = np.linspace(-0.1, 0.1, n_steps_frac_change) frac_high_range = np.linspace(0, 0.1, n_steps_frac_high) frac_low_range = np.linspace(0, 0.1, n_steps_frac_low) self.all_possible_outcomes = np.array( list( itertools.product(frac_change_range, frac_high_range, frac_low_range))) def json_data_for_outcome(self, day, outcome, score): rows = list() # meta ticker = day['_source.ticker'] date = day['_source.timestamp'] vector = outcome id = "%s-%s-%s" % (ticker, date, vector) meta = {"index": {"_index": INDEX_NAME, "_type": TYPE_NAME, "_id": id}} rows.append(json.dumps(meta)) # data row = ObjDict() row.frac_change = outcome[0] row.frac_high_range = outcome[1] row.frac_low_range = outcome[2] open_price = day['_source.open'].values[0] predicted_close = open_price * (1 + outcome[0]) expected_value = outcome[0] * score row.predicted_close = predicted_close row.expected_value = expected_value row.timestamp = day['_source.timestamp'].values[0] row.score = score row.ticker = day['_source.ticker'].values[0] rows.append(json.dumps(row)) return rows def predict_outcomes(self): print("predicting outcomes for: %s" % self.latest_data['_source.timestamp'].values[0]) previous_testing_data = self.training_data.tail( self.n_latency_days).index if self.verbose: print("previous_testing_data %s" % previous_testing_data) test_data = self.training_data.iloc[previous_testing_data] if self.verbose: print("Using the following slice of data:") print("[%s]" % previous_testing_data) print(test_data) test_data_features = StockPredictor._extract_features(test_data) # to blow everything away - may need to recreate/refresh indexes in ES! #self.delete_and_create_index() bulk_data = list() outcome_score = [] for possible_outcome in self.all_possible_outcomes: test_feature_vectors = np.row_stack( (test_data_features, possible_outcome)) if self.verbose: print("Final test feature set:") print("[%s]" % test_feature_vectors) score = self.hmm.score(test_feature_vectors) # ignoring scores <= 0 if score > 0: rows = self.json_data_for_outcome(self.latest_data, possible_outcome, score) bulk_data.append(rows) # format for ES, ugly es_array = "" for row in bulk_data: es_array += row[0] es_array += "\n" es_array += row[1] es_array += "\n" #print("Deleting prediction data for ... %s" % day['_source.ticker']) #es.delete_by_query(index=INDEX_NAME,doc_type=TYPE_NAME, body={'query': {'match': {'ticker': day['_source.ticker']}}}) print("Exporting predictions to ES") if self.verbose: print(es_array) res = es.bulk(index=INDEX_NAME, body=es_array, refresh=True)