def kd_tree_classification(k, lenData, pctTest, params, neightboards): clear_csv() samples = [] if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) kdTree = Kd_Tree(neightboards) firstRound = cross_validation(k, kdTree, data, lenData, "trainingFeatures", "testingFeatures", "First") secondRound = cross_validation(k, kdTree, data, lenData, "trainingFeatures", "testingFeatures", "Second") secondWithFirst = cross_validation(k, kdTree, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second") normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("KD-TREE", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def verify_address(adress, results, borough): zips = Normalizer.select_zipcode_class(Normalizer.get_neighborhood(borough)) for r in results: zip3dig = int(r[2]) / 100 if zip3dig in zips: return r[0], r[1], adress+", "+r[2] return None
def desicion_tree(k, lenData, pctTest, params, threshold): clear_csv() samples = [] if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.separate_data_2(samples, quantity_for_testing) decisionTree = DecisionTree(threshold) firstRound = cross_validation(k, decisionTree, data, lenData, "trainingFeaturesFirst", "testingFeaturesFirst", "First") secondRound = cross_validation(k, decisionTree, data, lenData, "trainingFeaturesSecond", "testingFeaturesSecond", "Second") secondWithFirst = cross_validation(k, decisionTree, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second") normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("DT", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def lr_classification(k, lenData, pctTest, l_regulizer=1): clear_csv() samples = generar_muestra_pais(lenData) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data_tensor(samples, quantity_for_testing) lrClassifier = LogisticRegression(1, l_regulizer) firstRound = cross_validation(k, lrClassifier, data, lenData, "trainingFeatures", "testingFeatures", "First") lrClassifier = LogisticRegression(2, l_regulizer) print("Paso primero") secondRound = cross_validation(k, lrClassifier, data, lenData, "trainingFeatures", "testingFeatures", "Second") print("Paso segundo") secondWithFirst = cross_validation(k, lrClassifier, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second") print("Paso tercero") normalData = normalizer.get_normal_data() # predictions = [firstRound, secondRound, secondWithFirst] predictions = [secondRound] show_accuracy("LR", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
class Predicter(object): def __init__(self, thetafile='theta.csv', datafile='data.csv'): try: df = pd.read_csv(thetafile) self.theta = [ float(df['theta0'].astype(float)), float(df['theta1'].astype(float)) ] except: print("the theta file was not found") self.theta = None self.norm = Normalizer(datafile) def predict(self, km): """ Model function: f(x) = ax + b """ return self.theta[0] + (self.theta[1] * km) def run(self, km): if self.theta == None: return None norm_km = self.norm.normalize_km(km) result = self.predict(norm_km) return self.norm.denormalize_price(result)
def train_data_set(): normalizer = Normalizer() data_set = [] labels = [] for i in range(0, 256, 8): n = normalizer.norm(int(random.uniform(0, 256))) data_set.append(n) labels.append(n) return labels, data_set
def svm_classification( k, lenData, pctTest, params, C=1, gamma=1, kernel="rbf"): clear_csv() samples = [] print(params) if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) svmClassifier = SVMClassifier(kernel, C, gamma) firstRound = cross_validation( k, svmClassifier, data, lenData, "trainingFeatures", "testingFeatures", "First" ) secondRound = cross_validation( k, svmClassifier, data, lenData, "trainingFeatures", "testingFeatures", "Second" ) secondWithFirst = cross_validation( k, svmClassifier, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second" ) normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("SVM", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def preprocess(self): tuples = CsvManager.read(self.input) num = CsvManager.read_progress() print num if num == 0: CsvManager.write_geo_codes([], self.output) CsvManager.write_progress('0') self.progress.set_size(len(tuples)) self.progress.update_progress(num) Normalizer.set_tuple(num, tuples) return tuples
def __init__(self, thetafile='theta.csv', datafile='data.csv'): try: df = pd.read_csv(thetafile) self.theta = [ float(df['theta0'].astype(float)), float(df['theta1'].astype(float)) ] except: print("the theta file was not found") self.theta = None self.norm = Normalizer(datafile)
def __init__(self, datafile, outfile, theta=[0, 0], learning_rate=0.1, train_range=1000): self.t_history = [] self.theta = theta self.datafile = datafile self.output = outfile self.data = pd.read_csv(datafile) self.learning_rate = learning_rate self.range = train_range norm = Normalizer(self.datafile) self.km = norm.normalize_km_list(self.data['km']) self.price = norm.normalize_price_list(self.data['price'])
def t(network, data): ''' test the model with input data :param network: :param data: :return: ''' normalizer = Normalizer() norm_data = normalizer.norm(data) predict_data = network.predict(norm_data) print '\ttestdata(%u)\tpredict(%u)' % (data, normalizer.denorm(predict_data))
def lr_classification(k, lenData, pctTest, l_regulizer=1): clear_csv() samples = generar_muestra_pais(lenData) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data_tensor(samples, quantity_for_testing) lrClassifier = LogisticRegression(1, l_regulizer) firstRound = cross_validation( k, lrClassifier, data, lenData, "trainingFeatures", "testingFeatures", "First" ) lrClassifier = LogisticRegression(2, l_regulizer) print("Paso primero") secondRound = cross_validation( k, lrClassifier, data, lenData, "trainingFeatures", "testingFeatures", "Second" ) print("Paso segundo") secondWithFirst = cross_validation( k, lrClassifier, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second" ) print("Paso tercero") normalData = normalizer.get_normal_data() # predictions = [firstRound, secondRound, secondWithFirst] predictions = [secondRound] show_accuracy("LR", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def kd_tree_classification(k, lenData, pctTest, params, neightboards): clear_csv() samples = [] if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) kdTree = Kd_Tree(neightboards) firstRound = cross_validation( k, kdTree, data, lenData, "trainingFeatures", "testingFeatures", "First" ) secondRound = cross_validation( k, kdTree, data, lenData, "trainingFeatures", "testingFeatures", "Second" ) secondWithFirst = cross_validation( k, kdTree, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second" ) normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("KD-TREE", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def desicion_tree(k, lenData, pctTest, params, threshold): clear_csv() samples = [] if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.separate_data_2(samples, quantity_for_testing) decisionTree = DecisionTree(threshold) firstRound = cross_validation( k, decisionTree, data, lenData, "trainingFeaturesFirst", "testingFeaturesFirst", "First" ) secondRound = cross_validation( k, decisionTree, data, lenData, "trainingFeaturesSecond", "testingFeaturesSecond", "Second" ) secondWithFirst = cross_validation( k, decisionTree, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second" ) normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("DT", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def __init__(self, loss_function): """ Parameters ---------- loss_function : SquaredErrorLoss loss function for the MLP """ self.layers = [] self.activations_functions = [] self.loss_function = loss_function self.normalizer = Normalizer() # All errors, returned in end for plotting self.errors = [] self.predict_errors = 0
def writeEntryPointProfiles(outEntryPointHdl, entryPointName, agentTypes): epNumber = entryPointName.split("_")[0] outEntryPointHdl.write(indent1 + "\n<entrypoint ID=\"ep_" + epNumber + "\" geotype=\"point\" type=\"dynamic\">\n") outEntryPointHdl.write(2 * indent2 + "<loader>entry_loader</loader>\n") outEntryPointHdl.write(2 * indent2 + "<geometry_id>" + epNumber + "</geometry_id>\n") outEntryPointHdl.write(2 * indent2 + "<profiles>\n") for agent in agentTypes: n = Normalizer() agent = n.normalize(agent); outEntryPointHdl.write(3 * indent2 + "<profile>\n") outEntryPointHdl.write(4 * indent2 + "<agentDistribution pctofentries=\"100.0\" agent_type=\"agent_" + agent + "\"/>\n") outEntryPointHdl.write(4 * indent2 + "<timetableReference ref=\"tt_" + agent + "\" scaleFactor=\"1.0\"/>\n") outEntryPointHdl.write(3 * indent2 + "</profile>\n") outEntryPointHdl.write(2 * indent2 + "</profiles>\n") outEntryPointHdl.write(indent1 + "</entrypoint>\n")
def pruebas(): # svm_classification(1000, 0.2, C=10, gamma=0.00833333333, kernel="rbf") lenData = 2500 print(lenData) print("kernel: ", "sigmoid", " C: ", 1, " G: ", 0.000000001) pctTest = 0.2 # samples = generar_muestra_provincia(lenData, "SAN JOSE") # quantity_for_testing = int(lenData*pctTest) # normalizer = Normalizer() # data = normalizer.prepare_data(samples, quantity_for_testing) # svm_classification(10, lenData, pctTest, C=1, gamma=1, kernel="rbf") time1 = time.time() for i in range(0, 30): samples = generar_muestra_pais(lenData) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) svm_classification(10, lenData, pctTest, C=1, gamma=0.000000001, kernel="sigmoid") time2 = time.time() print("ms: ", ((time2 - time1) * 1000.0)) totalacc = 0.0 for i in range(0, len(accList), 3): totalacc += accList[i][1] print("ER: ", 1 - (totalacc / 30.0)) totalacc = 0.0 for i in range(1, len(accList), 3): totalacc += accList[i][1] print("ER: ", 1 - (totalacc / 30.0)) totalacc = 0.0 for i in range(2, len(accList), 3): totalacc += accList[i][1] print("ER: ", 1 - (totalacc / 30.0))
def main(): test = [ 61.19499969, 57.31000137, 56.09249878, 61.72000122, 61.38000107, 64.61000061, 61.93500137, 63.70249939, 63.57249832, 60.22750092, 61.23249817, 60.35250092, 65.61750031, 64.85749817, 66.51750183, 66.99749756, 68.3125 , 71.76249695, 71.10749817, 71.67250061, 70.69999695, 69.23249817, 67.09249878, 69.02500153, 68.75749969, 70.74250031, 70.79250336, 69.64499664, 71.93250275, 73.44999695, 72.26750183, 73.29000092, 74.38999939, 75.15750122, 75.93499756, 77.53250122, 78.75250244, 77.85250092, 76.91249847, 77.38500214, 76.92749786, 78.73999786, 78.28500366, 79.80750275, 79.21250153, 79.72250366, 79.18250275, 79.52749634, 79.5625 , 79.48500061, 80.46250153, 80.83499908, 81.27999878, 80.58000183, 82.875 , 83.36499786, 85.99749756, 88.20999908, 83.97499847, 84.69999695, 85.74749756, 88.01999664, 87.89749908, 87.93250275, 87.43000031, 89.71749878, 91.63249969, 90.01499939, 91.20999908, 88.40750122, 90.44499969, 91.19999695, 91.02749634, 91.02749634, 93.46250153, 93.17250061, 95.34249878, 95.75250244, 95.91999817, 95.47750092, 97.05750275, 97.72499847, 96.52249908, 96.32749939, 98.35749817, 97. , 97.27249908, 92.84500122, 92.61499786, 94.80999756, 93.25250244, 95.04000092, 96.19000244, 106.26000214, 108.9375 , 109.66500092, 110.0625 , 113.90249634, 111.11250305, 112.72750092] test = np.array(test) test = np.reshape(test, (-1, 1)) normalizer = Normalizer() normalized = normalizer.FeatureScaler.transform(test) print(normalized)
def _do_work(self,queue,bbnp,param,action_space,state_space,max_length,seed_id,num_workers,env,continuous_action,cma_param,index_start): start = time.time() np.random.seed(seed_id) if self.noise_type == 'Gaussian': noises = self.sigma * np.random.randn(num_workers,len(param)) noisy_param = param + noises elif self.noise_type == 'Hadamard': h_size = 1<<((max(num_workers,len(param))-1).bit_length()) h = hadamard(h_size) noises = self.sigma*([email protected](np.random.choice([-1,1], h_size)))[:num_workers,:len(param)] noisy_param = param + noises elif self.noise_type == 'CMA': noisy_param = cma_param elif self.noise_type == 'CDPP': noisy_param = self.buffer[index_start:index_start+num_workers] fitness = [] anti_fitness = [] worker_summary = {} for ind in noisy_param: #do the roll out if self.state_renormalize == True: normal = Normalizer(state_space) ind_fit = bbnp.roll_out(ind,env,self.env,normal,render = False,state_renormalize = True) normal = Normalizer(state_space) ind_fit_anti = bbnp.roll_out(-ind,env,self.env,normal,render = False,state_renormalize = True) else: normal = Normalizer(state_space) ind_fit = bbnp.roll_out(ind,env,self.env,normal,render = False,init = False) ind_fit_anti = bbnp.roll_out(-ind,env,self.env,normal,render = False,init = False) fitness.append(ind_fit) anti_fitness.append(ind_fit_anti) end = time.time() worker_summary['fit'] = fitness worker_summary['anti_fit'] = anti_fitness worker_summary['seed_id'] = seed_id queue.put(worker_summary)
def pruebas(): # svm_classification(1000, 0.2, C=10, gamma=0.00833333333, kernel="rbf") lenData = 2500 print(lenData) print("kernel: ", "sigmoid", " C: ", 1, " G: ", 0.000000001) pctTest = 0.2 # samples = generar_muestra_provincia(lenData, "SAN JOSE") # quantity_for_testing = int(lenData*pctTest) # normalizer = Normalizer() # data = normalizer.prepare_data(samples, quantity_for_testing) # svm_classification(10, lenData, pctTest, C=1, gamma=1, kernel="rbf") time1 = time.time() for i in range(0, 30): samples = generar_muestra_pais(lenData) quantity_for_testing = int(lenData*pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) svm_classification( 10, lenData, pctTest, C=1, gamma=0.000000001, kernel="sigmoid") time2 = time.time() print("ms: ", ((time2-time1)*1000.0)) totalacc = 0.0 for i in range(0, len(accList), 3): totalacc += accList[i][1] print("ER: ", 1-(totalacc/30.0)) totalacc = 0.0 for i in range(1, len(accList), 3): totalacc += accList[i][1] print("ER: ", 1-(totalacc/30.0)) totalacc = 0.0 for i in range(2, len(accList), 3): totalacc += accList[i][1] print("ER: ", 1-(totalacc/30.0))
def svm_classification(k, lenData, pctTest, params, C=1, gamma=1, kernel="rbf"): clear_csv() samples = [] print(params) if (params[0] == "PAIS"): samples = generar_muestra_pais(lenData) else: samples = generar_muestra_provincia(lenData, params[1]) quantity_for_testing = int(lenData * pctTest) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) svmClassifier = SVMClassifier(kernel, C, gamma) firstRound = cross_validation(k, svmClassifier, data, lenData, "trainingFeatures", "testingFeatures", "First") secondRound = cross_validation(k, svmClassifier, data, lenData, "trainingFeatures", "testingFeatures", "Second") secondWithFirst = cross_validation(k, svmClassifier, data, lenData, "trainingFeaturesFirstInclude", "testingFeaturesFirstInclude", "Second") normalData = normalizer.get_normal_data() predictions = [firstRound, secondRound, secondWithFirst] show_accuracy("SVM", predictions) make_csv(k, normalData, lenData, pctTest, predictions)
def fix_acris(self): tuples = self.preprocess() real_estates = [] while tuples: try: t = tuples.pop(0) bbl = Normalizer.set_bbl(t[0], t[1], t[2]) address = t[3]+" "+t[4] address = Normalizer.set_address(address, bbl) date = Normalizer.set_str_to_epoch(t[5]) price = t[6] real_estates.append((bbl, address, date, price)) except ValueError: self.error_log.open() self.error_log.write(t[1]+", "+str(t[0])) self.error_log.close() except KeyboardInterrupt: print "" print "Stopped" CsvManager.append_geo_codes(real_estates, self.output) CsvManager.append_geo_codes(real_estates, self.output)
def testLogProbabilities(self): probs = numpy.asarray([ [0.01, 0.04], [0.1, 0.4], [0.4, 0.1], [0.00004, 0.00001]]) expectedProbs = numpy.asarray([ [0.2, 0.8], [0.2, 0.8], [0.8, 0.2], [0.8, 0.2]]) logProbs = numpy.log(probs) logExpectedProbs = numpy.log(expectedProbs) target = Normalizer() normalizedLogProbs = target.normalizeLogProbabilities(logProbs) normalizedProbs = numpy.exp(normalizedLogProbs) numpy.testing.assert_array_almost_equal(normalizedProbs, expectedProbs) numpy.testing.assert_array_almost_equal(normalizedLogProbs, logExpectedProbs)
def normalizeTokens(self): '''Normalizes each token in the instance and stores the normalized tokens in the token dictionary''' normalizedTokenDictionary = {} normalizedTokenId = 1 for tokenObject in self.tokenDictionary.values(): normalizedTokenObjects = Normalizer.getInstance().normalizeToken( tokenObject) for normalizedTokenObject in normalizedTokenObjects: normalizedTokenDictionary[ normalizedTokenId] = normalizedTokenObject normalizedTokenId += 1 return normalizedTokenDictionary
def _get_block_dict(self): self._block_dict = {} for section in self._config.sections(): self._block_dict[section] = [] for option in self._config[section]: if option not in [ 'url_base', 'query_interval', 'tmp', 'output', 'iterations' ]: normalized_option = Normalizer.normalize_name(option) output_xml_file = self._output + '/' + section + '/' + normalized_option + ".xml" self._block_dict[section].append((option, output_xml_file))
def _process_article(self, doc_key, article): normalizer = Normalizer() all_terms = [] for i in article.findall('*'): if 'encoded' in i.tag or 'description' in i.tag: if i.text: i = BeautifulSoup(i.text, "lxml") if i.text is not None: for y in i.text.split(): all_terms.append(y) cleaned_terms = [] for i in all_terms: normalized = normalizer.normalize_name(i).strip() if not len(normalized) < 3 and not normalizer.is_stop_word( normalized) and not normalizer.is_link(normalized): cleaned_terms.append(normalized) for term in cleaned_terms: stemmed_term = self._stemmer.stemWord(term) term_id = self._get_or_create_term_id(stemmed_term) self._ii_dict.setdefault(term_id, set()) self._ii_dict[term_id].add(self._document_dict.get(doc_key))
def __init__(self): self.NaiveBayesClassifier = NaiveBayesClassifier() # Sentence Splitters self.RuleBasedSentenceSplitter = RuleBasedSentenceSplitter() self.MLBasedSentenceSplitter = MLBasedSentenceSplitter() # Tokenizers self.RuleBasedTokenizer = RuleBasedTokenizer() self.MLBasedTokenizer = MLBasedTokenizer() # Normalizer self.Normalizer = Normalizer() # Stemmer self.Stemmer = Stemmer() # Stopword Eliminators self.StaticStopWordEliminator = StaticStopwordRemover() self.DynamicStopWordEliminator = DynamicStopWordEliminator()
def search_lat_long(self): tuples = self.preprocess() count = 1 nominatim, google, opencage, bing, tiger = self.build_geocodings() while tuples: t = tuples.pop(0) status, found = self.geocode_process(t, nominatim) if not found: if status == -1: status, found = self.geocode_process(t, bing) if not found and status == -1: self.geocode_process(t, tiger) elif status == -2: i = 1 while i < 3: print "Waiting 45' for the "+Normalizer.set_order(str(i))+" time" time.sleep(2700) status, found = self.geocode_process(t, nominatim) if found: continue elif status == -2: i += 1 elif status == -3: return if count % 100 == 0: for i in range(3): t = tuples.pop(0) status, found = self.geocode_process(t, google) time.sleep(3) if not found: self.geocode_process(t, opencage) time.sleep(3) else: t = tuples.pop(0) self.geocode_process(t, opencage) time.sleep(3) count += 1
def _apply(self): ''' Apply transformations set ... ''' if 'filter' not in self.config: raise ValueError('Config map should include "filter."') opt = DataParserOpt(**self.config['filter']) dr = DataReader(self.map_file, self.data, opt=opt) A, y = dr.extract_features() if 'sample' in self.config: try: method = self.config['sample']['method'] method_args = self.config['sample']['method_args'] ds = DataSampler(A, y, method, **method_args) except Exception as e: ds = DataSampler(A, y) print('{} or\nSample structure has no method_args'.format(e)) A, y = ds.sample() if 'normalize' in self.config: nml = Normalizer(map_file=self.map_file) for method_str in self.config['normalize']['methods']: method = getattr(nml, method_str) A = method(A) return A, y
def test_distance(self): normalizer = Normalizer() self.assertEqual(distance([0, 1, 2], [3, 4, 5]), 5.196152422706632)
print("Accuracy Training:", accuracy.eval({ X: train_x, y: train_y })) ## sess = tf.Session() ## with sess.as_default(): ## return self.toparty(self.y.eval({self.X: test_x, self.y: test_y}).tolist()) ## samples = generar_muestra_pais(100) quantity_for_testing = int(100 * 0.2) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) classes = np.append(data["trainingClassesFirst"], data["testingClassesFirst"], axis=0) sample = { "trainingFeatures": data["trainingFeatures"], "trainingClasses": data["trainingClassesFirst"], "testingFeatures": data["testingFeatures"], "testingClasses": data["testingClassesFirst"] } sample2 = { "testingFeatures": data["testingFeatures"], "testingClasses": data["testingClassesFirst"] } print(sample2["testingClasses"])
# empieza el entrenamiento _, c = sess.run([optimizer, cost], feed_dict={self.X: train_x, self.y: train_y}) cost_in_each_epoch += c ## # you can uncomment next two lines of code for printing cost when training ## if (epoch+1) % display_step == 0: ## print("Epoch: {}".format(epoch + 1), "cost={}".format(cost_in_each_epoch)) print("Accuracy Training:", accuracy.eval({X: train_x, y: train_y})) ## sess = tf.Session() ## with sess.as_default(): ## return self.toparty(self.y.eval({self.X: test_x, self.y: test_y}).tolist()) ## samples = generar_muestra_pais(100) quantity_for_testing = int(100*0.2) normalizer = Normalizer() data = normalizer.prepare_data(samples, quantity_for_testing) classes = np.append( data["trainingClassesFirst"], data["testingClassesFirst"], axis=0 ) sample = { "trainingFeatures": data["trainingFeatures"], "trainingClasses": data["trainingClassesFirst"],"testingFeatures": data["testingFeatures"], "testingClasses": data["testingClassesFirst"]} sample2 = { "testingFeatures": data["testingFeatures"], "testingClasses": data["testingClassesFirst"]} print(sample2["testingClasses"]) prueba = logistic_regression_classifier(1,classes) prueba.train(sample) print(prueba.classify(sample2))
def train(self): #need to put into function if not os.path.exists("./" + param_dir): os.mkdir(param_dir) for test in range(self.num_test): fit_list = [] iteration_list = [] plt.ion() plt.show() #set up environment env = gym.make(self.env) if self.continuous_action == True: action_space = env.action_space.shape[0] else: action_space = env.action_space.n state_space = env.observation_space.low.size param_dict = {} #get param for the len best_reward = 0 bbnp = bb_numpy(action_space,state_space,self.max_length,continuous_action = self.continuous_action,state_renormalize = self.state_renormalize) state = np.zeros(state_space)[None,...] bbnp.forward_propagate(state,init = True) #initialize param param = np.array(bbnp.get_flat_param()) SGD_ = SGD(param, self.lr) cma_es = cma.CMAEvolutionStrategy(param,self.sigma,{'popsize': self.num_perturbations,}) for iteration in range(self.iterations): ts = time.time() cma_param = np.array(cma_es.ask()) if self.method_type == 'CDPP': if iteration == 0: X = np.random.randn(self.num_perturbations*3,len(param)) cond_indices = cond_kdpp([], X, k = self.num_perturbations) self.buffer = X[cond_indices] else: dists = np.linalg.norm(self.buffer-param,axis=1) num_closest= int(self.perc_reuse*self.num_perturbations) closest_indices = dists.argsort()[:num_closest] reused_samples = self.buffer[closest_indices] X = np.random.randn(self.num_perturbations*3,len(param)) + param cond_indices = cond_kdpp(self.buffer[closest_indices],X,k=(self.num_perturbations-num_closest)) self.buffer = np.vstack((self.buffer[closest_indices],X[cond_indices])) all_indices = [] queue = Queue() num_workers = [] num_perts = self.num_perturbations shared_amt = self.num_perturbations//self.num_cpu if self.noise_type == 'CDPP' and iteration!=0: num_perts = self.num_perturbations - int(self.perc_reuse*self.num_perturbations) shared_amt = (self.num_perturbations - int(self.perc_reuse*self.num_perturbations))//self.num_cpu while num_perts > shared_amt: num_perts -= shared_amt num_workers.append(shared_amt) num_workers.append(num_perts) start_indices = [0] # print('num_workers=',num_workers) for i in range(1,len(num_workers)): start_indices.append(start_indices[i-1]+num_workers[i-1]) cma_param_slicer = [0] cma_param_slicer.extend(num_workers) cma_param_slicer = np.cumsum(cma_param_slicer) seed_id = np.random.randint(np.iinfo(np.int32(10)).max, size=len(num_workers)) workers = [Process(target = self._do_work,args = (queue,bbnp,param,action_space,state_space,self.max_length,seed_id[i],num_workers[i],env,self.continuous_action,cma_param[cma_param_slicer[i]:cma_param_slicer[i+1],:],start_indices[i])) for i in range(len(seed_id))] for worker in workers: worker.start() results = [queue.get() for p in workers] # Swapping this with the above line so deadlock is avoided for worker in workers: worker.join() if self.noise_type == 'CDPP': if iteration != 0: old_pert_fitness = pert_fitness[closest_indices] old_anti_pert_fitness = anti_pert_fitness[closest_indices] pert_fitness,anti_pert_fitness,seed_id = get_info_summary(results) pert_fitness = np.array(pert_fitness)[...,None] anti_pert_fitness = np.array(anti_pert_fitness)[...,None] if self.noise_type == 'CDPP': if iteration != 0: pert_fitness = np.vstack((old_pert_fitness,pert_fitness)) anti_pert_fitness = np.vstack((old_anti_pert_fitness,anti_pert_fitness)) if self.noise_type in ['Gaussian','Hadamard']: noises = get_noise_matrices(seed_id,num_workers,len(param),self.sigma,self.noise_type) #record average_fit average_fit = np.sum(pert_fitness)/self.num_perturbations fit_list.append(average_fit) iteration_list.append(iteration) #dynamic plot the graph plt.plot(iteration_list,fit_list,'r') plt.draw() plt.pause(0.3) #Ranking best if self.method_type == 'Rank': top_ind = np.sort(np.argsort(pert_fitness,axis =0)[-self.best:][::-1],axis = 0).flatten() pert_fitness = pert_fitness[top_ind] gradient = (1 / len(top_ind) / self.sigma * (noises[top_ind,:].T@pert_fitness)).flatten() SGD_gradient = SGD_.get_gradients(gradient) param = param + SGD_gradient print('param',param) #Vanilla elif self.method_type == 'Vanilla': gradient = (1 / self.num_perturbations / self.sigma * (noises.T@pert_fitness)).flatten() SGD_gradient = SGD_.get_gradients(gradient) param = param + SGD_gradient #CMA elif self.method_type == 'CMA': cma_es.tell(cma_param,-pert_fitness[:,0] - compute_weight_decay(0.01,cma_param)) param = cma_es.result[5] # mean of all perturbations - for render and save - not used to update new space #ARS elif self.method_type == 'ARS': fb_fitness = np.hstack((pert_fitness,anti_pert_fitness)) top_ind = (np.argsort(np.max(fb_fitness,axis = 1,keepdims = True),axis = 0)[-self.best:][::-1]).flatten() fit_diff = pert_fitness - anti_pert_fitness reward_noise = np.std(np.vstack((pert_fitness,anti_pert_fitness))) fit_diff = fit_diff[top_ind] gradient = (1 / len(top_ind) / self.sigma/reward_noise * (noises[top_ind,:].T@fit_diff)).flatten() SGD_gradient = SGD_.get_gradients(gradient) param = param + SGD_gradient #CDPP elif self.method_type == 'CDPP': cond_noise = self.buffer - param if iteration != 0: noises = np.vstack((reused_samples,cond_noise)) else: noises = cond_noise top_ind = np.sort(np.argsort(pert_fitness,axis =0)[-self.best:][::-1],axis = 0).flatten() best_pert_fitness = pert_fitness[top_ind] gradient = (1 / self.num_perturbations / self.sigma * ((self.buffer - param)[top_ind,:].T@best_pert_fitness)).flatten() param = param + self.lr * gradient if iteration % self.video_save_interval == 0 and iteration !=0: normal = Normalizer(state_space) video_env = gym.wrappers.Monitor(env, './videos/' + str(self.env) + '/'+ str(self.method_type) + '_perturbations_' + str(self.num_perturbations) + '_' +'state_renormalize_' + str(self.state_renormalize)+ '_Simga_' + str(self.sigma) + "_best_" + str(self.best) + "_iter_" +str(iteration) + "_test_" + str(self.test)) bbnp.roll_out(param,video_env,self.env,normal,render = True) print("-" * 100) te = time.time() #print the results print('iteration: {} | average_fit: {} | # params: {} | time: {:2.2f}s'.format(iteration,average_fit,len(param),te-ts)) self.save_param(path,name,param,average_fit,iteration)
class MLP: """ This is the class used for processing the data and creating a Multi layered perceptron. ... Attributes ---------- layers : list of the class Layer a list of the layers included in the MLP activations_functions : list of the class Activation a list of the activations included in the MLP loss_function : Loss the loss function defined for the MLP normalizer : Normalizer a normalizer class to normalize the data used in the MLP errors : float [] list for the average error for each training epoch in the MLP predict_errors : float a float for the average error of all data rows in predict Methods ------- add_layer(self, size, activation_function) adds a layer to the MLP _backprop(self, d_loss, learning_rate) backpropagates through the MLP train(self, x, y, learning_rate=0.01, n_epochs=10) method for training the MLP predict(self, x, y) method for predicting given data plot(self, dataset_name, nr_epochs, y_test, y_pred, train_time, pred_time) method for plotting result from MLP """ def __init__(self, loss_function): """ Parameters ---------- loss_function : SquaredErrorLoss loss function for the MLP """ self.layers = [] self.activations_functions = [] self.loss_function = loss_function self.normalizer = Normalizer() # All errors, returned in end for plotting self.errors = [] self.predict_errors = 0 def add_layer(self, size, activation_function): """Adds a layer to the MLP. Parameters ---------- size : int The size of the layer to be added activation_function : Activation The activation function for the layer to be added """ n_inputs = size if self.layers: n_inputs = self.layers[-1].n_nodes self.layers.append(Layer(size, n_inputs)) self.activations_functions.append(activation_function) def _backprop(self, d_loss, learning_rate): """Backpropagates through the MLP. Parameters ---------- d_loss : numpy array The derivated loss learning_rate : float The rate at which the MLP learns """ for i in range(len(self.layers) - 1, 1, -1): input_data = self.layers[i].get_output() loss = self.activations_functions[i].backward(input_data).reshape( len(input_data), -1) * d_loss d_loss = self.layers[i].backprop(loss, learning_rate) def train(self, x, y, learning_rate=0.01, n_epochs=10): """Backpropagates through the MLP. Parameters ---------- n_epochs : int The number of epochs the MLP will run x : floats [][] The input data for training the MLP y : floats [][] The target data for each training data learning_rate : float The rate at which the MLP learns """ # Normalize data self.normalizer.fit(x, y) x, y = self.normalizer.normalize(x, y) # Main loop, handles forward and backprop for _ in range(n_epochs): output = x for j, layer in enumerate(self.layers): # Activation forward, sets the input and output for each layers output = self.activations_functions[j].forward( layer.forward(output)) layer.set_output(output) # Current error for the epoch is saved error = (np.average(self.loss_function.forward(output, y))) self.errors.append(error) # Get the derivative loss from the output node d_loss = self.loss_function.backward(output, y) # Backprop self._backprop(d_loss, learning_rate) def predict(self, x, y): """Predicts a given input with the MLP. Parameters ---------- x : floats [][] The input data for training the MLP y : floats [][] The target data for each training data """ # Normalize data x, y = self.normalizer.normalize(x, y) output = x for j, layer in enumerate(self.layers): output = self.activations_functions[j].forward( layer.forward(output)) self.predict_errors = (np.average(self.loss_function.forward( output, y))) return self.normalizer.renormalize(output) def plot(self, dataset_name, nr_epochs, y_test, y_pred, train_time, pred_time): """Plots the results from a dataset trained and predicted with the MLP. Parameters ---------- dataset_name : str The name of the dataset being plot nr_epochs : int The number of epochs the MLP has been trained y_test : floats [][] The target data for each training data y_pred : floats [][] The predicted result from the MLP train_time : float The elapsed time of the MLP training on the dataset pred_time : float The elapsed time of the MLP prediction on the dataset """ xy_max = max(max(y_pred), max(y_test)) xy_min = min(min(y_pred), min(y_test)) plt.figure(figsize=(10, 6)) plt.suptitle("Data: {}".format(dataset_name)) plt.subplot(121) plt.scatter(np.arange(1, nr_epochs + 1), self.errors, label='loss') plt.title("Average Loss by epoch") plt.xlabel('Epochs') plt.ylabel('Loss') pred_info = "Total pred time: {:.6f}\n".format(pred_time) train_info = "Total train time: {:.2f}\n".format(train_time) average_loss = "Pred MSE: {:.2f}\n".format( np.average(self.predict_errors)) epochs = "Number of epochs: {}".format(nr_epochs) text = pred_info + train_info + average_loss + epochs plt.annotate(text, xy=(1, 1), xytext=(-15, -15), fontsize=10, xycoords='axes fraction', textcoords='offset points', bbox=dict(facecolor='white', alpha=0.8), horizontalalignment='right', verticalalignment='top') plt.subplot(122) plt.scatter(y_test, y_pred) plt.xlim(xy_min, xy_max) plt.ylim(xy_min, xy_max) plt.xlabel("Target") plt.ylabel("Predicted") plt.title("Actual Y vs Predicted ") plt.show()
while line: strLine = line.split(delim) try: entryPointNumber = int(strLine[0]) epName = genericAgentTypeName(strLine[2]) activityEntryPointName = epName + "_" + strLine[0].replace(" ", "_") + "_" + strLine[1].replace(" ", "_") entryPointName = strLine[0].replace(" ", "_") + "_" + strLine[1].replace(" ", "_") totalEntries = int(strLine[16]) if totalEntries > 0: ## Generating list of agenttypes over entry points if not entryPointName in agentTypesAtEntryPoints.keys(): agentTypesAtEntryPoints[entryPointName] = [] agentTypesAtEntryPoints[entryPointName].append(activityEntryPointName) ## Writing Time Tables n = Normalizer() timetableHeader = "\n<timetableData ID=\"tt_" + epName + "_" + n.normalize(entryPointName) + "\">" timetableNote = "<!-- Time Table for Entrypoint ID: " + str(entryPointNumber) + " -->" timetableFooter = "</timetableData>" outTTHdl.write(indent1 + timetableHeader + "\n") outTTHdl.write(indent2 + timetableNote + "\n") time1 = 0 timeIncrement = 2 for i in range(3, 15): try: pctCount = int(strLine[i]) except: pctCount = 0 time2 = time1 + timeIncrement count = int(round(totalEntries * (pctCount / 100.0)))
class LDA(object): def __init__(self, vectorizer="tfidf", stopwords_path=None): self.vectorizer = vectorizer self.stopwords_path = stopwords_path self.normalizer = Normalizer(self.stopwords_path) self.normalizer.load_stopwords() @print_run_time def tranform_corpora(self, corpora_dir, corpora_path, id2word_path): """转化语料 1. 从{corpora_dir}文件夹下提取所有.txt文件作为语料 2. 文件总每一行经过预处理后作为一行,存入{corpora_path}文件 3. 保存id2word到{id2word_path}文件 """ self.corpora_dir = corpora_dir self.corpora_path = corpora_path self.id2word_path = id2word_path self._transform_corpora(self.normalizer, self.corpora_dir, self.corpora_path, self.id2word_path) @print_run_time def train_lda(self, model_dir, model_fname, num_topics): self.model_dir = model_dir self.model_fname = model_fname self.num_topics = num_topics self.model = self._train_lda(self.vectorizer, self.corpora_path, self.id2word_path, self.model_dir, self.model_fname, self.num_topics) self.model_path = os.path.join(self.model_dir, self.vectorizer, self.model_fname) @staticmethod def _transform_corpora(normalizer, corpora_dir, corpora_path, id2word_path): """转化语料 1. 从{corpora_dir}文件夹下提取所有.txt文件作为语料 2. 文件总每一行经过预处理后作为一行,存入{corpora_path}文件 3. 保存id2word到{id2word_path}文件 Args: corpora_dir(path) :- 语料文件所在的文件夹 corpora_path(path) :- 汇总所有语料的.txt文件 id2word_path(path) :- gensim的字典文件 """ corpora = [] if not os.path.isdir(corpora_dir): raise OSError(corpora_dir, "doesn't exist") if not os.path.isdir(os.path.dirname(corpora_path)): raise OSError(os.path.dirname(corpora_path), " doesn't exist") if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_path))): raise OSError("the grandparent directory of ", id2word_path, " doesnt't exist") output_tfidf = open(corpora_path, 'a', encoding="utf8") for file in os.listdir(corpora_dir): if file.endswith('txt'): file = os.path.join(corpora_dir, file) print(file+' read') with open(file, encoding="utf8") as f: lines = f.readlines() for line in lines: words = normalizer.tokenize(line) if len(words) > 0: corpora.append(words) output_tfidf.write('{}\n'.format(" ".join(words))) f.close() output_tfidf.close() id2word = gensim.corpora.Dictionary(corpora) parent_dir = os.path.dirname(id2word_path) make_dir(parent_dir) if not os.path.isfile(id2word_path): id2word.save(id2word_path) print('id2word saved') else: print(id2word_path, ' already exists') @staticmethod def _train_lda(vectorizer, corpora_path, id2word_path, model_dir, model_fname=model_fname, num_topics=10): """训练和保存基于tfidf的lda模型 基于{corpora_path}文件保存的语料和{id2word_path}保存的gensim字典来训练lda_tfidf模型, 保存该模型到{model_dir}文件夹下 Args: vectorizer(str) :- 向量化方法, choices=["bow", "tfidf"] corpora_path(path) :- 保存语料的.txt文件 id2word_path(path) :- 保存gensim字典的文件 model_dir(path) :- 保存gensim LDA模型的文件夹 model_fname(path) :- 模型文件名 num_topics(int) :- lda的超参,主题数 """ try: assert vectorizer in ["bow", "tfidf"] except AssertionError: raise AssertionError("vectorizer must be bow or tfidf") if not os.path.isdir(model_dir): raise OSError(model_dir, "doesn't exist") corpora = [] with open(corpora_path, 'r', encoding="utf8") as fp: lines = fp.readlines() for line in lines: corpora.append(line.strip()) id2word = gensim.corpora.Dictionary.load(id2word_path) corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora] # tfidf的话需要计算idf if vectorizer == "tfidf": MmCorpus.serialize(corpus_tfidf_mm, corpus) corpus = MmCorpus(corpus_tfidf_mm) model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) model_path = os.path.join(model_dir, vectorizer) make_dir(model_path) model_path = os.path.join(model_path, model_fname) if not os.path.isfile(model_path): model.save(model_path) print('model saved') else: print(f"{model_path} already exists") return model @staticmethod def analysis_topics(fname): """将各个主题的关键字打印出来 把self.model.print_topics(10)保存到fname后打印出来 """ f = open(fname, 'r') lines = f.readlines() for line in lines: print(re.findall(r'\"([^\"]*)\"', line)) @staticmethod def _short_long_similarity(model_path, normalizer, id2word_path, short, long): """计算长短文本的相似度 Args: model_path(path) :- gensim.models.ldamodel的保存路径 id2word_path(path) :- gensim.corpora.Dictionary的保存路径 short(str) :- 短文本 long(str) :- 长文本 Returns: prob(float) :- 长短文本的匹配度 theta(iterables) :- 长文本在lda模型下的主题分布概率, 每个元素为(主题的序号, 对应主题的概率) """ lda = gensim.models.LdaModel.load(model_path) id2word = gensim.corpora.Dictionary.load(id2word_path) theta = lda[id2word.doc2bow(normalizer.tokenize(long))] short = normalizer.tokenize(short) short = set(short) short = id2word.doc2idx(short) prob = 0 for word in short: prob_w = sum([lda.expElogbeta[k][word]*1000 * p_zk for (k, p_zk) in theta]) prob += math.log(prob_w) prob = prob/len(short) prob -= math.log(1000) return prob, theta def short_long_sim(self, short, long): """用self.model计算长短文本相似度 """ return self._short_long_similarity(self.model_path, self.normalizer, self.id2word_path, short, long)
def GetJointLogProbability(logProbabilities, labels, ids): modelAggregator = ModelAggregator() normalizer = Normalizer() unormalizedLogProbs, labels, ids = modelAggregator.Aggregate(logProbabilities, labels, ids) return (normalizer.normalizeLogProbabilities(unormalizedLogProbs), labels,ids)
def __init__(self, vectorizer="tfidf", stopWordsFile=None): self.vectorizer = vectorizer self.stopWordsFile = stopWordsFile self.normalizer = Normalizer(self.stopWordsFile) self.normalizer.load_stopWords()
class LDA(): def __init__(self, vectorizer="tfidf", stopWordsFile=None): self.vectorizer = vectorizer self.stopWordsFile = stopWordsFile self.normalizer = Normalizer(self.stopWordsFile) self.normalizer.load_stopWords() @print_run_time def create_corporaListAndCorporaText(self, corpora_source, corpora_txt, id2word_fname): self.corpora_source = corpora_source self.corpora_txt = corpora_txt self.id2word_fname = id2word_fname self._create_corporaListAndCorporaText(self.normalizer, self.corpora_source, self.corpora_txt, self.id2word_fname) @print_run_time def createAndSave_lda(self, ldaModel_save_repo, num_topics): self.ldaModel_save_repo = ldaModel_save_repo self.num_topics = num_topics if self.vectorizer == "tfidf": self.model = self._createAndSave_lda_tfidf(self.corpora_txt, self.id2word_fname, self.ldaModel_save_repo, self.num_topics) self.model_fname = self.ldaModel_save_repo + '/gensim_tfidf/crawl_news.model' else: self.model = self._createAndSave_lda_bow(self.corpora_txt, self.id2word_fname, self.ldaModel_save_repo, self.num_topics) self.model_fname = self.ldaModel_save_repo + '/gensim_bow/crawl_news.model' @staticmethod def _create_corporaListAndCorporaText(normalizer, corpora_source, corpora_txt, id2word_fname): ''' 从{corpora_source}文件夹下提取所有.txt文件作为语料 文件总每一行经过预处理后作为一行,存入{corpora_txt}文件 并保存id2word到{id2word_fname}文件 Args: corpora_source(path) :- 语料文件所在的文件夹 corpora_txt(path) :- 汇总所有语料的.txt文件 id2word_fname(path) :- gensim的字典文件 ''' corpora = [] if not os.path.isdir(corpora_source): raise OSError(corpora_source, "doesn't exist") if not os.path.isdir(os.path.dirname(corpora_txt)): raise OSError(os.path.dirname(corpora_txt), " doesn't exist") if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_fname))): raise OSError("the grandparent directory of ", id2word_fname, " doesnt't exist") output_tfidf = open(corpora_txt, 'a', encoding="utf8") for file in os.listdir(corpora_source): if file.endswith('txt'): file = os.path.join(corpora_source, file) print(file + ' read') with open(file, encoding="utf8") as f: lines = f.readlines() for line in lines: words = normalizer.tokenize(line) if len(words) > 0: corpora.append(words) output_tfidf.write('{}\n'.format(" ".join(words))) f.close() output_tfidf.close() id2word = gensim.corpora.Dictionary(corpora) parent_dir = os.path.dirname(id2word_fname) make_dir(parent_dir) if not os.path.isfile(id2word_fname): id2word.save(id2word_fname) print('id2word saved') else: print(id2word_fname, ' already exists') @staticmethod def _createAndSave_lda_bow(corpora_txt, id2word_fname, ldaModel_save_repo, num_topics=10): ''' 训练和保存基于bow的lda模型 基于{corpora_txt}文件保存的语料和{id2word_fname}保存的gensim字典来训练lda_bow模型, 主题数为{num_topics} 保存该模型到{ldaModel_save_repo}文件夹下 Args: corpora_txt(path) :- 保存语料的.txt文件 id2word_fname(path) :- 保存gensim字典的文件 ldaModel_save_repo(path) :- 保存gensim LDA模型的文件夹 num_topics(int) :- lda的超参,主题数 ''' if not os.path.isdir(ldaModel_save_repo): raise OSError(ldaModel_save_repo, "doesn't exist") corpora = [] with open(corpora_txt, 'r', encoding="utf8") as fp: lines = fp.readlines() for line in lines: corpora.append(line.strip()) id2word = gensim.corpora.Dictionary.load(id2word_fname) corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora] lda_bow = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) make_dir(ldaModel_save_repo + '/gensim_bow') if not os.path.isfile(ldaModel_save_repo + '/gensim_bow/crawl_news.model'): lda_bow.save(ldaModel_save_repo + '/gensim_bow/crawl_news.model') print('lda_bow saved') else: print(ldaModel_save_repo, '/gensim_bow/crawl_news.model already exists') return lda_bow @staticmethod def _createAndSave_lda_tfidf(corpora_txt, id2word_fname, ldaModel_save_repo, num_topics=10): ''' 训练和保存基于tfidf的lda模型 基于{corpora_txt}文件保存的语料和{id2word_fname}保存的gensim字典来训练lda_tfidf模型, 主题数为{num_topics} 保存该模型到{ldaModel_save_repo}文件夹下 Args: corpora_txt(path) :- 保存语料的.txt文件 id2word_fname(path) :- 保存gensim字典的文件 ldaModel_save_repo(path) :- 保存gensim LDA模型的文件夹 num_topics(int) :- lda的超参,主题数 ''' if not os.path.isdir(ldaModel_save_repo): raise OSError(ldaModel_save_repo, "doesn't exist") corpora = [] with open(corpora_txt, 'r', encoding="utf8") as fp: lines = fp.readlines() for line in lines: corpora.append(line.strip()) id2word = gensim.corpora.Dictionary.load(id2word_fname) MmCorpus.serialize( 'corpus_tfidf.mm', [id2word.doc2bow(corpus.split(" ")) for corpus in corpora]) mm = MmCorpus('corpus_tfidf.mm') lda_tfidf = gensim.models.LdaModel(corpus=mm, id2word=id2word, num_topics=num_topics) make_dir(ldaModel_save_repo + '/gensim_tfidf') if not os.path.isfile(ldaModel_save_repo + '/gensim_tfidf/crawl_news.model'): lda_tfidf.save(ldaModel_save_repo + '/gensim_tfidf/crawl_news.model') print('lda_tfidf saved') else: print(ldaModel_save_repo, '/gensim_tfidf/crawl_news.model already exists') return lda_tfidf @staticmethod def analysis_topics(fname): '''将各个主题的关键字打印出来 把self.model.print_topics(10)保存到fname后打印出来 ''' f = open(fname, 'r') lines = f.readlines() for line in lines: print(re.findall(r'\"([^\"]*)\"', line)) @staticmethod def _short_long_similarity(lda_fname, normalizer, id2word_fname, short, long): '''计算长短文本的相似度 Args: lda_fname(path) :- gensim.models.ldamodel的保存路径 id2word_fnmae(path) :- gensim.corpora.Dictionary的保存路径 short(str) :- 短文本 long(str) :- 长文本 Returns: prob(float) :- 长短文本的匹配度 Theta(iterables) :- 长文本在lda模型下的主题分布概率, 每个元素为(主题的序号, 对应主题的概率) ''' lda = gensim.models.LdaModel.load(lda_fname) id2word = gensim.corpora.Dictionary.load(id2word_fname) Theta = lda[id2word.doc2bow(normalizer.tokenize(long))] short = normalizer.tokenize(short) short = set(short) short = id2word.doc2idx(short) prob = 0 for word in short: prob_w = sum([ lda.expElogbeta[k][word] * 1000 * p_zk for (k, p_zk) in Theta ]) prob += math.log(prob_w) prob = prob / len(short) prob -= math.log(1000) return prob, Theta def short_long_sim(self, short, long): return self._short_long_similarity(self.model_fname, self.normalizer, self.id2word_fname, short, long)
def writeStates(self, outStateHdl, entryPointID, entryPointName, wayPointProbabilities, countFunctions): ## Entry-/waypoint probability states if entryPointID == 10101: pass # import pdb;pdb.set_trace() i = 2 propSum = 0 for wayPoint in wayPointProbabilities: if wayPoint[i] > 0: propSum += 1 if 0 == 0: #if propSum > 0: entryPointName = entryPointName.replace(" ", "_") outStateHdl.write("<stateType ID=\"" + "entryState_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "\">\n") outStateHdl.write(indent2 + "<iconcolour>" + self.iconColor + "</iconcolour>\n") outStateHdl.write(indent2 + "<speed>\n") outStateHdl.write(indent2 + indent2 + "<alpha>" + str(self.speedAlpha) + "</alpha>\n") outStateHdl.write(indent2 + indent2 + "<beta>" + str(self.speedBeta) + "</beta>\n") outStateHdl.write(indent2 + "</speed>\n") outStateHdl.write(indent2 + "<headOnFactor>1.0</headOnFactor>\n"); outStateHdl.write(indent2 + "<agentMode>network</agentMode>\n") outStateHdl.write(indent2 + "<type>normal</type>\n") outStateHdl.write(indent2 + "<instantiationFunctions>\n" + indent2 + indent2 + "<function name=\"selectWaypoint\">\n") for wayPoint in wayPointProbabilities: if wayPoint[i] > 0: blanks = " " for cc in range(3 - len(str(wayPoint[i]))): blanks += " " n = Normalizer() outStateHdl.write(indent2 + indent2 + "<waypoint id=\"" + str(wayPoint[0]) + "\" probability=\"" + str(wayPoint[i]) + "\" />" + blanks + "<!-- " + n.normalize(wayPoint[1]) + "-->\n") outStateHdl.write(indent2 + indent2 + "</function>\n" + indent2 + "</instantiationFunctions>\n") countFunctions += 1 outStateHdl.write("</stateType>\n\n") i += 1 ## Pause at waypoint behaviour entryPointName = entryPointName.replace(" ", "_") outStateHdl.write("<stateType ID=\"" + "atWayPointState_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "\">\n") outStateHdl.write(indent2 + "<iconcolour>" + self.iconColorPause + "</iconcolour>\n") outStateHdl.write(indent2 + "<speed>\n") outStateHdl.write(indent2 + indent2 + "<alpha>" + str(self.speedAlpha) + "</alpha>\n") outStateHdl.write(indent2 + indent2 + "<beta>" + str(self.speedBeta) + "</beta>\n") outStateHdl.write(indent2 + "</speed>\n") outStateHdl.write(indent2 + "<headOnFactor>1.0</headOnFactor>\n"); outStateHdl.write(indent2 + "<agentMode>network</agentMode>\n") outStateHdl.write(indent2 + "<type>normal</type>\n") outStateHdl.write(indent2 + "<categoricTransitionFunctions>\n") outStateHdl.write(indent2 + indent2 + "<function name=\"waitTime\">\n") outStateHdl.write(indent2 + indent2 + indent2 + "<parameter>" + str(self.wayPointWait) + "</parameter>\n") outStateHdl.write(indent2 + indent2 + indent2 + "<toState>" + "Exit_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "</toState>\n") outStateHdl.write(indent2 + indent2 + "</function>\n") outStateHdl.write(indent2 + "</categoricTransitionFunctions>\n") outStateHdl.write("</stateType>\n\n") ## Exit behaviour outStateHdl.write("<stateType ID=\"" + "exitState_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "\">\n") outStateHdl.write(indent2 + "<iconcolour>" + self.iconColorExit + "</iconcolour>\n") outStateHdl.write(indent2 + "<speed>\n") outStateHdl.write(indent2 + indent2 + "<alpha>" + str(self.speedAlpha) + "</alpha>\n") outStateHdl.write(indent2 + indent2 + "<beta>" + str(self.speedBeta) + "</beta>\n") outStateHdl.write(indent2 + "</speed>\n") outStateHdl.write(indent2 + "<headOnFactor>1.0</headOnFactor>\n"); outStateHdl.write(indent2 + "<agentMode>network</agentMode>\n") outStateHdl.write(indent2 + "<type>normal</type>\n") outStateHdl.write(indent2 + "<instantiationFunctions>\n") outStateHdl.write(indent2 + indent2 + "<function name=\"selectWaypoint\">\n") outStateHdl.write(indent2 + indent2 + indent2 + "<waypoint id=\"" + str(entryPointID) + "\" probability=\"100\"/>\n") outStateHdl.write(indent2 + indent2 + "</function>\n") outStateHdl.write(indent2 + "</instantiationFunctions>\n") outStateHdl.write("</stateType>\n\n")
from HyperParameter import HyperParameter from Normalizer import Normalizer from AiPolicy import AIPolicy from Explorator import Explorator from Trainer import Trainer # Running the main code def mkdir(base, name): path = os.path.join(base, name) if not os.path.exists(path): os.makedirs(path) return path workDir = mkdir('exp', 'brs') monitorDir = mkdir(workDir, 'monitor') hyperParameter = HyperParameter() np.random.seed(hyperParameter.seed) environment = gym.make(hyperParameter.environmentName) environment = wrappers.Monitor(environment, monitorDir, force=True) inputsNumber = environment.observation_space.shape[0] outputsNumber = environment.action_space.shape[0] policy = AIPolicy(inputsNumber, outputsNumber, hyperParameter) normalizer = Normalizer(inputsNumber) explorator = Explorator(hyperParameter, normalizer, policy, environment) trainer = Trainer(policy, normalizer, hyperParameter, explorator) trainer.train()
from NN import NeuralNetwork from Normalizer import Normalizer import numpy as np from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split cancer = load_breast_cancer() X = cancer['data'] y = cancer['target'] length = len(cancer['feature_names']) nn = NeuralNetwork([length, 30, 20, 10, 5, 1]) X_train, X_test, y_train, y_test = train_test_split(X, y) normalize = Normalizer() normalize.fit(X_train) X_train = normalize.transform(X_train) X_test = normalize.transform(X_test) nn.fit(X_train, y_train, epochs=1000, verbose=False) predictions = nn.predict(X_test) print(nn.cost(predictions, y_test))
def __init__(self, vectorizer="tfidf", stopwords_path=None): self.vectorizer = vectorizer self.stopwords_path = stopwords_path self.normalizer = Normalizer(self.stopwords_path) self.normalizer.load_stopwords()
def collect_news(self): self.output = self.config["DEFAULT"]["output"] query_interval = int(self.config["DEFAULT"]["query_interval"]) iterations = int(self.config["DEFAULT"]["iterations"]) iterations_counter = 0 while iterations_counter <= iterations: for section in self.config.sections(): for option in self.config[section]: url_base = self.config[section]['url_base'] if option not in [ 'url_base', 'query_interval', 'tmp', 'output', 'iterations' ]: path = url_base + self.config[section][option] try: xml_data = requests.get(path) except requests.exceptions.ChunkedEncodingError: if self._callback: self._callback("DLERR", path) continue try: tree = ET.ElementTree( ET.fromstring(xml_data.content)) except ET.ParseError: if self._callback: self._callback("PARSEERR", path) continue root = tree.getroot() news_list = root.findall('./channel/item') normalized_option = Normalizer.normalize_name(option) self.create_dir_if_not_exists(section) output_xml_file = self.create_output_file_if_not_exists( normalized_option, section) tree_output = ET.parse(output_xml_file) root = tree_output.getroot() for article in news_list: article_title = article.find('title') article_date = article.find('pubDate') try: article_title.text = self.normalize_value( article_title.text) article_date.text = self.normalize_value( article_date.text) search_filter = './item[title=' + '"' + article_title.text + '"' + "]" \ + '[pubDate=' + '"' + article_date.text + '"' + ']' all_items = root.findall(search_filter) if len(all_items) == 0: if self._callback: self._callback("NEWARTICLE", section, option, article_title.text) root.append(article) except AttributeError: if self._callback: self._callback("BADFORMAT") tree_output.write(output_xml_file) if self._callback: self._callback("WAITING", query_interval) self._callback("CANINTERR") time.sleep(query_interval) iterations_counter += 1