def Base(eta, l2): params.outfile = 'Base_model' params.dataf = 'data/oct27.traindev.proc.cnn' params.dev = 'data/oct27.test.proc.cnn' params.test = 'data/daily547.proc.cnn' params.batchsize = 10 params.hidden = 100 params.embedsize = 100 params.eta = eta params.L2 = l2 params.dropout = 0 params.frac = 0.1 params.emb = 0 (words, We) = getWordmap('wordvects.tw100w5-m40-it2') We = np.asarray(We).astype('float32') tagger = getTagger('data/tagger') print tagger params.outfile = params.outfile + ".Batchsize" + '_' + str( params.batchsize) + '_' + "LearningRate" + '_' + str( params.eta) + '_' + str( params.hiddensize) + '_' + str(l2) + '.pickle' traindata = getData(params.dataf, words, tagger) trainx0, trainy0 = traindata devdata = getData(params.dev, words, tagger) devx0, devy0 = devdata testdata = getData(params.test, words, tagger) testx0, testy0 = testdata tm = base_model(We, params) tm.train(traindata, devdata, testdata, params)
def parse_dailykos(): #JNYTDocument.drop_collection() #URL for a search for the term US Presidential elections between 05/01/2011 and 05/31/2013 base_url = "http://www.dailykos.com/search?submit=Search&time_begin=05%2F01%2F2011&text_type=any&search_type=search_stories&order_by=-time&text_expand=contains&text=US%20Presidential%20Elections&time_type=time_published&usernames=%28usernames%29&tags=%28tags%29&time_end=05%2F31%2F2013&page=" main_url = base_url+"1" main_soup = BeautifulSoup(utils.getData(main_url)).find("div",{"class":"ajax-form-results ajax-delay-load"}) no_of_items = main_soup.find("h4",{"class":"sub-head"}).get_text() no_of_items = no_of_items.replace(" results were found","") no_of_items = int(no_of_items) no_of_pages = no_of_items / 50 #They rate limit the requests coming in from an bot. So an initial wait period before the loop begins.. #Inside the loop, parsing the other contents would introduce the sufficient wait time. time.sleep(10) for page_num in range(1, 30): url = base_url + `page_num` soup = BeautifulSoup(utils.getData(url)).find("div",{"class":"ajax-form-results ajax-delay-load"}) table_list = soup.find("table",{"class":"styled storiesAsGrid"}) if table_list != None: tbody = table_list.find("tbody") if tbody != None: link_rows = tbody.findAll("tr") for link_row in link_rows: dailyKosDoc = JNYTDocument() link = link_row.find("td",{"class":"first"}).find("a",{"class":"title"}) date = link_row.find("td",{"class":"sm date"}) dailyKosDoc.pub_date = datetime.strptime(date.get_text(),'%m/%d/%Y') dailyKosDoc.source = "DailyKos" dailyKosDoc.web_url = "http://www.dailykos.com" + link['href'] dailyKosDoc.headline = link.get_text() dailyKosDoc.political_leaning = "Liberal" dailyKosDoc.save() #Getting the social shares for the URL #dailyKosDoc.social_shares = shares.get_social_counts(dailyKosDoc.web_url) #dailyKosDoc.save() #Getting the content of the URL try: content_soup = BeautifulSoup(utils.getData(dailyKosDoc.web_url)).find("div",{"id":"storyWrapper"}).find("div",{"class":"article-body"}) dailyKosDoc.content = content_soup.get_text() dailyKosDoc.save() except: pass #break #if page_num == 2: # break return `page_num`
def parse_michelle_malkin(): #JNYTDocument.drop_collection() #http://michellemalkin.com/page/1/?s=presidential+elections+2012 base_url = "http://michellemalkin.com/page/<<page_num>>/?s=presidential+elections+2012" page_num = 1 while True: url = base_url.replace("<<page_num>>", `page_num`) soup = BeautifulSoup(utils.getData(url)).find("div",{"id":"content"}) title = soup.find("h1",{"class":"leadStoryAlt"}) if title == "Not Found": break article = soup.find("div",{"class":"article"}) headings = article.findAll("h2") author = article.findAll("div",{"class":"author"}) for index, h2 in enumerate(headings): link = h2.find("a") meta_data = [string.strip() for string in author[index].get_text().encode('utf-8').split('\xc2\xa0\xc2\xa0')] michelleMalkinDoc = JNYTDocument() michelleMalkinDoc.web_url = link['href'] michelleMalkinDoc.political_leaning = "Conservative" michelleMalkinDoc.source = "Michelle Malkin" michelleMalkinDoc.headline = link.get_text() michelleMalkinDoc.pub_date = datetime.strptime(meta_data[2],"%B %d, %Y %I:%M %p") michelleMalkinDoc.save() #Getting the social shares for the URL #michelleMalkinDoc.social_shares = shares.get_social_counts(michelleMalkinDoc.web_url) #michelleMalkinDoc.save() #Getting the document content. content_soup = BeautifulSoup(utils.getData(michelleMalkinDoc.web_url)).find("div",{"class":"blog"}).findAll("p") article_content = "" for paragraph in content_soup: text = paragraph.get_text() if text.startswith("**Written by ") or text.startswith("Twitter @"): continue article_content += " "+ text michelleMalkinDoc.content = article_content.strip() michelleMalkinDoc.save() page_num += 1 return `index`
def parse_fivethirtyeight(): #JNYTDocument.drop_collection() base_url = "http://fivethirtyeight.com/page/<<page_num>>/?s=presidential+elections+2012" page_num = 1 while True: url = base_url.replace("<<page_num>>", `page_num`) url_content = utils.getData(url) if url_content is None: break # if page_num == 3: # break # soup = BeautifulSoup(utils.getData(url)) # print soup soup = BeautifulSoup(url_content).find("div",{"id":"main"}) posts = soup.findAll("div") for index, div in enumerate(posts): if index == 0: continue #do something with the individual posts here.. date_string = div.find("span",{"class":"datetime updated"}).get_text() link = div.find("h2",{"class":"article-title entry-title"}).find("a") fivethirtyeightDoc = JNYTDocument() fivethirtyeightDoc.web_url = link['href'].strip() fivethirtyeightDoc.political_leaning = "Liberal" fivethirtyeightDoc.source = "FiveThirtyEight" fivethirtyeightDoc.headline = link.get_text().strip() fivethirtyeightDoc.pub_date = datetime.strptime(date_string.strip(),"%b %d, %Y") fivethirtyeightDoc.save() #Getting the social shares for the URL #fivethirtyeightDoc.social_shares = shares.get_social_counts(fivethirtyeightDoc.web_url) #fivethirtyeightDoc.save() try: content_soup = BeautifulSoup(utils.getData(fivethirtyeightDoc.web_url)).find("div",{"class":"entry-content"}) fivethirtyeightDoc.content = content_soup.get_text() fivethirtyeightDoc.save() except: pass #print date_string.strip(), title_link.get_text().strip(), title_link['href'].strip() # break # break page_num = page_num + 1 return `index`
def parse_pj_media(): # JNYTDocument.drop_collection() #http://pjmedia.com/page/1/?s=presidential+elections+2012&submit_x=0&submit_y=0&search_sortby=date base_url = "http://pjmedia.com/page/<<page_num>>/?s=presidential+elections+2012&submit_x=0&submit_y=0&search_sortby=date" page_num = 1 while True: url = base_url.replace("<<page_num>>",`page_num`) html_content = utils.getData(url) if html_content == None: break soup = BeautifulSoup(html_content) articles = soup.find("div",{"id":"archive-content"}).findAll("div",{"class":"category-story"}) for article in articles: pjMediaDoc = JNYTDocument() link = article.find("h2").find("a") meta_data = [string.strip() for string in article.find("div",{"class":"category-author2"}).get_text().split('-')] date_str = meta_data[0] date_str = date_str.replace("th,",",") date_str = date_str.replace("st,",",") date_str = date_str.replace("nd,",",") date_str = date_str.replace("rd,",",") pjMediaDoc.web_url = link['href'] pjMediaDoc.political_leaning = "Conservative" pjMediaDoc.source = "PJ Media" pjMediaDoc.headline = link.get_text().strip() pjMediaDoc.pub_date = datetime.strptime(date_str,"%A, %B %d, %Y") pjMediaDoc.save() #Getting the social shares for the URL #pjMediaDoc.social_shares = shares.get_social_counts(pjMediaDoc.web_url) #pjMediaDoc.save() #Getting the content of the document content_soup = BeautifulSoup(utils.getData(pjMediaDoc.web_url+"?singlepage=true")).find("div",{"class":"post"}).find("div",{"class":"clearingfix"}).findAll("p") article_content = "" for paragraph in content_soup: text = paragraph.get_text() article_content += " "+ text pjMediaDoc.content = article_content.strip() pjMediaDoc.save() page_num += 1 return `page_num`
def parse_talkingpointsmemo(): #JNYTDocument.drop_collection() base_url = "https://www.googleapis.com/customsearch/v1element?key=AIzaSyCVAXiUzRYsML1Pv6RwSG1gunmMikTzQqY&rsz=filtered_cse&num=10&hl=en&prettyPrint=false&source=gcsc&gss=.com&sig=23952f7483f1bca4119a89c020d13def&start=<<start_num>>&cx=partner-pub-7451232131633930:5915231553&q=presidential%20elections%202012&safe=active&googlehost=www.google.com&callback=google.search.Search.apiary272&nocache=1416895033146" start_num = 10 while start_num < 100: # if start_num == 30: # break url = base_url.replace("<<start_num>>", `start_num`) url_content = utils.getData(url) url_content = url_content.replace("// API callback","") url_content = url_content.replace("google.search.Search.apiary272(","").strip() url_content = url_content[:-2] data = json.loads(url_content) for result in data["results"]: # print result["titleNoFormatting"], result["url"] talkingPointsMemoDoc = JNYTDocument() talkingPointsMemoDoc.web_url = result["url"] talkingPointsMemoDoc.headline = result["titleNoFormatting"] talkingPointsMemoDoc.political_leaning = "Liberal" talkingPointsMemoDoc.source = "Talking Points Memo" talkingPointsMemoDoc.save() #Getting the social shares for the URL talkingPointsMemoDoc.social_shares = shares.get_social_counts(talkingPointsMemoDoc.web_url) talkingPointsMemoDoc.save() try: content_soup = BeautifulSoup(utils.getData(talkingPointsMemoDoc.web_url)) by_line = content_soup.find("section",{"class":"byline"}).find("time") date_string = by_line.get_text().strip().rsplit(",",1)[0] talkingPointsMemoDoc.pub_date = datetime.strptime(date_string.strip(),"%B %d, %Y") content = content_soup.find("div",{"class":"story-teaser"}) body_content = content_soup.find("div",{"class":"story-body"}) main_content = content.get_text() + " " + body_content.get_text() talkingPointsMemoDoc.content = main_content.strip() talkingPointsMemoDoc.save() except: print "Exception occured" pass start_num += 10 # break return 'Anand'
def Base(eta, l2, morepara, emb, batchsize): params.outfile = 'POS_CRF_Bilstm_Viterbi_' params.dataf = '../pos_data/oct27.traindev.proc.cnn' params.dev = '../pos_data/oct27.test.proc.cnn' params.test = '../pos_data/daily547.proc.cnn' params.batchsize = batchsize params.hidden = 100 params.embedsize = 100 params.emb = emb params.eta = eta params.L2 = l2 params.dropout = 0 params.num_labels = 25 params.morepara = morepara (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2') #words.update({'UUUNKKK':0}) #a=[0]*len(We[0]) #newWe = [] #newWe.append(a) #We = newWe + We We = np.asarray(We).astype('float32') print We.shape tagger = getTagger('../pos_data/tagger') print tagger params.outfile = params.outfile + ".Batchsize" + '_' + str( params.batchsize ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str( params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb) #examples are shuffled data traindata = getData(params.dataf, words, tagger) trainx0, trainy0 = traindata devdata = getData(params.dev, words, tagger) devx0, devy0 = devdata print 'dev set', len(devx0) testdata = getData(params.test, words, tagger) testx0, testy0 = testdata print 'test set', len(testx0) #print Y print "Using Training Data" + params.dataf print "Using Word Embeddings with Dimension " + str(params.embedsize) print "Saving models to: " + params.outfile #lm = LM_model(params) #lm.train(trainy0, devy0, params) tm = CRF_model(We, params) tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def parse_crooksnliars(): #JNYTDocument.drop_collection() base_url = "http://crooksandliars.com/solr/presidential%20elections%202012?page=<<page_num>>&filters=im_cl_section%3A1" page_num = 0 while page_num < 313: url = base_url.replace("<<page_num>>", `page_num`) url_content = utils.getData(url) soup = BeautifulSoup(url_content).find("div",{"class":"search-results"}) content_nodes = soup.findAll("div",{"class":"buildmode-teaser"}) for index, div in enumerate(content_nodes): crooksNLiarsDoc = JNYTDocument() title = div.find("div",{"class","field-title"}).find("a") field_submitted = div.find("div",{"class":"field field-submitted submitted"}) author_link = field_submitted.find("a") temp_string = field_submitted.get_text() temp_string = temp_string.replace("By","") temp_string = temp_string.replace(author_link.get_text(),"").strip() date_string = temp_string.split("-")[0].strip().rsplit(" ",2)[0] date_string = date_string.replace("Anonymous","") crooksNLiarsDoc.web_url = "http://crooksandliars.com" + title["href"] crooksNLiarsDoc.headline = title.get_text().strip() crooksNLiarsDoc.political_leaning = "Liberal" crooksNLiarsDoc.source = "Crooks N Liars" crooksNLiarsDoc.pub_date = datetime.strptime(date_string.strip(),"%B %d, %Y") crooksNLiarsDoc.save() #Getting the social shares for the URL #crooksNLiarsDoc.social_shares = shares.get_social_counts(crooksNLiarsDoc.web_url) #crooksNLiarsDoc.save() try: content_soup = BeautifulSoup(utils.getData(crooksNLiarsDoc.web_url)).find("div",{"class":"nd-region-middle-wrapper"}) crooksNLiarsDoc.content = content_soup.get_text() crooksNLiarsDoc.save() except: pass # break # break page_num = page_num + 1 return `page_num`
def thread(id, filename, nbInputs, seed=None): rs = np.random.RandomState(seed) with Session() as sess: with createSolver(id, nbInputs, sess, RandomState(rs.randint(1E9))) \ as solver: res = evaluateSolver(solver, getData(filename, nbInputs, 1), rs) return res
def test(filename): X, Y_, _ = utils.getData(filename) Y = XGboost_revalue(Y_) dataSet_name = filename.split('/')[1].split('.')[0] print("------------------------" + dataSet_name + "------------------------") train_data, train_label, test_data, test_label = utils.splitDataSet( X, Y, test_size=0.3) # 转换为DMatrix数据格式 dtrain = xgb.DMatrix(train_data, label=train_label) dtest = xgb.DMatrix(test_data, label=test_label) # 设置参数 parameters = { 'eta': 0.01, 'subsample': 0.75, 'objective': 'multi:softmax', # error evaluation for multiclass tasks 'num_class': 2, # number of classes to predic 'max_depth': 8 # depth of the trees in the boosting process } num_round = 500 # the number of training iterations bst = xgb.train(parameters, dtrain, num_round) preds = bst.predict(dtest) #输出的是概率 acc, p, r, f1 = utils.calAccuracy(preds, test_label) print("正确率:{:.2%}\t查准率:{:.4f}\t查全率:{:.4f}\tF1:{:.4f}".format( acc, p, r, f1))
def svc(): X, y = getData() clf = make_pipeline(StandardScaler(), SVC(gamma="auto")) clf.fit(X, y) return clf.score(X, y)
def visualize(): pkl_path = './dataset/full_data.pkl' main_transform = transforms.Compose([transforms.ToTensor()]) trainset, testset = getData(pkl_path, train_transform = main_transform, test_transform = main_transform) test_dataloader = DataLoader(testset, batch_size = 1, shuffle = True) model = custom.CustomNet() model.load_state_dict(torch.load('./models/trainedModels/currBest.model')) plt.figure(dpi=300) curr = 150 for i,batch in enumerate(test_dataloader): if(i >= 10 and i < 15): plt.subplot(curr + i + 1) data = batch img = data['data'] labels = data['labels'][0].numpy() outs = model(img) _,preds = torch.max(outs,1) outs = outs[0].detach().numpy() preds = preds[0].detach().numpy() plt.imshow(img.numpy()[0][0], cmap='gray') plt.xlabel('Predictions: '+ str(preds[0]) +' '+ str(preds[1])+' ' +str(preds[2]) + '\n Ground Truth:'+ str(labels[0])+' ' +str(labels[1])+' ' +str(labels[2])) else: if(i > 15): break else: continue plt.show()
def getX(self): samples = getData() x_train = np.zeros((BATCH_SIZE, 576)) conds = np.zeros((BATCH_SIZE, NUM_CONDS)) for i in range(BATCH_SIZE): if i % 25 == 0: x = getSingleSample(samples) cond = np.array([1, 0, 0, 0]) else: x = synthData((i % 25) / 25, samples) if (i % 25) > 17: cond = np.array([0, 0, 0, 1]) elif (i % 25) > 8: cond = np.array([0, 0, 1, 0]) else: cond = np.array([0, 1, 0, 0]) x_train[i, :] = x conds[i, :] = cond x_train = np.reshape(x_train, (BATCH_SIZE, 576, 1)) return x_train, conds
def __init__(self, split=0.7, interval='1min', predict_period=1, days=5, mins_interval=30, start_date='2020-08-24', end_date='2020-08-29', stock_name='SPY', stride=1): super(stockGraphGenerator, self).__init__() self.__start_date = datetime.datetime.strptime( start_date + ' 10:00:00', '%Y-%m-%d %H:%M:%S') self.__end_date = datetime.datetime.strptime(end_date + ' 20:00:00', '%Y-%m-%d %H:%M:%S') self.__mins_interval = mins_interval self.__stride = stride self.__data_len = self.__calculateLen(days, mins_interval) self.__interval = interval self.__predict_period = predict_period self.__data_raw = utils.getData(stock_name).reset_index() self.train_data = torch.utils.data.Subset( self, list(range(0, int(split * self.__data_len)))) self.test_data = torch.utils.data.Subset( self, list(range(int(split * self.__data_len), int(self.__data_len))))
def runMLE(): #Train on 8000, test on 2000 X, Y = getData() Xtrain = X[:8000] Ytrain = Y[:8000] Xtest = X[8000:] Ytest = Y[8000:] means, covs = getConditionals(Xtrain, Ytrain) priors = getPriors(Ytrain) acc = 0 Probs = np.zeros((10, len(Xtest))) for j in range(10): #Fix for non-invertible matrices A = covs[j] A = A + .01 * np.identity(np.shape(covs[j])[0]) #Use logpdf to avoid overflow p = stats.multivariate_normal.logpdf(Xtest, mean=means[j], cov=A) p = p * priors[j] Probs[j] = p ypred = np.zeros(len(Xtest)) for i in range(len(Xtest)): ypred = np.argmax(Probs[:, i]) if (ypred == Ytest[i]): acc += 1 acc = acc / len(Xtest) return acc
def question3(dataDir, imageName, imageName2): time, data = getData(dataDir) # part 1 x = data[:,0:3] x_des = data[:,3:6] data_label = ["x", "y", "z"] data_des_label = ["$x_d$", "$y_d$", "$z_d$"] fig, plots = plt.subplots(3, figsize=(5, 6)) plotXYZ(plots, x, x_des, time, data_label, data_des_label) title = "Question 3\nx vs x_desired" plots[0].set_title(title, fontsize=13, font="monospace") fig.supylabel('End effector positions (m)', fontsize=11, font="monospace") plots[-1].set_xlabel('Time (seconds)', fontsize=11, font="monospace") fig.tight_layout() fig.savefig(imageName) # part 2 delta_phi = data[:,6:9] data_label = ["x", "y", "z"] fig, plots = plt.subplots(3, figsize=(5, 6)) plotXYZ(plots, delta_phi, 0, time, data_label, 0) title = "Question 3\n$\delta$$\phi$" plots[0].set_title(title, fontsize=13, font="monospace") fig.supylabel('Orientation error (rad)', fontsize=11, font="monospace") plots[-1].set_xlabel('Time (seconds)', fontsize=11, font="monospace") fig.tight_layout() fig.savefig(imageName2) plt.close('all')
def getData(self): X, y = utils.getData() trainX, testX, trainY, testY = train_test_split( X, y, test_size=0.2, shuffle=True, random_state=1234 ) return (trainX, trainY), (testX, testY)
def parse_redstate(): base_url = "http://www.redstate.com/search/presidential+elections+2012/page/" for page_num in range(1,63): url = base_url + `page_num` soup = BeautifulSoup(utils.getData(url)) articles = soup.find("ul",{"class":"story-loop"}).findAll("ul",{"class","post"}) for index, article in enumerate(articles): title_link = article.find("a") date_string = article.find("span",{"class":"byline-italic"}) date_string = date_string.get_text().split(" at ")[0] date_string = date_string.replace("th,",",") date_string = date_string.replace("st,",",") date_string = date_string.replace("nd,",",") date_string = date_string.replace("rd,",",") redStateDoc = JNYTDocument() redStateDoc.web_url = title_link['href'] redStateDoc.political_leaning = "Conservative" redStateDoc.source = "RedState" redStateDoc.headline = title_link.get_text().strip() redStateDoc.pub_date = datetime.strptime(date_string.strip(),"%B %d, %Y") redStateDoc.save() #Getting the social shares for the URL #redStateDoc.social_shares = shares.get_social_counts(redStateDoc.web_url) #redStateDoc.save() content_soup = BeautifulSoup(utils.getData(redStateDoc.web_url)).find("div",{"class":"the-content"}).findAll("p") article_content = "" text = "" for paragraph in content_soup: text = paragraph.get_text() article_content += " "+ text article_content = article_content.replace(text,"") redStateDoc.content = article_content.strip() redStateDoc.save() # break return `page_num`
def featurize(): print("---- Reading Data ----") img_paths = glob.glob(DATA_PATH) print("len(img_paths):", len(img_paths)) random.seed(a=13521) random.shuffle(img_paths) train_test_split = 0.8 X_test_paths = img_paths[int(train_test_split * len(img_paths)):] dims = (448, 448, 3) # Loading Data X_test = utils.getData(X_test_paths, dims) print("X_test:", X_test.shape) # To check NaN pixel images nan_pixels_per_image = utils.nansInData(X_test) # plt.scatter(x=np.arange(0,len(nan_pixels_per_image)), y=nan_pixels_per_image) # plt.savefig("nan_scatter.png") # Checking min max to see if normalization is needed or not print("Before normalization") print(np.nanmin(X_test), np.nanmax(X_test)) X_test = utils.normalize(X_test) # Checking min max after normalization print("After normalization") print(np.nanmin(X_test), np.nanmax(X_test)) # Interpolate nan values X_test = utils.interpolateNaNValues(X_test) # To check NaN pixel images nan_pixels_per_image = utils.nansInData(X_test) print("---- Reading Model ----") model = load_model(OUTPUT_MODEL_PATH) print(model.summary()) print("---- Featurizing Data ----") feature_list = extract_features(img_array=X_test, model=model, layer_names=['conv2d_8']) # layer_name = 'conv2d_8' # intermediate_layer_model = Model(inputs=model.input, # outputs=model.get_layer(layer_name).output) # intermediate_output = intermediate_layer_model.predict(data) # feature_list = intermediate_output utils.nansInData(feature_list, data_type="feature") # Save the features and the filelist order for later use. pickle.dump(feature_list, file=open((FEATURES_OUTPUT), mode='wb')) pickle.dump(X_test_paths, file=open((PATH_LIST), mode='wb'))
def testSyntheticData(self): #A,S,F = [],[],[] x_train, y_train, x_control_train, x_control_test, x_test, y_test = ut.getData() dist_params, dist_params_train = ut.getDistribution(x_train, y_train, x_control_train) mean, cov, meanT, covT = dist_params["mean"], dist_params["cov"], dist_params_train["mean"], dist_params_train["cov"] #print(mean) meanN = [0] * len(mean) covN = np.identity(len(mean)) #clf = GaussianMixture(n_components=2, covariance_type='full') means = [mean, meanN] covariances = [cov, covN] lw = float(sys.argv[2]) weights = [1-lw, lw] #for i in range(0,4): LR, LE = len(y_train), len(y_test) train, test = [],[] for i in range(0, LR): j = np.random.choice([0,1], p=weights) seed = np.random.randint(10) train.append(multivariate_normal(means[j], covariances[j], allow_singular=1).rvs(size=1, random_state=seed)) for i in range(0, LE): j = np.random.choice([0,1], p=weights) seed = np.random.randint(10) test.append(multivariate_normal(means[j], covariances[j], allow_singular=1).rvs(size=1, random_state=seed)) x_train, y_train, x_control_train = [], [], [] for t in train: x_train.append(t[:-2]) if t[len(t)-2] < 0: y_train.append(-1) else: y_train.append(1) #y_train.append(t[len(t)-2]) if t[len(t)-1] < 0.5: x_control_train.append(0) else: x_control_train.append(1) x_control_test, x_test, y_test = [], [], [] for t in test: x_test.append(t[:-2]) if t[len(t)-2] < 0: y_test.append(-1) else: y_test.append(1) if t[len(t)-1] < 0.5: x_control_test.append(0) else: x_control_test.append(1) #print(x_train, y_train, x_control_train) y_res = self.processGivenData(0.9, x_train, y_train, x_control_train, x_test, y_test, x_control_test, dist_params, dist_params_train) acc, sr, fdr = ut.getStats(y_test, y_res, x_control_test) print("Acc: ", acc, " SR: ", sr, " FDR: ", fdr)
def parse_time(): current_page_url = "http://search.time.com/results.html?Ntt=immigration+reform&Nf=p_date_range%7cBTWN+20110101+20130531" while current_page_url != None: soup = BeautifulSoup(utils.getData(current_page_url)).find("div",{"class":"resultsCol"}) pagination = soup.find("div",{"class":"pagi"}).find("a",{"title":"Next"}) if pagination != None: current_page_url = pagination["href"] else: current_page_url = None articles = soup.findAll("div",{"class":"tout"}) for article in articles: image_div = article.find("div",{"class":"img"}) if image_div != None: title_link = article.find("h3").find("a") date_string = article.find("span",{"class":"date"}).get_text().strip() content_soup = BeautifulSoup(utils.getData(title_link['href'])).find("div",{"class":"entry-content"}) if content_soup != None: content_soup = content_soup.findAll("p") article_content = "" for p in content_soup: article_content += p.get_text().strip() print title_link['href'], title_link.get_text(), date_string timeDoc = JNYTDocument() timeDoc.pub_date = datetime.strptime(date_string,'%b %d, %Y') timeDoc.source = "Time" timeDoc.web_url = title_link['href'] timeDoc.headline = title_link.get_text() timeDoc.content = article_content timeDoc.save() #Getting the social shares for the URL #timeDoc.social_shares = shares.get_social_counts(timeDoc.web_url) #timeDoc.save() #current_page_url = None return current_page_url
def testImages(): pkl_path = './dataset/full_data.pkl' main_transform = transforms.Compose([transforms.ToTensor()]) trainset, testset = getData(pkl_path, train_transform = main_transform, test_transform = main_transform) img = trainset[0]['data'][0] labels = str(trainset[0]['labels']) plt.figure(dpi=300) plt.imshow(img, cmap='gray') plt.xlabel(labels) plt.show()
def LModel(eta,batchsize,dSize,relSize, updatewords): trainSize = [50] acti = ['relu','tanh'] evaT = ['sum','max','cause'] layersize =dSize params.frac = 1.0 params.outfile = 'Model_FA'+'_eta_'+str(eta)+'_dSize_'+ str(dSize) + '_batchsize_'+ str(batchsize) + '_relSize_'+ str(relSize) + '_trainSize_'+str(trainSize[0]) + '_updatewords_' + str(updatewords) #params.dataf = '../data/conceptnet/AddModelData/omcs_train_new'+str(trainSize[0])+'.txt' #params.dataf = '../data/conceptnet/AddModelData/causes_omcs.txt' params.dataf = '../data/conceptnet/AddModelData/new_omcs100.txt' params.batchsize = batchsize params.hiddensize = 25 params.type = "MAX" params.save = True params.constraints = False params.embedsize = dSize params.relsize = relSize params.activation = acti[0] params.evaType = evaT[0] params.usepeep = True params.LC = 0.00001 params.Lw = 0.01 params.eta = eta params.margin = 1 params.save= True (words, We) = getWordmap('../data/conceptnet/embeddings/embeddings.skip.newtask.en.d'+str(dSize)+'.m1.w5.s0.it20.txt') #print We.shape rel = getRelation('../data/conceptnet/rel.txt') params.outfile = "../models/"+params.outfile+"_"+str(params.LC)+"_"+str(params.Lw)+".txt" #examples are shuffled data examples = getData(params.dataf) params.data = examples[0:int(params.frac*len(examples))] #print "Using Training Data"+params.dataf #print "Using Word Embeddings with Dimension "+str(dSize[0]) #print "Training on "+str(len(params.data)) #print "Saving models to: "+params.outfile Rel_init = np.zeros((35,params.relsize,params.relsize)) for k in range(35): for i in range(params.relsize): for j in range(params.relsize): if(i==j): Rel_init[k][i][j] = 1+random.uniform(-0.2,0.2) else: Rel_init[k][i][j] = random.uniform(-0.2,0.2) tm = theano_word_model(We, words, layersize, params.embedsize, rel, params.relsize, Rel_init, params.LC, params.Lw, params.eta, params.margin, params.usepeep, updatewords) tm.train( params.data, params)
def testTrainingLoop(need_pickle=False): if (need_pickle): pkl_path = dsetToPickle('./dataset/', 'train.csv') else: pkl_path = './dataset/full_data.pkl' main_transform = transforms.Compose([transforms.ToTensor()]) trainset, testset = getData(pkl_path, train_transform = main_transform, test_transform = main_transform) train_dataloader = DataLoader(trainset, batch_size = 64, shuffle = True) test_dataloader = DataLoader(testset, batch_size = 64, shuffle = True) trainModel(None, train_dataloader, test_dataloader)
def gridSearchScore(): X, y = getData() scores = ["precision", "recall"] for score in scores: print(f"# Tuning hyper-parameters for {score}") clf = gridSearch(X, y) means = clf.cv_results_["mean_test_score"] stds = clf.cv_results_["std_test_score"] for mean, std, params in zip(means, stds, clf.cv_results_["params"]): print("{:0.3f} (+/-{:0.03f}) for {}".format(mean, std * 2, params))
def testPreprocessedData(self): x_train, y_train, x_control_train, x_control_test, x_test, y_test = ut.getData() #checkNormalFit(x_train, y_train, x_control_train) for i in range(1,11): try : tau = i/10.0 print("Tau : ", tau) y_res = self.processGivenData(tau, x_train, y_train, x_control_train, x_test, y_test, x_control_test, [], []) ut.getStats(y_test, y_res, x_control_test) print("\n") except Exception as e: logging.exception(str(tau) + " failed\n" + str(e))
def getAddress(url, predictors): ''' Finds all the addresses on the web-page Parameters ---------- url : The url of the page predictors : a list of tuples which are like (parameters, model) Here parameters is a dictionary of the hyper-parameters of the model Returns ------- final : A list of lists, where every list contains the paragraph which are the part of the same address. ''' soup, paras, paradict = parsePage(url) # print soup addresses = [] if 'tripadvisor' in url: final = TripAdAddr(soup) else: results = set() for params, pred in predictors: # get the feature vectors for the text on the web-page as required by the X = getData(paras, params['NUM_FEATURES'], params['BATCH_SIZE'], SEQ_LENGTH=params['SEQ_LENGTH']) res = pred(X).flatten() addrs = getLabels(res, paras, params['NUM_CLUST']) # take the intersection of the results extracted by the classifiers... # success depends heavily on the ability of the classifiers to find all the addresses results = results.intersection(addrs) #print getScores(pred, paras, params) # the final address extractor is the hard coded rule-based function which works when # there are telephone numbers in the address results = results.union(rulEx(paras)) # to align the addresses based on their position on the page addresses = sorted(results, key=lambda x: x[1]) final = accuAddr(addresses) # print final return final
def testDataset(need_pickle=False): if (need_pickle): pkl_path = dsetToPickle('./dataset/', 'train.csv') else: pkl_path = './dataset/full_data.pkl' train, test = getData(pkl_path) print('Testing...') print("train.data:\n", train.data) print("train[0]:\n", train[0]) for i in range(5): plt.subplot(150 + i + 1) sample = train[i]['data'][0] plt.imshow(sample.astype(int), cmap='gray', vmin=0, vmax=255) plt.show()
def scrape(stats_file, pokedes_file): tr_elements = utils.getData('http://pokemondb.net/pokedex/all', '//tr') col = [] # For each row, store each first element (header) and an empty list i = 0 for t in tr_elements[0]: name = t.text_content() if name == '#': name = 'no' col.append((utils.clean_string(name), [])) i += 1 # Since out first row is the header, data is stored on the second row onwards for j in range(1, len(tr_elements)): T = tr_elements[j] if len(T) != 10: break i = 0 for t in T.iterchildren(): data = t.text_content() if i > 0: try: data = int(data) except: pass col[i][1].append(data) i += 1 # Construct Data Frame using Pandas. Dict = {title: column for (title, column) in col} df = pd.DataFrame(Dict) # Apply clean up df['name'] = df['name'].apply(utils.str_bracket) df['type'] = df['type'].apply(utils.str_break) df['img_filename'] = df['name'] df['img_filename'] = df['img_filename'].apply(utils.generate_img_file_name) # Save to json df.to_json(stats_file, orient='records') # Save image_filename list utils.save_df_to_text(df, pokedes_file, 'img_filename')
def getAddress(url, predictors): ''' Finds all the addresses on the web-page Parameters ---------- url : The url of the page predictors : a list of tuples which are like (parameters, model) Here parameters is a dictionary of the hyper-parameters of the model Returns ------- final : A list of lists, where every list contains the paragraph which are the part of the same address. ''' soup, paras, paradict = parsePage(url) # print soup addresses = [] if 'tripadvisor' in url: final = TripAdAddr(soup) else: results = set() for params, pred in predictors: # get the feature vectors for the text on the web-page as required by the X = getData(paras, params['NUM_FEATURES'], params[ 'BATCH_SIZE'], SEQ_LENGTH=params['SEQ_LENGTH']) res = pred(X).flatten() addrs = getLabels(res, paras, params['NUM_CLUST']) # take the intersection of the results extracted by the classifiers... # success depends heavily on the ability of the classifiers to find all the addresses results = results.intersection(addrs) #print getScores(pred, paras, params) # the final address extractor is the hard coded rule-based function which works when # there are telephone numbers in the address results = results.union(rulEx(paras)) # to align the addresses based on their position on the page addresses = sorted(results, key=lambda x: x[1]) final = accuAddr(addresses) # print final return final
def handle(self): # print('Client: ', self.client_address) self.logger = logging.getLogger() while True: try: data = utils.getData(self.connection) if data is None: break except: import traceback # self.logger.error(traceback.format_exc()) # traceback.print_exc() break record = logging.makeLogRecord(data) self.handleLogRecord(record)
def handle(self): #print('Client: ', self.client_address) while True: # obj = getData(self.connection) # self.protocol(obj) try: obj = utils.getData(self.connection) if obj is None: break self.protocol(obj) except: import traceback # traceback.print_exc() self.resend() self._error("exception, exit!") break
def main(): if (NEED_PICKLE): pkl_path = utils.dsetToPickle('./dataset/', 'train.csv') else: pkl_path = './dataset/full_data.pkl' # set up the data transforms train_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.45], std=[0.225]), ]) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.45], std=[0.225]), ]) # set up the datasets/loaders trainset, testset = utils.getData(pkl_path, split=0.75, drop=0.5, train_transform=train_transform, test_transform=test_transform) trainloader = DataLoader(trainset, batch_size=64, shuffle=True, drop_last=True) testloader = DataLoader(testset, batch_size=64, shuffle=True, drop_last=True) # create the model model = prnet.PretrainedResnet(TOTAL_ROOTS, TOTAL_VOWELS, TOTAL_CONS) # model.load_state_dict(torch.load('./best_model.model')) # train the model model = train.train(model, trainloader, testloader, epochs=35, lr=0.01) # save the model torch.save(model.state_dict(), './saved_model.model') # validate the model # acc = train.validate(model, testloader) # print("Validation Accuracy: %.3f" % (acc)) return
def add3(): if request.method == 'POST': data = request.form print(data) sql = "INSERT INTO Card (CardNum,CardName,Type,Sex,Workunit,Address,Telephone,Email,RegisterDate) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s');" \ % (data['CardNum'], data['CardName'], data['Type'], data['Sex'],\ data['WorkUnit'], data['Address'], data['Telephone'], data['Email'],data['RegisterDate']) utils.execu(sql) return redirect(url_for('CardIfo')) else: sql = "select * from %s" % (utils.Table4) content, labels = utils.query(sql, utils.Table4) sql = "select TypeName from %s" % (utils.Table5) TypeName = utils.getData(sql) return render_template('CardAdd.html', labels=labels, content=content, TypeName=TypeName)
def sqlmodify3(): if request.method == 'POST': data = request.form sql = "update %s set CardNum='%s',CardName='%s',TypeName='%s',Sex='%s',WorkUnit='%s',Address='%s',Telephone='%s',Email='%s',RegisterDate='%s' where CardNum=%s" \ % (utils.Table4, data['CardNum'], data['CardName'], data['TypeName'], data['Sex'], data['WorkUnit'], data['Address'], data['Telephone'], data['Email'],data['RegisterDate'], data['uid']) utils.execu(sql) return redirect(url_for('CardIfo')) else: uid = int(request.args.get('uid')) sql = "select * from %s" % (utils.Table4) content, labels = utils.query(sql, utils.Table4) sql = "select TypeName from %s" % (utils.Table5) TypeName = utils.getData(sql) return render_template('CardModify.html', labels=labels, content=content, uid=uid, TypeName=TypeName)
def sqlmodify1(): if request.method == 'POST': data = request.form sql = "update %s set BookNum='%s',BookName='%s',Categories='%s',Author='%s',Press='%s',PublicateDate='%s',Price='%s',IsLend='%s' where BookNum=%s" \ % (utils.Table1, data['BookNum'], data['BookName'], data['Categories'], data['Author'], data['Press'], data['PublicateDate'], data['Price'], data['IsLend'], data['uid']) utils.execu(sql) return redirect(url_for('BooIfo')) else: uid = int(request.args.get('uid')) sql = "select * from %s" % (utils.Table1) content, labels = utils.query(sql, utils.Table1) sql = "select categories from %s" % (utils.Table3) categories = utils.getData(sql) return render_template('BooModify.html', labels=labels, content=content, uid=uid, categories=categories)
def question1(dataDir, subNum, imageName): time, data = getData(dataDir) x = data[:,0:3] x_des = data[:,3:6] data_label = ["x", "y", "z"] data_des_label = ["$x_d$", "$y_d$", "$z_d$"] fig, plots = plt.subplots(3, figsize=(5, 6)) plotXYZ(plots, x, x_des, time, data_label, data_des_label) title = "Question 1" + subNum + "\nx vs x_desired" plots[0].set_title(title, fontsize=13, font="monospace") fig.supylabel('End effector positions (m)', fontsize=11, font="monospace") plots[-1].set_xlabel('Time (seconds)', fontsize=11, font="monospace") fig.tight_layout() fig.savefig(imageName) plt.close('all')
def question2(dataDir, subNum, imageName, imageName2): time, data = getData(dataDir) # part 1 x = data[:,0:3] x_des = data[:,3:6] data_label = ["x", "y", "z"] data_des_label = ["$x_d$", "$y_d$", "$z_d$"] fig, plots = plt.subplots(3, figsize=(5, 6)) plotXYZ(plots, x, x_des, time, data_label, data_des_label) title = "Question 2" + subNum + "\nx vs x_desired" plots[0].set_title(title, fontsize=13, font="monospace") fig.supylabel('End effector positions (m)', fontsize=11, font="monospace") plots[-1].set_xlabel('Time (seconds)', fontsize=11, font="monospace") fig.tight_layout() fig.savefig(imageName) # part 2 q4_data = data[:,6:9] q6_data = data[:,9:12] fig, plots = plt.subplots(2, figsize=(5, 6)) data_label = [r"$q_{4}$", r"$q_{4_{low}}$", r"$q_{4_{high}}$"] lineType = ["c", "c--", "c-."] simpleSubplot(plots[0], q4_data, time, data_label, lineType) plots[0].set_title(title, fontsize=13, font="monospace") data_label = [r"$q_{6}$", r"$q_{6_{low}}$", r"$q_{6_{high}}$"] lineType = ["m", "m--", "m-."] simpleSubplot(plots[1], q6_data, time, data_label, lineType) title = "Question 2" + subNum + "\nJoint Angle with Joint Limits" fig.supylabel('Joint Angles (rad)', fontsize=11, font="monospace") plots[-1].set_xlabel('Time (seconds)', fontsize=11, font="monospace") fig.tight_layout() fig.savefig(imageName2) plt.close('all')
def rf(classes=3): X, y, _ = utils.getData(samplingType="1", classes=classes) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123456) rf = RandomForestClassifier( n_estimators=10, oob_score=True, random_state=123456, n_jobs=os.cpu_count(), criterion="entropy", ) rf.fit(X_train, y_train) predicted = rf.predict(X_test) accuracy = accuracy_score(y_test, predicted) print(f"Out-of-bag score estimate: {rf.oob_score_:.3}") cm = pd.DataFrame(confusion_matrix(y_test, predicted), ) sns.heatmap(cm, annot=True) with open(f"./result/rf_result_{classes}.pkl", "wb") as f: pkl.dump( [ ["model", "predict", "accuracy", "out-of-bag", "cm"], [rf, predicted, accuracy, rf.oob_score_, cm], ], f, ) print(f"Mean accuracy score: ", accuracy_score(y_test, predicted)) print("precision: ", metrics.precision_score(y_test, predicted)) print("recall: ", metrics.recall_score(y_test, predicted)) print("f1: ", metrics.f1_score(y_test, predicted)) print(classification_report(y_test, predicted)) return y_test, predicted
def parse_nyt(): url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=stock+market+crash+&begin_date=20070101&end_date=20090101&api-key=318a69b2af97848f66071cb4c1fdc831:15:69992102" response = urlopen(url).read() response = json.loads(response) print "Got response from nytimes" articleContent = [] i = 0 page = 1 hits = response["response"]["meta"]["hits"] while i<51 and page<(hits/10): print 'Getting response for page',page url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=stock+market+crash+&begin_date=20070101&end_date=20090101&page="+str(page)+"&api-key=318a69b2af97848f66071cb4c1fdc831:15:69992102" try: response = urlopen(url).read() response = json.loads(response) for article in response["response"]["docs"]: if random.randint(0,3) == 3: #1/3 probability print article["web_url"] soup1 = BeautifulSoup(utils.getData(article["web_url"])) soup = soup1.findAll("p",{"itemprop": "articleBody"}) if soup == None or len(soup) == 0: soup = soup1.find("div", {"id": "articleBody"}) if soup!=None: soup = soup.findAll("p") if soup == None or len(soup)==0: soup = soup1.find("div", {"class": "articleBody"}) if soup!=None: soup = soup.findAll("p") if soup!=None and len(soup)>0: if article["word_count"]>200 and article["lead_paragraph"]!=None: articleContent.append({}) articleContent[i]["abstract"] = article["abstract"] articleContent[i]["pub_date"] = article["pub_date"] articleContent[i]["headline"] = article["headline"]["main"] articleContent[i]["keywords"] = article["keywords"] articleContent[i]["lead_paragraph"] = article["lead_paragraph"] articleContent[i]["web_url"] = article["web_url"] articleContent[i]["id"] = article["_id"] articleContent[i]["word_count"] = article["word_count"] keywords = "" keywords = getMultiples(article["keywords"],"value") # should probably pull these if/else checks into a module # variables = [article["pub_date"], keywords, str(article["headline"]["main"]) if "main" in article["headline"].keys() else "", str(article["source"]) if "source" in article.keys() else "", str(article["document_type"]) if "document_type" in article.keys() else "", article["web_url"] if "web_url" in article.keys() else "",str(article["news_desk"]) if "news_desk" in article.keys() else "",str(article["section_name"]) if "section_name" in article.keys() else "",str(article["lead_paragraph"]).replace("\n","") if "lead_paragraph" in article.keys() else ""] # line = "\t".join(variables) # articleContent[i]["text"] = line sent = "" if type(soup) is not str: sent = " ".join([str(word) for word in soup]) else: sent = soup articleContent[i]["text"] = utils.strip(sent) print articleContent[i]["headline"],article["keywords"],article["lead_paragraph"] i+=1 print 'Extracted',i,article["pub_date"] if i>51: break except: print "Skipped" page+=1 print "Articles Extracted",i return articleContent
) args = parser.parse_args() params.LW = args.LW params.outfile = args.outfile params.batchsize = args.batchsize params.dim = args.dim params.wordfile = args.wordfile params.save = str2bool(args.save) params.train = args.train params.margin = args.margin params.type = args.samplingtype params.epochs = args.epochs params.evaluate = str2bool(args.evaluate) params.learner = str2learner(args.learner) params.learner = lasagne.updates.adagrad (words, We) = getWordmap(params.wordfile) examples = getData(params.train, words) if args.num_examples: examples = examples[0 : args.num_examples] print "Number of training examples: ", len(examples) print sys.argv model = paragram_word_model(We, params) train(model, examples, words, params)