def run(dest, results_path): # open up the database store = pd.HDFStore( os.path.abspath(os.path.join(results_path, 'model_fall_responses.h5'))) sigma, phi = util.get_params() all_data = pd.DataFrame([]) for query in store.root._v_children: # look up the name of the key for the parameters that we want (will be # something like params_0) params = store["/{}/param_ref".format(query)]\ .reset_index()\ .set_index(['sigma', 'phi'])['index']\ .ix[(sigma, phi)] # load in the data data = store["{}/{}".format(query, params)] all_data = all_data.append(data) all_data = all_data\ .set_index(['query', 'block', 'stimulus', 'kappa0'])\ .sortlevel() store.close() all_data.to_csv(dest)
def svmvec(path, output_filename): conn = sqlite3.connect(path) c = conn.cursor() c_inner = conn.cursor() c_inner2 = conn.cursor() params = util.get_params(c, path) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('select ED_ENC_NUM, Score from Documents') i = 1 with open(output_filename, 'w') as fout_samples: with open(output_filename + ".id", 'w') as fout_ids: for doc_id, score in c: if i % 100 == 0: print(('svmvec(): processing document %s (%d/%d)' % (str(doc_id), i, num_total_docs))) c_inner.execute( """SELECT DocumentsToDimensions.DimensionId, Count FROM DocumentsToDimensions INNER JOIN Dimensions ON DocumentsToDimensions.DimensionId = Dimensions.DimensionId WHERE DocumentsToDimensions.ED_ENC_NUM = ? AND Dimensions.Exclude = 0 AND Count > 0""", (doc_id, )) if score == None: score = 0 elif score > 100: score = 100 elif score < -100: score = -100 assert -100 <= score <= 100 print('%d' % (score / 100), end=' ', file=fout_samples) print(doc_id, file=fout_ids) for dim_id, count in c_inner: c_inner2.execute( """SELECT IDF FROM Dimensions WHERE DimensionId = ?""", (dim_id, )) idf = float(c_inner2.fetchone()[0]) # # The SELECT statement above protects us from zero count. # tfidf = 1 + log10(count) * idf if params['USE_BINARIZED_TDF']: tfidf = 1 if tfidf > float(params['C_BINARIZE']) else 0 print('%d:%d' % (dim_id, tfidf), end=' ', file=fout_samples) else: print('%d:%d' % (dim_id, tfidf), end=' ', file=fout_samples) print(file=fout_samples) i += 1 c_inner.close() c.close() conn.close()
def __init__(self, config): """Initialize Trainer Args: config (dict): Configuration dictionary """ super(Trainer, self).__init__() # Define multi-task setting dataset = config['dataset'] dataset_name = dataset['dataset_name'] self.tasks_weighting = dataset['tasks_weighting'] self.tasks = [k for k, v in self.tasks_weighting.items()] # Setup network model_config = config['model'] self.model = get_module(model_config, dataset_name, self.tasks) print('Model constructed for {}'.format(' '.join(self.tasks))) if 'grouping' in model_config: print('groups = {}'.format(model_config['grouping']['groups'])) print('grouping method = {}'.format(model_config['grouping']['method'])) self.model = update_module(config, self.model, self.tasks) # Setup for a task-conditional setting model_params = config['model']['parameters'] if 'common_mt_params' in model_params: self.task_conditional = not model_params['common_mt_params'] else: self.task_conditional = False # Setup optimizers optimizer_config = config['optimizer'] optimizer_cls = get_optimizer(optimizer_config['algorithm']) model_params = get_params(self.model, optimizer_config['parameters']['lr'], len(self.tasks), self.task_conditional, self.tasks) self.optimizer = optimizer_cls(model_params, **optimizer_config['parameters']) # Setup schedulers scheduler_config = config['scheduler'] scheduler_cls = get_scheduler(scheduler_config['lr_policy']) self.scheduler = scheduler_cls(self.optimizer, **scheduler_config['parameters']) # Setup loss function losses_config = config['loss'] self.criterions = get_loss_functions(self.tasks, losses_config) # Initialise performance meters self.best_val_loss = 1e9 self.train_loss = {} self.val_loss = {} for task in self.tasks: self.train_loss[task] = get_running_meter() self.val_loss[task] = get_running_meter() # Initialize img logging for visualization self.img_logging = get_img_logging(dataset_name, self.tasks) self.pred_decoder = get_pred_decoder(dataset_name, self.tasks)
def classify(path, svm_test, svm_classifier, test_results, temporary_dir): tmp_file = P.join(temporary_dir, 'svm_classify.txt') ids_file = P.join(temporary_dir, 'test-samples.dat.id') conn = sqlite3.connect(path) p = sub.Popen([SVM_CLASSIFY, svm_test, svm_classifier, tmp_file], stderr=sub.PIPE, stdout=sub.PIPE) stdout, stderr = p.communicate() if stderr: print(stderr, file=sys.stderr) return precision, recall = -1, -1 for line in stdout.decode().split('\n'): print(line) match = REGEX.match(line.strip()) if match: precision, recall = match.groups() c = conn.cursor() params = util.get_params(c, path) scores = [ int(float(f) / params['CLASSIFY_CLIP'] * 100) for f in open(tmp_file).read().strip().split('\n') ] if test_results: assert precision >= 0 and recall >= 0 print(('precision:', precision, 'recall:', recall)) c.execute('SELECT ED_ENC_NUM, Score FROM Documents') for (i, (doc_id, score)) in enumerate(c): if i >= len(scores): print('Premature end of training file', file=sys.stderr) assert False if score < 0 and scores[i] < 0: continue elif score > 0 and scores[i] > 0: continue print((doc_id, 'expected:', score, 'actual:', scores[i])) else: c_inner = conn.cursor() c.execute('SELECT ED_ENC_NUM FROM Documents') i = 0 for doc_id in open(ids_file).read().strip().split('\n'): if i >= len(scores): print('Premature end of training file', file=sys.stderr) assert False c_inner.execute( 'UPDATE Documents SET Score = ? WHERE ED_ENC_NUM = ?', (scores[i], doc_id)) i = i + 1 c_inner.close() c.close() conn.commit()
def __init__(self, fimage=None, location="LaSilla"): # Todo: load ALL PARAMS self.location = location self.params = util.get_params(location) if fimage is None: fimage = "current.JPG" self.fimage = fimage self.retrieve_image() self.im_masked, self.im_original = util.loadallsky(fimage, return_complete=True) self.mask = util.get_mask(self.im_original) self.observability_map = None
def __init__(self): param_list = util.get_params() self.project_name = param_list[1] self.project_list = util.get_comma_seprated_list(param_list[2]) self.svn_code_repository = param_list[3] self.svn_username = param_list[4] self.svn_password = param_list[5] self.svn_auth = ' --username ' + self.svn_username + ' --password ' + self.svn_password print('svn_auth: ' + self.svn_auth) self.workspace = os.getcwd() self.war_name = self.project_name + FORMATE_WAR print('war name is :' + self.war_name) self.conf_dir = util.get_conf_dir(self.project_name) print('conf_dir is :' + self.conf_dir)
def learn(path, training, svm_classifier): conn = sqlite3.connect(path) c = conn.cursor() param = util.get_params(c, path) if param['SVM_LEARN']: options = param['SVM_LEARN'].split(' ') else: options = [] cmdline = [SVM_LEARN] + options + [training, svm_classifier] p = sub.Popen(cmdline, stderr=sub.PIPE, stdout=sub.PIPE) stdout, stderr = p.communicate() if stderr: print >> sys.stderr, stderr return c.close()
def main(self): (train_data, train_label, train_seq_len), (dev_data, dev_label, dev_seq_len), (test_data, test_label, test_seq_len), _, _ = dp.data_preprocess() params = get_params() if DEFINES.train: check_and_create_path() estimator = tf.estimator.Estimator( model_fn=model.model_fn, model_dir=DEFINES.ckpt_path, params=params, config=tf.estimator.RunConfig( save_checkpoints_steps=30, save_summary_steps=1, log_step_count_steps=10)) train_spec = tf.estimator.TrainSpec( input_fn=lambda:dp.train_input_fn( train_data, train_seq_len, train_label, DEFINES.batch_size ), max_steps=DEFINES.train_step) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: dp.eval_input_fn( dev_data, dev_seq_len, dev_label, len(dev_data) ), exporters = [BestCheckpointsExporter()], start_delay_secs=0, throttle_secs=0) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) print('Training finished') print('Evaluate testset') assert glob.glob(os.path.join(DEFINES.best_ckpt_path, '*.ckpt*')), 'Checkpoint does not exist' estimator = tf.estimator.Estimator( model_fn=model.model_fn, model_dir=DEFINES.best_ckpt_path, params=params) test_result = estimator.evaluate(input_fn=lambda: dp.eval_input_fn( test_data, test_seq_len, test_label, len(test_data))) print('\nEVAL set accuracy: {accuracy:0.3f}\n'.format(**test_result))
def main(): elastic = Elastic() config = get_params() path = config['crawler']['path'] # download imdb files wget.download(BASICS, out=path) wget.download(RATINGS, out=path) wget.download(EPISODES, out=path) # crawl movies and add to elastic crawl() elastic.insert_elastic() # remove files os.remove(f"{config['crawler']['path']}/title.basics.tsv.gz") os.remove(f"{config['crawler']['path']}/title.ratings.tsv.gz") os.remove(f"{config['crawler']['path']}/title.episode.tsv.gz")
from db.factory import BasicFactory from elasticsearch import Elasticsearch, helpers, exceptions from util import get_params, set_logger import json bf = BasicFactory() config = get_params() logger = set_logger('elastic') es = Elasticsearch(hosts=[{ 'host': config['elastic']['host'], 'port': config['elastic']['port'] }]) class Elastic: INDEX = config['elastic']['index'] def search(self, name): """ Autocomplete for movies :param str name: prefix from search :return: movies :rtype: dict """ message = {'status': False} query = { 'suggest': { 'movie': { 'prefix': name,
def on_threadSample_newSample(self, sample): observability = sample[1] rest = sample[0] self.matplotlibWidget.figure.tight_layout() self.matplotlibWidget.axis.imshow(rest, vmin=0, vmax=255, cmap=plt.get_cmap('Greys_r')) self.matplotlibWidget.axis.imshow(observability, cmap=plt.get_cmap('RdYlGn'), alpha=0.2) #theta_coordinates = np.deg2rad([-146,0,45,90,0,180,170,190,200,0, 270, 315]) theta_coordinates = np.deg2rad(np.arange(0, 360, 15)) params = util.get_params(location="LaSilla") ff = params['ff'] k1 = params['k1'] k2 = params['k2'] r0 = params['r0'] cx = params['cx'] cy = params['cy'] north = params['north'] deltatetha = params['deltatetha'] url_weather = params['url_weather'] wpl = params['wind_pointing_limit'] wsl = params['wind_stopping_limit'] coordinatesx = np.cos(north + theta_coordinates) * r0 + cx coordinatesy = np.sin(north + theta_coordinates) * r0 + cy northx, northy = util.get_image_coordinates(np.deg2rad(0), np.deg2rad(24)) eastx, easty = util.get_image_coordinates(np.deg2rad(90), np.deg2rad(20)) self.matplotlibWidget.axis.annotate('N', xy=(northx, northy), rotation=deltatetha, horizontalalignment='center', verticalalignment='center') self.matplotlibWidget.axis.annotate('E', xy=(eastx, easty), rotation=deltatetha, horizontalalignment='center', verticalalignment='center') altshow = [15, 30, 45, 60, 75, 90] for angle in np.deg2rad(altshow): rr = util.get_radius(angle, ff, k1, k2, r0) #if angle >= np.pi/2: print rr/330. self.matplotlibWidget.figure.gca().add_artist( plt.Circle((cx, cy), rr, color='k', fill=False)) textx = np.cos(north + np.deg2rad(180)) * (rr - 2) + cx texty = np.sin(north + np.deg2rad(180)) * (rr - 2) + cy self.matplotlibWidget.axis.annotate( '%d' % (90 - np.ceil(np.rad2deg(angle))), xy=(textx, texty), rotation=deltatetha, #prefered_direction['dir'], horizontalalignment='left', verticalalignment='center', size=10) WD, WS = get_wind(url_weather) WDd = WD WD = np.deg2rad(WD) if WS is not None and WS > wpl: wdcoordinatesx = np.cos(north - WD) * r0 + cx wdcoordinatesy = np.sin(north - WD) * r0 + cy Nd = np.rad2deg(north) # + 90. if WS > wsl: cw = 'r' self.matplotlibWidget.axis.add_patch( Wedge([cx, cy], r0, Nd - WDd, Nd - WDd + 360, fill=False, hatch='//', edgecolor=cw)) self.matplotlibWidget.axis.annotate( 'WIND LIMIT\nREACHED', xy=(cx, cy), rotation=0, horizontalalignment='center', verticalalignment='center', color=cw, fontsize=35) elif WS > wpl: cw = 'darkorange' wtcoordinatesx = np.cos(north - WD) * r0 / 2. + cx wtcoordinatesy = np.sin(north - WD) * r0 / 2. + cy self.matplotlibWidget.axis.add_patch( Wedge([cx, cy], r0, -90 + Nd - WDd, 90 + Nd - WDd, fill=False, hatch='//', edgecolor=cw)) self.matplotlibWidget.axis.annotate( 'Pointing limit!', xy=(wtcoordinatesx, wtcoordinatesy), rotation=0, horizontalalignment='center', verticalalignment='center', color=cw, fontsize=25) self.matplotlibWidget.axis.plot([cx, wdcoordinatesx], [cy, wdcoordinatesy], lw=3, color=cw) #plt.plot([cx, northx], [cy, northy], lw=2, color='k') for ccx, ccy in zip(coordinatesx, coordinatesy): self.matplotlibWidget.axis.plot([cx, ccx], [cy, ccy], lw=1, color='k') self.matplotlibWidget.axis.set_ylim([np.shape(rest)[0], 0]) self.matplotlibWidget.axis.set_xlim([0, np.shape(rest)[1]]) self.matplotlibWidget.axis.set_axis_off() self.matplotlibWidget.canvas.draw()
#print(info) agent.send(info.encode()) except IOError as e: if (e.errno == errno.EWOULDBLOCK): pass def create_rfid_status(): rfid_status = {} for tag, position in card_id_dict.items(): rfid_status[position] = 0 return rfid_status if __name__ == "__main__": params = get_params(sys.argv) #download_host = params.get("download_host") #download_port = params.get("download_port") ip = params.get("ip") port = int(params.get("port")) # load from db card_id_dict = load_card_ids() rfid_status = create_rfid_status() cars_position = {} agents = [] my_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) bind_connection() Thread(target=accept_connection).start() Thread(target=receive_request).start() #receive_request()
train_feat = train.drop(train_columns_to_drop, axis=1) test_feat = test.drop(test_columns_to_drop, axis=1) factorize_category(train_feat) train_feat.fillna(nan,inplace=True) test_feat.fillna(nan,inplace=True) train_feat_final = train_feat xgtrain = xgb.DMatrix(train_feat_final, train['target'].values) # grid search params = get_params() params["eta"] = 0.01 min_child_weight_list = [1] subsample_list = [1] colsample_bytree_list = [0.6] max_depth_list = [10] #min_child_weight_list = [1, 5, 10] #subsample_list = [0.6, 0.8, 1] #colsample_bytree_list = [0.6, 0.8, 1] #max_depth_list = [8, 10, 12] params_list = [] for min_child_weight in min_child_weight_list: for subsample in subsample_list: for colsample_bytree in colsample_bytree_list:
def index(filename, options): """ Perform indexing. Each document is stemmed, and then the non-excluded dimensions are counted for that document. The result is put into the DocumentsToDimensions table. """ conn = sqlite3.connect(filename) c = conn.cursor() params = util.get_params(c, filename) stemmer = params['stemmer'] print 'index(): stemmer: %s' % stemmer stemmers = { 'porter' : nltk.PorterStemmer(), 'lancaster' : nltk.LancasterStemmer() } try: stemmer = stemmers[stemmer] except KeyError: print 'unsupported stemmer:', stemmer return 1 all_dim = util.get_dimensions(c, 0) assert all_dim, "You must calculate dimensions prior to indexing." all_include = util.get_all_include_regex(c) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('DELETE FROM DocumentsToDimensions') c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'bigram'") nBigrams = int(c.fetchone()[0]) print 'Number of bigrams: ', nBigrams do_bigrams = nBigrams > 0 c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'trigram'") nTrigrams = int(c.fetchone()[0]) print 'Number of trigrams: ', nTrigrams do_trigrams = nTrigrams > 0 # # If the POS column contains "unigram", then it means we didn't perform POS tagging when calculating dimensions. # c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'unigram'") pos_tag = int(c.fetchone()[0]) == 0 cmd = 'SELECT ED_ENC_NUM FROM Documents' if options.limit: cmd += ' LIMIT %d' % options.limit num_total_docs = min(options.limit, num_total_docs) # # TODO: why is fetchmany not working? # #document_ids = c.execute(cmd).fetchmany() document_ids = [] for row in c.execute(cmd): document_ids.append(row[0]) print "fetched %d document ids" % len(document_ids) num_batches = int(math.ceil(len(document_ids)/options.batch_size)) # # Set up multiprocessing. # # MAIN_PROCESS -> document_id_queue -> WORKER_PROCESSES # # Each worker subprocess reads a document from the SQL database, processes it, and writes back to the database.. # document_id_queue = multiprocessing.Queue() proc_queue = multiprocessing.Queue() for i in xrange(num_batches): start = i*options.batch_size end = start+options.batch_size document_id_queue.put(Batch(start, document_ids[start:end])) for i in range(options.subprocesses): document_id_queue.put(None) # # Terminate the SQL connection so that the subprocesses can use it. # conn.commit() conn.close() # # https://docs.python.org/2/library/array.html#module-array # counter = multiprocessing.Value("I") pr_list = [] for i in range(options.subprocesses): args = (document_id_queue, filename, stemmer, all_include, pos_tag, do_bigrams, do_trigrams, all_dim, counter) p = multiprocessing.Process(target=worker_subprocess, args=args) p.start() pr_list.append(p) # # Wait for all worker subprocesses to complete. # for i, p in enumerate(pr_list): p.join() # # Calculate IDF weighting. # conn = sqlite3.connect(filename) c = conn.cursor() for dim_id, _, _ in all_dim: c.execute("""SELECT COUNT(DimensionId) FROM DocumentsToDimensions WHERE DimensionId = ?""", (dim_id,)) freq = int(c.fetchone()[0]) idf = log10(num_total_docs/(1+freq)) c.execute( 'UPDATE Dimensions SET IDF = ? WHERE DimensionId = ?', (idf, dim_id)) # # Save and exit. # conn.commit() c.close()
test_feat = test.drop(test_columns_to_drop, axis=1) factorize_category(train_feat) # handle numeric nan train_feat.fillna(nan, inplace=True) test_feat.fillna(nan, inplace=True) # handle numeric nan train_feat.replace(np.inf, -999, inplace=True) test_feat.replace(np.inf, -999, inplace=True) train_feat_final = train_feat xgtrain = xgb.DMatrix(train_feat_final, train['target'].values) # grid search params = get_params() params["eta"] = 0.05 min_child_weight_list = [1] subsample_list = [1] colsample_bytree_list = [0.6] max_depth_list = [10] #min_child_weight_list = [1, 5, 10] #subsample_list = [0.6, 0.8, 1] #colsample_bytree_list = [0.6, 0.8, 1] #max_depth_list = [8, 10, 12] params_list = [] for min_child_weight in min_child_weight_list: for subsample in subsample_list: for colsample_bytree in colsample_bytree_list:
def on_threadSample_newSample(self, sample): observability = sample[1] rest = sample[0] self.matplotlibWidget.figure.tight_layout() self.matplotlibWidget.axis.imshow(rest, vmin=0, vmax=255, cmap=plt.get_cmap('Greys_r')) self.matplotlibWidget.axis.imshow(observability, cmap=plt.get_cmap('RdYlGn'), alpha=0.2) #theta_coordinates = np.deg2rad([-146,0,45,90,0,180,170,190,200,0, 270, 315]) theta_coordinates = np.deg2rad(np.arange(0,360,15)) params = util.get_params(location="LaSilla") ff = params['ff'] k1 = params['k1'] k2 = params['k2'] r0 = params['r0'] cx = params['cx'] cy = params['cy'] north = params['north'] deltatetha = params['deltatetha'] url_weather = params['url_weather'] wpl = params['wind_pointing_limit'] wsl = params['wind_stopping_limit'] coordinatesx = np.cos(north + theta_coordinates) * r0 + cx coordinatesy = np.sin(north + theta_coordinates) * r0 + cy northx, northy = util.get_image_coordinates(np.deg2rad(0), np.deg2rad(24)) eastx, easty = util.get_image_coordinates(np.deg2rad(90), np.deg2rad(20)) self.matplotlibWidget.axis.annotate('N', xy=(northx, northy), rotation=deltatetha, horizontalalignment='center', verticalalignment='center') self.matplotlibWidget.axis.annotate('E', xy=(eastx, easty), rotation=deltatetha, horizontalalignment='center', verticalalignment='center') altshow = [15, 30, 45, 60, 75, 90] for angle in np.deg2rad(altshow): rr = util.get_radius(angle, ff, k1, k2, r0) #if angle >= np.pi/2: print rr/330. self.matplotlibWidget.figure.gca().add_artist(plt.Circle((cx,cy),rr,color='k', fill=False)) textx = np.cos(north + np.deg2rad(180)) * (rr - 2) + cx texty = np.sin(north + np.deg2rad(180)) * (rr - 2) + cy self.matplotlibWidget.axis.annotate('%d' % (90-np.ceil(np.rad2deg(angle))), xy=(textx, texty), rotation=deltatetha,#prefered_direction['dir'], horizontalalignment='left', verticalalignment='center', size=10) WD, WS = get_wind(url_weather) WDd = WD WD = np.deg2rad(WD) if WS is not None and WS > wpl: wdcoordinatesx = np.cos(north - WD) * r0 + cx wdcoordinatesy = np.sin(north - WD) * r0 + cy Nd = np.rad2deg(north)# + 90. if WS > wsl : cw = 'r' self.matplotlibWidget.axis.add_patch(Wedge([cx, cy], r0, Nd - WDd, Nd - WDd+360, fill=False, hatch='//', edgecolor=cw)) self.matplotlibWidget.axis.annotate('WIND LIMIT\nREACHED', xy=(cx, cy), rotation=0, horizontalalignment='center', verticalalignment='center', color=cw, fontsize=35) elif WS > wpl : cw = 'darkorange' wtcoordinatesx = np.cos(north - WD) * r0 / 2. + cx wtcoordinatesy = np.sin(north - WD) * r0 / 2. + cy self.matplotlibWidget.axis.add_patch(Wedge([cx, cy], r0, -90+Nd-WDd, 90+Nd-WDd, fill=False, hatch='//', edgecolor=cw)) self.matplotlibWidget.axis.annotate('Pointing limit!', xy=(wtcoordinatesx, wtcoordinatesy), rotation=0, horizontalalignment='center', verticalalignment='center', color=cw, fontsize=25) self.matplotlibWidget.axis.plot([cx, wdcoordinatesx], [cy, wdcoordinatesy], lw=3, color=cw) #plt.plot([cx, northx], [cy, northy], lw=2, color='k') for ccx, ccy in zip(coordinatesx, coordinatesy): self.matplotlibWidget.axis.plot([cx, ccx], [cy, ccy], lw=1, color='k') self.matplotlibWidget.axis.set_ylim([np.shape(rest)[0], 0]) self.matplotlibWidget.axis.set_xlim([0, np.shape(rest)[1]]) self.matplotlibWidget.axis.set_axis_off() self.matplotlibWidget.canvas.draw()
def mrmr(c, path): """ Perform automatic mRMR feature selection using the specified cursor. Changes are persisted to the database using the cursor. """ params = util.get_params(c, path) # # mRMR feature selection # include_dim = set() exclude_dim = set() all_dim = util.get_dimensions(c, 0) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents WHERE Score > 0') num_positive_docs = int(c.fetchone()[0]) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents WHERE Score < 0') num_negative_docs = int(c.fetchone()[0]) # # The part below is ported from filterFeatures() of reference.py # cu = params['C_UPPERCUTOFF'] * num_total_docs ccp = params['C_CLASSCUTOFF'] * num_positive_docs ccm = params['C_CLASSCUTOFF'] * num_negative_docs lcp = params['C_LOWERCUTOFF'] * num_positive_docs lcm = params['C_LOWERCUTOFF'] * num_negative_docs # # The original script didn't have any comments, so here's my guess of what # individual variables represent. # # cu Upper cut-off. If a feature occurs in more than cu documents, # then it should be excluded. # ccp Upper class cut-off for positive documents. # lcp Lower class cut-off for positive documents. # If the frequency of a feature within positive documents # falls within this interval, then it should be excluded. # ccm Upper class cut-off for negative documents. # lcm Lower class cut-off for negative documents. # If the frequency of a feature within negative documents # falls within this interval, then it should be excluded. # for (dim_id, _, _) in all_dim: text_count, plus_count, minus_count = 0, 0, 0 c.execute( """SELECT Score FROM Documents INNER JOIN DocumentsToDimensions ON Documents.ED_ENC_NUM = DocumentsToDimensions.ED_ENC_NUM WHERE DimensionId = ?""", (dim_id, )) for score in c: text_count += 1 if score > 0: plus_count += 1 elif score < 0: minus_count += 1 if params['USE_UPPERCUTS'] and text_count > cu: exclude_dim.add(dim_id) if dim_id in include_dim: include_dim.remove(dim_id) elif params['USE_CLASSCUTS'] and minus_count > ccm and plus_count > ccp: exclude_dim.add(dim_id) if dim_id in include_dim: include_dim.remove(dim_id) elif params['USE_LOWERCUTS'] and minus_count < lcm and plus_count < lcp: exclude_dim.add(dim_id) if dim_id in include_dim: include_dim.remove(dim_id) else: if dim_id in exclude_dim: exclude_dim.remove(dim_id) include_dim.add(dim_id) # # end of ported code. # print('mRMR enabled:', len(include_dim), 'disabled:', len(exclude_dim)) assert not include_dim.intersection(exclude_dim) for dim in include_dim: c.execute('UPDATE Dimensions SET Exclude = 0 WHERE DimensionId = ?', (dim, )) for dim in exclude_dim: c.execute('UPDATE Dimensions SET Exclude = 1 WHERE DimensionId = ?', (dim, ))
def run(dest, results_path, data_path, version, seed): np.random.seed(seed) hyps = [-1.0, 1.0] # load empirical probabilities human_responses = pd.read_csv(os.path.join( results_path, "human_fall_responses_raw.csv")) empirical = human_responses\ .groupby(['version', 'block'])\ .get_group((version, 'B'))\ .rename(columns={'kappa0': 'kappa'})\ .set_index(['stimulus', 'kappa', 'pid'])['fall? response']\ .unstack('kappa')[hyps]\ .stack() # load feedback fb = (util.load_fb(data_path)['C']['nfell'] > 1)\ .unstack('kappa')[hyps]\ .stack()\ .to_frame('fb')\ .reset_index() # load ipe probabilites old_store = pd.HDFStore( os.path.join(results_path, "model_fall_responses_raw.h5"), mode='r') # get the parameters we want sigma, phi = util.get_params() # dataframe to store all the results all_llh = pd.DataFrame([]) # compute empirical likelihood print('empirical') llh_empirical = bootstrap_llh(compute_llh, empirical, fb) llh_empirical['counterfactual'] = False llh_empirical['likelihood'] = 'empirical' all_llh = all_llh.append(llh_empirical) print('empirical cf') llh_empirical_cf = bootstrap_llh(compute_llh_counterfactual, empirical, fb) llh_empirical_cf['counterfactual'] = True llh_empirical_cf['likelihood'] = 'empirical' all_llh = all_llh.append(llh_empirical_cf) # compute likelihoods for each query type for query in old_store.root._v_children: # look up the name of the key for the parameters that we want (will be # something like params_0) param_ref_key = "/{}/param_ref".format(query) params = old_store[param_ref_key]\ .reset_index()\ .set_index(['sigma', 'phi'])['index']\ .ix[(sigma, phi)] # get the data key = "/{}/{}".format(query, params) ipe = old_store[key]\ .groupby('block')\ .get_group('B')\ .rename(columns={'kappa0': 'kappa'})\ .set_index(['stimulus', 'kappa', 'sample'])['response']\ .unstack('kappa')[hyps]\ .stack() # compute ipe likelihood print(query) llh_ipe = bootstrap_llh(compute_llh, ipe, fb) llh_ipe['counterfactual'] = False llh_ipe['likelihood'] = 'ipe_' + query all_llh = all_llh.append(llh_ipe) print(query + ' cf') llh_ipe_cf = bootstrap_llh(compute_llh_counterfactual, ipe, fb) llh_ipe_cf['counterfactual'] = True llh_ipe_cf['likelihood'] = 'ipe_' + query all_llh = all_llh.append(llh_ipe_cf) old_store.close() results = all_llh\ .set_index(['likelihood', 'counterfactual', 'stimulus', 'kappa0', 'hypothesis'])\ .sortlevel() assert not np.isnan(results['median']).any() assert not np.isinf(results['median']).any() results.to_csv(dest)
def index(filename, nlp): """ Perform indexing. Each document is stemmed, and then the non-excluded dimensions are counted for that document. The result is put into the DocumentsToDimensions table. """ conn = sqlite3.connect(filename) c = conn.cursor() params = util.get_params(c, filename) stemmer = params['stemmer'] print('index(): stemmer: %s' % stemmer) all_dim = util.get_dimensions(c, 0) assert all_dim, "You must calculate dimensions prior to indexing." all_include = util.get_all_include_regex(c) c.execute('SELECT COUNT(ED_ENC_NUM) FROM Documents') num_total_docs = int(c.fetchone()[0]) c.execute('DELETE FROM DocumentsToDimensions') c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'bigram'") nBigrams = int(c.fetchone()[0]) print('Number of bigrams: ', nBigrams) do_bigrams = nBigrams > 0 c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'trigram'") nTrigrams = int(c.fetchone()[0]) print('Number of trigrams: ', nTrigrams) do_trigrams = nTrigrams > 0 # # If the POS column contains "unigram", then it means we didn't perform POS tagging when calculating dimensions. # c.execute("SELECT COUNT(*) FROM Dimensions WHERE PartOfSpeech = 'unigram'") pos_tag = int(c.fetchone()[0]) == 0 cmd = 'SELECT ED_ENC_NUM FROM Documents' # if options.limit: # cmd += ' LIMIT %d' % options.limit # num_total_docs = min(options.limit, num_total_docs) # # TODO: why is fetchmany not working? # #document_ids = c.execute(cmd).fetchmany() document_ids = [] for row in c.execute(cmd): document_ids.append(row[0]) print("fetched %d document ids" % len(document_ids)) # # Terminate the SQL connection so that the subprocesses can use it. # conn.commit() conn.close() # # https://docs.python.org/2/library/array.html#module-array # main_process(nlp, document_ids, filename, stemmer, all_include, pos_tag, do_bigrams, do_trigrams, all_dim) conn = sqlite3.connect(filename) c = conn.cursor() for dim_id, _, _ in all_dim: c.execute( """SELECT COUNT(DimensionId) FROM DocumentsToDimensions WHERE DimensionId = ?""", (dim_id, )) freq = int(c.fetchone()[0]) idf = log10(num_total_docs / (1 + freq)) c.execute('UPDATE Dimensions SET IDF = ? WHERE DimensionId = ?', (idf, dim_id)) # # Save and exit. # conn.commit() c.close()
def generate_features(track_ids, audio_paths, ground_truths, params, audio_params, param_file, logdir, feature_path_root='features', normalize=False): if not os.path.exists(feature_path_root): print(f'Feature root directory does not yet exist. Creating {feature_path_root}.') os.makedirs(feature_path_root) # Go through each directory in feature root path and check if parameters are the same # If a match is found, load those features. If not, generate a new directory and features. feature_dirs = [os.path.join(feature_path_root, name) for name in os.listdir(feature_path_root) if os.path.isdir(os.path.join(feature_path_root, name))] directory_found = False for feature_dir in feature_dirs: # Get parameter dictionary from directory param_path = f'{feature_dir}/params.json' if os.path.exists(param_path): param_stored, audio_param_stored, _ = get_params(param_path) else: print(f'Paramater file missing in {feature_dir}.') continue # Compare with current parameters if audio_param_stored == audio_params: directory_found = True print(f'Found matching feature directory: {feature_dir}.') # Calculate missing feature arrays for k, track_id in enumerate(tqdm(track_ids)): if not os.path.isfile(f'{feature_dir}/{track_id}.npy'): print(f'[{k}/{len(track_ids)}] Calculating missing features for {track_id}.') features, ground_truth, length = features_and_annotation(track_id, audio_paths, ground_truths, params) np.save(f'{feature_dir}/{track_id}.npy', features) np.save(f'{feature_dir}/{track_id}_truth.npy', ground_truth) np.save(f'{feature_dir}/{track_id}_length.npy', length) break # If no directory with current parameters has been found, create it and store params if not directory_found: feature_dir = os.path.join(feature_path_root, f'features_{int(time.time())}') print(f'Creating new feature directory: {feature_dir}.') os.makedirs(feature_dir) # Copy parameters print('Writing parameter file.') copyfile(param_file, f'{feature_dir}/params.json') # Calculate missing feature arrays for k, track_id in enumerate(tqdm(track_ids)): if not os.path.isfile(f'{feature_dir}/{track_id}.npy'): print(f'[{k}/{len(track_ids)}] Calculating missing features for {track_id}.') features, ground_truth, length = features_and_annotation(track_id, audio_paths, ground_truths, params) np.save(f'{feature_dir}/{track_id}.npy', features) np.save(f'{feature_dir}/{track_id}_truth.npy', ground_truth) np.save(f'{feature_dir}/{track_id}_length.npy', length) # Find normalisation factors norm_file = f'{logdir}/norm.pkl' if normalize and not os.path.isfile(norm_file): print('Calculating normalisation factors.') count = 0 sums = [] mins = [] maxs = [] for k, track_id in enumerate(tqdm(track_ids)): feat = np.load(f'{feature_dir}/{track_id}.npy') sums.append(np.sum(feat, axis=1, keepdims=True)) mins.append(np.min(feat, axis=1, keepdims=True)) maxs.append(np.max(feat, axis=1, keepdims=True)) count += feat.shape[1] mean = np.sum(np.concatenate(sums, axis=1), axis=1, keepdims=True) / count max_val = np.max(np.concatenate(maxs, axis=1), axis=1, keepdims=True) - mean min_val = np.min(np.concatenate(mins, axis=1), axis=1, keepdims=True) - mean # norm = np.max(np.concatenate([max_val, np.abs(min_val)], axis=1), axis=1, keepdims=True) # Calculate variance variances = [] for k, track_id in enumerate(tqdm(track_ids)): feat = np.load(f'{feature_dir}/{track_id}.npy') variances.append(np.sum(np.square(feat - mean), axis=1, keepdims=True)) var = np.sum(np.concatenate(variances, axis=1), axis=1, keepdims=True) / count # Normalize by standard deviation norm = np.sqrt(var) norm_dict = {'mean': mean, 'norm': norm, 'min_val': min_val, 'max_val': max_val} joblib.dump(norm_dict, norm_file) print('Features complete.') return feature_dir
def calc_dim(path, limit=0, pos_tag=False, language='en'): """ Stems each document. Determines all possible dimensions. Creates dimensions-related tables and populates them. conn The connection to the database to work with. stemmer The stemmer to use. limit The number of documents to process. If zero, all documents. bigrams Process bigrams. trigrams Process trigrams. """ conn = sqlite3.connect(path) c = conn.cursor() params = util.get_params(c, path) stemmer = params['stemmer'] bigrams = params['bigrams'] trigrams = params['trigrams'] print 'calc_dim(): stemmer: %s bigrams: %s trigrams: %s' % ( stemmer, bigrams, trigrams) exclude = util.get_all_exclude_regex(c) include = util.get_all_include_regex(c) stemmers = { 'porter': nltk.PorterStemmer(), 'lancaster': nltk.LancasterStemmer() } try: stemmer = stemmers[stemmer] except KeyError: print 'unsupported stemmer:', stemmer return 1 num_doc = 0 c.execute('SELECT ED_ENC_NUM FROM Documents') for doc in c: num_doc += 1 cmd = 'SELECT ED_ENC_NUM, NOTE_TEXT, Score FROM Documents' if limit: cmd += ' LIMIT %d' % limit num_doc = min(limit, num_doc) c.execute(cmd) all_words = set() all_bigrams = set() all_trigrams = set() all_inclusions = set() all_doc = [] for i, (num, raw, score) in enumerate(c): if i % 100 == 0: print 'calc_dim(): processing document %s (%d/%d)' % (str(num), i + 1, num_doc) proc = process_document(raw, stemmer, include, pos_tag, bigrams, trigrams) all_words = all_words.union(set(proc['stemmed'])) all_inclusions = all_inclusions.union(set(proc['inclusions'])) if bigrams: all_bigrams = all_bigrams.union(set(proc['bigrams'])) if trigrams: all_trigrams = all_trigrams.union(set(proc['trigrams'])) all_doc.append(num) all_words = list(all_words) all_words.sort() all_bigrams = list(all_bigrams) all_bigrams.sort() all_trigrams = list(all_trigrams) all_trigrams.sort() all_inclusions = list(all_inclusions) all_inclusions.sort() init_dim(c) populate_dim(c, all_words, all_bigrams, all_trigrams, all_inclusions, exclude) c.execute("SELECT COUNT(*) FROM Dimensions") nDims = int(c.fetchone()[0]) # # Save and exit. # c.close() conn.commit()
def run_step(self, prev, params): df = prev custom_params, url = get_params(params.get('url')), params.get('url') names = { 'municipality_code': 'mun_id', 'state_code': 'ent_id', 'foreign_destination_origin': 'partner_country', 'trade_flow': 'flow_id', 'product_2d': 'hs2_id', 'product_4d': 'hs4_id', 'product': 'hs6_id' } df.rename(columns=names, inplace=True) if 'unanonymized' in params.get('table'): logging.debug('Unanonymized values...') df.drop(columns=['value'], inplace=True) df.rename(columns={'unanonymized_value': 'value'}, inplace=True) else: logging.debug('Anonymized values...') df.drop(columns=['unanonymized_value'], inplace=True) # negative values df.value.replace('C', np.nan, inplace=True) df.value = df.value.astype('float') df = df.loc[df.value > 0].copy() # iso3 names df['partner_country'] = df['partner_country'].str.lower() # fill columns level = ['hs6_id', 'hs4_id', 'hs2_id'] for i in level: if i != custom_params['depth']: df[i] = 0 # drop date, create time dimension for k, v in custom_params['datetime'].items(): df[k] = v df.drop(columns='date', inplace=True) # hs codes df[custom_params['depth']] = df[custom_params['depth']].astype( 'str').str.zfill(get_number(custom_params['depth'])) for row in df[custom_params['depth']].unique(): df[custom_params['depth']].replace(row, hs6_converter(row), inplace=True) for col in df.columns[df.columns != 'partner_country']: df[col] = df[col].astype('float').round(0).astype('int') # drop null trade values df.dropna(subset=['value'], inplace=True) # national ent id if 'National' in url: df['ent_id'] = 0 # explicit level name df['level'] = int(custom_params['level'][2]) df['product_level'] = int( re.findall(r"(\d){1}", custom_params['depth'])[0]) # debug df['url'] = url return df
def run_step(self, prev, params): df = prev custom_params, url = get_params(params.get('url')), params.get('url') names = { 'municipality_code': 'mun_id', 'state_code': 'ent_id', 'foreign_destination_origin': 'partner_country', 'trade_flow': 'flow_id', 'product_2d': 'hs2_id', 'product_4d': 'hs4_id', 'product': 'hs6_id' } df.rename(columns=names, inplace=True) if 'unanonymized' in params.get('table'): logging.debug('Unanonymized values...') df.drop(columns=['value'], inplace=True) df.rename(columns={'unanonymized_value': 'value'}, inplace=True) else: logging.debug('Anonymized values...') df.drop(columns=['unanonymized_value'], inplace=True) # negative values df.value.replace('C', np.nan, inplace=True) df.value = df.value.astype('float') df = df.loc[df.value > 0].copy() # iso3 names df['partner_country'] = df['partner_country'].str.lower() # fill columns level = ['hs6_id', 'hs4_id', 'hs2_id'] for i in level: if i != custom_params['depth']: df[i] = 0 # drop date, create time dimension for k, v in custom_params['datetime'].items(): df[k] = v df.drop(columns='date', inplace=True) # hs codes df[custom_params['depth']] = df[custom_params['depth']].astype( 'str').str.zfill(get_number(custom_params['depth'])) for row in df[custom_params['depth']].unique(): df[custom_params['depth']].replace(row, hs6_converter(row), inplace=True) for col in df.columns[df.columns != 'partner_country']: df[col] = df[col].astype('float').round(0).astype('int') # drop null trade values df.dropna(subset=['value'], inplace=True) # national ent id if 'National' in url: df['nat_id'] = "mex" # explicit level name df['product_level'] = int( re.findall(r"(\d){1}", custom_params['depth'])[0]) # debug df['url'] = url for id_ in ["ent_id", "mun_id"]: if id_ in df.columns: df.loc[df[id_] == 0, id_] = 33000 if id_ == 'mun_id' else 33 df.loc[df.partner_country == "zya", "partner_country"] = "nld" df.loc[df.partner_country == "rom", "partner_country"] = "rou" df.loc[df.partner_country == "cia", "partner_country"] = "vat" df.loc[df.partner_country == "cur", "partner_country"] = "cuw" df.loc[df.partner_country == "cxi", "partner_country"] = "cxr" df.loc[df.partner_country == "dsm", "partner_country"] = "fsm" df.loc[df.partner_country == "fxa", "partner_country"] = "atf" df.loc[df.partner_country == "lhm", "partner_country"] = "hmd" df.loc[df.partner_country == "pty", "partner_country"] = "pcz" df.loc[df.partner_country == "xch", "partner_country"] = "iot" # Removing firms variable (requested) df.drop("firms", axis=1, inplace=True) return df
def mrmr(path, temp_dir): conn = sqlite3.connect(path) c = conn.cursor() c.execute('SELECT DimensionId FROM Dimensions') dimension_ids = [ d[0] for d in c.fetchall() ] mrmr_tmp = P.join(temp_dir, "mrmr-in.csv") fout = open(mrmr_tmp, 'w') fout.write(','.join([ 'Class' ] + map(str, dimension_ids)) + '\n') # # Output the CSV file for the mRMR utility. # c.execute('SELECT ED_ENC_NUM, Score FROM Documents') num_doc = 0 for doc_id, score in c: # # Feature selection can only take place when we have labelled samples. # assert score in (-100, 100) c_inner = conn.cursor() nonzero = {} # # TODO: ignore disabled dimensions? # c_inner.execute("""SELECT DimensionId, Count FROM DocumentsToDimensions where ED_ENC_NUM = ?""", (doc_id,)) for dim_id, count in c_inner: nonzero[dim_id] = count values = [ str(score/100) ] for dim in dimension_ids: if dim in nonzero: values.append(nonzero[dim]) else: values.append(0) fout.write(','.join(map(str, values)) + '\n') num_doc += 1 fout.close() # # Run the mRMR utility. # params = util.get_params(c, path) cmd = [ MRMR, '-i', mrmr_tmp, '-s', str(num_doc), '-v', str(len(dimension_ids)) ] + params['MRMR'].split(' ') print 'command line:', ' '.join(cmd) p = sub.Popen(cmd, bufsize=1, stdout=sub.PIPE, stderr=sub.STDOUT) # # This blocks until the underlying process completes, so can appear # unresponsive. Don't do this. # # stdout, stderr = p.communicate() # # Parse the output, enable/disable the required features. # There's a warning about buffers filling up and blocking the process if # things are done this way # (http://docs.python.org/library/subprocess.html#subprocess.Popen.kill). # However, in our case, there isn't THAT much data to worry about -- it's # more important to output it as it's coming in so it looks like the # application is actually doing something. # result = {} regex = re.compile('(\\d+) \t (\\d+) \t (\\d+) \t (\\d+\\.\\d+)') # # Argh, stdout is still being buffered... # TODO: try https://bitbucket.org/geertj/winpexpect/wiki/Home # while True: line = p.stdout.readline() if not line: break print line, match = regex.search(line) if not match: continue order, fea, name, score = match.groups() result[int(order)] = (int(fea), int(name), float(score)) print selected = sorted([ (result[k][1], result[k][2]) for k in result ]) c.execute('UPDATE Dimensions SET Exclude = 1') for dim in selected: c.execute( 'UPDATE Dimensions SET Exclude = 0, MRMR = ? WHERE DimensionId = ?', (dim[1], dim[0])) conn.commit() output_dim_table(c) c.close()
import numpy as np import pylab as plt import glob import scipy.ndimage import util theta_coordinates = np.deg2rad( [-146, 0, 45, 90, 0, 180, 170, 190, 200, 0, 270, 315]) #np.arange(0, 360, 45)) theta_coordinates = np.deg2rad([0, 45, 90, 135, 180, 225, 270, 315]) print theta_coordinates params = util.get_params(location="LaSilla") ff = params['ff'] k1 = params['k1'] k2 = params['k2'] r0 = params['r0'] cx = params['cx'] cy = params['cy'] north = params['north'] deltatetha = params['deltatetha'] coordinatesx = np.cos(north + theta_coordinates) * r0 + cx coordinatesy = np.sin(north + theta_coordinates) * r0 + cy list_of_image = glob.glob("current*.JPG") for fnimg in list_of_image: im = scipy.ndimage.imread(fnimg) ar = np.array(im)
import numpy as np import pylab as plt import glob import scipy.ndimage import util theta_coordinates = np.deg2rad([-146,0,45,90,0,180,170,190,200,0, 270, 315])#np.arange(0, 360, 45)) theta_coordinates = np.deg2rad([0, 45, 90, 135, 180, 225, 270, 315]) print theta_coordinates params = util.get_params(location="LaSilla") ff = params['ff'] k1 = params['k1'] k2 = params['k2'] r0 = params['r0'] cx = params['cx'] cy = params['cy'] north = params['north'] deltatetha = params['deltatetha'] coordinatesx = np.cos(north + theta_coordinates) * r0 + cx coordinatesy = np.sin(north + theta_coordinates) * r0 + cy list_of_image = glob.glob("current*.JPG") for fnimg in list_of_image: im = scipy.ndimage.imread(fnimg) ar = np.array(im)
from util import get_params, trade, calculate_lt_returns #get parameters param = get_params() sim_results = [] dist_results = [] day_totals = [] sim_counter = 0 for i in range(param.n_sims): sim_counter += 1 start_value = param.portfolio lt_alloc = param.lt_alloc st_alloc = param.st_alloc cash = param.cash_alloc lt_holding_pct = param.lt_holding_pct lt_holding_amt = 1 / lt_holding_pct rebalance = param.rebalance_period lt_holdings = [] lt_returns = 0 end_sim = False #ensure allocations equal 100% of portfolio if lt_alloc + st_alloc + cash != 1: print('ERROR: Allocations must equal 100%') exit() #print initial parameters print('\n' + '-' * 61) print('Initial Simulation Parameters:') print('Starting portfolio value: ${:.2f}'.format(start_value))