def handle_import_dataset(self): msgBox = tk.messagebox.askyesnocancel( "Import Dataset", "Does your dataset contain multiple signals?") if msgBox is None: return else: path, dataset, datasetex = self.select_raw_data(msgBox) # get list of files to import files_to_import = [] if not msgBox: files_to_import.append(path) else: folder_contents = os.listdir(path) csv_files = [ os.path.join(path, item) for item in folder_contents if item.endswith(".csv") ] files_to_import += csv_files # import and load dataset if path is not None: if not datasetex: labelfilenames = [ _folder.file_abspath('common', labelfile) for labelfile in [ 'labels_SWaN.csv', 'labels_MUSS.csv', 'labels_ambsed.csv', 'labels_goodbad.csv', 'labels_unknown.csv' ] ] import import_dataset import_dataset.main(files_to_import, name=dataset, labelfilenames=labelfilenames) else: print('dataset %s already imported' % dataset)
def main(filenames, *, name=None, labelfilenames=None, zoom=None, mag=DEFAULT_MAGNITUDE, sample=None, day=None): if len(filenames) > 1 and not name: _helper.errorExit( 'Must specify a custom dataset --name when importing multiple files' ) if mag <= 0: _helper.errorExit('magnitude must be positive') if sample is not None and day is not None: _helper.errorExit('Can only provide one of --sample and --day') start_sample, end_sample = None, None if sample is not None: start_sample, end_sample = parseRange('sample', sample) start_day, end_day = None, None if day is not None: start_day, end_day = parseRange('day', day) # load labels if not labelfilenames: labelfilenames = [ _folder.file_abspath('common', 'labels_test.csv'), _folder.file_abspath('common', 'labels_unknown.csv') ] labels = [] labels_names = set() for labelfile in labelfilenames: print('Reading labels from %s...' % labelfile) with open(labelfile, 'rt') as csvfile: reader = csv.DictReader(csvfile) if set(reader.fieldnames) != set(['label', 'red', 'green', 'blue' ]): _helper.errorExit('Incorrect label csv headers') for row in reader: label_name = row['label'].strip() rr = float(row['red'].strip()) gg = float(row['green'].strip()) bb = float(row['blue'].strip()) if re.search('[^\w\- ]', label_name, re.ASCII): _helper.errorExit( 'Only alphanumeric, underscore, dash, and space allowed in label names: ' + label_name) if label_name in labels_names: _helper.errorExit('Duplicate label: ' + label_name) labels.append((label_name, rr, gg, bb)) labels_names.add(label_name) # process arguments signal_names = [] for filename in filenames: signal_names.append(_helper.makeIdFromFilename(filename)) if len(signal_names) != len(set(signal_names)): _helper.errorExit('Duplicate signal names') if name: if not _helper.checkId(name, False): _helper.errorExit( 'Only alphanumeric and underscore allowed in dataset names') dataset = name else: dataset = signal_names[0] if start_sample is not None or end_sample is not None: dataset = dataset + strRange('sample', start_sample, end_sample) if start_day is not None or end_day is not None: dataset = dataset + strRange('day', start_day, end_day) out_folder = _helper.datasetDir(dataset) tile_folder = _helper.datasetTileDir(dataset) if os.path.exists(out_folder): _helper.errorExit('Please remove output folder ' + out_folder) print('Using output folder ' + out_folder) _helper.ensureDirExists(out_folder, False) _helper.ensureDirExists(tile_folder, False) # read in data print('reading header...') # open files csvfiles = [] for filename in filenames: if filename.endswith('.gz'): use_open = gzip.open else: use_open = open csvfiles.append(use_open(filename, 'rt')) # read headers files_start_ms = [] dataset_rate = None for filename, csvfile in zip(filenames, csvfiles): header_rate, header_start_ms = _helper.process_actigraph_header( csvfile) if dataset_rate == None: dataset_rate = int(header_rate) elif dataset_rate != int(header_rate): _helper.errorExit('Multiple sample rates found') files_start_ms.append(header_start_ms) # determine sample range dataset_start_ms = min(files_start_ms) dataset_start_date = datetime.datetime.utcfromtimestamp(dataset_start_ms / 1000).date() if start_sample is not None or end_sample is not None: pass if start_day is not None or end_day is not None: if start_day is not None: output_min_ms = 1000 * calendar.timegm( (dataset_start_date + datetime.timedelta(days=(start_day - 1))).timetuple()) start_sample = (max(output_min_ms, dataset_start_ms) - dataset_start_ms) * dataset_rate / 1000 if start_sample != int(start_sample): _helper.errorExit('day start sample error') start_sample = int(start_sample) else: start_sample = None if end_day is not None: output_max_ms = 1000 * calendar.timegm( (dataset_start_date + datetime.timedelta(days=(end_day))).timetuple()) end_sample = (output_max_ms - dataset_start_ms) * dataset_rate / 1000 if end_sample != int(end_sample): _helper.errorExit('day end sample error') end_sample = int(end_sample) else: end_sample = None # determine starting day index start_day_index = 1 if start_sample: start_day_index = 1 + (datetime.datetime.utcfromtimestamp( dataset_start_ms / 1000 + start_sample / dataset_rate).date() - dataset_start_date).days # print header summary if len(filenames) > 1: for filename, signalname, file_start_ms in zip(filenames, signal_names, files_start_ms): print('file start: ', _helper.timeMillisecondToTimeString(file_start_ms), signalname, filename) print('input start: ', _helper.timeMillisecondToTimeString(dataset_start_ms), dataset) # read data sample_len = 3 * len(filenames) sample_data = [] min_smp = 1e100 max_smp = -1e100 for fileindex, (filename, file_start_ms, csvfile) in enumerate( zip(filenames, files_start_ms, csvfiles)): print('reading ' + filename + '...') # Checks if csv header is absent and adds the header if needed csvstartpos = csvfile.tell() firstrow = next(csvfile) csvfile.seek(csvstartpos) fieldnames = None if 'Accelerometer' not in firstrow: # No headers present DEFAULT_FIELDNAMES = [ 'Timestamp', 'Accelerometer X', 'Accelerometer Y', 'Accelerometer Z' ] no_of_fields = len(firstrow.split(',')) if no_of_fields == 4: fieldnames = DEFAULT_FIELDNAMES elif no_of_fields == 3: fieldnames = DEFAULT_FIELDNAMES[1:] else: _helper.errorExit( 'missing header has unrecognized number of fields') if fieldnames != None: _helper.warning('input file missing field names, using ' + ','.join(fieldnames)) reader = csv.DictReader(csvfile, fieldnames=fieldnames) if 'Timestamp' in reader.fieldnames: _helper.warning( 'input file has Timestamp field, but it will be ignored') # process rows reader_sample_index = 0 sample_offset = (file_start_ms - dataset_start_ms) * dataset_rate / 1000 if sample_offset != int(sample_offset): _helper.errorExit('sample offset error') sample_offset = int(sample_offset) if start_sample != None: sample_offset -= start_sample for row in reader: data_sample_index = reader_sample_index + sample_offset reader_sample_index += 1 if data_sample_index < 0: continue if end_sample != None and data_sample_index >= end_sample - ( start_sample if start_sample != None else 0): break x = float(row['Accelerometer X']) y = float(row['Accelerometer Y']) z = float(row['Accelerometer Z']) min_smp = min(min_smp, x, y, z) max_smp = max(max_smp, x, y, z) while data_sample_index >= len(sample_data): sample_data.append([None] * sample_len) sample_data[data_sample_index][3 * fileindex + 0] = x sample_data[data_sample_index][3 * fileindex + 1] = y sample_data[data_sample_index][3 * fileindex + 2] = z if reader_sample_index % (60 * 60 * dataset_rate) == 0: print('read %d hours...' % (reader_sample_index / (60 * 60 * dataset_rate))) if min_smp < -mag or mag < max_smp: _helper.warning('sample exceeds magnitude') output_start_ms = dataset_start_ms if start_sample != None: output_start_ms_offset = start_sample * 1000 / dataset_rate if output_start_ms_offset != int(output_start_ms_offset): _helper.errorExit('output start offset sample error') output_start_ms += int(output_start_ms_offset) output_end_ms = output_start_ms + (len(sample_data) - 1) * 1000 / dataset_rate # figure out max zoom level, if needed if zoom is None: for zz in range(10): zoom = zz if len(sample_data) / math.pow(SUBSAMPLE, zz + 1) <= 2 * TILE_SIZE: break # print summary print('length: ', len(sample_data)) print('rate: ', dataset_rate) print('max zoom: ', zoom) print('output start: ', _helper.timeMillisecondToTimeString(output_start_ms)) print('output end: ', _helper.timeMillisecondToTimeString(output_end_ms)) # write tiles for zoom_level in range(zoom + 1): print('writing zoom %d...' % zoom_level) zoom_subsample = SUBSAMPLE**zoom_level zoom_tile_size = TILE_SIZE * zoom_subsample ntiles = int(len(sample_data) / zoom_tile_size) if len(sample_data) % zoom_tile_size != 0: ntiles += 1 for tt in range(ntiles): tile_id = 'z%02dt%06d' % (zoom_level, tt) outfilename = os.path.join(tile_folder, tile_id + '.json') with open(outfilename, 'wt') as outfile: write_startfile(outfile, zoom_subsample, dataset + ':' + tile_id) prev = False for ss in range(tt * TILE_SIZE, (tt + 1) * TILE_SIZE + 1): rangesmp = sample_data[ss * zoom_subsample:(ss + 1) * zoom_subsample] write_sample(outfile, rangesample(rangesmp, sample_len), prev, sample_len) prev = True write_endfile(outfile) if (tt + 1) % 1000 == 0: print('wrote %d tiles...' % (tt + 1)) print('writing origin...') outfilename = _helper.datasetOriginFilename(dataset) with open(outfilename, 'wt') as outfile: outfile.write("{\n") outfile.write(' "origin": %s\n' % json.dumps(filenames)) outfile.write('}\n') print('writing config...') outfilename = _helper.datasetConfigFilename(dataset) with open(outfilename, 'wt') as outfile: outfile.write('{\n') outfile.write(' "title": "%s",\n' % dataset) outfile.write(' "tile_size": %d,\n' % TILE_SIZE) outfile.write(' "tile_subsample": %d,\n' % SUBSAMPLE) outfile.write(' "zoom_max": %d,\n' % zoom) outfile.write(' "length": %d,\n' % len(sample_data)) outfile.write(' "start_time_ms": %s,\n' % output_start_ms) outfile.write(' "sample_rate": %d,\n' % dataset_rate) outfile.write(' "start_day_idx": %d,\n' % start_day_index) outfile.write(' "magnitude": %d,\n' % mag) outfile.write(' "signals": ["%s"],\n' % ('", "'.join(signal_names))) outfile.write(' "labels": [\n') for ii, (ll, rr, gg, bb) in enumerate(labels): outfile.write( ' { "label": "%s", "color": [ %0.2f, %0.2f, %0.2f ] }%s\n' % (ll, rr, gg, bb, ',' if ii + 1 < len(labels) else '')) outfile.write(' ]\n') outfile.write('}\n') print('dataset written to ' + out_folder)
def _process_request(self, path, vars): global _debug_delay if _debug_delay: time.sleep(_debug_delay) if path == '/signaligner.html': if 'dataset' in vars and ALNUMUN_RE.match(vars['dataset']): dataset = vars['dataset'] else: dataset = 'null' if 'session' in vars and ALNUMUN_RE.match(vars['session']): session = vars['session'] else: session = SESSION_ERROR def replace_data(data): data = replace_vars(data, session, False) return data self._send_header_and_file_data( _folder.file_abspath('signaligner/signaligner.html'), False, CTYPE_HTML, replace_data) elif path == '/signaligner.js': def replace_data(data): data = replace_mode_config(data) return data self._send_header_and_file_data( _folder.file_abspath('signaligner/signaligner.js'), False, CTYPE_JS, replace_mode_config) elif path == '/fetchdatasetlist': datasets = _helper.getDatasetList() self._send_header(200, CTYPE_PLAIN) self._send_data(json.dumps(datasets), False) elif path == '/fetchdataset': if 'dataset' in vars and ALNUMUN_RE.match(vars['dataset']): dataset_name = vars['dataset'] if 'type' in vars and vars['type'] == 'config': file_path = _helper.datasetConfigFilename(dataset_name) elif 'type' in vars and vars[ 'type'] == 'tile' and 'id' in vars and ALNUMUN_RE.match( vars['id']): file_path = os.path.join( _helper.datasetTileDir(dataset_name), vars['id'] + '.json') else: self._send_header(404, CTYPE_PLAIN) return if not os.path.exists(file_path): self._send_header(404, CTYPE_PLAIN) return self._send_header_and_file_data(file_path, False, CTYPE_PLAIN) else: self._send_header(404, CTYPE_PLAIN) elif path == '/fetchlabels': if 'dataset' in vars and ALNUMUN_RE.match(vars['dataset']): dataset = vars['dataset'] self._send_header(200, CTYPE_PLAIN) labels = _helper.getLabelsLatest(dataset) if labels: self._send_data(json.dumps(labels), False) else: self._send_header(404, CTYPE_PLAIN) elif path == '/reportlabels': if 'data' in vars: data = json.loads(vars['data']) if 'dataset' in data and ALNUMUN_RE.match( data['dataset'] ) and 'session' in data and ALNUMUN_RE.match(data['session']): dataset = data['dataset'] session = data['session'] with open( _helper.ensureDirExists( _helper.logLabelsFilename(dataset, session), True), 'at') as dfile: dfile.write(json.dumps(data) + '\n') with open( _helper.ensureDirExists( _helper.latestLabelsFilename(dataset, session), True), 'wt') as dfile: dfile.write(json.dumps(data) + '\n') with open( _helper.ensureDirExists( _helper.latestLabelsFilename(dataset, session), True), 'rt') as dfile: response = json.loads(dfile.read()) self._send_header(200, CTYPE_PLAIN) self._send_data(json.dumps(response), False) else: self._send_header(404, CTYPE_PLAIN) else: self._send_header(404, CTYPE_PLAIN) elif path == '/mturksubmit' or path == '/mturksubmissions': if 'data' in vars: data = json.loads(vars['data']) if 'dataset' in data and ALNUMUN_RE.match( data['dataset'] ) and 'session' in data and ALNUMUN_RE.match(data['session']): dataset = data['dataset'] session = data['session'] if path == '/mturksubmit': mturk_submit = _helper.mturkSubmitLabelsFilename( dataset, session) if not os.path.exists(mturk_submit): with open( _helper.ensureDirExists( mturk_submit, True), 'wt') as dfile: dfile.write(json.dumps(data) + '\n') submissions = _helper.mturkGetSubmissions(session) total = 0 datasets = [] for submission in submissions: score = submission['score'] / 100.0 score = score**2 score *= submission['daysofdata'] # minimum of 1 cent for tutorial levels, 20 cents for challenge score = max(score, 0.20) if submission['istutorial']: score *= 0.05 total += score datasets.append(submission['dataset']) total = int(total * 100) if session not in _mturk_session_codes: _mturk_session_codes[session] = _helper.makeId()[:3] code = _mturk_session_codes[session] code = code + ('%03d' % total).upper() code = code + hashlib.md5( code.encode('utf-8')).hexdigest()[:3].upper() response = { 'amount': '$%d.%02d' % (total // 100, total % 100), 'code': code, 'datasets': datasets } self._send_header(200, CTYPE_PLAIN) self._send_data(json.dumps(response), False) else: self._send_header(404, CTYPE_PLAIN) else: self._send_header(404, CTYPE_PLAIN) elif path == '/log': if 'data' in vars: with open( _helper.ensureDirExists( _folder.data_abspath('playlog'), True), 'at') as dfile: dfile.write(vars['data'] + '\n') self._send_header(200, CTYPE_PLAIN) elif HTML_RE.match(path): if path == '/mturk_start.html': global _mode if _mode != 'MTURK': self._send_header(200, CTYPE_PLAIN) self._send_data( 'mode must be MTURK to request mturk_start.html', False) return if 'session' in vars and ALNUMUN_RE.match(vars['session']): session = vars['session'] else: session = SESSION_ERROR def replace_data(data): return replace_vars(data, session, True) self._send_header_and_file_data( _folder.file_abspath('static' + path), False, CTYPE_HTML, replace_data) elif PNG_RE.match(path): self._send_header_and_file_data( _folder.file_abspath('static' + path), True, CTYPE_PNG) elif JS_RE.match(path): self._send_header_and_file_data( _folder.file_abspath('static' + path), False, CTYPE_JS) elif CSS_RE.match(path): self._send_header_and_file_data( _folder.file_abspath('static' + path), False, CTYPE_CSS) else: self._send_header(404, CTYPE_PLAIN)
def datasetDir(dataset): common_dir = _folder.file_abspath('common', 'datasets', dataset) if os.path.exists(common_dir): return common_dir return _folder.data_abspath(_get_dataset_folder(), dataset)
def handle_run_algo(self): if self.datasetSelected is None: tk.messagebox.showerror("Alert", "Please import a dataset first.") return import import_mhealth import main import import_labels algorithm = self.algorithmSelected.get() swan = algorithm == 'SWaN' muss = algorithm == 'MUSS' qc = algorithm == 'QC' dataset = self.datasetSelected.get() dataset_raw_csv_paths = get_dataset_raw_file_paths(dataset) csv_selected = self.algorithmCSVSelected.get() run_algo_csv_list = [] missing_files = [] for filepath in dataset_raw_csv_paths: if csv_selected == "ALL" or _helper.makeIdFromFilename( filepath) == csv_selected: run_algo_csv_list.append(filepath) for filepath in run_algo_csv_list: if not os.path.exists(filepath): missing_files.append(filepath) else: signal_name = _helper.makeIdFromFilename(filepath) mhealth_folder = mhealthfolder(dataset, signal_name) algo_folder = algofolder(dataset, signal_name) if not os.path.exists(mhealth_folder): import_mhealth.main(filepath, mhealth_folder) old_cwd = os.path.abspath(os.path.realpath(os.getcwd())) os.chdir(_folder.file_abspath('mdcas-python')) main.main(mhealth_folder + '/default/', algo_folder + '/default/', 80, profiling=False, swan=swan, muss=muss, qc=qc) os.chdir(old_cwd) if swan: print("Running SWaN algorithm...") import_labels.main(dataset, algo_folder + '/default/SWaN_output.csv', source='Algo', session='SWaN_' + signal_name) elif muss: print("Running MUSS algorithm...") import_labels.main(dataset, algo_folder + '/default/muss_output.csv', source='Algo', session='MUSS_' + signal_name) elif qc: print("Running QC algorithm...") import_labels.main(dataset, algo_folder + '/default/qc_output.csv', source='Algo', session='QC_' + signal_name, qcfix=True) if len(run_algo_csv_list) > 0: tk.messagebox.showinfo( "Run Algorithm", "Algorithm labels successfully added for the following files: " + ", ".join([ _helper.makeIdFromFilename(file) for file in run_algo_csv_list ])) if len(missing_files) > 0: tk.messagebox.showerror( "Run Algorithm", "The algorithm was not run on the following missing files. Please" "move the files back to their locations from when the dataset was" "imported: " + ", ".join(missing_files))