def load_data( input_dir=None, split='train', ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Load data for a specific split If input_dir is not provided, loads X and y for the given split from the default location (S3). If input_dir is provided, loads the entities/targets tables from their default table names from the given directory, ignoring split. For feature development, only the train split should be used. """ config = load_config() tables = config.data.tables entities_table_name = config.data.entities_table_name entities_config = some(where(tables, name=entities_table_name)) targets_table_name = config.data.targets_table_name targets_config = some(where(tables, name=targets_table_name)) if input_dir is None: bucket = config.data.s3_bucket split_path = config.data.splits.get(split) input_dir = f's3://{bucket}/{split_path}' X = load_table_from_config(input_dir, entities_config) y = load_table_from_config(input_dir, targets_config) return X, y
def load_data(input_dir=None): """Load data""" if input_dir is not None: tables = config.get('data.tables') entities_table_name = config.get('data.entities_table_name') entities_config = some(where(tables, name=entities_table_name)) X = load_table_from_config(input_dir, entities_config) targets_table_name = config.get('data.targets_table_name') targets_config = some(where(tables, name=targets_table_name)) y = load_table_from_config(input_dir, targets_config) else: raise NotImplementedError return X, y
def split_date(spec): if spec['is_continuous'] or not spec['end_date']: yield spec return start_date = maybe_parse_date(spec['start_date']) end_date = maybe_parse_date(spec['end_date']) or start_date days = int((end_date - start_date).total_seconds() / (60 * 60 * 24)) schedules = spec['schedules'] or [{ 'days_of_week': [0, 1, 2, 3, 4, 5, 6], 'start_time': spec['start_time'], 'end_time': spec['end_time'], }] for day in range(days + 1): this_date = start_date + timedelta(days=day) schedule = some( lambda s: this_date.isoweekday() in s['days_of_week'], schedules ) if schedule: date_string = datetime.combine(this_date, time(0, 0)).isoformat() yield { 'is_continuous': False, 'start_date': date_string, 'end_date': date_string, 'start_time': schedule['start_time'], 'end_time': schedule['end_time'], }
def load_data(input_dir=None): """Load data""" if input_dir is not None: tables = config.get('data.tables') entities_table_name = config.get('data.entities_table_name') entities_config = some(where(tables, name=entities_table_name)) X = load_table_from_config(input_dir, entities_config) targets_table_name = config.get('data.targets_table_name') targets_config = some(where(tables, name=targets_table_name)) y = load_table_from_config(input_dir, targets_config) else: root = 'https://mit-dai-ballet.s3.amazonaws.com/census' X = pd.read_csv(root + '/train/entities.csv.gz') y = pd.read_csv(root + '/train/targets.csv.gz') return X, y
def load_data(input_dir=None): """Load data""" if input_dir is not None: tables = conf.get("tables") entities_table_name = conf.get("data", "entities_table_name") entities_config = some(where(tables, name=entities_table_name)) X_df = load_table_from_config(input_dir, entities_config) targets_table_name = conf.get("data", "targets_table_name") targets_config = some(where(tables, name=targets_table_name)) y_df = load_table_from_config(input_dir, targets_config) else: source = "https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt" df = pd.read_csv(source, sep="\t") X_df = df.drop("SalePrice", axis=1) y_df = df["SalePrice"] return X_df, y_df
def main(data, context): submission = Submission.from_bucket_trigger(data, context) # Make sure this isn't one of the lead files if submission.filename.startswith('lead'): return f"Ignoring lead audio file: gcs://{data['bucket']}{data['name']}" # Check required files audio = submission.audio_extracted if not audio.exists(): return f"Blob {audio.url} does not exist!" reference = F.some(lambda x: x.exists(), submission.audio_reference_candidates()) if not reference: urls = [c.url for c in submission.audio_reference_candidates()] return f"Unable to find reference audio! Tried {urls}" # Load config audio_cfg = submission.song_config()['audio'] loudnorm_cfg = submission.song_config()['loudnorm'] corr_cfg = submission.song_config()['correlation'] with TemporaryDirectory() as tempdir: os.chdir(tempdir) logging.info("In temp dir: %s", tempdir) # Download files logging.info("Downloading subject %s", audio.url) audio.download(audio.filename) logging.info("Downloading reference %s", reference.url) reference.download(reference.filename) # Process analysis, _ = align.cross_correlate( reference.filename, audio.filename, samplerate=corr_cfg.get('samplerate', ANALYSIS_SAMPLERATE), preprocess=corr_cfg.get('preprocess')) if audio_cfg['loudnorm']: analysis['loudnorm'] = loudnorm_analysis(audio.filename, submission.singer_count(), loudnorm_cfg) # Update firestore logging.info('Saving analysis data to firestore') submission.firestore_document().set({'analysis': analysis}, merge=True) # Output out_file = 'tmp_' + audio.filename write_aligned_audio(audio.filename, out_file, analysis, audio_cfg) # Upload logging.info("Uploading to %s", submission.audio_aligned.url) submission.audio_aligned.upload(out_file)
def load_data(split='train', input_dir=None): """Load data If input dir is not None, then load whatever dataset appears in `input_dir`. Otherwise, load the data split indicated by `split`. """ if input_dir is not None: config = load_config() tables = config.data.tables entities_table_name = config.data.entities_table_name entities_config = some(where(tables, name=entities_table_name)) X = load_table_from_config(input_dir, entities_config) targets_table_name = config.data.targets_table_name targets_config = some(where(tables, name=targets_table_name)) y = load_table_from_config(input_dir, targets_config) return X, y raise NotImplementedError
def with_filters(self, query_string, supported_attributes, ignored=['page', 'page_size', 'order_by', 'order_direction']): from urllib.parse import parse_qs parsed_query = parse_qs(query_string) query = self for k, v in parsed_query.items(): if some(_ == k.decode('utf-8'), ignored): continue attribute, op = k.decode('utf-8').split(':', 1) query = _ops[op](query, supported_attributes[attribute], _value(v[0].decode('utf-8'))) return query
def get_disqus_user(preferred_channel_id=None): """ If user is logged in, generate an appropriate Disqus user: - Same channel as preferred channel if available (for OP) - First channel user has - Anon account if user has no channels - None if not logged in """ if current_user and current_user.is_authenticated: channels = db.session.query(Channel).filter_by(user_id=current_user.id).all() preferred_channel = some(lambda x: x.id == preferred_channel_id, channels) if preferred_channel: return to_disqus_user(preferred_channel) elif channels: return to_disqus_user(channels[0]) else: user = db.session.query(User).filter_by(id=current_user.id).one() return to_disqus_user(user)
def parse_frames(self, nframes=0): # read and encode gps block (take only the last frame) image, gps = None, None for _ in range(nframes + 1): image = self.tn.read_until(GPS_MARK) gps = self.tn.read_until(GPS_MARK) self.tn.read_very_eager() # remove ascii-codes and store full image image = ANSI_ESCAPE.sub("", image.decode("utf8")).split("\n") gps = ANSI_ESCAPE.sub("", gps.decode("utf8")).split("\n") self.frame = image + gps # extract gps part only gps = [GPS_ESCAPE.sub("", line)[:-2] for line in gps[:-1]] # parse gps block raw = [list(map(ord, line)) for line in gps[3:-3]] # get facing direction (and grid offset) face = some(lambda ch: ch in ORIENTATIONS, flatten(raw)) offset = [(line.index(face), y) for y, line in enumerate(raw) if face in line][0] # get gps view (and transpose for (x, y) coordiante system) view = np.array([[ch in WALLS for ch in line] for line in raw], dtype=int) view = view.T # parse position and end coordinates pos = tuple(parse("pos: {:d} {:d}", gps[-2].strip())) end = tuple(parse("end: {:d} {:d}", gps[-1].strip())) # update internal map dx, dy = offset[0] - pos[0], offset[1] - pos[1] for (x, y), val in np.ndenumerate(view): coords = tup_to_cmpx((x - dx, y - dy)) if coords not in self.blacklist: self.grid[tup_to_cmpx((x - dx, y - dy))] = MARK_WALL if val else MARK_FREE # store current state in complex format self.pos = tup_to_cmpx(pos) self.end = tup_to_cmpx(end) self.face = COMPLEX_MAPPING[face]
def list_unmatched(page, per_page, order_by, order_direction): include_ignored = some(_ == 'include_ignored', request.args) query = UnmatchedItem.query.join(ParsedMediaItem) if order_by is not None and order_direction is not None: order_clause = _order_by_builder(order_by, order_direction) query = query.order_by(order_clause) if not include_ignored: query = query.filter(ParsedMediaItem.ignored.isnot(True)) results = query.paginate(page, per_page, False) return jsonify(total_items=results.total, pages=total_pages(results.total, per_page), items=results.items, many=True)
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' logger.debug("in import stage: %s" % harvest_object.guid) if not harvest_object: logger.error('No harvest object received') self._save_object_error('No harvest object received') return False try: self._set_config(harvest_object.job.source.config) context = {'model': model, 'session': Session, 'user': self.user} package_dict = json.loads(harvest_object.content) package_dict['id'] = munge_title_to_name(harvest_object.guid) package_dict['name'] = package_dict['id'] # add owner_org source_dataset = get_action('package_show')( { 'ignore_auth': True }, { 'id': harvest_object.source.id }) owner_org = source_dataset.get('owner_org') package_dict['owner_org'] = owner_org try: prev_dict = iffy(json.loads)(_get_content( some( compose(partial(eq, package_dict['id']), attrgetter('guid')), harvest_object.source.jobs[-2].objects))) if prev_dict and prev_dict.get( 'integrity') == package_dict['integrity']: logger.info('Package not changed. Skip update') return False except IndexError: logger.debug('Skip integrity check. No previous data.') # logger.debug('Create/update package using dict: %s' % package_dict) self._create_or_update_package(package_dict, harvest_object, 'package_show') Session.commit() logger.debug("Finished record") except: logger.exception('Something went wrong!') self._save_object_error('Exception in import stage', harvest_object) return False return True