Exemple #1
0
def load_data(
    input_dir=None,
    split='train',
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load data for a specific split

    If input_dir is not provided, loads X and y for the given split from the
    default location (S3). If input_dir is provided, loads the
    entities/targets tables from their default table names from the given
    directory, ignoring split.

    For feature development, only the train split should be used.
    """
    config = load_config()
    tables = config.data.tables
    entities_table_name = config.data.entities_table_name
    entities_config = some(where(tables, name=entities_table_name))
    targets_table_name = config.data.targets_table_name
    targets_config = some(where(tables, name=targets_table_name))

    if input_dir is None:
        bucket = config.data.s3_bucket
        split_path = config.data.splits.get(split)
        input_dir = f's3://{bucket}/{split_path}'

    X = load_table_from_config(input_dir, entities_config)
    y = load_table_from_config(input_dir, targets_config)

    return X, y
Exemple #2
0
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        tables = config.get('data.tables')

        entities_table_name = config.get('data.entities_table_name')
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.get('data.targets_table_name')
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
    else:
        raise NotImplementedError

    return X, y
Exemple #3
0
def split_date(spec):
    if spec['is_continuous'] or not spec['end_date']:
        yield spec
        return

    start_date = maybe_parse_date(spec['start_date'])
    end_date = maybe_parse_date(spec['end_date']) or start_date

    days = int((end_date - start_date).total_seconds() / (60 * 60 * 24))
    schedules = spec['schedules'] or [{
        'days_of_week': [0, 1, 2, 3, 4, 5, 6],
        'start_time': spec['start_time'],
        'end_time': spec['end_time'],
    }]

    for day in range(days + 1):
        this_date = start_date + timedelta(days=day)
        schedule = some(
            lambda s: this_date.isoweekday() in s['days_of_week'],
            schedules
        )
        if schedule:
            date_string = datetime.combine(this_date, time(0, 0)).isoformat()
            yield {
                'is_continuous': False,
                'start_date': date_string,
                'end_date': date_string,
                'start_time': schedule['start_time'],
                'end_time': schedule['end_time'],
            }
Exemple #4
0
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        tables = config.get('data.tables')

        entities_table_name = config.get('data.entities_table_name')
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.get('data.targets_table_name')
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
    else:
        root = 'https://mit-dai-ballet.s3.amazonaws.com/census'
        X = pd.read_csv(root + '/train/entities.csv.gz')
        y = pd.read_csv(root + '/train/targets.csv.gz')

    return X, y
Exemple #5
0
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        tables = conf.get("tables")

        entities_table_name = conf.get("data", "entities_table_name")
        entities_config = some(where(tables, name=entities_table_name))
        X_df = load_table_from_config(input_dir, entities_config)

        targets_table_name = conf.get("data", "targets_table_name")
        targets_config = some(where(tables, name=targets_table_name))
        y_df = load_table_from_config(input_dir, targets_config)
    else:
        source = "https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt"
        df = pd.read_csv(source, sep="\t")
        X_df = df.drop("SalePrice", axis=1)
        y_df = df["SalePrice"]

    return X_df, y_df
def main(data, context):
    submission = Submission.from_bucket_trigger(data, context)

    # Make sure this isn't one of the lead files
    if submission.filename.startswith('lead'):
        return f"Ignoring lead audio file: gcs://{data['bucket']}{data['name']}"

    # Check required files
    audio = submission.audio_extracted
    if not audio.exists():
        return f"Blob {audio.url} does not exist!"

    reference = F.some(lambda x: x.exists(),
                       submission.audio_reference_candidates())
    if not reference:
        urls = [c.url for c in submission.audio_reference_candidates()]
        return f"Unable to find reference audio! Tried {urls}"

    # Load config
    audio_cfg = submission.song_config()['audio']
    loudnorm_cfg = submission.song_config()['loudnorm']
    corr_cfg = submission.song_config()['correlation']

    with TemporaryDirectory() as tempdir:
        os.chdir(tempdir)
        logging.info("In temp dir: %s", tempdir)

        # Download files
        logging.info("Downloading subject %s", audio.url)
        audio.download(audio.filename)
        logging.info("Downloading reference %s", reference.url)
        reference.download(reference.filename)

        # Process
        analysis, _ = align.cross_correlate(
            reference.filename,
            audio.filename,
            samplerate=corr_cfg.get('samplerate', ANALYSIS_SAMPLERATE),
            preprocess=corr_cfg.get('preprocess'))
        if audio_cfg['loudnorm']:
            analysis['loudnorm'] = loudnorm_analysis(audio.filename,
                                                     submission.singer_count(),
                                                     loudnorm_cfg)

        # Update firestore
        logging.info('Saving analysis data to firestore')
        submission.firestore_document().set({'analysis': analysis}, merge=True)

        # Output
        out_file = 'tmp_' + audio.filename
        write_aligned_audio(audio.filename, out_file, analysis, audio_cfg)

        # Upload
        logging.info("Uploading to %s", submission.audio_aligned.url)
        submission.audio_aligned.upload(out_file)
Exemple #7
0
def load_data(split='train', input_dir=None):
    """Load data

    If input dir is not None, then load whatever dataset appears in
    `input_dir`. Otherwise, load the data split indicated by `split`.
    """
    if input_dir is not None:
        config = load_config()
        tables = config.data.tables

        entities_table_name = config.data.entities_table_name
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.data.targets_table_name
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
        return X, y

    raise NotImplementedError
Exemple #8
0
def with_filters(self,
                 query_string,
                 supported_attributes,
                 ignored=['page', 'page_size', 'order_by', 'order_direction']):
    from urllib.parse import parse_qs
    parsed_query = parse_qs(query_string)
    query = self
    for k, v in parsed_query.items():
        if some(_ == k.decode('utf-8'), ignored):
            continue
        attribute, op = k.decode('utf-8').split(':', 1)
        query = _ops[op](query, supported_attributes[attribute],
                         _value(v[0].decode('utf-8')))
    return query
Exemple #9
0
def get_disqus_user(preferred_channel_id=None):
    """ If user is logged in, generate an appropriate Disqus user:
      - Same channel as preferred channel if available (for OP)
      - First channel user has
      - Anon account if user has no channels
      - None if not logged in
    """
    if current_user and current_user.is_authenticated:
        channels = db.session.query(Channel).filter_by(user_id=current_user.id).all()
        preferred_channel = some(lambda x: x.id == preferred_channel_id, channels)
        if preferred_channel:
            return to_disqus_user(preferred_channel)
        elif channels:
            return to_disqus_user(channels[0])
        else:
            user = db.session.query(User).filter_by(id=current_user.id).one()
            return to_disqus_user(user)
Exemple #10
0
    def parse_frames(self, nframes=0):
        # read and encode gps block (take only the last frame)
        image, gps = None, None
        for _ in range(nframes + 1):
            image = self.tn.read_until(GPS_MARK)
            gps = self.tn.read_until(GPS_MARK)

        self.tn.read_very_eager()

        # remove ascii-codes and store full image
        image = ANSI_ESCAPE.sub("", image.decode("utf8")).split("\n")
        gps = ANSI_ESCAPE.sub("", gps.decode("utf8")).split("\n")

        self.frame = image + gps

        # extract gps part only
        gps = [GPS_ESCAPE.sub("", line)[:-2] for line in gps[:-1]]

        # parse gps block
        raw = [list(map(ord, line)) for line in gps[3:-3]]

        # get facing direction (and grid offset)
        face = some(lambda ch: ch in ORIENTATIONS, flatten(raw))
        offset = [(line.index(face), y) for y, line in enumerate(raw) if face in line][0]

        # get gps view (and transpose for (x, y) coordiante system)
        view = np.array([[ch in WALLS for ch in line] for line in raw], dtype=int)
        view = view.T

        # parse position and end coordinates
        pos = tuple(parse("pos: {:d} {:d}", gps[-2].strip()))
        end = tuple(parse("end: {:d} {:d}", gps[-1].strip()))

        # update internal map
        dx, dy = offset[0] - pos[0], offset[1] - pos[1]
        for (x, y), val in np.ndenumerate(view):
            coords = tup_to_cmpx((x - dx, y - dy))
            if coords not in self.blacklist:
                self.grid[tup_to_cmpx((x - dx, y - dy))] = MARK_WALL if val else MARK_FREE

        # store current state in complex format
        self.pos = tup_to_cmpx(pos)
        self.end = tup_to_cmpx(end)
        self.face = COMPLEX_MAPPING[face]
Exemple #11
0
def list_unmatched(page, per_page, order_by, order_direction):

    include_ignored = some(_ == 'include_ignored', request.args)

    query = UnmatchedItem.query.join(ParsedMediaItem)

    if order_by is not None and order_direction is not None:
        order_clause = _order_by_builder(order_by, order_direction)
        query = query.order_by(order_clause)

    if not include_ignored:
        query = query.filter(ParsedMediaItem.ignored.isnot(True))

    results = query.paginate(page, per_page, False)

    return jsonify(total_items=results.total,
                   pages=total_pages(results.total, per_page),
                   items=results.items,
                   many=True)
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        logger.debug("in import stage: %s" % harvest_object.guid)
        if not harvest_object:
            logger.error('No harvest object received')
            self._save_object_error('No harvest object received')
            return False

        try:
            self._set_config(harvest_object.job.source.config)
            context = {'model': model, 'session': Session, 'user': self.user}

            package_dict = json.loads(harvest_object.content)

            package_dict['id'] = munge_title_to_name(harvest_object.guid)
            package_dict['name'] = package_dict['id']

            # add owner_org
            source_dataset = get_action('package_show')(
                {
                    'ignore_auth': True
                }, {
                    'id': harvest_object.source.id
                })
            owner_org = source_dataset.get('owner_org')
            package_dict['owner_org'] = owner_org

            try:
                prev_dict = iffy(json.loads)(_get_content(
                    some(
                        compose(partial(eq, package_dict['id']),
                                attrgetter('guid')),
                        harvest_object.source.jobs[-2].objects)))
                if prev_dict and prev_dict.get(
                        'integrity') == package_dict['integrity']:
                    logger.info('Package not changed. Skip update')
                    return False
            except IndexError:
                logger.debug('Skip integrity check. No previous data.')

            # logger.debug('Create/update package using dict: %s' % package_dict)
            self._create_or_update_package(package_dict, harvest_object,
                                           'package_show')

            Session.commit()

            logger.debug("Finished record")
        except:
            logger.exception('Something went wrong!')
            self._save_object_error('Exception in import stage',
                                    harvest_object)
            return False
        return True