Ejemplo n.º 1
0
    class Pipeline(BaseEstimator, TransformerMixin):
        """
        """
        def __init__(self,
                     numeric,
                     id=None,
                     target=None,
                     categorical=None,
                     verbose=0):
            self.created_features = None
            self.id = id
            self.target = target
            self.categorical = categorical
            self.numeric = numeric
            self.verbose = verbose

            self.feature_generator = None
            self.preprocessor = None

        def fit_transform(self, df, y=None, **fit_params):
            with Timer('pipelines.Pipeline.fit_transform:', self.verbose):
                self.feature_generator = FeatureGenerator(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                df_features = self.feature_generator.fit_transform(df)

                self.preprocessor = Preprocessor(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                x = self.preprocessor.fit_transform(df_features)
                return x

        def transform(self, df):
            with Timer('pipelines.Pipeline.transform:', self.verbose):
                if self.feature_generator is None:
                    raise NotFittedError(
                        f'feature_generator = {self.feature_generator}')
                if self.preprocessor is None:
                    raise NotFittedError(f'preprocessor = {self.preprocessor}')

                df_features = self.feature_generator.transform(df)
                x = self.preprocessor.transform(df_features)
                return x

        def fit(self, x, y=None, **fit_params):
            return self

        def get_feature_names(self):
            return self.created_features
Ejemplo n.º 2
0
def custom_scorer(Y_true, Y_pred, **kwargs):
    note_true, dur_true = TeacherGenerator.y_to_note_dur(
        Y_true.squeeze(), sampler=TeacherGenerator.take_argmax)
    note_pred, dur_pred = TeacherGenerator.y_to_note_dur(
        Y_pred.squeeze(), sampler=TeacherGenerator.take_argmax)

    feat_true = FeatureGenerator.construct_single_feature(note_true, dur_true)
    feat_pred = FeatureGenerator.construct_single_feature(note_pred, dur_pred)
    return np.sqrt(np.sum(np.square(feat_true-feat_pred)))
Ejemplo n.º 3
0
        def fit_transform(self, df, y=None, **fit_params):
            with Timer('pipelines.Pipeline.fit_transform:', self.verbose):
                self.feature_generator = FeatureGenerator(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                df_features = self.feature_generator.fit_transform(df)

                self.preprocessor = Preprocessor(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                x = self.preprocessor.fit_transform(df_features)
                return x
Ejemplo n.º 4
0
def make_validation_datasets():
    feature_generator = FeatureGenerator()
    with open("datasets/positions_chunks.json00") as data:
        X, yraw, rnds = make_dataset(data, feature_generator)
    joblib.dump((X, yraw, rnds),
                "datasets/positions_1_squares_flags.joblib",
                compress=2)

    feature_generator = FeatureGenerator(attacked_squares=True)
    with open("datasets/positions_chunks.json00") as data:
        X, yraw, rnds = make_dataset(data, feature_generator)
    joblib.dump((X, yraw, rnds),
                "datasets/positions_1_squares_flags_attackers.joblib",
                compress=2)

    feature_generator = FeatureGenerator(attacked_squares=True, pins=True)
    with open("datasets/positions_chunks.json00") as data:
        X, yraw, rnds = make_dataset(data, feature_generator)
    joblib.dump((X, yraw, rnds),
                "datasets/positions_1_squares_flags_attackers_pins.joblib",
                compress=2)
def make_inferences(lr, X, dur_predict, sampler):
    inferences = []
    while get_inferenced_time(inferences) < dur_predict:
        Y = lr.predict(X.reshape(1, -1)).squeeze()
        inform_output(Y, inferences)
        inference = Inference(
            TeacherGenerator.y_to_note_dur(Y.squeeze(), sampler=sampler))
        inferences.append(inference)
        X = np.hstack(
            (X[6:, ...],
             FeatureGenerator.construct_single_feature(inference.note,
                                                       inference.duration)))
    # out = np.array([])
    # for inf in inferences:
    #     out = np.append(out, np.repeat(inf.note, inf.duration))
    return inferences
Ejemplo n.º 6
0
def make_inferences(lr, X, dur_predict, sampler):
    inferences = []

    while get_inferenced_time(inferences) < dur_predict:
        Y = lr.predict(X.reshape(1, -1))


        if len(inferences) < 4:
            inference = Inference(
                TeacherGenerator.y_to_note_dur(
                    Y.squeeze(), sampler=sampler)
                )

        else:
            prev_notes = []
            min_note = TeacherGenerator._min_note
            for i in range(3):
                prev_notes.append(inferences[-4+i].note - min_note + 1)

            P = Y.squeeze()
            p = P[:-19]
            d = np.ones(len(P)-len(p))

            # voice 0 range = (54,76)
            # voice 1  = (45,71)
            # voice 2 range = (40,62)
            # voice 3 range = (28,54)
            
            if prev_notes[2] > 0:
                #v3
                if min_note == 28:
                    oct = 0
                #v2
                elif min_note == 40:
                    oct = 12
                #v1
                elif min_note == 45:
                    oct = 12
                #v0
                else:
                    oct = 12*2

                if prev_notes[2] == 38 + oct - min_note:
                    pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = 37 + oct - min_note, scale = 1)

                elif prev_notes[2] == 37 + oct - min_note and prev_notes[1] == 38 + oct - min_note:
                    pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = 40 + oct - min_note, scale = 1)

                elif prev_notes[2] == 40 + oct - min_note and prev_notes[1] == 37 + oct - min_note and prev_notes[0] == 38 + oct - min_note:
                    pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = 39 + oct - min_note, scale = 1)

                else:
                    pdf = norm.pdf(np.arange(1,len(p)+1,1), loc = prev_notes[2]+1 , scale = 1)

                p = p*(pdf*100)
                p[prev_notes[2]] = 0

            P = np.concatenate((p,d))

            inference = Inference(
                TeacherGenerator.y_to_note_dur(
                    P, prev_notes, TeacherGenerator._min_note, sampler=sampler)
                )
            

        inferences.append(inference)
        X = np.hstack((
            X[6:, ...],
            FeatureGenerator.construct_single_feature(
                    inference.note, inference.duration
            )
        ))
    # out = np.array([])
    # for inf in inferences:
    #     out = np.append(out, np.repeat(inf.note, inf.duration))
    return inferences
Ejemplo n.º 7
0
import os
from glob import glob

import numpy as np
from tqdm import tqdm

from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn
from tensorflow.contrib.training import HParams
from tensorflow.contrib.learn import RunConfig

from lib import create_estimator, model_dir, POSSIBLE_LABELS, params, id2name, FINGERPRINT_KEY, getMfcc, getTransformedAudioLocal
from features import FeatureGenerator

featureGenerator = FeatureGenerator(params)

TEST_BATCH_SIZE=64
TEST_DATA_PATHS = glob('../../data/test/audio/*wav')

def test_data_generator():
    for path in TEST_DATA_PATHS:
        fname = os.path.basename(path)
        result = dict(fname=np.string_(fname))
        audio_options = dict(
            fname=path,
            desired_samples=16000,
            fg_vol=1,
            bg_data=[],
            bg_vol=0,
            clip_min=-1.0,
            clip_max=1.0,
            time_shift_samples=0,
Ejemplo n.º 8
0
    out = \
        {sampler: [None for _ in range(no_top)] for sampler in samplers}
    all_voice_inferences = \
        {sampler: [[] for _ in range(no_top)] for sampler in samplers}

    log = np.array(
        ['voice', 'experiment', 'alpha', 'window size',
         'mean score']).reshape(1, -1)

    for voice in voices:
        print('\n-------- VOICE %s --------' % voice)

        # Transform data to input and teacher matrices
        notes, durations = transform.encode_duration(raw_input, voice)
        features = FeatureGenerator.construct_features(notes, durations)

        # X, indices = transform.windowed(features, window_size=windows[0])
        # Y = TeacherGenerator.construct_teacher(notes, durations, indices)

        # Train a ridge regression model
        # lr = obtain_optimal_model(X[:-1, ...], Y, alphas)
        top, nlog = obtain_optimal_model(features, notes, durations, alphas,
                                         windows, log, voice)

        log = nlog

        for (idx, model) in enumerate(top):
            no = len(top) - idx
            lr = model[3]
            X = model[4]
Ejemplo n.º 9
0
def create_combined_review_data_set(review_file_name):
    """
        Derp
    """
    data = load_json(review_file_name)
    X = []
    y = []

    # Pre-process all features
    fg = FeatureGenerator(data)

    for idx, datum in enumerate(data):
        # Our labels are the star counts
        y.append(int(datum['stars']))

        business_id = datum['business_id']

        # Our features are everything else we can get our hands on
        feature_vector = []
        # Add as many features as we can think of
        # Features must be NUMERIC -- ints or floats!
        # DO NOT INCLUDE THE 'STARS' CATEGORY
        feature_vector.append(int(datum['votes']['cool']))
        feature_vector.append(int(datum['votes']['funny']))
        feature_vector.append(int(datum['votes']['useful']))

        # TextBlob processing
        blob = fg.get_blob(idx)


        words = blob.words.lower().singularize()

        # TODO: add features of selected word counts
        #       need to do some processing to figure out which words matter
        #
        #       the feature generation is separated into a separate class
        #       because some features may need to reference the context of
        #       the entire dataset before determining it's features
        #       (e.g. counts of words that are most widespread across all data)
        #
        #       feature generation can potentially look up yelp user accounts
        #       which should be done through the FeatureGenerator class

        # Add feature vector to list of feature vectors
        feature_vector.append(fg.generate_subjectivity(idx))
        feature_vector.append(fg.generate_polarity(idx))
        feature_vector.append(fg.generate_length(idx))
        feature_vector.append(fg.generate_num_sentences(idx))
        feature_vector.append(fg.generate_avg_sentence_len(idx))
        feature_vector.append(fg.generate_count_exclamation(idx))
        feature_vector.append(fg.generate_punctuation_to_sentence_ratio(idx))
        feature_vector.append(fg.generate_number_of_all_cap_words(idx))
        feature_vector.append(fg.generate_similarity_between_words(idx, 1))
        feature_vector.append(fg.generate_similarity_between_words(idx, 2))
        feature_vector.append(fg.generate_similarity_between_words(idx, 3))
        feature_vector.append(fg.generate_similarity_between_words(idx, 4))
        feature_vector.append(fg.generate_similarity_between_words(idx, 5))
        feature_vector.append(fg.generate_average_stars_cluster(idx, business_id))
        feature_vector.append(fg.generate_num_businesses_in_area(idx, business_id))

        feature_vector.append(
            fg.generate_number_of_tips(
                idx, datum['user_id'], business_id)
        )
        feature_vector.append(
            fg.generate_business_latitude(
                idx, business_id)
        )
        feature_vector.append(
            fg.generate_business_longitude(
                idx, business_id)
        )

        X.append(feature_vector)
    return DataSet(X, y)
Ejemplo n.º 10
0
def getBgVol(background_frequency, background_volume_range):
    if np.random.uniform(0, 1) < background_frequency:
        return np.random.uniform(0, background_volume_range)
    else:
        return 0


# print('bck vol: ', getBckVol(0.5, 0.5))
# print('bck vol: ', getBckVol(0.5, 0.5))
# print('bck vol: ', getBckVol(0.5, 0.5))
# print('bck vol: ', getBckVol(0.5, 0.5))

##=========================================================
## Actual computations start here
##=========================================================
featureGenerator = FeatureGenerator(params, getBgFileNames(DATADIR))

train_meta_list, val_meta_list = get_metadata_lists(DATADIR)
augmentWithSilence(train_meta_list, SILENCE_PCT)
augmentWithSilence(val_meta_list, SILENCE_PCT)
print('Augumented sizes: Train: {}. Val: {}'.format(len(train_meta_list),
                                                    len(val_meta_list)))
train_input_fn = generator_input_fn(
    x=data_generator_fn(train_meta_list, BG_PARAMS, 'train'),
    target_key=TARGET_KEY,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_epochs=None,
    queue_capacity=3 * BATCH_SIZE + 10,
    num_threads=1,
)