コード例 #1
0
class ProcessGames(Task):
    data = Complex()
    row_data = List()
    speaker_code_dict = Dict()
    speaker_codes = List()
    vectorizer = Complex()

    data_format = SportsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process sports events."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        con = sqlite3.connect(settings.DB_PATH)
        c = con.cursor()
        rosters = sql.read_frame("select * from rosters", con)

        tys = []
        for i in xrange(0, rosters.shape[0]):
            year = rosters.iloc[i]['year']
            team = rosters.iloc[i]['team']
            ty = [year, team]
            if ty not in tys:
                tys.append(ty)

        for ty in tys:
            year, team = ty
            ros = rosters[((rosters['year'] == year) &
                           (rosters['team'] == team))]
            players = list(ros['id'])

        return data
コード例 #2
0
ファイル: tasks.py プロジェクト: jzeyl/evolve-music
class RandomForestTrain(Train):
    """
    A class to train a random forest
    """
    colnames = List()
    clf = Complex()
    category = RegistryCategories.algorithms
    namespace = get_namespace(__module__)
    algorithm = RandomForestClassifier
    args = {'n_estimators' : 300, 'min_samples_leaf' : 4, 'compute_importances' : True}

    help_text = "Train and predict with Random Forest."
コード例 #3
0
class Normalize(Task):
    """
    Normalize values by reducing standard deviation to 1 and mean to 0
    """
    column_means = List()
    column_stdevs = List()
    category = RegistryCategories.preprocessors
    tester = NormalizationTester
    test_cases = [{'stream' : os.path.abspath(os.path.join(settings.PACKAGE_PATH,'tests/data/csv/1/data.csv')), 'dataformat' : DataFormats.csv}]
    data = Complex()

    help_text = "Example class to normalize input values."

    def train(self, data, **kwargs):
        """
        Calculate the standard deviations and means in the training data
        """
        self.data = data
        for i in xrange(0,data.shape[1]):
            column_mean = np.mean(data.icol(i))
            column_stdev = np.std(data.icol(i))

            #Have to do += or "list" type will fail (ie with append)
            self.column_means += [column_mean]
            self.column_stdevs += [column_stdev]

        self.data = self.predict(data)

    def predict(self, test_data, **kwargs):
        """
        Adjust new input by the values in the training data
        """
        if test_data.shape[1]!=self.data.shape[1]:
            raise Exception("Test data has different number of columns than training data.")
        for i in xrange(0,test_data.shape[1]):
            test_data.loc[:,i] = test_data.icol(i) - self.column_means[i]
            if int(self.column_stdevs[i])!=0:
                test_data.loc[:,i] = test_data.icol(i) / self.column_stdevs[i]
        return test_data
コード例 #4
0
class Validate(Task):
    data = Complex()
    results = Complex()
    error = Float()
    importances = Complex()
    importance = Complex()
    column_names = List()

    data_format = DataFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Validate."

    def cross_validate(self, data, non_predictors, **kwargs):
        nfolds = kwargs.get('nfolds', 3)
        algo = kwargs.get('algo')
        seed = kwargs.get('seed', 1)
        data_len = data.shape[0]
        counter = 0
        fold_length = int(math.floor(data_len/nfolds))
        folds = []
        data_seq = list(xrange(0,data_len))
        random.seed(seed)
        random.shuffle(data_seq)

        for fold in xrange(0, nfolds):
            start = counter

            end = counter + fold_length
            if fold == (nfolds-1):
                end = data_len
            folds.append(data_seq[start:end])
            counter += fold_length

        results = []
        data.index = range(data.shape[0])
        self.importances = []
        for (i,fold) in enumerate(folds):
            predict_data = data.iloc[fold,:]
            out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):]))
            train_data = data.iloc[out_indices,:]
            alg = algo()
            target = train_data['next_year_wins']
            train_data = train_data[[l for l in list(train_data.columns) if l not in non_predictors]]
            predict_data = predict_data[[l for l in list(predict_data.columns) if l not in non_predictors]]
            clf = alg.train(train_data,target,**algo.args)
            results.append(alg.predict(predict_data))
            self.importances.append(clf.feature_importances_)
        return results, folds

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        non_predictors = [i.replace(" ", "_").lower() for i in list(set(data['team']))] + ["team", "next_year_wins"]
        self.column_names = [l for l in list(data.columns) if l not in non_predictors]
        results, folds = self.cross_validate(data, non_predictors, **kwargs)
        self.gather_results(results, folds, data)

    def gather_results(self, results, folds, data):
        full_results = list(chain.from_iterable(results))
        full_indices = list(chain.from_iterable(folds))
        partial_result_df = make_df([full_results, full_indices], ["result", "index"])
        partial_result_df = partial_result_df.sort(["index"])
        partial_result_df.index = range(partial_result_df.shape[0])
        result_df = pd.concat([partial_result_df, data[['next_year_wins', 'team', 'year', 'total_wins']]], axis=1)
        result_df = result_df[(result_df['next_year_wins']>0) & result_df['total_wins']>0]
        self.results = result_df
        self.calc_error(result_df)
        self.calc_importance(self.importances, self.column_names)

    def calc_error(self, result_df):
        filtered_df = result_df[result_df['year']<np.max(result_df['year'])]
        self.error = np.mean(np.abs(filtered_df['result'] - filtered_df['next_year_wins']))

    def calc_importance(self, importances, col_names):
        importance_frame = pd.DataFrame(importances)
        importance_frame.columns = col_names
        self.importance = importance_frame.mean(axis=0)
        self.importance.sort(0)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        pass
コード例 #5
0
class FeatureExtractor(Task):
    data = Complex()
    row_data = List()
    speaker_code_dict = Dict()
    speaker_codes = List()
    vectorizer = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {
        'scriptfile':
        os.path.abspath(os.path.join(settings.DATA_PATH, "script_tasks"))
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        scriptfile = kwargs.get('scriptfile')
        script_data = pickle.load(open(scriptfile))
        script = script_data.tasks[2].voice_lines.value
        speakers = []
        lines = []
        for s in script:
            for (i, l) in enumerate(s):
                if i > 0:
                    previous_line = s[i - 1]['line']
                    previous_speaker = s[i - 1]['speaker']
                else:
                    previous_line = ""
                    previous_speaker = ""

                if i > 1:
                    two_back_speaker = s[i - 2]['speaker']
                else:
                    two_back_speaker = ""

                if len(s) > i + 1:
                    next_line = s[i + 1]['line']
                else:
                    next_line = ""
                current_line = s[i]['line']
                current_speaker = s[i]['speaker']
                lines.append(current_line)
                speakers.append(current_speaker)
                row_data = {
                    'previous_line': previous_line,
                    'previous_speaker': previous_speaker,
                    'next_line': next_line,
                    'current_line': current_line,
                    'current_speaker': current_speaker,
                    'two_back_speaker': two_back_speaker
                }
                self.row_data.append(row_data)
        self.speaker_code_dict = {
            k: i
            for (i, k) in enumerate(list(set(speakers)))
        }
        self.speaker_codes = [self.speaker_code_dict[s] for s in speakers]
        self.max_features = math.floor(MAX_FEATURES) / 3
        self.vectorizer = Vectorizer()
        self.vectorizer.fit(lines, self.speaker_codes, self.max_features)
        prev_features = self.vectorizer.batch_get_features(
            [rd['previous_line'] for rd in self.row_data])
        cur_features = self.vectorizer.batch_get_features(
            [rd['current_line'] for rd in self.row_data])
        next_features = self.vectorizer.batch_get_features(
            [rd['next_line'] for rd in self.row_data])

        self.speaker_code_dict.update({'': -1})
        meta_features = make_df(
            [[
                self.speaker_code_dict[s['two_back_speaker']]
                for s in self.row_data
            ],
             [
                 self.speaker_code_dict[s['previous_speaker']]
                 for s in self.row_data
             ], self.speaker_codes],
            ["two_back_speaker", "previous_speaker", "current_speaker"])
        #meta_features = make_df([[self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data], self.speaker_codes],["two_back_speaker", "current_speaker"])
        train_frame = pd.concat([
            pd.DataFrame(prev_features),
            pd.DataFrame(cur_features),
            pd.DataFrame(next_features), meta_features
        ],
                                axis=1)
        train_frame.index = range(train_frame.shape[0])
        data = {
            'vectorizer':
            self.vectorizer,
            'speaker_code_dict':
            self.speaker_code_dict,
            'train_frame':
            train_frame,
            'speakers':
            make_df([speakers, self.speaker_codes, lines],
                    ["speaker", "speaker_code", "line"]),
            'data':
            data,
            'current_features':
            cur_features,
        }
        return data
コード例 #6
0
class ClusterScriptText(Task):
    data = Complex()
    clusters = Complex()
    predictions = Complex()
    clusters = List()
    cl = Complex()
    vec = Complex()
    vec1 = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cluster simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = data
        self.predict(self.data)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        from train import Vectorizer, make_df

        self.vec = Vectorizer()

        reformatter = ReformatScriptText()
        args = reformatter.args
        args['do_replace'] = False
        reformatter.train(data, "", **args)

        script_segments = list(chain.from_iterable(reformatter.voice_lines))
        text = [s['line'] for s in script_segments]
        speaker = [s['speaker'] for s in script_segments]
        unique_speakers = list(set(speaker))
        speaker_code_dict = {k:i for (i,k) in enumerate(unique_speakers)}
        speaker_codes = [speaker_code_dict[k] for k in unique_speakers]
        speaker_list = []
        speaker_frame = make_df([text,speaker],["text","speaker"])
        for i in unique_speakers:
            s_text = "\n".join(list(speaker_frame[speaker_frame['speaker']==i]['text']))
            speaker_list.append(s_text)

        self.vec.fit(speaker_list, speaker_codes, 200,min_features=2)
        features = self.vec.batch_get_features(speaker_list)

        cl = KMeans()
        self.predictions = cl.fit_predict(features)
        self.cl = cl

        for i in xrange(0,max(self.predictions)):
            clust = []
            for c in xrange(0,len(speaker_codes)):
                if self.predictions[c]==i:
                    clust.append(unique_speakers[c])
            self.clusters.append(clust)

        pca = PCA(n_components=2, whiten=True).fit(features)
        rf = pca.transform(features)
        labels = cl.labels_
        pyplot.clf()
        centroids = cl.cluster_centers_
        pyplot.cla()
        for i in range(max(labels)):
            ds = rf[np.where(labels==i)]
            pyplot.plot(ds[:,0],ds[:,1],'o', label=self.clusters[i][0])
        pyplot.legend(loc=8)
        pyplot.savefig('clusters.png')

        self.vec1 = Vectorizer()
        speaker_codes = [speaker_code_dict[k] for k in speaker]

        self.vec1.fit(text, speaker_codes, 200,min_features=2)
        features = self.vec1.batch_get_features(text)

        pca = PCA(n_components=2, whiten=True).fit(features)
        rf = pca.transform(features)
        pyplot.clf()
        pyplot.cla()
        for i in range(len(speaker_codes)):
            pyplot.plot(rf[i,0],rf[i,1],'o', label=speaker[i])
        pyplot.savefig('all_speakers.png')
コード例 #7
0
class CrossValidate(Task):
    data = Complex()
    results = Complex()
    error = Float()
    importances = Complex()
    importance = Complex()
    column_names = List()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)
    args = {
        'nfolds': 3,
        'algo': RandomForestTrain,
        'target_name': 'label_code',
        'non_predictors': ["label", "line", "label_code"]
    }

    help_text = "Cross validate simpsons data."

    def cross_validate(self, data, **kwargs):
        nfolds = kwargs.get('nfolds', 3)
        algo = kwargs.get('algo')
        seed = kwargs.get('seed', 1)
        self.target_name = kwargs.get('target_name')
        non_predictors = kwargs.get('non_predictors')

        self.column_names = [
            l for l in list(data.columns) if l not in non_predictors
        ]
        data_len = data.shape[0]
        counter = 0
        fold_length = int(math.floor(data_len / nfolds))
        folds = []
        data_seq = list(xrange(0, data_len))
        random.seed(seed)
        random.shuffle(data_seq)

        for fold in xrange(0, nfolds):
            start = counter

            end = counter + fold_length
            if fold == (nfolds - 1):
                end = data_len
            folds.append(data_seq[start:end])
            counter += fold_length

        results = []
        data.index = range(data.shape[0])
        self.importances = []
        for (i, fold) in enumerate(folds):
            predict_data = data.iloc[fold, :]
            out_indices = list(chain.from_iterable(folds[:i] +
                                                   folds[(i + 1):]))
            train_data = data.iloc[out_indices, :]
            alg = algo()
            target = train_data[self.target_name]
            train_data = train_data[[
                l for l in list(train_data.columns) if l not in non_predictors
            ]]
            predict_data = predict_data[[
                l for l in list(predict_data.columns)
                if l not in non_predictors
            ]]
            clf = alg.train(train_data, target, **algo.args)
            results.append(alg.predict(predict_data))
            self.importances.append(clf.feature_importances_)
        return results, folds

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.target_name = kwargs.get('target_name')
        results, folds = self.cross_validate(data, **kwargs)
        self.gather_results(results, folds, data)

    def gather_results(self, results, folds, data):
        full_results = list(chain.from_iterable(results))
        full_indices = list(chain.from_iterable(folds))
        partial_result_df = make_df([full_results, full_indices],
                                    ["result", "index"])
        partial_result_df = partial_result_df.sort(["index"])
        partial_result_df.index = range(partial_result_df.shape[0])
        result_df = pd.concat([partial_result_df, data], axis=1)
        self.results = result_df
        self.calc_importance(self.importances, self.column_names)

    def calc_error(self, result_df):
        self.error = np.mean(
            np.abs(result_df['result'] - result_df[self.target_name]))

    def calc_importance(self, importances, col_names):
        importance_frame = pd.DataFrame(importances)
        importance_frame.columns = col_names
        self.importance = importance_frame.mean(axis=0)
        self.importance.sort(0)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        pass
コード例 #8
0
class LoadAudioFiles(Task):
    data = Complex()
    all_files = List()
    seq = Complex()
    res = Complex()
    label_codes = Dict()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {
        'audio_dir': settings.AUDIO_DIR,
        'timeout': 600,
        'only_labelled_lines': settings.ONLY_LABELLED_LINES,
        'processed_files_limit': settings.PROCESSED_FILES_LIMIT
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def extract_season(self, name):
        match1 = re.search('\[(\d+)[x\.](\d+)\]', name)
        if match1 is not None:
            season = match1.group(1)
            episode = match1.group(2)
            return int(season), int(episode)

        match2 = re.search('S(\d+)E(\d+)', name)
        if match2 is not None:
            season = match2.group(1)
            episode = match2.group(2)
            return int(season), int(episode)

        return None, None

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        p = Pool(4, maxtasksperchild=50)
        audio_dir = kwargs['audio_dir']
        timeout = kwargs['timeout']
        oll = kwargs['only_labelled_lines']
        pff = kwargs['processed_files_limit']

        all_files = []
        for ad in os.listdir(audio_dir):
            ad_path = os.path.abspath(os.path.join(audio_dir, ad))
            if os.path.isdir(ad_path):
                files = os.listdir(ad_path)
                all_files += [
                    os.path.abspath(os.path.join(ad_path, f)) for f in files
                ]
            else:
                all_files += [ad_path]
        self.all_files = [f for f in all_files if f.endswith(".ogg")]
        frames = []
        counter = 0
        for f in self.all_files:
            season, episode = self.extract_season(f)
            if season is None or (season == 11 and episode == 6):
                continue
            subtitle_frame = data[((data['season'] == season) &
                                   (data['episode'] == episode))]
            if subtitle_frame.shape[0] == 0:
                continue

            #To cause loop to end early, remove if needed
            if oll:
                label_frame = subtitle_frame[(subtitle_frame['label'] != "")]
                if label_frame.shape[0] == 0:
                    continue
            if pff is not None and isinstance(pff, int) and counter >= pff:
                break

            counter += 1
            log.info("On file {0} Season {1} Episode {2}".format(
                counter, season, episode))
            f_data, fs, enc = oggread(f)
            subtitle_frame = subtitle_frame.sort('start')
            subtitle_frame.index = range(subtitle_frame.shape[0])
            samps = []
            good_rows = []
            for i in xrange(0, subtitle_frame.shape[0]):
                start = subtitle_frame['start'].iloc[i]
                end = subtitle_frame['end'].iloc[i]
                if end - start > 6 or (subtitle_frame['label'][i] == ''
                                       and oll):
                    continue
                samp = f_data[(start * fs):(end * fs), :]
                samps.append({'samp': samp, 'fs': fs})
                good_rows.append(i)
            r = p.imap(process_subtitle, samps, chunksize=1)
            sf = subtitle_frame.iloc[good_rows]
            results = []
            for i in range(len(samps)):
                try:
                    results.append(r.next(timeout=timeout))
                except TimeoutError:
                    results.append(None)
            good_rows = [
                i for i in xrange(0, len(results)) if results[i] != None
            ]
            audio_features = [i for i in results if i != None]
            good_sf = sf.iloc[good_rows]
            good_sf.index = range(good_sf.shape[0])
            audio_frame = pd.DataFrame(audio_features)
            audio_frame.index = range(audio_frame.shape[0])
            df = pd.concat([good_sf, audio_frame], axis=1)
            df = df.fillna(-1)
            df.index = range(df.shape[0])
            frames.append(df)
            lab_df_shape = df[df['label'] != ''].shape[0]
            log.info("Processed {0} lines, {1} of which were labelled".format(
                df.shape[0], lab_df_shape))
        p.close()
        p.join()
        log.info("Done processing episodes.")
        data = pd.concat(frames, axis=0)
        data.index = range(data.shape[0])
        data.index = range(data.shape[0])

        for c in list(data.columns):
            data[c] = data[c].real
        for k in CHARACTERS:
            for i in CHARACTERS[k]:
                data['label'][data['label'] == i] = k
        self.label_codes = {k: i for (i, k) in enumerate(set(data['label']))}
        reverse_label_codes = {
            self.label_codes[k]: k
            for k in self.label_codes
        }
        data['label_code'] = [self.label_codes[k] for k in data['label']]
        self.seq = SequentialValidate()

        #Do cv to get error estimates
        cv_frame = data[data['label'] != ""]
        self.seq.train(cv_frame, **self.seq.args)
        self.res = self.seq.results
        self.res = self.res[[
            'line', 'label', 'label_code', 'result_code', 'result_label'
        ]]

        exact_percent, adj_percent = compute_error(self.res)
        log.info("Exact match percent: {0}".format(exact_percent))
        log.info("Adjacent match percent: {0}".format(adj_percent))
        #Predict in the frame
        alg = RandomForestTrain()
        target = cv_frame['label_code']
        non_predictors = ["label", "line", "label_code"]
        train_names = [
            l for l in list(cv_frame.columns) if l not in non_predictors
        ]
        train_data = cv_frame[train_names]
        predict_data = data[train_names]
        clf = alg.train(train_data, target, **alg.args)
        data['result_code'] = alg.predict(predict_data)
        data['result_label'] = [
            reverse_label_codes[k] for k in data['result_code']
        ]
        return data