Esempio n. 1
0
class EvolveMusic(Task):
    data = Complex()
    clf = Complex()
    importances = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)
    args = {
        'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"],
        'target_var' : 'label_code',
    }

    def train(self,data,target, **kwargs):
        non_predictors = kwargs.get('non_predictors')
        target = kwargs.get('target_var')

        data.index = range(data.shape[0])

        alg = RandomForestTrain()
        good_names = [i for i in data.columns if i not in non_predictors]
        for c in good_names:
            data[c] = data[c].astype(float)

        for c in good_names:
            data[c] = data[c].real

        clf = alg.train(np.asarray(data[good_names]),data[target],**alg.args)
        importances = clf.feature_importances_

        counter = 0
        for i in xrange(0,data.shape[0]):
            fname = data['fname'][i]
            vec, fs, enc = read_sound(fname)
            label = data["labels"][i]
            if label=="classical":
                counter+=1
                name = fname.split("/")[-1]
                feats = process_song(vec,fs)
                initial_quality = clf.predict_proba(feats)[0,1]
                headers = "song_index,iteration,quality,distance,splice_song_index,splice_song"
                v2s = [headers,"{0},{1},{2},{3},{4},{5}".format(i,-1,initial_quality,0,0,"N/A")]
                print(headers)
                for z in xrange(0,100):
                    if z%10==0 or z==0:
                        v2ind = random.randint(0,data.shape[0]-1)
                        v2fname = data['fname'][v2ind]
                        vec2, v2fs, v2enc = read_sound(v2fname)
                        feats = process_song(vec,fs)
                        quality = clf.predict_proba(feats)[0,1]
                        nearest_match, min_dist = find_nearest_match(feats, data[good_names])
                        descriptor = "{0},{1},{2},{3},{4},{5}".format(i,z,quality,min_dist,v2ind,v2fname.split("/")[-1])
                        v2s.append(descriptor)
                        print(descriptor)
                        if min_dist>.35 and (abs(quality-0)<=.1 or abs(1-quality)<=.1) and z!=0:
                            write_file(name,vec,fs,enc,v2s)
                    vec = alter(vec,vec2,fs,v2fs,clf)
                write_file(name,vec,fs,enc,v2s)
Esempio n. 2
0
class ReformatScriptText(Task):
    data = Complex()
    voice_lines = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {'scriptfile' : os.path.abspath(os.path.join(settings.PROJECT_PATH, "data/raw_scripts2.json")), 'do_replace' : True}

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = data
        self.predict(self.data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        voice_scripts = list(data['voice_script'])
        scriptfile = kwargs['scriptfile']
        do_replace = kwargs['do_replace']
        json_scripts = json.load(open(scriptfile))
        voice_scripts+=[s['script'] for s in json_scripts]
        script_segments = []
        for script in voice_scripts:
            script = script.replace("\"","")
            lines = script.split("\n")
            segment = []
            for line in lines:
                if line.strip()!="":
                    line = line.encode('ascii','ignore')
                    line_split = line.split(":")
                    if do_replace:
                        line_split[0] = find_replacement(line_split[0].strip())
                    line_split[0] = cleanup_name(line_split[0].strip())
                    segment.append({'speaker' : line_split[0],
                                    'line' : ":".join(line_split[1:]).strip()})
                else:
                    if len(segment)>0:
                        script_segments.append(segment)
                        segment = []
            if len(segment)>0:
                script_segments.append(segment)
        self.voice_lines = script_segments
Esempio n. 3
0
class CleanupScriptList(Task):
    data = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        script_removal_values = [""]
        for r in script_removal_values:
            data = data[data["script"]!=r]
        log.info(data)
        data['episode_name'] = [i.split('\n')[0].strip() for i in data['episode_name']]
        data['episode_code'] = [i.split('/')[-1].split('.html')[0] for i in data['url']]

        data.index = range(data.shape[0])
        return data
Esempio n. 4
0
class Train(Task):
    """
    A class to train a support vector machine algorithm
    """
    clf = Complex()
    category = RegistryCategories.algorithms
    algorithm = svm.SVC
    args = {'C': 1.0}
    tester = SVMTester
    test_cases = [{
        'data':
        os.path.abspath(
            os.path.join(settings.PACKAGE_PATH, 'tests/data/csv/1/data.csv')),
        'target':
        os.path.abspath(
            os.path.join(settings.PACKAGE_PATH,
                         'tests/data/csv/1/target.csv')),
        'dataformat':
        DataFormats.csv
    }]
    help_text = "Example class to train and predict with SVM."

    def train(self, data, target, **kwargs):
        #When doing self.clf =clf , __set__ is called on the field.
        # But, when doing self.clf = self.algorithm() and self.clf.fit(), __set__ is not called.
        # Work around this by doing the fit logic on a local variable, and then assigning to self.clf,
        clf = self.algorithm(**kwargs)
        clf = clf.fit(data, target)
        self.clf = clf
        return self.clf

    def predict(self, test_data, **kwargs):
        test_data = test_data
        return self.clf.predict(test_data)
Esempio n. 5
0
class ProcessMusic(Task):
    data = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process sports events."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        d = []
        labels = []
        encs = []
        fss = []
        fnames = []
        if not os.path.isfile(settings.FEATURE_PATH):
            for (z,p) in enumerate(data):
                log.info("On file {0}".format(z))
                try:
                    data , fs, enc = read_sound(p['newpath'])
                except Exception:
                    continue
                try:
                    features = process_song(data,fs)
                except Exception:
                    log.exception("Could not get features")
                    continue
                d.append(features)
                labels.append(p['type'])
                fss.append(fs)
                encs.append(enc)
                fnames.append(p['newpath'])
            frame = pd.DataFrame(d)
            frame['labels']  = labels
            frame['fs'] = fss
            frame['enc'] = encs
            frame['fname'] = fnames
            label_dict = {
                'classical' : 1,
                'electronic' : 0
            }
            frame['label_code'] = [label_dict[i] for i in frame['labels']]
            frame.to_csv(settings.FEATURE_PATH)
        else:
            frame = pd.read_csv(settings.FEATURE_PATH)

        return frame
Esempio n. 6
0
class ProcessGames(Task):
    data = Complex()
    row_data = List()
    speaker_code_dict = Dict()
    speaker_codes = List()
    vectorizer = Complex()

    data_format = SportsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process sports events."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        con = sqlite3.connect(settings.DB_PATH)
        c = con.cursor()
        rosters = sql.read_frame("select * from rosters", con)

        tys = []
        for i in xrange(0, rosters.shape[0]):
            year = rosters.iloc[i]['year']
            team = rosters.iloc[i]['team']
            ty = [year, team]
            if ty not in tys:
                tys.append(ty)

        for ty in tys:
            year, team = ty
            ros = rosters[((rosters['year'] == year) &
                           (rosters['team'] == team))]
            players = list(ros['id'])

        return data
Esempio n. 7
0
class CleanupScriptText(Task):
    data = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data)

    def check_for_line_split(self, line):
        return check_if_character(line.split(":")[0])

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        voice_scripts = []
        for i in xrange(0,data.shape[0]):
            script_lines = data['script'][i].split('\n')
            voice_lines = []
            current_line = ""
            for (i,line) in enumerate(script_lines):
                current_line = current_line.strip()
                line = line.strip()
                if line.startswith("[") and line.endswith("]"):
                    continue
                if line.startswith("-"):
                    continue
                voice_line = re.search('\w+:',line)
                if voice_line is not None:
                    if self.check_for_line_split(current_line):
                        voice_lines.append(current_line)
                    current_line = line
                elif (len(line)==0 or line.startswith("-")) and len(current_line)>0:
                    if self.check_for_line_split(current_line):
                        voice_lines.append(current_line)
                    current_line = ""
                    voice_lines.append(" ")
                elif len(current_line)>0:
                    current_line+=" " + line
            script_text = "\n".join([l for l in voice_lines if len(l)>0 and "{" not in l and "=" not in l])
            script_text = re.sub("\[.+\]","",script_text)
            voice_scripts.append(script_text.strip())

        data['voice_script'] = voice_scripts

        return data
Esempio n. 8
0
class RandomForestTrain(Train):
    """
    A class to train a random forest
    """
    colnames = List()
    clf = Complex()
    category = RegistryCategories.algorithms
    namespace = get_namespace(__module__)
    algorithm = RandomForestClassifier
    args = {'n_estimators' : 300, 'min_samples_leaf' : 4, 'compute_importances' : True}

    help_text = "Train and predict with Random Forest."
Esempio n. 9
0
class PullDownComments(Task):
    data = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Pull down comments and store them."

    def train(self, data, **kwargs):
        try:
            items_done = read_raw_data_from_cache(
                os.path.abspath(
                    os.path.join(settings.DATA_PATH, "items_done.p")))
            comments = [c['comment'] for c in items_done]
            replies = [c['reply'] for c in items_done]
            for subreddit in settings.REPLY_SUBREDDIT_LIST:
                try:
                    comment = get_single_comment(subreddit)
                    print comment
                    if comment is None:
                        log.info("Could not get a comment")
                        continue
                    text = comment.body
                    cid = comment.id
                    reply = test_knn_matcher(knn_matcher, text)
                    if text in comments or (reply in replies
                                            and reply is not None):
                        continue
                    data = {'comment': text, 'reply': reply, 'comment_id': cid}
                    items_done.append(data)
                    replies.append(reply)
                    comments.append(text)
                    log.info("Subreddit: {0}".format(subreddit))
                    log.info("Comment: {0} {1}".format(cid, text))
                    log.info("Reply: {0}".format(reply))
                    log.info("-------------------")
                except:
                    log.exception("Cannot get reply for {0}".format(subreddit))
                    continue
                write_data_to_cache(items_done, "items_done.p", "comment_id")
        except Exception:
            log.exception("Could not pull down comment.")
Esempio n. 10
0
class GenerateTransitionMatrix(Task):
    data = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process midi files."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        tempos = {'tick' : [], 'mpqn' : []}
        notes = {}
        all_instruments = []
        for (z,p) in enumerate(data):
            log.info("On file {0}".format(z))
            try:
                m = midi.read_midifile(p['path'])
            except Exception:
                continue
            try:
                notes, tempos, instruments = process_midifile(m,notes,tempos)
                all_instruments.append(instruments)
            except Exception:
                log.exception("Could not get features")
                continue
        nm, tm = generate_matrices(notes,tempos)

        data = {'files' : data, 'notes' : notes, 'tempos' : tempos, 'nm' : nm, 'tm': tm, 'in' : list(chain.from_iterable(all_instruments))}

        return data
Esempio n. 11
0
class Normalize(Task):
    """
    Normalize values by reducing standard deviation to 1 and mean to 0
    """
    column_means = List()
    column_stdevs = List()
    category = RegistryCategories.preprocessors
    tester = NormalizationTester
    test_cases = [{'stream' : os.path.abspath(os.path.join(settings.PACKAGE_PATH,'tests/data/csv/1/data.csv')), 'dataformat' : DataFormats.csv}]
    data = Complex()

    help_text = "Example class to normalize input values."

    def train(self, data, **kwargs):
        """
        Calculate the standard deviations and means in the training data
        """
        self.data = data
        for i in xrange(0,data.shape[1]):
            column_mean = np.mean(data.icol(i))
            column_stdev = np.std(data.icol(i))

            #Have to do += or "list" type will fail (ie with append)
            self.column_means += [column_mean]
            self.column_stdevs += [column_stdev]

        self.data = self.predict(data)

    def predict(self, test_data, **kwargs):
        """
        Adjust new input by the values in the training data
        """
        if test_data.shape[1]!=self.data.shape[1]:
            raise Exception("Test data has different number of columns than training data.")
        for i in xrange(0,test_data.shape[1]):
            test_data.loc[:,i] = test_data.icol(i) - self.column_means[i]
            if int(self.column_stdevs[i])!=0:
                test_data.loc[:,i] = test_data.icol(i) / self.column_stdevs[i]
        return test_data
Esempio n. 12
0
class KNNRF(Task):
    data = Complex()
    predictions = Complex()
    importances = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    args = {'algo': RandomForestTrain}

    help_text = "Cleanup simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        from preprocess import CHARACTERS

        vec_length = math.floor(MAX_FEATURES / 3)

        algo = kwargs.get('algo')
        alg = algo()
        train_data = data['train_frame'].iloc[:, :-1]
        target = data['train_frame']['current_speaker']
        clf = alg.train(train_data, target, **algo.args)
        self.importances = clf.feature_importances_

        test_data = data['data']
        match_data = data['current_features']
        reverse_speaker_code_dict = {
            data['speaker_code_dict'][k]: k
            for k in data['speaker_code_dict']
        }

        speaker_list = []
        speaker_codes = reverse_speaker_code_dict.keys()
        for i in xrange(0, len(speaker_codes)):
            s_text = "\n".join(
                list(data['speakers'][data['speakers']['speaker'] ==
                                      reverse_speaker_code_dict[
                                          speaker_codes[i]]]['line']))
            speaker_list.append(s_text)
        speaker_features = data['vectorizer'].batch_get_features(speaker_list)

        self.predictions = []
        counter = 0
        for script in test_data['voice_script']:
            counter += 1
            log.info("On script {0} out of {1}".format(
                counter, len(test_data['voice_script'])))
            lines = script.split("\n")
            speaker_code = [-1 for i in xrange(0, len(lines))]
            for (i, line) in enumerate(lines):
                if i > 0 and i % RESET_SCENE_EVERY != 0:
                    previous_line = lines[i - 1]
                    previous_speaker = speaker_code[i - 1]
                else:
                    previous_line = ""
                    previous_speaker = -1

                if i > 1 and i % RESET_SCENE_EVERY != 0:
                    two_back_speaker = speaker_code[i - 2]
                else:
                    two_back_speaker = -1

                if i < (len(lines) - 1):
                    next_line = lines[i + 1]
                else:
                    next_line = ""

                prev_features = data['vectorizer'].get_features(previous_line)
                cur_features = data['vectorizer'].get_features(line)
                next_features = data['vectorizer'].get_features(next_line)

                meta_features = make_df(
                    [[two_back_speaker], [previous_speaker]],
                    ["two_back_speaker", "previous_speaker"])
                #meta_features = make_df([[two_back_speaker]],["two_back_speaker"])
                train_frame = pd.concat([
                    pd.DataFrame(prev_features),
                    pd.DataFrame(cur_features),
                    pd.DataFrame(next_features), meta_features
                ],
                                        axis=1)

                speaker_code[i] = alg.predict(train_frame)[0]

                nearest_match, distance = self.find_nearest_match(
                    cur_features, speaker_features)
                if distance < CHARACTER_DISTANCE_MIN:
                    sc = speaker_codes[nearest_match]
                    speaker_code[i] = sc
                    continue

                for k in CHARACTERS:
                    for c in CHARACTERS[k]:
                        if c in previous_line:
                            speaker_code[i] = data['speaker_code_dict'][k]

                nearest_match, distance = self.find_nearest_match(
                    cur_features, match_data)
                if distance < DISTANCE_MIN:
                    sc = data['speakers']['speaker_code'][nearest_match]
                    speaker_code[i] = sc
                    continue

            df = make_df([
                lines, speaker_code,
                [reverse_speaker_code_dict[round(s)] for s in speaker_code]
            ], ["line", "speaker_code", "speaker"])
            self.predictions.append(df)
        return data

    def find_nearest_match(self, features, matrix):
        features = np.asarray(features)
        distances = [self.euclidean(u, features) for u in matrix]
        nearest_match = distances.index(min(distances))
        return nearest_match, min(distances)

    def euclidean(self, v1, v2):
        return np.sqrt(np.sum(np.square(np.subtract(v1, v2))))
Esempio n. 13
0
class FeatureExtractor(Task):
    data = Complex()
    row_data = List()
    speaker_code_dict = Dict()
    speaker_codes = List()
    vectorizer = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {
        'scriptfile':
        os.path.abspath(os.path.join(settings.DATA_PATH, "script_tasks"))
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        scriptfile = kwargs.get('scriptfile')
        script_data = pickle.load(open(scriptfile))
        script = script_data.tasks[2].voice_lines.value
        speakers = []
        lines = []
        for s in script:
            for (i, l) in enumerate(s):
                if i > 0:
                    previous_line = s[i - 1]['line']
                    previous_speaker = s[i - 1]['speaker']
                else:
                    previous_line = ""
                    previous_speaker = ""

                if i > 1:
                    two_back_speaker = s[i - 2]['speaker']
                else:
                    two_back_speaker = ""

                if len(s) > i + 1:
                    next_line = s[i + 1]['line']
                else:
                    next_line = ""
                current_line = s[i]['line']
                current_speaker = s[i]['speaker']
                lines.append(current_line)
                speakers.append(current_speaker)
                row_data = {
                    'previous_line': previous_line,
                    'previous_speaker': previous_speaker,
                    'next_line': next_line,
                    'current_line': current_line,
                    'current_speaker': current_speaker,
                    'two_back_speaker': two_back_speaker
                }
                self.row_data.append(row_data)
        self.speaker_code_dict = {
            k: i
            for (i, k) in enumerate(list(set(speakers)))
        }
        self.speaker_codes = [self.speaker_code_dict[s] for s in speakers]
        self.max_features = math.floor(MAX_FEATURES) / 3
        self.vectorizer = Vectorizer()
        self.vectorizer.fit(lines, self.speaker_codes, self.max_features)
        prev_features = self.vectorizer.batch_get_features(
            [rd['previous_line'] for rd in self.row_data])
        cur_features = self.vectorizer.batch_get_features(
            [rd['current_line'] for rd in self.row_data])
        next_features = self.vectorizer.batch_get_features(
            [rd['next_line'] for rd in self.row_data])

        self.speaker_code_dict.update({'': -1})
        meta_features = make_df(
            [[
                self.speaker_code_dict[s['two_back_speaker']]
                for s in self.row_data
            ],
             [
                 self.speaker_code_dict[s['previous_speaker']]
                 for s in self.row_data
             ], self.speaker_codes],
            ["two_back_speaker", "previous_speaker", "current_speaker"])
        #meta_features = make_df([[self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data], self.speaker_codes],["two_back_speaker", "current_speaker"])
        train_frame = pd.concat([
            pd.DataFrame(prev_features),
            pd.DataFrame(cur_features),
            pd.DataFrame(next_features), meta_features
        ],
                                axis=1)
        train_frame.index = range(train_frame.shape[0])
        data = {
            'vectorizer':
            self.vectorizer,
            'speaker_code_dict':
            self.speaker_code_dict,
            'train_frame':
            train_frame,
            'speakers':
            make_df([speakers, self.speaker_codes, lines],
                    ["speaker", "speaker_code", "line"]),
            'data':
            data,
            'current_features':
            cur_features,
        }
        return data
Esempio n. 14
0
class ClusterScriptText(Task):
    data = Complex()
    clusters = Complex()
    predictions = Complex()
    clusters = List()
    cl = Complex()
    vec = Complex()
    vec1 = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cluster simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = data
        self.predict(self.data)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        from train import Vectorizer, make_df

        self.vec = Vectorizer()

        reformatter = ReformatScriptText()
        args = reformatter.args
        args['do_replace'] = False
        reformatter.train(data, "", **args)

        script_segments = list(chain.from_iterable(reformatter.voice_lines))
        text = [s['line'] for s in script_segments]
        speaker = [s['speaker'] for s in script_segments]
        unique_speakers = list(set(speaker))
        speaker_code_dict = {k:i for (i,k) in enumerate(unique_speakers)}
        speaker_codes = [speaker_code_dict[k] for k in unique_speakers]
        speaker_list = []
        speaker_frame = make_df([text,speaker],["text","speaker"])
        for i in unique_speakers:
            s_text = "\n".join(list(speaker_frame[speaker_frame['speaker']==i]['text']))
            speaker_list.append(s_text)

        self.vec.fit(speaker_list, speaker_codes, 200,min_features=2)
        features = self.vec.batch_get_features(speaker_list)

        cl = KMeans()
        self.predictions = cl.fit_predict(features)
        self.cl = cl

        for i in xrange(0,max(self.predictions)):
            clust = []
            for c in xrange(0,len(speaker_codes)):
                if self.predictions[c]==i:
                    clust.append(unique_speakers[c])
            self.clusters.append(clust)

        pca = PCA(n_components=2, whiten=True).fit(features)
        rf = pca.transform(features)
        labels = cl.labels_
        pyplot.clf()
        centroids = cl.cluster_centers_
        pyplot.cla()
        for i in range(max(labels)):
            ds = rf[np.where(labels==i)]
            pyplot.plot(ds[:,0],ds[:,1],'o', label=self.clusters[i][0])
        pyplot.legend(loc=8)
        pyplot.savefig('clusters.png')

        self.vec1 = Vectorizer()
        speaker_codes = [speaker_code_dict[k] for k in speaker]

        self.vec1.fit(text, speaker_codes, 200,min_features=2)
        features = self.vec1.batch_get_features(text)

        pca = PCA(n_components=2, whiten=True).fit(features)
        rf = pca.transform(features)
        pyplot.clf()
        pyplot.cla()
        for i in range(len(speaker_codes)):
            pyplot.plot(rf[i,0],rf[i,1],'o', label=speaker[i])
        pyplot.savefig('all_speakers.png')
Esempio n. 15
0
class CrossValidate(Task):
    data = Complex()
    results = Complex()
    error = Float()
    importances = Complex()
    importance = Complex()
    column_names = List()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)
    args = {
        'nfolds': 3,
        'algo': RandomForestTrain,
        'target_name': 'label_code',
        'non_predictors': ["label", "line", "label_code"]
    }

    help_text = "Cross validate simpsons data."

    def cross_validate(self, data, **kwargs):
        nfolds = kwargs.get('nfolds', 3)
        algo = kwargs.get('algo')
        seed = kwargs.get('seed', 1)
        self.target_name = kwargs.get('target_name')
        non_predictors = kwargs.get('non_predictors')

        self.column_names = [
            l for l in list(data.columns) if l not in non_predictors
        ]
        data_len = data.shape[0]
        counter = 0
        fold_length = int(math.floor(data_len / nfolds))
        folds = []
        data_seq = list(xrange(0, data_len))
        random.seed(seed)
        random.shuffle(data_seq)

        for fold in xrange(0, nfolds):
            start = counter

            end = counter + fold_length
            if fold == (nfolds - 1):
                end = data_len
            folds.append(data_seq[start:end])
            counter += fold_length

        results = []
        data.index = range(data.shape[0])
        self.importances = []
        for (i, fold) in enumerate(folds):
            predict_data = data.iloc[fold, :]
            out_indices = list(chain.from_iterable(folds[:i] +
                                                   folds[(i + 1):]))
            train_data = data.iloc[out_indices, :]
            alg = algo()
            target = train_data[self.target_name]
            train_data = train_data[[
                l for l in list(train_data.columns) if l not in non_predictors
            ]]
            predict_data = predict_data[[
                l for l in list(predict_data.columns)
                if l not in non_predictors
            ]]
            clf = alg.train(train_data, target, **algo.args)
            results.append(alg.predict(predict_data))
            self.importances.append(clf.feature_importances_)
        return results, folds

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.target_name = kwargs.get('target_name')
        results, folds = self.cross_validate(data, **kwargs)
        self.gather_results(results, folds, data)

    def gather_results(self, results, folds, data):
        full_results = list(chain.from_iterable(results))
        full_indices = list(chain.from_iterable(folds))
        partial_result_df = make_df([full_results, full_indices],
                                    ["result", "index"])
        partial_result_df = partial_result_df.sort(["index"])
        partial_result_df.index = range(partial_result_df.shape[0])
        result_df = pd.concat([partial_result_df, data], axis=1)
        self.results = result_df
        self.calc_importance(self.importances, self.column_names)

    def calc_error(self, result_df):
        self.error = np.mean(
            np.abs(result_df['result'] - result_df[self.target_name]))

    def calc_importance(self, importances, col_names):
        importance_frame = pd.DataFrame(importances)
        importance_frame.columns = col_names
        self.importance = importance_frame.mean(axis=0)
        self.importance.sort(0)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        pass
Esempio n. 16
0
class GenerateMarkovTracks(Task):
    data = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process midi files."

    args = {
        'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"],
        'target_var' : 'label_code',
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        frame1 = pd.read_csv(settings.MIDI_FEATURE_PATH)
        frame2 = pd.read_csv(settings.FEATURE_PATH)

        frame = pd.concat([frame1,frame2],axis=0)
        non_predictors = kwargs.get('non_predictors')
        target = kwargs.get('target_var')

        frame.index = range(frame.shape[0])

        alg = RandomForestTrain()
        good_names = [i for i in frame.columns if i not in non_predictors]
        for c in good_names:
            frame[c] = frame[c].astype(float)

        for c in good_names:
            frame[c] = frame[c].real

        clf = alg.train(np.asarray(frame[good_names]),frame[target],**alg.args)

        evolutions = 2
        track_count = 100
        patterns_to_pick = int(math.floor(track_count/4))
        remixes_to_make = int(math.floor(track_count/4))
        additions_to_make = int(math.floor(track_count/4))
        patterns = generate_patterns(track_count,data)
        for z in xrange(0,evolutions):
            new_quality, quality, patterns = rate_tracks(patterns, clf)
            patterns = patterns[0:patterns_to_pick]
            for i in xrange(0,remixes_to_make):
                patterns.append(remix(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick])))
            #for i in xrange(0,additions_to_make):
            #    patterns.append(add_song(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick])))
            new_patterns = []
            for p in patterns:
                if p not in new_patterns:
                    new_patterns.append(p)
            patterns = new_patterns
            patterns += generate_patterns(track_count - len(patterns), data)
        new_quality, quality, patterns = rate_tracks(patterns, clf)

        feats = []
        for (i,p) in enumerate(patterns):
            time = strftime("%m-%d-%Y-%H%M%S", gmtime())
            fname = time+random.choice(words)+".mid"
            oggpath = write_and_convert(p,fname)
            dat, fs, enc = oggread(oggpath)
            f = process_song(dat[:settings.MUSIC_TIME_LIMIT * fs,:],fs)
            feats.append(f)
        feats = pd.DataFrame(feats)
        feats['label_code'] = [2] * feats.shape[0]
        feats['label'] = ["generated"] * feats.shape[0]
        feats.to_csv(os.path.abspath(os.path.join(settings.DATA_PATH,"generated_midi_features.csv")))

        return data
Esempio n. 17
0
class Validate(Task):
    data = Complex()
    results = Complex()
    error = Float()
    importances = Complex()
    importance = Complex()
    column_names = List()

    data_format = DataFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Validate."

    def cross_validate(self, data, non_predictors, **kwargs):
        nfolds = kwargs.get('nfolds', 3)
        algo = kwargs.get('algo')
        seed = kwargs.get('seed', 1)
        data_len = data.shape[0]
        counter = 0
        fold_length = int(math.floor(data_len/nfolds))
        folds = []
        data_seq = list(xrange(0,data_len))
        random.seed(seed)
        random.shuffle(data_seq)

        for fold in xrange(0, nfolds):
            start = counter

            end = counter + fold_length
            if fold == (nfolds-1):
                end = data_len
            folds.append(data_seq[start:end])
            counter += fold_length

        results = []
        data.index = range(data.shape[0])
        self.importances = []
        for (i,fold) in enumerate(folds):
            predict_data = data.iloc[fold,:]
            out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):]))
            train_data = data.iloc[out_indices,:]
            alg = algo()
            target = train_data['next_year_wins']
            train_data = train_data[[l for l in list(train_data.columns) if l not in non_predictors]]
            predict_data = predict_data[[l for l in list(predict_data.columns) if l not in non_predictors]]
            clf = alg.train(train_data,target,**algo.args)
            results.append(alg.predict(predict_data))
            self.importances.append(clf.feature_importances_)
        return results, folds

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        non_predictors = [i.replace(" ", "_").lower() for i in list(set(data['team']))] + ["team", "next_year_wins"]
        self.column_names = [l for l in list(data.columns) if l not in non_predictors]
        results, folds = self.cross_validate(data, non_predictors, **kwargs)
        self.gather_results(results, folds, data)

    def gather_results(self, results, folds, data):
        full_results = list(chain.from_iterable(results))
        full_indices = list(chain.from_iterable(folds))
        partial_result_df = make_df([full_results, full_indices], ["result", "index"])
        partial_result_df = partial_result_df.sort(["index"])
        partial_result_df.index = range(partial_result_df.shape[0])
        result_df = pd.concat([partial_result_df, data[['next_year_wins', 'team', 'year', 'total_wins']]], axis=1)
        result_df = result_df[(result_df['next_year_wins']>0) & result_df['total_wins']>0]
        self.results = result_df
        self.calc_error(result_df)
        self.calc_importance(self.importances, self.column_names)

    def calc_error(self, result_df):
        filtered_df = result_df[result_df['year']<np.max(result_df['year'])]
        self.error = np.mean(np.abs(filtered_df['result'] - filtered_df['next_year_wins']))

    def calc_importance(self, importances, col_names):
        importance_frame = pd.DataFrame(importances)
        importance_frame.columns = col_names
        self.importance = importance_frame.mean(axis=0)
        self.importance.sort(0)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        pass
Esempio n. 18
0
class LoadAudioFiles(Task):
    data = Complex()
    all_files = List()
    seq = Complex()
    res = Complex()
    label_codes = Dict()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {
        'audio_dir': settings.AUDIO_DIR,
        'timeout': 600,
        'only_labelled_lines': settings.ONLY_LABELLED_LINES,
        'processed_files_limit': settings.PROCESSED_FILES_LIMIT
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def extract_season(self, name):
        match1 = re.search('\[(\d+)[x\.](\d+)\]', name)
        if match1 is not None:
            season = match1.group(1)
            episode = match1.group(2)
            return int(season), int(episode)

        match2 = re.search('S(\d+)E(\d+)', name)
        if match2 is not None:
            season = match2.group(1)
            episode = match2.group(2)
            return int(season), int(episode)

        return None, None

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        p = Pool(4, maxtasksperchild=50)
        audio_dir = kwargs['audio_dir']
        timeout = kwargs['timeout']
        oll = kwargs['only_labelled_lines']
        pff = kwargs['processed_files_limit']

        all_files = []
        for ad in os.listdir(audio_dir):
            ad_path = os.path.abspath(os.path.join(audio_dir, ad))
            if os.path.isdir(ad_path):
                files = os.listdir(ad_path)
                all_files += [
                    os.path.abspath(os.path.join(ad_path, f)) for f in files
                ]
            else:
                all_files += [ad_path]
        self.all_files = [f for f in all_files if f.endswith(".ogg")]
        frames = []
        counter = 0
        for f in self.all_files:
            season, episode = self.extract_season(f)
            if season is None or (season == 11 and episode == 6):
                continue
            subtitle_frame = data[((data['season'] == season) &
                                   (data['episode'] == episode))]
            if subtitle_frame.shape[0] == 0:
                continue

            #To cause loop to end early, remove if needed
            if oll:
                label_frame = subtitle_frame[(subtitle_frame['label'] != "")]
                if label_frame.shape[0] == 0:
                    continue
            if pff is not None and isinstance(pff, int) and counter >= pff:
                break

            counter += 1
            log.info("On file {0} Season {1} Episode {2}".format(
                counter, season, episode))
            f_data, fs, enc = oggread(f)
            subtitle_frame = subtitle_frame.sort('start')
            subtitle_frame.index = range(subtitle_frame.shape[0])
            samps = []
            good_rows = []
            for i in xrange(0, subtitle_frame.shape[0]):
                start = subtitle_frame['start'].iloc[i]
                end = subtitle_frame['end'].iloc[i]
                if end - start > 6 or (subtitle_frame['label'][i] == ''
                                       and oll):
                    continue
                samp = f_data[(start * fs):(end * fs), :]
                samps.append({'samp': samp, 'fs': fs})
                good_rows.append(i)
            r = p.imap(process_subtitle, samps, chunksize=1)
            sf = subtitle_frame.iloc[good_rows]
            results = []
            for i in range(len(samps)):
                try:
                    results.append(r.next(timeout=timeout))
                except TimeoutError:
                    results.append(None)
            good_rows = [
                i for i in xrange(0, len(results)) if results[i] != None
            ]
            audio_features = [i for i in results if i != None]
            good_sf = sf.iloc[good_rows]
            good_sf.index = range(good_sf.shape[0])
            audio_frame = pd.DataFrame(audio_features)
            audio_frame.index = range(audio_frame.shape[0])
            df = pd.concat([good_sf, audio_frame], axis=1)
            df = df.fillna(-1)
            df.index = range(df.shape[0])
            frames.append(df)
            lab_df_shape = df[df['label'] != ''].shape[0]
            log.info("Processed {0} lines, {1} of which were labelled".format(
                df.shape[0], lab_df_shape))
        p.close()
        p.join()
        log.info("Done processing episodes.")
        data = pd.concat(frames, axis=0)
        data.index = range(data.shape[0])
        data.index = range(data.shape[0])

        for c in list(data.columns):
            data[c] = data[c].real
        for k in CHARACTERS:
            for i in CHARACTERS[k]:
                data['label'][data['label'] == i] = k
        self.label_codes = {k: i for (i, k) in enumerate(set(data['label']))}
        reverse_label_codes = {
            self.label_codes[k]: k
            for k in self.label_codes
        }
        data['label_code'] = [self.label_codes[k] for k in data['label']]
        self.seq = SequentialValidate()

        #Do cv to get error estimates
        cv_frame = data[data['label'] != ""]
        self.seq.train(cv_frame, **self.seq.args)
        self.res = self.seq.results
        self.res = self.res[[
            'line', 'label', 'label_code', 'result_code', 'result_label'
        ]]

        exact_percent, adj_percent = compute_error(self.res)
        log.info("Exact match percent: {0}".format(exact_percent))
        log.info("Adjacent match percent: {0}".format(adj_percent))
        #Predict in the frame
        alg = RandomForestTrain()
        target = cv_frame['label_code']
        non_predictors = ["label", "line", "label_code"]
        train_names = [
            l for l in list(cv_frame.columns) if l not in non_predictors
        ]
        train_data = cv_frame[train_names]
        predict_data = data[train_names]
        clf = alg.train(train_data, target, **alg.args)
        data['result_code'] = alg.predict(predict_data)
        data['result_label'] = [
            reverse_label_codes[k] for k in data['result_code']
        ]
        return data