class EvolveMusic(Task): data = Complex() clf = Complex() importances = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) args = { 'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"], 'target_var' : 'label_code', } def train(self,data,target, **kwargs): non_predictors = kwargs.get('non_predictors') target = kwargs.get('target_var') data.index = range(data.shape[0]) alg = RandomForestTrain() good_names = [i for i in data.columns if i not in non_predictors] for c in good_names: data[c] = data[c].astype(float) for c in good_names: data[c] = data[c].real clf = alg.train(np.asarray(data[good_names]),data[target],**alg.args) importances = clf.feature_importances_ counter = 0 for i in xrange(0,data.shape[0]): fname = data['fname'][i] vec, fs, enc = read_sound(fname) label = data["labels"][i] if label=="classical": counter+=1 name = fname.split("/")[-1] feats = process_song(vec,fs) initial_quality = clf.predict_proba(feats)[0,1] headers = "song_index,iteration,quality,distance,splice_song_index,splice_song" v2s = [headers,"{0},{1},{2},{3},{4},{5}".format(i,-1,initial_quality,0,0,"N/A")] print(headers) for z in xrange(0,100): if z%10==0 or z==0: v2ind = random.randint(0,data.shape[0]-1) v2fname = data['fname'][v2ind] vec2, v2fs, v2enc = read_sound(v2fname) feats = process_song(vec,fs) quality = clf.predict_proba(feats)[0,1] nearest_match, min_dist = find_nearest_match(feats, data[good_names]) descriptor = "{0},{1},{2},{3},{4},{5}".format(i,z,quality,min_dist,v2ind,v2fname.split("/")[-1]) v2s.append(descriptor) print(descriptor) if min_dist>.35 and (abs(quality-0)<=.1 or abs(1-quality)<=.1) and z!=0: write_file(name,vec,fs,enc,v2s) vec = alter(vec,vec2,fs,v2fs,clf) write_file(name,vec,fs,enc,v2s)
class ReformatScriptText(Task): data = Complex() voice_lines = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." args = {'scriptfile' : os.path.abspath(os.path.join(settings.PROJECT_PATH, "data/raw_scripts2.json")), 'do_replace' : True} def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = data self.predict(self.data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ voice_scripts = list(data['voice_script']) scriptfile = kwargs['scriptfile'] do_replace = kwargs['do_replace'] json_scripts = json.load(open(scriptfile)) voice_scripts+=[s['script'] for s in json_scripts] script_segments = [] for script in voice_scripts: script = script.replace("\"","") lines = script.split("\n") segment = [] for line in lines: if line.strip()!="": line = line.encode('ascii','ignore') line_split = line.split(":") if do_replace: line_split[0] = find_replacement(line_split[0].strip()) line_split[0] = cleanup_name(line_split[0].strip()) segment.append({'speaker' : line_split[0], 'line' : ":".join(line_split[1:]).strip()}) else: if len(segment)>0: script_segments.append(segment) segment = [] if len(segment)>0: script_segments.append(segment) self.voice_lines = script_segments
class CleanupScriptList(Task): data = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ script_removal_values = [""] for r in script_removal_values: data = data[data["script"]!=r] log.info(data) data['episode_name'] = [i.split('\n')[0].strip() for i in data['episode_name']] data['episode_code'] = [i.split('/')[-1].split('.html')[0] for i in data['url']] data.index = range(data.shape[0]) return data
class Train(Task): """ A class to train a support vector machine algorithm """ clf = Complex() category = RegistryCategories.algorithms algorithm = svm.SVC args = {'C': 1.0} tester = SVMTester test_cases = [{ 'data': os.path.abspath( os.path.join(settings.PACKAGE_PATH, 'tests/data/csv/1/data.csv')), 'target': os.path.abspath( os.path.join(settings.PACKAGE_PATH, 'tests/data/csv/1/target.csv')), 'dataformat': DataFormats.csv }] help_text = "Example class to train and predict with SVM." def train(self, data, target, **kwargs): #When doing self.clf =clf , __set__ is called on the field. # But, when doing self.clf = self.algorithm() and self.clf.fit(), __set__ is not called. # Work around this by doing the fit logic on a local variable, and then assigning to self.clf, clf = self.algorithm(**kwargs) clf = clf.fit(data, target) self.clf = clf return self.clf def predict(self, test_data, **kwargs): test_data = test_data return self.clf.predict(test_data)
class ProcessMusic(Task): data = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process sports events." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ d = [] labels = [] encs = [] fss = [] fnames = [] if not os.path.isfile(settings.FEATURE_PATH): for (z,p) in enumerate(data): log.info("On file {0}".format(z)) try: data , fs, enc = read_sound(p['newpath']) except Exception: continue try: features = process_song(data,fs) except Exception: log.exception("Could not get features") continue d.append(features) labels.append(p['type']) fss.append(fs) encs.append(enc) fnames.append(p['newpath']) frame = pd.DataFrame(d) frame['labels'] = labels frame['fs'] = fss frame['enc'] = encs frame['fname'] = fnames label_dict = { 'classical' : 1, 'electronic' : 0 } frame['label_code'] = [label_dict[i] for i in frame['labels']] frame.to_csv(settings.FEATURE_PATH) else: frame = pd.read_csv(settings.FEATURE_PATH) return frame
class ProcessGames(Task): data = Complex() row_data = List() speaker_code_dict = Dict() speaker_codes = List() vectorizer = Complex() data_format = SportsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process sports events." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ con = sqlite3.connect(settings.DB_PATH) c = con.cursor() rosters = sql.read_frame("select * from rosters", con) tys = [] for i in xrange(0, rosters.shape[0]): year = rosters.iloc[i]['year'] team = rosters.iloc[i]['team'] ty = [year, team] if ty not in tys: tys.append(ty) for ty in tys: year, team = ty ros = rosters[((rosters['year'] == year) & (rosters['team'] == team))] players = list(ros['id']) return data
class CleanupScriptText(Task): data = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data) def check_for_line_split(self, line): return check_if_character(line.split(":")[0]) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ voice_scripts = [] for i in xrange(0,data.shape[0]): script_lines = data['script'][i].split('\n') voice_lines = [] current_line = "" for (i,line) in enumerate(script_lines): current_line = current_line.strip() line = line.strip() if line.startswith("[") and line.endswith("]"): continue if line.startswith("-"): continue voice_line = re.search('\w+:',line) if voice_line is not None: if self.check_for_line_split(current_line): voice_lines.append(current_line) current_line = line elif (len(line)==0 or line.startswith("-")) and len(current_line)>0: if self.check_for_line_split(current_line): voice_lines.append(current_line) current_line = "" voice_lines.append(" ") elif len(current_line)>0: current_line+=" " + line script_text = "\n".join([l for l in voice_lines if len(l)>0 and "{" not in l and "=" not in l]) script_text = re.sub("\[.+\]","",script_text) voice_scripts.append(script_text.strip()) data['voice_script'] = voice_scripts return data
class RandomForestTrain(Train): """ A class to train a random forest """ colnames = List() clf = Complex() category = RegistryCategories.algorithms namespace = get_namespace(__module__) algorithm = RandomForestClassifier args = {'n_estimators' : 300, 'min_samples_leaf' : 4, 'compute_importances' : True} help_text = "Train and predict with Random Forest."
class PullDownComments(Task): data = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Pull down comments and store them." def train(self, data, **kwargs): try: items_done = read_raw_data_from_cache( os.path.abspath( os.path.join(settings.DATA_PATH, "items_done.p"))) comments = [c['comment'] for c in items_done] replies = [c['reply'] for c in items_done] for subreddit in settings.REPLY_SUBREDDIT_LIST: try: comment = get_single_comment(subreddit) print comment if comment is None: log.info("Could not get a comment") continue text = comment.body cid = comment.id reply = test_knn_matcher(knn_matcher, text) if text in comments or (reply in replies and reply is not None): continue data = {'comment': text, 'reply': reply, 'comment_id': cid} items_done.append(data) replies.append(reply) comments.append(text) log.info("Subreddit: {0}".format(subreddit)) log.info("Comment: {0} {1}".format(cid, text)) log.info("Reply: {0}".format(reply)) log.info("-------------------") except: log.exception("Cannot get reply for {0}".format(subreddit)) continue write_data_to_cache(items_done, "items_done.p", "comment_id") except Exception: log.exception("Could not pull down comment.")
class GenerateTransitionMatrix(Task): data = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process midi files." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ tempos = {'tick' : [], 'mpqn' : []} notes = {} all_instruments = [] for (z,p) in enumerate(data): log.info("On file {0}".format(z)) try: m = midi.read_midifile(p['path']) except Exception: continue try: notes, tempos, instruments = process_midifile(m,notes,tempos) all_instruments.append(instruments) except Exception: log.exception("Could not get features") continue nm, tm = generate_matrices(notes,tempos) data = {'files' : data, 'notes' : notes, 'tempos' : tempos, 'nm' : nm, 'tm': tm, 'in' : list(chain.from_iterable(all_instruments))} return data
class Normalize(Task): """ Normalize values by reducing standard deviation to 1 and mean to 0 """ column_means = List() column_stdevs = List() category = RegistryCategories.preprocessors tester = NormalizationTester test_cases = [{'stream' : os.path.abspath(os.path.join(settings.PACKAGE_PATH,'tests/data/csv/1/data.csv')), 'dataformat' : DataFormats.csv}] data = Complex() help_text = "Example class to normalize input values." def train(self, data, **kwargs): """ Calculate the standard deviations and means in the training data """ self.data = data for i in xrange(0,data.shape[1]): column_mean = np.mean(data.icol(i)) column_stdev = np.std(data.icol(i)) #Have to do += or "list" type will fail (ie with append) self.column_means += [column_mean] self.column_stdevs += [column_stdev] self.data = self.predict(data) def predict(self, test_data, **kwargs): """ Adjust new input by the values in the training data """ if test_data.shape[1]!=self.data.shape[1]: raise Exception("Test data has different number of columns than training data.") for i in xrange(0,test_data.shape[1]): test_data.loc[:,i] = test_data.icol(i) - self.column_means[i] if int(self.column_stdevs[i])!=0: test_data.loc[:,i] = test_data.icol(i) / self.column_stdevs[i] return test_data
class KNNRF(Task): data = Complex() predictions = Complex() importances = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) args = {'algo': RandomForestTrain} help_text = "Cleanup simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ from preprocess import CHARACTERS vec_length = math.floor(MAX_FEATURES / 3) algo = kwargs.get('algo') alg = algo() train_data = data['train_frame'].iloc[:, :-1] target = data['train_frame']['current_speaker'] clf = alg.train(train_data, target, **algo.args) self.importances = clf.feature_importances_ test_data = data['data'] match_data = data['current_features'] reverse_speaker_code_dict = { data['speaker_code_dict'][k]: k for k in data['speaker_code_dict'] } speaker_list = [] speaker_codes = reverse_speaker_code_dict.keys() for i in xrange(0, len(speaker_codes)): s_text = "\n".join( list(data['speakers'][data['speakers']['speaker'] == reverse_speaker_code_dict[ speaker_codes[i]]]['line'])) speaker_list.append(s_text) speaker_features = data['vectorizer'].batch_get_features(speaker_list) self.predictions = [] counter = 0 for script in test_data['voice_script']: counter += 1 log.info("On script {0} out of {1}".format( counter, len(test_data['voice_script']))) lines = script.split("\n") speaker_code = [-1 for i in xrange(0, len(lines))] for (i, line) in enumerate(lines): if i > 0 and i % RESET_SCENE_EVERY != 0: previous_line = lines[i - 1] previous_speaker = speaker_code[i - 1] else: previous_line = "" previous_speaker = -1 if i > 1 and i % RESET_SCENE_EVERY != 0: two_back_speaker = speaker_code[i - 2] else: two_back_speaker = -1 if i < (len(lines) - 1): next_line = lines[i + 1] else: next_line = "" prev_features = data['vectorizer'].get_features(previous_line) cur_features = data['vectorizer'].get_features(line) next_features = data['vectorizer'].get_features(next_line) meta_features = make_df( [[two_back_speaker], [previous_speaker]], ["two_back_speaker", "previous_speaker"]) #meta_features = make_df([[two_back_speaker]],["two_back_speaker"]) train_frame = pd.concat([ pd.DataFrame(prev_features), pd.DataFrame(cur_features), pd.DataFrame(next_features), meta_features ], axis=1) speaker_code[i] = alg.predict(train_frame)[0] nearest_match, distance = self.find_nearest_match( cur_features, speaker_features) if distance < CHARACTER_DISTANCE_MIN: sc = speaker_codes[nearest_match] speaker_code[i] = sc continue for k in CHARACTERS: for c in CHARACTERS[k]: if c in previous_line: speaker_code[i] = data['speaker_code_dict'][k] nearest_match, distance = self.find_nearest_match( cur_features, match_data) if distance < DISTANCE_MIN: sc = data['speakers']['speaker_code'][nearest_match] speaker_code[i] = sc continue df = make_df([ lines, speaker_code, [reverse_speaker_code_dict[round(s)] for s in speaker_code] ], ["line", "speaker_code", "speaker"]) self.predictions.append(df) return data def find_nearest_match(self, features, matrix): features = np.asarray(features) distances = [self.euclidean(u, features) for u in matrix] nearest_match = distances.index(min(distances)) return nearest_match, min(distances) def euclidean(self, v1, v2): return np.sqrt(np.sum(np.square(np.subtract(v1, v2))))
class FeatureExtractor(Task): data = Complex() row_data = List() speaker_code_dict = Dict() speaker_codes = List() vectorizer = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." args = { 'scriptfile': os.path.abspath(os.path.join(settings.DATA_PATH, "script_tasks")) } def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ scriptfile = kwargs.get('scriptfile') script_data = pickle.load(open(scriptfile)) script = script_data.tasks[2].voice_lines.value speakers = [] lines = [] for s in script: for (i, l) in enumerate(s): if i > 0: previous_line = s[i - 1]['line'] previous_speaker = s[i - 1]['speaker'] else: previous_line = "" previous_speaker = "" if i > 1: two_back_speaker = s[i - 2]['speaker'] else: two_back_speaker = "" if len(s) > i + 1: next_line = s[i + 1]['line'] else: next_line = "" current_line = s[i]['line'] current_speaker = s[i]['speaker'] lines.append(current_line) speakers.append(current_speaker) row_data = { 'previous_line': previous_line, 'previous_speaker': previous_speaker, 'next_line': next_line, 'current_line': current_line, 'current_speaker': current_speaker, 'two_back_speaker': two_back_speaker } self.row_data.append(row_data) self.speaker_code_dict = { k: i for (i, k) in enumerate(list(set(speakers))) } self.speaker_codes = [self.speaker_code_dict[s] for s in speakers] self.max_features = math.floor(MAX_FEATURES) / 3 self.vectorizer = Vectorizer() self.vectorizer.fit(lines, self.speaker_codes, self.max_features) prev_features = self.vectorizer.batch_get_features( [rd['previous_line'] for rd in self.row_data]) cur_features = self.vectorizer.batch_get_features( [rd['current_line'] for rd in self.row_data]) next_features = self.vectorizer.batch_get_features( [rd['next_line'] for rd in self.row_data]) self.speaker_code_dict.update({'': -1}) meta_features = make_df( [[ self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data ], [ self.speaker_code_dict[s['previous_speaker']] for s in self.row_data ], self.speaker_codes], ["two_back_speaker", "previous_speaker", "current_speaker"]) #meta_features = make_df([[self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data], self.speaker_codes],["two_back_speaker", "current_speaker"]) train_frame = pd.concat([ pd.DataFrame(prev_features), pd.DataFrame(cur_features), pd.DataFrame(next_features), meta_features ], axis=1) train_frame.index = range(train_frame.shape[0]) data = { 'vectorizer': self.vectorizer, 'speaker_code_dict': self.speaker_code_dict, 'train_frame': train_frame, 'speakers': make_df([speakers, self.speaker_codes, lines], ["speaker", "speaker_code", "line"]), 'data': data, 'current_features': cur_features, } return data
class ClusterScriptText(Task): data = Complex() clusters = Complex() predictions = Complex() clusters = List() cl = Complex() vec = Complex() vec1 = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cluster simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = data self.predict(self.data) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ from train import Vectorizer, make_df self.vec = Vectorizer() reformatter = ReformatScriptText() args = reformatter.args args['do_replace'] = False reformatter.train(data, "", **args) script_segments = list(chain.from_iterable(reformatter.voice_lines)) text = [s['line'] for s in script_segments] speaker = [s['speaker'] for s in script_segments] unique_speakers = list(set(speaker)) speaker_code_dict = {k:i for (i,k) in enumerate(unique_speakers)} speaker_codes = [speaker_code_dict[k] for k in unique_speakers] speaker_list = [] speaker_frame = make_df([text,speaker],["text","speaker"]) for i in unique_speakers: s_text = "\n".join(list(speaker_frame[speaker_frame['speaker']==i]['text'])) speaker_list.append(s_text) self.vec.fit(speaker_list, speaker_codes, 200,min_features=2) features = self.vec.batch_get_features(speaker_list) cl = KMeans() self.predictions = cl.fit_predict(features) self.cl = cl for i in xrange(0,max(self.predictions)): clust = [] for c in xrange(0,len(speaker_codes)): if self.predictions[c]==i: clust.append(unique_speakers[c]) self.clusters.append(clust) pca = PCA(n_components=2, whiten=True).fit(features) rf = pca.transform(features) labels = cl.labels_ pyplot.clf() centroids = cl.cluster_centers_ pyplot.cla() for i in range(max(labels)): ds = rf[np.where(labels==i)] pyplot.plot(ds[:,0],ds[:,1],'o', label=self.clusters[i][0]) pyplot.legend(loc=8) pyplot.savefig('clusters.png') self.vec1 = Vectorizer() speaker_codes = [speaker_code_dict[k] for k in speaker] self.vec1.fit(text, speaker_codes, 200,min_features=2) features = self.vec1.batch_get_features(text) pca = PCA(n_components=2, whiten=True).fit(features) rf = pca.transform(features) pyplot.clf() pyplot.cla() for i in range(len(speaker_codes)): pyplot.plot(rf[i,0],rf[i,1],'o', label=speaker[i]) pyplot.savefig('all_speakers.png')
class CrossValidate(Task): data = Complex() results = Complex() error = Float() importances = Complex() importance = Complex() column_names = List() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) args = { 'nfolds': 3, 'algo': RandomForestTrain, 'target_name': 'label_code', 'non_predictors': ["label", "line", "label_code"] } help_text = "Cross validate simpsons data." def cross_validate(self, data, **kwargs): nfolds = kwargs.get('nfolds', 3) algo = kwargs.get('algo') seed = kwargs.get('seed', 1) self.target_name = kwargs.get('target_name') non_predictors = kwargs.get('non_predictors') self.column_names = [ l for l in list(data.columns) if l not in non_predictors ] data_len = data.shape[0] counter = 0 fold_length = int(math.floor(data_len / nfolds)) folds = [] data_seq = list(xrange(0, data_len)) random.seed(seed) random.shuffle(data_seq) for fold in xrange(0, nfolds): start = counter end = counter + fold_length if fold == (nfolds - 1): end = data_len folds.append(data_seq[start:end]) counter += fold_length results = [] data.index = range(data.shape[0]) self.importances = [] for (i, fold) in enumerate(folds): predict_data = data.iloc[fold, :] out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):])) train_data = data.iloc[out_indices, :] alg = algo() target = train_data[self.target_name] train_data = train_data[[ l for l in list(train_data.columns) if l not in non_predictors ]] predict_data = predict_data[[ l for l in list(predict_data.columns) if l not in non_predictors ]] clf = alg.train(train_data, target, **algo.args) results.append(alg.predict(predict_data)) self.importances.append(clf.feature_importances_) return results, folds def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.target_name = kwargs.get('target_name') results, folds = self.cross_validate(data, **kwargs) self.gather_results(results, folds, data) def gather_results(self, results, folds, data): full_results = list(chain.from_iterable(results)) full_indices = list(chain.from_iterable(folds)) partial_result_df = make_df([full_results, full_indices], ["result", "index"]) partial_result_df = partial_result_df.sort(["index"]) partial_result_df.index = range(partial_result_df.shape[0]) result_df = pd.concat([partial_result_df, data], axis=1) self.results = result_df self.calc_importance(self.importances, self.column_names) def calc_error(self, result_df): self.error = np.mean( np.abs(result_df['result'] - result_df[self.target_name])) def calc_importance(self, importances, col_names): importance_frame = pd.DataFrame(importances) importance_frame.columns = col_names self.importance = importance_frame.mean(axis=0) self.importance.sort(0) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ pass
class GenerateMarkovTracks(Task): data = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process midi files." args = { 'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"], 'target_var' : 'label_code', } def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ frame1 = pd.read_csv(settings.MIDI_FEATURE_PATH) frame2 = pd.read_csv(settings.FEATURE_PATH) frame = pd.concat([frame1,frame2],axis=0) non_predictors = kwargs.get('non_predictors') target = kwargs.get('target_var') frame.index = range(frame.shape[0]) alg = RandomForestTrain() good_names = [i for i in frame.columns if i not in non_predictors] for c in good_names: frame[c] = frame[c].astype(float) for c in good_names: frame[c] = frame[c].real clf = alg.train(np.asarray(frame[good_names]),frame[target],**alg.args) evolutions = 2 track_count = 100 patterns_to_pick = int(math.floor(track_count/4)) remixes_to_make = int(math.floor(track_count/4)) additions_to_make = int(math.floor(track_count/4)) patterns = generate_patterns(track_count,data) for z in xrange(0,evolutions): new_quality, quality, patterns = rate_tracks(patterns, clf) patterns = patterns[0:patterns_to_pick] for i in xrange(0,remixes_to_make): patterns.append(remix(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick]))) #for i in xrange(0,additions_to_make): # patterns.append(add_song(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick]))) new_patterns = [] for p in patterns: if p not in new_patterns: new_patterns.append(p) patterns = new_patterns patterns += generate_patterns(track_count - len(patterns), data) new_quality, quality, patterns = rate_tracks(patterns, clf) feats = [] for (i,p) in enumerate(patterns): time = strftime("%m-%d-%Y-%H%M%S", gmtime()) fname = time+random.choice(words)+".mid" oggpath = write_and_convert(p,fname) dat, fs, enc = oggread(oggpath) f = process_song(dat[:settings.MUSIC_TIME_LIMIT * fs,:],fs) feats.append(f) feats = pd.DataFrame(feats) feats['label_code'] = [2] * feats.shape[0] feats['label'] = ["generated"] * feats.shape[0] feats.to_csv(os.path.abspath(os.path.join(settings.DATA_PATH,"generated_midi_features.csv"))) return data
class Validate(Task): data = Complex() results = Complex() error = Float() importances = Complex() importance = Complex() column_names = List() data_format = DataFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Validate." def cross_validate(self, data, non_predictors, **kwargs): nfolds = kwargs.get('nfolds', 3) algo = kwargs.get('algo') seed = kwargs.get('seed', 1) data_len = data.shape[0] counter = 0 fold_length = int(math.floor(data_len/nfolds)) folds = [] data_seq = list(xrange(0,data_len)) random.seed(seed) random.shuffle(data_seq) for fold in xrange(0, nfolds): start = counter end = counter + fold_length if fold == (nfolds-1): end = data_len folds.append(data_seq[start:end]) counter += fold_length results = [] data.index = range(data.shape[0]) self.importances = [] for (i,fold) in enumerate(folds): predict_data = data.iloc[fold,:] out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):])) train_data = data.iloc[out_indices,:] alg = algo() target = train_data['next_year_wins'] train_data = train_data[[l for l in list(train_data.columns) if l not in non_predictors]] predict_data = predict_data[[l for l in list(predict_data.columns) if l not in non_predictors]] clf = alg.train(train_data,target,**algo.args) results.append(alg.predict(predict_data)) self.importances.append(clf.feature_importances_) return results, folds def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ non_predictors = [i.replace(" ", "_").lower() for i in list(set(data['team']))] + ["team", "next_year_wins"] self.column_names = [l for l in list(data.columns) if l not in non_predictors] results, folds = self.cross_validate(data, non_predictors, **kwargs) self.gather_results(results, folds, data) def gather_results(self, results, folds, data): full_results = list(chain.from_iterable(results)) full_indices = list(chain.from_iterable(folds)) partial_result_df = make_df([full_results, full_indices], ["result", "index"]) partial_result_df = partial_result_df.sort(["index"]) partial_result_df.index = range(partial_result_df.shape[0]) result_df = pd.concat([partial_result_df, data[['next_year_wins', 'team', 'year', 'total_wins']]], axis=1) result_df = result_df[(result_df['next_year_wins']>0) & result_df['total_wins']>0] self.results = result_df self.calc_error(result_df) self.calc_importance(self.importances, self.column_names) def calc_error(self, result_df): filtered_df = result_df[result_df['year']<np.max(result_df['year'])] self.error = np.mean(np.abs(filtered_df['result'] - filtered_df['next_year_wins'])) def calc_importance(self, importances, col_names): importance_frame = pd.DataFrame(importances) importance_frame.columns = col_names self.importance = importance_frame.mean(axis=0) self.importance.sort(0) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ pass
class LoadAudioFiles(Task): data = Complex() all_files = List() seq = Complex() res = Complex() label_codes = Dict() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." args = { 'audio_dir': settings.AUDIO_DIR, 'timeout': 600, 'only_labelled_lines': settings.ONLY_LABELLED_LINES, 'processed_files_limit': settings.PROCESSED_FILES_LIMIT } def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def extract_season(self, name): match1 = re.search('\[(\d+)[x\.](\d+)\]', name) if match1 is not None: season = match1.group(1) episode = match1.group(2) return int(season), int(episode) match2 = re.search('S(\d+)E(\d+)', name) if match2 is not None: season = match2.group(1) episode = match2.group(2) return int(season), int(episode) return None, None def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ p = Pool(4, maxtasksperchild=50) audio_dir = kwargs['audio_dir'] timeout = kwargs['timeout'] oll = kwargs['only_labelled_lines'] pff = kwargs['processed_files_limit'] all_files = [] for ad in os.listdir(audio_dir): ad_path = os.path.abspath(os.path.join(audio_dir, ad)) if os.path.isdir(ad_path): files = os.listdir(ad_path) all_files += [ os.path.abspath(os.path.join(ad_path, f)) for f in files ] else: all_files += [ad_path] self.all_files = [f for f in all_files if f.endswith(".ogg")] frames = [] counter = 0 for f in self.all_files: season, episode = self.extract_season(f) if season is None or (season == 11 and episode == 6): continue subtitle_frame = data[((data['season'] == season) & (data['episode'] == episode))] if subtitle_frame.shape[0] == 0: continue #To cause loop to end early, remove if needed if oll: label_frame = subtitle_frame[(subtitle_frame['label'] != "")] if label_frame.shape[0] == 0: continue if pff is not None and isinstance(pff, int) and counter >= pff: break counter += 1 log.info("On file {0} Season {1} Episode {2}".format( counter, season, episode)) f_data, fs, enc = oggread(f) subtitle_frame = subtitle_frame.sort('start') subtitle_frame.index = range(subtitle_frame.shape[0]) samps = [] good_rows = [] for i in xrange(0, subtitle_frame.shape[0]): start = subtitle_frame['start'].iloc[i] end = subtitle_frame['end'].iloc[i] if end - start > 6 or (subtitle_frame['label'][i] == '' and oll): continue samp = f_data[(start * fs):(end * fs), :] samps.append({'samp': samp, 'fs': fs}) good_rows.append(i) r = p.imap(process_subtitle, samps, chunksize=1) sf = subtitle_frame.iloc[good_rows] results = [] for i in range(len(samps)): try: results.append(r.next(timeout=timeout)) except TimeoutError: results.append(None) good_rows = [ i for i in xrange(0, len(results)) if results[i] != None ] audio_features = [i for i in results if i != None] good_sf = sf.iloc[good_rows] good_sf.index = range(good_sf.shape[0]) audio_frame = pd.DataFrame(audio_features) audio_frame.index = range(audio_frame.shape[0]) df = pd.concat([good_sf, audio_frame], axis=1) df = df.fillna(-1) df.index = range(df.shape[0]) frames.append(df) lab_df_shape = df[df['label'] != ''].shape[0] log.info("Processed {0} lines, {1} of which were labelled".format( df.shape[0], lab_df_shape)) p.close() p.join() log.info("Done processing episodes.") data = pd.concat(frames, axis=0) data.index = range(data.shape[0]) data.index = range(data.shape[0]) for c in list(data.columns): data[c] = data[c].real for k in CHARACTERS: for i in CHARACTERS[k]: data['label'][data['label'] == i] = k self.label_codes = {k: i for (i, k) in enumerate(set(data['label']))} reverse_label_codes = { self.label_codes[k]: k for k in self.label_codes } data['label_code'] = [self.label_codes[k] for k in data['label']] self.seq = SequentialValidate() #Do cv to get error estimates cv_frame = data[data['label'] != ""] self.seq.train(cv_frame, **self.seq.args) self.res = self.seq.results self.res = self.res[[ 'line', 'label', 'label_code', 'result_code', 'result_label' ]] exact_percent, adj_percent = compute_error(self.res) log.info("Exact match percent: {0}".format(exact_percent)) log.info("Adjacent match percent: {0}".format(adj_percent)) #Predict in the frame alg = RandomForestTrain() target = cv_frame['label_code'] non_predictors = ["label", "line", "label_code"] train_names = [ l for l in list(cv_frame.columns) if l not in non_predictors ] train_data = cv_frame[train_names] predict_data = data[train_names] clf = alg.train(train_data, target, **alg.args) data['result_code'] = alg.predict(predict_data) data['result_label'] = [ reverse_label_codes[k] for k in data['result_code'] ] return data