class BaseInput(FieldModel): """ Base for the input class """ #Cached fields. See fields.base data = Dict() #Used for the registry category = RegistryCategories.inputs namespace = get_namespace(__module__) input_format = "none" help_text = "Base class for input. Do not use directly." def __init__(self, **kwargs): super(BaseInput, self).__init__(**kwargs) def read_input(self, stream, **kwargs): """ Reads the input in the specified format. Overriden by specific input functions. """ pass def get_data(self): """ After data has been input, returns it. Override if needed. """ return self.data
class CleanupScriptList(Task): data = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ script_removal_values = [""] for r in script_removal_values: data = data[data["script"]!=r] log.info(data) data['episode_name'] = [i.split('\n')[0].strip() for i in data['episode_name']] data['episode_code'] = [i.split('/')[-1].split('.html')[0] for i in data['url']] data.index = range(data.shape[0]) return data
class EvolveMusic(Task): data = Complex() clf = Complex() importances = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) args = { 'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"], 'target_var' : 'label_code', } def train(self,data,target, **kwargs): non_predictors = kwargs.get('non_predictors') target = kwargs.get('target_var') data.index = range(data.shape[0]) alg = RandomForestTrain() good_names = [i for i in data.columns if i not in non_predictors] for c in good_names: data[c] = data[c].astype(float) for c in good_names: data[c] = data[c].real clf = alg.train(np.asarray(data[good_names]),data[target],**alg.args) importances = clf.feature_importances_ counter = 0 for i in xrange(0,data.shape[0]): fname = data['fname'][i] vec, fs, enc = read_sound(fname) label = data["labels"][i] if label=="classical": counter+=1 name = fname.split("/")[-1] feats = process_song(vec,fs) initial_quality = clf.predict_proba(feats)[0,1] headers = "song_index,iteration,quality,distance,splice_song_index,splice_song" v2s = [headers,"{0},{1},{2},{3},{4},{5}".format(i,-1,initial_quality,0,0,"N/A")] print(headers) for z in xrange(0,100): if z%10==0 or z==0: v2ind = random.randint(0,data.shape[0]-1) v2fname = data['fname'][v2ind] vec2, v2fs, v2enc = read_sound(v2fname) feats = process_song(vec,fs) quality = clf.predict_proba(feats)[0,1] nearest_match, min_dist = find_nearest_match(feats, data[good_names]) descriptor = "{0},{1},{2},{3},{4},{5}".format(i,z,quality,min_dist,v2ind,v2fname.split("/")[-1]) v2s.append(descriptor) print(descriptor) if min_dist>.35 and (abs(quality-0)<=.1 or abs(1-quality)<=.1) and z!=0: write_file(name,vec,fs,enc,v2s) vec = alter(vec,vec2,fs,v2fs,clf) write_file(name,vec,fs,enc,v2s)
class ScriptFormatter(JSONFormat): namespace = get_namespace(__module__) def from_script(self, input_data): """ Reads script format input data, but data is already in json, so return. """ return input_data
class ProcessMusic(Task): data = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process sports events." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ d = [] labels = [] encs = [] fss = [] fnames = [] if not os.path.isfile(settings.FEATURE_PATH): for (z,p) in enumerate(data): log.info("On file {0}".format(z)) try: data , fs, enc = read_sound(p['newpath']) except Exception: continue try: features = process_song(data,fs) except Exception: log.exception("Could not get features") continue d.append(features) labels.append(p['type']) fss.append(fs) encs.append(enc) fnames.append(p['newpath']) frame = pd.DataFrame(d) frame['labels'] = labels frame['fs'] = fss frame['enc'] = encs frame['fname'] = fnames label_dict = { 'classical' : 1, 'electronic' : 0 } frame['label_code'] = [label_dict[i] for i in frame['labels']] frame.to_csv(settings.FEATURE_PATH) else: frame = pd.read_csv(settings.FEATURE_PATH) return frame
class CleanupScriptText(Task): data = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data) def check_for_line_split(self, line): return check_if_character(line.split(":")[0]) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ voice_scripts = [] for i in xrange(0,data.shape[0]): script_lines = data['script'][i].split('\n') voice_lines = [] current_line = "" for (i,line) in enumerate(script_lines): current_line = current_line.strip() line = line.strip() if line.startswith("[") and line.endswith("]"): continue if line.startswith("-"): continue voice_line = re.search('\w+:',line) if voice_line is not None: if self.check_for_line_split(current_line): voice_lines.append(current_line) current_line = line elif (len(line)==0 or line.startswith("-")) and len(current_line)>0: if self.check_for_line_split(current_line): voice_lines.append(current_line) current_line = "" voice_lines.append(" ") elif len(current_line)>0: current_line+=" " + line script_text = "\n".join([l for l in voice_lines if len(l)>0 and "{" not in l and "=" not in l]) script_text = re.sub("\[.+\]","",script_text) voice_scripts.append(script_text.strip()) data['voice_script'] = voice_scripts return data
class MusicInput(BaseInput): """ Extends baseinput to read simpsons scripts """ input_format = MusicFormats.mjson help_text = "Read in music links data." namespace = get_namespace(__module__) def read_input(self, mfile, has_header=True): """ directory is a path to a directory with multiple csv files """ mjson = json.load(open(mfile)) for m in mjson: m['ltype'] = m['ltype'].split("?")[0] ltypes = list(set([m['ltype'] for m in mjson])) for l in ltypes: jp = join_path(settings.MUSIC_PATH, l) if not os.path.isdir(jp): os.mkdir(jp) fpaths = [] for m in mjson: fname = m['link'].split("/")[-1] fpath = join_path(join_path(settings.MUSIC_PATH, m['ltype']), fname) try: if not os.path.isfile(fpath): r = requests.get(m['link']) f = open(fpath, 'wb') f.write(r.content) f.close() fpaths.append({'type': m['ltype'], 'path': fpath}) except Exception: log.exception("Could not get music file.") for p in fpaths: newfile = p['path'][:-4] + ".ogg" if not os.path.isfile(newfile): frommp3 = subprocess.Popen(['mpg123', '-w', '-', p['path']], stdout=subprocess.PIPE) toogg = subprocess.Popen(['oggenc', '-'], stdin=frommp3.stdout, stdout=subprocess.PIPE) with open(newfile, 'wb') as outfile: while True: data = toogg.stdout.read(1024 * 100) if not data: break outfile.write(data) p['newpath'] = newfile self.data = fpaths
class EventFormatter(JSONFormat): namespace = get_namespace(__module__) def from_events(self, input_data): """ Reads subtitle format input data and converts to json. """ return input_data def to_dataframe(self): return self.data
class RandomForestTrain(Train): """ A class to train a random forest """ colnames = List() clf = Complex() category = RegistryCategories.algorithms namespace = get_namespace(__module__) algorithm = RandomForestClassifier args = {'n_estimators' : 300, 'min_samples_leaf' : 4, 'compute_importances' : True} help_text = "Train and predict with Random Forest."
class BaseFormat(FieldModel): """ Base class to reformat input data. If implementing, add in from_ and to_ methods (see read_input and get_data) """ #These fields will be cached. See fields.base data = Dict() #Set the category and namespace for the registry category = RegistryCategories.formatters namespace = get_namespace(__module__) help_text = "Base class for reformatting input data. Do not use directly." def __init__(self, **kwargs): super(BaseFormat, self).__init__(**kwargs) self.input_formats = [] self.output_formats = [] self.setup_formats() def setup_formats(self): """ Inspects its methods to see what it can convert from and to """ methods = self.get_methods() for m in methods: #Methods named "from_X" will be assumed to convert from format X to the common format if m.startswith("from_"): self.input_formats.append(re.sub("from_" , "",m)) #Methods named "to_X" will be assumed to convert from the common format to X elif m.startswith("to_"): self.output_formats.append(re.sub("to_","",m)) def read_input(self, input_data, data_format): """ Reads the input data and converts to common format input_data - the output from one of the input classes (ie CSVInput) data_format - the format of the data. See utils.input.dataformats """ if data_format not in self.input_formats: raise Exception("Input format {0} not available with this class. Available formats are {1}.".format(data_format, self.input_formats)) data_converter = getattr(self, "from_" + data_format) self.data = data_converter(input_data) def get_data(self, data_format): """ Reads the common format and converts to output data data_format - the format of the output data. See utils.input.dataformats """ if data_format not in self.output_formats: raise Exception("Output format {0} not available with this class. Available formats are {1}.".format(data_format, self.output_formats)) data_converter = getattr(self, "to_" + data_format) return data_converter()
class ReformatScriptText(Task): data = Complex() voice_lines = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." args = {'scriptfile' : os.path.abspath(os.path.join(settings.PROJECT_PATH, "data/raw_scripts2.json")), 'do_replace' : True} def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = data self.predict(self.data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ voice_scripts = list(data['voice_script']) scriptfile = kwargs['scriptfile'] do_replace = kwargs['do_replace'] json_scripts = json.load(open(scriptfile)) voice_scripts+=[s['script'] for s in json_scripts] script_segments = [] for script in voice_scripts: script = script.replace("\"","") lines = script.split("\n") segment = [] for line in lines: if line.strip()!="": line = line.encode('ascii','ignore') line_split = line.split(":") if do_replace: line_split[0] = find_replacement(line_split[0].strip()) line_split[0] = cleanup_name(line_split[0].strip()) segment.append({'speaker' : line_split[0], 'line' : ":".join(line_split[1:]).strip()}) else: if len(segment)>0: script_segments.append(segment) segment = [] if len(segment)>0: script_segments.append(segment) self.voice_lines = script_segments
class ScriptInput(BaseInput): """ Extends baseinput to read simpsons scripts """ input_format = SimpsonsFormats.script help_text = "Reformat simpsons script data." namespace = get_namespace(__module__) def read_input(self, filename, has_header=True): """ directory is a path to a directory with multiple csv files """ filestream = open(filename) self.data = json.load(filestream)
class MidiInput(BaseInput): """ Extends baseinput to read midi """ input_format = MusicFormats.midijson help_text = "Read in music links data." namespace = get_namespace(__module__) def read_input(self, mfile, has_header=True): """ directory is a path to a directory with multiple csv files """ mjson = json.load(open(mfile)) ltypes = list(set([m['ltype'] for m in mjson])) for l in ltypes: jp = join_path(settings.MIDI_MUSIC_PATH, l) if not os.path.isdir(jp): os.mkdir(jp) fpaths = [] for m in mjson: fname = m['link'].split("/")[-1] fpath = join_path(join_path(settings.MIDI_MUSIC_PATH, m['ltype']), fname) log.info(fpath) try: if not os.path.isfile(fpath): r = requests.get(m['link']) f = open(fpath, 'wb') f.write(r.content) f.close() fpaths.append({'type': m['ltype'], 'path': fpath}) except Exception: log.exception("Could not get music file.") npaths = [] for p in fpaths: try: p['newpath'] = convert_to_ogg(p['path']) npaths.append(p) except Exception: log.exception("Could not convert to ogg") continue self.data = npaths
class PullDownComments(Task): data = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Pull down comments and store them." def train(self, data, **kwargs): try: items_done = read_raw_data_from_cache( os.path.abspath( os.path.join(settings.DATA_PATH, "items_done.p"))) comments = [c['comment'] for c in items_done] replies = [c['reply'] for c in items_done] for subreddit in settings.REPLY_SUBREDDIT_LIST: try: comment = get_single_comment(subreddit) print comment if comment is None: log.info("Could not get a comment") continue text = comment.body cid = comment.id reply = test_knn_matcher(knn_matcher, text) if text in comments or (reply in replies and reply is not None): continue data = {'comment': text, 'reply': reply, 'comment_id': cid} items_done.append(data) replies.append(reply) comments.append(text) log.info("Subreddit: {0}".format(subreddit)) log.info("Comment: {0} {1}".format(cid, text)) log.info("Reply: {0}".format(reply)) log.info("-------------------") except: log.exception("Cannot get reply for {0}".format(subreddit)) continue write_data_to_cache(items_done, "items_done.p", "comment_id") except Exception: log.exception("Could not pull down comment.")
class ProcessGames(Task): data = Complex() row_data = List() speaker_code_dict = Dict() speaker_codes = List() vectorizer = Complex() data_format = SportsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process sports events." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ con = sqlite3.connect(settings.DB_PATH) c = con.cursor() rosters = sql.read_frame("select * from rosters", con) tys = [] for i in xrange(0, rosters.shape[0]): year = rosters.iloc[i]['year'] team = rosters.iloc[i]['team'] ty = [year, team] if ty not in tys: tys.append(ty) for ty in tys: year, team = ty ros = rosters[((rosters['year'] == year) & (rosters['team'] == team))] players = list(ros['id']) return data
class Task(FieldModel): """ Base class for task """ #Used by the registry category = RegistryCategories.base namespace = get_namespace(__module__) #Define dependencies to run before this (results are passed into class before execution) dependencies = [] trained_dependencies = [] #Additional arguments to pass into train and predict functions (additional data files, etc) args = {} #Data format accepted data_format = DataFormats.dataframe #Cached field data = Dict() help_text = "Base task class. Do not use directly." def __init__(self, **kwargs): super(Task, self).__init__(**kwargs) def train(self, data, **kwargs): """ Used in the training phase. Override. """ pass def predict(self, test_data, **kwargs): """ Used in the predict phase, after training. Override """ pass def get_data(self): data_dict = {} for key in self.stored_values: data_dict.update({key: getattr(self, key)})
class SubtitleFormatter(JSONFormat): namespace = get_namespace(__module__) def from_subtitle(self, input_data): """ Reads subtitle format input data and converts to json. """ reformatted_data = [] for (i, row) in enumerate(input_data): if i == 0: headers = row else: data_row = {} for (j, h) in enumerate(headers): if j < len(row): data_row.update({h: row[j]}) else: data_row.update({h: 0}) reformatted_data.append(data_row) return reformatted_data
class GenerateTransitionMatrix(Task): data = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process midi files." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ tempos = {'tick' : [], 'mpqn' : []} notes = {} all_instruments = [] for (z,p) in enumerate(data): log.info("On file {0}".format(z)) try: m = midi.read_midifile(p['path']) except Exception: continue try: notes, tempos, instruments = process_midifile(m,notes,tempos) all_instruments.append(instruments) except Exception: log.exception("Could not get features") continue nm, tm = generate_matrices(notes,tempos) data = {'files' : data, 'notes' : notes, 'tempos' : tempos, 'nm' : nm, 'tm': tm, 'in' : list(chain.from_iterable(all_instruments))} return data
class GameInput(BaseInput): """ Extends baseinput to read simpsons scripts """ input_format = SportsFormats.events help_text = "Read in baseball event data." namespace = get_namespace(__module__) def read_input(self, directory, has_header=True): """ directory is a path to a directory with multiple csv files """ efolds = [ join_path(directory, f) for f in os.listdir(directory) if os.path.isdir(os.path.join(directory, f)) ] efiles = [] for fold in efolds: files = [ i for i in os.listdir(fold) if os.path.isfile(join_path(fold, i)) if i.endswith(".EVN") ] years = list(set([i[:4] for i in files])) for (i, y) in enumerate(years): if not os.path.isfile('{0}/events-{1}.csv'.format( settings.DATA_PATH, y)): cmd = "{cp}cwevent -q -n -f 0-96 -x 0-62 -y {y} {y}*.EV* > {dp}/events-{y}.csv".format( cp=settings.CHADWICK_PATH, dp=settings.DATA_PATH, y=y) os.chdir(fold) subprocess.call(cmd, shell=True) if not os.path.isfile('{0}/games-{1}.csv'.format( settings.DATA_PATH, y)): cmd = "{cp}cwgame -q -n -f 0-83 -y {y} {y}*.EV* > {dp}/games-{y}.csv".format( cp=settings.CHADWICK_PATH, dp=settings.DATA_PATH, y=y) subprocess.call(cmd, shell=True) if not os.path.isfile('{0}/boxes-{1}.csv'.format( settings.DATA_PATH, y)): cmd = "{cp}cwbox -q -X -y {y} {y}*.EV* > {dp}/boxes-{y}.csv".format( cp=settings.CHADWICK_PATH, dp=settings.DATA_PATH, y=y) os.chdir(fold) subprocess.call(cmd, shell=True) efiles += [ join_path(fold, i) for i in os.listdir(fold) if os.path.isfile(join_path(fold, i)) ] con = sqlite3.connect(settings.DB_PATH) c = con.cursor() if not table_exists(c, "rosters"): rfiles = [f for f in efiles if f.endswith(".ROS")] rosters = [] for r in rfiles: filestream = open(r) team, year = get_team_and_year(r) df = pd.read_csv(filestream, names=[ "id", "lastname", "firstname", "pbat", "sbat", "team", "position" ]) df['year'] = [year for i in xrange(0, df.shape[0])] rosters.append(df) roster = pd.concat(rosters, axis=0) sql.write_frame(roster, name='rosters', con=con) game_files = [ join_path(settings.DATA_PATH, g) for g in os.listdir(settings.DATA_PATH) if g.startswith('games-') ] event_files = [ join_path(settings.DATA_PATH, e) for e in os.listdir(settings.DATA_PATH) if e.startswith('events-') ] if not table_exists(c, "games"): games = [] for g in game_files: df = pd.read_csv(open(g)) year = e.split('-')[1].split('\.')[0] df['year'] = [year for i in xrange(0, df.shape[0])] games.append(df) games = pd.concat(games, axis=0) sql.write_frame(games, name='games', con=con) if not table_exists(c, "events"): events = [] for e in event_files: df = pd.read_csv(open(e)) year = e.split('-')[1].split('\.')[0] df['year'] = [year for i in xrange(0, df.shape[0])] events.append(df) events = pd.concat(events, axis=0) sql.write_frame(events, name='events', con=con) self.data = { 'rosters': 'roster', 'games': 'games', 'events': 'events', }
class KNNRF(Task): data = Complex() predictions = Complex() importances = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) args = {'algo': RandomForestTrain} help_text = "Cleanup simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ from preprocess import CHARACTERS vec_length = math.floor(MAX_FEATURES / 3) algo = kwargs.get('algo') alg = algo() train_data = data['train_frame'].iloc[:, :-1] target = data['train_frame']['current_speaker'] clf = alg.train(train_data, target, **algo.args) self.importances = clf.feature_importances_ test_data = data['data'] match_data = data['current_features'] reverse_speaker_code_dict = { data['speaker_code_dict'][k]: k for k in data['speaker_code_dict'] } speaker_list = [] speaker_codes = reverse_speaker_code_dict.keys() for i in xrange(0, len(speaker_codes)): s_text = "\n".join( list(data['speakers'][data['speakers']['speaker'] == reverse_speaker_code_dict[ speaker_codes[i]]]['line'])) speaker_list.append(s_text) speaker_features = data['vectorizer'].batch_get_features(speaker_list) self.predictions = [] counter = 0 for script in test_data['voice_script']: counter += 1 log.info("On script {0} out of {1}".format( counter, len(test_data['voice_script']))) lines = script.split("\n") speaker_code = [-1 for i in xrange(0, len(lines))] for (i, line) in enumerate(lines): if i > 0 and i % RESET_SCENE_EVERY != 0: previous_line = lines[i - 1] previous_speaker = speaker_code[i - 1] else: previous_line = "" previous_speaker = -1 if i > 1 and i % RESET_SCENE_EVERY != 0: two_back_speaker = speaker_code[i - 2] else: two_back_speaker = -1 if i < (len(lines) - 1): next_line = lines[i + 1] else: next_line = "" prev_features = data['vectorizer'].get_features(previous_line) cur_features = data['vectorizer'].get_features(line) next_features = data['vectorizer'].get_features(next_line) meta_features = make_df( [[two_back_speaker], [previous_speaker]], ["two_back_speaker", "previous_speaker"]) #meta_features = make_df([[two_back_speaker]],["two_back_speaker"]) train_frame = pd.concat([ pd.DataFrame(prev_features), pd.DataFrame(cur_features), pd.DataFrame(next_features), meta_features ], axis=1) speaker_code[i] = alg.predict(train_frame)[0] nearest_match, distance = self.find_nearest_match( cur_features, speaker_features) if distance < CHARACTER_DISTANCE_MIN: sc = speaker_codes[nearest_match] speaker_code[i] = sc continue for k in CHARACTERS: for c in CHARACTERS[k]: if c in previous_line: speaker_code[i] = data['speaker_code_dict'][k] nearest_match, distance = self.find_nearest_match( cur_features, match_data) if distance < DISTANCE_MIN: sc = data['speakers']['speaker_code'][nearest_match] speaker_code[i] = sc continue df = make_df([ lines, speaker_code, [reverse_speaker_code_dict[round(s)] for s in speaker_code] ], ["line", "speaker_code", "speaker"]) self.predictions.append(df) return data def find_nearest_match(self, features, matrix): features = np.asarray(features) distances = [self.euclidean(u, features) for u in matrix] nearest_match = distances.index(min(distances)) return nearest_match, min(distances) def euclidean(self, v1, v2): return np.sqrt(np.sum(np.square(np.subtract(v1, v2))))
class FeatureExtractor(Task): data = Complex() row_data = List() speaker_code_dict = Dict() speaker_codes = List() vectorizer = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." args = { 'scriptfile': os.path.abspath(os.path.join(settings.DATA_PATH, "script_tasks")) } def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ scriptfile = kwargs.get('scriptfile') script_data = pickle.load(open(scriptfile)) script = script_data.tasks[2].voice_lines.value speakers = [] lines = [] for s in script: for (i, l) in enumerate(s): if i > 0: previous_line = s[i - 1]['line'] previous_speaker = s[i - 1]['speaker'] else: previous_line = "" previous_speaker = "" if i > 1: two_back_speaker = s[i - 2]['speaker'] else: two_back_speaker = "" if len(s) > i + 1: next_line = s[i + 1]['line'] else: next_line = "" current_line = s[i]['line'] current_speaker = s[i]['speaker'] lines.append(current_line) speakers.append(current_speaker) row_data = { 'previous_line': previous_line, 'previous_speaker': previous_speaker, 'next_line': next_line, 'current_line': current_line, 'current_speaker': current_speaker, 'two_back_speaker': two_back_speaker } self.row_data.append(row_data) self.speaker_code_dict = { k: i for (i, k) in enumerate(list(set(speakers))) } self.speaker_codes = [self.speaker_code_dict[s] for s in speakers] self.max_features = math.floor(MAX_FEATURES) / 3 self.vectorizer = Vectorizer() self.vectorizer.fit(lines, self.speaker_codes, self.max_features) prev_features = self.vectorizer.batch_get_features( [rd['previous_line'] for rd in self.row_data]) cur_features = self.vectorizer.batch_get_features( [rd['current_line'] for rd in self.row_data]) next_features = self.vectorizer.batch_get_features( [rd['next_line'] for rd in self.row_data]) self.speaker_code_dict.update({'': -1}) meta_features = make_df( [[ self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data ], [ self.speaker_code_dict[s['previous_speaker']] for s in self.row_data ], self.speaker_codes], ["two_back_speaker", "previous_speaker", "current_speaker"]) #meta_features = make_df([[self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data], self.speaker_codes],["two_back_speaker", "current_speaker"]) train_frame = pd.concat([ pd.DataFrame(prev_features), pd.DataFrame(cur_features), pd.DataFrame(next_features), meta_features ], axis=1) train_frame.index = range(train_frame.shape[0]) data = { 'vectorizer': self.vectorizer, 'speaker_code_dict': self.speaker_code_dict, 'train_frame': train_frame, 'speakers': make_df([speakers, self.speaker_codes, lines], ["speaker", "speaker_code", "line"]), 'data': data, 'current_features': cur_features, } return data
class GenerateMarkovTracks(Task): data = Complex() data_format = MusicFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Process midi files." args = { 'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"], 'target_var' : 'label_code', } def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ frame1 = pd.read_csv(settings.MIDI_FEATURE_PATH) frame2 = pd.read_csv(settings.FEATURE_PATH) frame = pd.concat([frame1,frame2],axis=0) non_predictors = kwargs.get('non_predictors') target = kwargs.get('target_var') frame.index = range(frame.shape[0]) alg = RandomForestTrain() good_names = [i for i in frame.columns if i not in non_predictors] for c in good_names: frame[c] = frame[c].astype(float) for c in good_names: frame[c] = frame[c].real clf = alg.train(np.asarray(frame[good_names]),frame[target],**alg.args) evolutions = 2 track_count = 100 patterns_to_pick = int(math.floor(track_count/4)) remixes_to_make = int(math.floor(track_count/4)) additions_to_make = int(math.floor(track_count/4)) patterns = generate_patterns(track_count,data) for z in xrange(0,evolutions): new_quality, quality, patterns = rate_tracks(patterns, clf) patterns = patterns[0:patterns_to_pick] for i in xrange(0,remixes_to_make): patterns.append(remix(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick]))) #for i in xrange(0,additions_to_make): # patterns.append(add_song(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick]))) new_patterns = [] for p in patterns: if p not in new_patterns: new_patterns.append(p) patterns = new_patterns patterns += generate_patterns(track_count - len(patterns), data) new_quality, quality, patterns = rate_tracks(patterns, clf) feats = [] for (i,p) in enumerate(patterns): time = strftime("%m-%d-%Y-%H%M%S", gmtime()) fname = time+random.choice(words)+".mid" oggpath = write_and_convert(p,fname) dat, fs, enc = oggread(oggpath) f = process_song(dat[:settings.MUSIC_TIME_LIMIT * fs,:],fs) feats.append(f) feats = pd.DataFrame(feats) feats['label_code'] = [2] * feats.shape[0] feats['label'] = ["generated"] * feats.shape[0] feats.to_csv(os.path.abspath(os.path.join(settings.DATA_PATH,"generated_midi_features.csv"))) return data
class SubtitleInput(BaseInput): """ Extends baseinput to read simpsons scripts """ input_format = SimpsonsFormats.subtitle help_text = "Reformat simpsons script data." namespace = get_namespace(__module__) def get_episode_metadata(self, name): episode_code = re.search("\[\d+\.\d+\]", name).group(0).replace("[","").replace("]","") season, episode = episode_code.split(".") season = int(season) episode = int(episode) return season, episode def read_input(self, directory, has_header=True): """ directory is a path to a directory with multiple csv files """ sub_datafiles = [ f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory,f)) and f.endswith(".sub")] all_sub_data = [] for infile in sub_datafiles: stream = open(os.path.join(directory, infile)) season,episode = self.get_episode_metadata(infile) data=stream.read() row_data = [] for (i, row) in enumerate(data.split("\n")): row = row.replace('\r','') row_split = row.split("}") if len(row_split)>2: start = float(row_split[0].replace("{",""))/24 end = float(row_split[1].replace("{",""))/24 line = row_split[2].split("{")[0] if len(row_split[2].split("{"))>1: label = row_split[2].split("{")[1].replace("}","") else: label = "" row_data.append([start,end,line,label,season,episode]) all_sub_data.append(row_data) srt_datafiles = [ f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory,f)) and f.endswith(".srt")] for infile in srt_datafiles: stream = open(os.path.join(directory, infile)) season,episode = self.get_episode_metadata(infile) data=stream.read() row_data = [] for (i, row) in enumerate(data.split("\r\n\r\n")): row_split = row.split("\r\n") if len(row_split)>3: timing = row_split[1] start_timing = timing.split("-->")[0].replace(",",".").split(":") start_seconds = float(start_timing[-1]) + float(start_timing[-2])*60 + float(start_timing[-3])*3600 start = start_seconds end_timing = timing.split("-->")[1].replace(",",".").split(":") end_seconds = float(end_timing[-1]) + float(end_timing[-2])*60 + float(end_timing[-3])*3600 end = end_seconds line = " ".join(row_split[2:]) if len(line.split("{"))>1: label = line.split("{")[1].replace("}","") line = line.split("{")[0] else: label = "" row_data.append([start,end,line,label,season,episode]) all_sub_data.append(row_data) sub_data = [["start","end","line","label","season","episode"]] + list(chain.from_iterable(all_sub_data)) self.data = sub_data
class CrossValidate(Task): data = Complex() results = Complex() error = Float() importances = Complex() importance = Complex() column_names = List() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) args = { 'nfolds': 3, 'algo': RandomForestTrain, 'target_name': 'label_code', 'non_predictors': ["label", "line", "label_code"] } help_text = "Cross validate simpsons data." def cross_validate(self, data, **kwargs): nfolds = kwargs.get('nfolds', 3) algo = kwargs.get('algo') seed = kwargs.get('seed', 1) self.target_name = kwargs.get('target_name') non_predictors = kwargs.get('non_predictors') self.column_names = [ l for l in list(data.columns) if l not in non_predictors ] data_len = data.shape[0] counter = 0 fold_length = int(math.floor(data_len / nfolds)) folds = [] data_seq = list(xrange(0, data_len)) random.seed(seed) random.shuffle(data_seq) for fold in xrange(0, nfolds): start = counter end = counter + fold_length if fold == (nfolds - 1): end = data_len folds.append(data_seq[start:end]) counter += fold_length results = [] data.index = range(data.shape[0]) self.importances = [] for (i, fold) in enumerate(folds): predict_data = data.iloc[fold, :] out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):])) train_data = data.iloc[out_indices, :] alg = algo() target = train_data[self.target_name] train_data = train_data[[ l for l in list(train_data.columns) if l not in non_predictors ]] predict_data = predict_data[[ l for l in list(predict_data.columns) if l not in non_predictors ]] clf = alg.train(train_data, target, **algo.args) results.append(alg.predict(predict_data)) self.importances.append(clf.feature_importances_) return results, folds def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.target_name = kwargs.get('target_name') results, folds = self.cross_validate(data, **kwargs) self.gather_results(results, folds, data) def gather_results(self, results, folds, data): full_results = list(chain.from_iterable(results)) full_indices = list(chain.from_iterable(folds)) partial_result_df = make_df([full_results, full_indices], ["result", "index"]) partial_result_df = partial_result_df.sort(["index"]) partial_result_df.index = range(partial_result_df.shape[0]) result_df = pd.concat([partial_result_df, data], axis=1) self.results = result_df self.calc_importance(self.importances, self.column_names) def calc_error(self, result_df): self.error = np.mean( np.abs(result_df['result'] - result_df[self.target_name])) def calc_importance(self, importances, col_names): importance_frame = pd.DataFrame(importances) importance_frame.columns = col_names self.importance = importance_frame.mean(axis=0) self.importance.sort(0) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ pass
class ClusterScriptText(Task): data = Complex() clusters = Complex() predictions = Complex() clusters = List() cl = Complex() vec = Complex() vec1 = Complex() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cluster simpsons scripts." def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = data self.predict(self.data) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ from train import Vectorizer, make_df self.vec = Vectorizer() reformatter = ReformatScriptText() args = reformatter.args args['do_replace'] = False reformatter.train(data, "", **args) script_segments = list(chain.from_iterable(reformatter.voice_lines)) text = [s['line'] for s in script_segments] speaker = [s['speaker'] for s in script_segments] unique_speakers = list(set(speaker)) speaker_code_dict = {k:i for (i,k) in enumerate(unique_speakers)} speaker_codes = [speaker_code_dict[k] for k in unique_speakers] speaker_list = [] speaker_frame = make_df([text,speaker],["text","speaker"]) for i in unique_speakers: s_text = "\n".join(list(speaker_frame[speaker_frame['speaker']==i]['text'])) speaker_list.append(s_text) self.vec.fit(speaker_list, speaker_codes, 200,min_features=2) features = self.vec.batch_get_features(speaker_list) cl = KMeans() self.predictions = cl.fit_predict(features) self.cl = cl for i in xrange(0,max(self.predictions)): clust = [] for c in xrange(0,len(speaker_codes)): if self.predictions[c]==i: clust.append(unique_speakers[c]) self.clusters.append(clust) pca = PCA(n_components=2, whiten=True).fit(features) rf = pca.transform(features) labels = cl.labels_ pyplot.clf() centroids = cl.cluster_centers_ pyplot.cla() for i in range(max(labels)): ds = rf[np.where(labels==i)] pyplot.plot(ds[:,0],ds[:,1],'o', label=self.clusters[i][0]) pyplot.legend(loc=8) pyplot.savefig('clusters.png') self.vec1 = Vectorizer() speaker_codes = [speaker_code_dict[k] for k in speaker] self.vec1.fit(text, speaker_codes, 200,min_features=2) features = self.vec1.batch_get_features(text) pca = PCA(n_components=2, whiten=True).fit(features) rf = pca.transform(features) pyplot.clf() pyplot.cla() for i in range(len(speaker_codes)): pyplot.plot(rf[i,0],rf[i,1],'o', label=speaker[i]) pyplot.savefig('all_speakers.png')
class BaseWorkflow(object): """ Base workflow class """ #Metaclass needed to register the workflow __metaclass__ = MetaFieldModel #category, namespace, name for the registry category = RegistryCategories.base namespace = get_namespace(__module__) name = __name__.lower() #Defines how tasks are run runner = import_from_string(settings.RUNNER) input_file = "" input_format = DataFormats.csv target_file = "" target_format = DataFormats.csv predict_file = "" predict_format = DataFormats.csv tasks = [] run_id = "" help_text = "Base class for workflow. Do not use directly." def __init__(self, **kwargs): #initialize runner. Don't do this at class level to avoid sharing same runner object. self.runner = self.runner() self.setup_run = False def setup(self): #Reformat input data as needed self.reformatted_input = self.reformat_input() self.setup_run = True def find_dependencies(self, task): dependencies = task.dependencies return dependencies def execute_train_task_with_dependencies(self, task_cls, **kwargs): """ Run the training, as well as any dependencies of the training task_cls - class of a task """ log.info("Task {0}".format(get_task_name(task_cls))) #Instantiate the task task_inst = task_cls() #Grab arguments from the task instance and set them for arg in task_inst.args: if arg not in kwargs: kwargs[arg] = task_inst.args[arg] #Check for dependencies defined by the task if hasattr(task_inst, "dependencies"): deps = task_inst.dependencies dep_results = [] #Run the dependencies through recursion (in case of dependencies of dependencies, etc) for dep in deps: log.info("Dependency {0}".format(get_task_name(dep))) dep_results.append( self.execute_train_task_with_dependencies( dep.cls, **dep.args)) trained_dependencies = [] #Add executed dependency to trained_dependencies list on the task for i in xrange(0, len(deps)): dep = deps[i] dep_result = dep_results[i] name = dep.name namespace = dep.namespace category = dep.category trained_dependencies.append( TrainedDependency(category=category, namespace=namespace, name=name, inst=dep)) task_inst.trained_dependencies = trained_dependencies #Finally, run the task task_inst.train(**kwargs) return task_inst def execute_predict_task(self, task_inst, predict_data, **kwargs): """ Do a prediction task_inst - instance of a task """ result = task_inst.predict(predict_data, **task_inst.args) return result def train(self, **kwargs): """ Do the workflow training """ log.info("Starting to train...") if not self.setup_run: self.setup() self.trained_tasks = [] for task in self.tasks: data = self.reformatted_input[task.data_format]['data'] target = self.reformatted_input[task.data_format]['target'] if data is None: raise Exception( "Data cannot be none. Check the config file to make sure the right input is being read." ) kwargs['data'] = data kwargs['target'] = target trained_task = self.execute_train_task_with_dependencies( task, **kwargs) self.trained_tasks.append(trained_task) #If the trained task alters the data in any way, pass it down the chain to the next task if hasattr(trained_task, 'data'): self.reformatted_input[ task.data_format]['data'] = trained_task.data log.info("Finished training.") def predict(self, **kwargs): """ Do the workflow prediction (done after training, with new data) """ reformatted_predict = self.reformat_predict_data() results = {} for task_inst in self.trained_tasks: predict = reformatted_predict[task_inst.data_format]['predict'] kwargs['predict'] = predict results.update({ get_task_name(task_inst): self.execute_predict_task(task_inst, predict, **kwargs) }) return results def find_input(self, input_format): """ Find an input class for a given format input_format - see utils.input.dataformats """ input_cls = find_needed_input(input_format) return input_cls def read_input(self, input_cls, filename, **kwargs): """ Read in input and do some minimal preformatting input_cls - the class to use to read the input filename - input filename """ input_inst = input_cls() input_inst.read_input(filename) return input_inst.get_data() def reformat_file(self, input_file, input_format, output_format): """ Reformat input data files to a format the tasks can use """ #Return none if input_file or input_format do not exist if input_file is None or input_format is None: return None #Find the needed input class and read the input stream try: input_cls = self.find_input(input_format) input_inst = input_cls() except TypeError: #Return none if input_cls is a Nonetype return None #If the input file cannot be found, return None try: input_inst.read_input(self.absolute_filepath(input_file)) except IOError: return None formatter = find_needed_formatter(input_format, output_format) if formatter is None: raise Exception( "Cannot find a formatter that can convert from {0} to {1}". format(self.input_format, output_format)) formatter_inst = formatter() formatter_inst.read_input(input_inst.get_data(), input_format) data = formatter_inst.get_data(output_format) return data def absolute_filepath(self, input_file): """ Gets absolute path of a file """ #abspath needed to avoid relative path issues return os.path.abspath(input_file) def reformat_predict_data(self, **kwargs): reformatted_predict = {} for output_format in self.needed_formats: reformatted_predict.update({ output_format: { 'predict': self.reformat_file(self.predict_file, self.predict_format, output_format), } }) return reformatted_predict def reformat_input(self, **kwargs): """ Reformat input data """ reformatted_input = {} needed_formats = [] for task_cls in self.tasks: needed_formats.append(task_cls.data_format) self.needed_formats = list(set(needed_formats)) for output_format in self.needed_formats: reformatted_input.update({ output_format: { 'data': self.reformat_file(self.input_file, self.input_format, output_format), 'target': self.reformat_file(self.target_file, self.target_format, output_format) } }) return reformatted_input
class Validate(Task): data = Complex() results = Complex() error = Float() importances = Complex() importance = Complex() column_names = List() data_format = DataFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Validate." def cross_validate(self, data, non_predictors, **kwargs): nfolds = kwargs.get('nfolds', 3) algo = kwargs.get('algo') seed = kwargs.get('seed', 1) data_len = data.shape[0] counter = 0 fold_length = int(math.floor(data_len/nfolds)) folds = [] data_seq = list(xrange(0,data_len)) random.seed(seed) random.shuffle(data_seq) for fold in xrange(0, nfolds): start = counter end = counter + fold_length if fold == (nfolds-1): end = data_len folds.append(data_seq[start:end]) counter += fold_length results = [] data.index = range(data.shape[0]) self.importances = [] for (i,fold) in enumerate(folds): predict_data = data.iloc[fold,:] out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):])) train_data = data.iloc[out_indices,:] alg = algo() target = train_data['next_year_wins'] train_data = train_data[[l for l in list(train_data.columns) if l not in non_predictors]] predict_data = predict_data[[l for l in list(predict_data.columns) if l not in non_predictors]] clf = alg.train(train_data,target,**algo.args) results.append(alg.predict(predict_data)) self.importances.append(clf.feature_importances_) return results, folds def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ non_predictors = [i.replace(" ", "_").lower() for i in list(set(data['team']))] + ["team", "next_year_wins"] self.column_names = [l for l in list(data.columns) if l not in non_predictors] results, folds = self.cross_validate(data, non_predictors, **kwargs) self.gather_results(results, folds, data) def gather_results(self, results, folds, data): full_results = list(chain.from_iterable(results)) full_indices = list(chain.from_iterable(folds)) partial_result_df = make_df([full_results, full_indices], ["result", "index"]) partial_result_df = partial_result_df.sort(["index"]) partial_result_df.index = range(partial_result_df.shape[0]) result_df = pd.concat([partial_result_df, data[['next_year_wins', 'team', 'year', 'total_wins']]], axis=1) result_df = result_df[(result_df['next_year_wins']>0) & result_df['total_wins']>0] self.results = result_df self.calc_error(result_df) self.calc_importance(self.importances, self.column_names) def calc_error(self, result_df): filtered_df = result_df[result_df['year']<np.max(result_df['year'])] self.error = np.mean(np.abs(filtered_df['result'] - filtered_df['next_year_wins'])) def calc_importance(self, importances, col_names): importance_frame = pd.DataFrame(importances) importance_frame.columns = col_names self.importance = importance_frame.mean(axis=0) self.importance.sort(0) def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ pass
class LoadAudioFiles(Task): data = Complex() all_files = List() seq = Complex() res = Complex() label_codes = Dict() data_format = SimpsonsFormats.dataframe category = RegistryCategories.preprocessors namespace = get_namespace(__module__) help_text = "Cleanup simpsons scripts." args = { 'audio_dir': settings.AUDIO_DIR, 'timeout': 600, 'only_labelled_lines': settings.ONLY_LABELLED_LINES, 'processed_files_limit': settings.PROCESSED_FILES_LIMIT } def train(self, data, target, **kwargs): """ Used in the training phase. Override. """ self.data = self.predict(data, **kwargs) def extract_season(self, name): match1 = re.search('\[(\d+)[x\.](\d+)\]', name) if match1 is not None: season = match1.group(1) episode = match1.group(2) return int(season), int(episode) match2 = re.search('S(\d+)E(\d+)', name) if match2 is not None: season = match2.group(1) episode = match2.group(2) return int(season), int(episode) return None, None def predict(self, data, **kwargs): """ Used in the predict phase, after training. Override """ p = Pool(4, maxtasksperchild=50) audio_dir = kwargs['audio_dir'] timeout = kwargs['timeout'] oll = kwargs['only_labelled_lines'] pff = kwargs['processed_files_limit'] all_files = [] for ad in os.listdir(audio_dir): ad_path = os.path.abspath(os.path.join(audio_dir, ad)) if os.path.isdir(ad_path): files = os.listdir(ad_path) all_files += [ os.path.abspath(os.path.join(ad_path, f)) for f in files ] else: all_files += [ad_path] self.all_files = [f for f in all_files if f.endswith(".ogg")] frames = [] counter = 0 for f in self.all_files: season, episode = self.extract_season(f) if season is None or (season == 11 and episode == 6): continue subtitle_frame = data[((data['season'] == season) & (data['episode'] == episode))] if subtitle_frame.shape[0] == 0: continue #To cause loop to end early, remove if needed if oll: label_frame = subtitle_frame[(subtitle_frame['label'] != "")] if label_frame.shape[0] == 0: continue if pff is not None and isinstance(pff, int) and counter >= pff: break counter += 1 log.info("On file {0} Season {1} Episode {2}".format( counter, season, episode)) f_data, fs, enc = oggread(f) subtitle_frame = subtitle_frame.sort('start') subtitle_frame.index = range(subtitle_frame.shape[0]) samps = [] good_rows = [] for i in xrange(0, subtitle_frame.shape[0]): start = subtitle_frame['start'].iloc[i] end = subtitle_frame['end'].iloc[i] if end - start > 6 or (subtitle_frame['label'][i] == '' and oll): continue samp = f_data[(start * fs):(end * fs), :] samps.append({'samp': samp, 'fs': fs}) good_rows.append(i) r = p.imap(process_subtitle, samps, chunksize=1) sf = subtitle_frame.iloc[good_rows] results = [] for i in range(len(samps)): try: results.append(r.next(timeout=timeout)) except TimeoutError: results.append(None) good_rows = [ i for i in xrange(0, len(results)) if results[i] != None ] audio_features = [i for i in results if i != None] good_sf = sf.iloc[good_rows] good_sf.index = range(good_sf.shape[0]) audio_frame = pd.DataFrame(audio_features) audio_frame.index = range(audio_frame.shape[0]) df = pd.concat([good_sf, audio_frame], axis=1) df = df.fillna(-1) df.index = range(df.shape[0]) frames.append(df) lab_df_shape = df[df['label'] != ''].shape[0] log.info("Processed {0} lines, {1} of which were labelled".format( df.shape[0], lab_df_shape)) p.close() p.join() log.info("Done processing episodes.") data = pd.concat(frames, axis=0) data.index = range(data.shape[0]) data.index = range(data.shape[0]) for c in list(data.columns): data[c] = data[c].real for k in CHARACTERS: for i in CHARACTERS[k]: data['label'][data['label'] == i] = k self.label_codes = {k: i for (i, k) in enumerate(set(data['label']))} reverse_label_codes = { self.label_codes[k]: k for k in self.label_codes } data['label_code'] = [self.label_codes[k] for k in data['label']] self.seq = SequentialValidate() #Do cv to get error estimates cv_frame = data[data['label'] != ""] self.seq.train(cv_frame, **self.seq.args) self.res = self.seq.results self.res = self.res[[ 'line', 'label', 'label_code', 'result_code', 'result_label' ]] exact_percent, adj_percent = compute_error(self.res) log.info("Exact match percent: {0}".format(exact_percent)) log.info("Adjacent match percent: {0}".format(adj_percent)) #Predict in the frame alg = RandomForestTrain() target = cv_frame['label_code'] non_predictors = ["label", "line", "label_code"] train_names = [ l for l in list(cv_frame.columns) if l not in non_predictors ] train_data = cv_frame[train_names] predict_data = data[train_names] clf = alg.train(train_data, target, **alg.args) data['result_code'] = alg.predict(predict_data) data['result_label'] = [ reverse_label_codes[k] for k in data['result_code'] ] return data