Esempio n. 1
0
class BaseInput(FieldModel):
    """
    Base for the input class
    """
    #Cached fields.  See fields.base
    data = Dict()

    #Used for the registry
    category = RegistryCategories.inputs
    namespace = get_namespace(__module__)

    input_format = "none"

    help_text = "Base class for input.  Do not use directly."

    def __init__(self, **kwargs):
        super(BaseInput, self).__init__(**kwargs)

    def read_input(self, stream, **kwargs):
        """
        Reads the input in the specified format.  Overriden by specific input functions.
        """
        pass

    def get_data(self):
        """
        After data has been input, returns it.  Override if needed.
        """
        return self.data
Esempio n. 2
0
class CleanupScriptList(Task):
    data = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        script_removal_values = [""]
        for r in script_removal_values:
            data = data[data["script"]!=r]
        log.info(data)
        data['episode_name'] = [i.split('\n')[0].strip() for i in data['episode_name']]
        data['episode_code'] = [i.split('/')[-1].split('.html')[0] for i in data['url']]

        data.index = range(data.shape[0])
        return data
Esempio n. 3
0
class EvolveMusic(Task):
    data = Complex()
    clf = Complex()
    importances = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)
    args = {
        'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"],
        'target_var' : 'label_code',
    }

    def train(self,data,target, **kwargs):
        non_predictors = kwargs.get('non_predictors')
        target = kwargs.get('target_var')

        data.index = range(data.shape[0])

        alg = RandomForestTrain()
        good_names = [i for i in data.columns if i not in non_predictors]
        for c in good_names:
            data[c] = data[c].astype(float)

        for c in good_names:
            data[c] = data[c].real

        clf = alg.train(np.asarray(data[good_names]),data[target],**alg.args)
        importances = clf.feature_importances_

        counter = 0
        for i in xrange(0,data.shape[0]):
            fname = data['fname'][i]
            vec, fs, enc = read_sound(fname)
            label = data["labels"][i]
            if label=="classical":
                counter+=1
                name = fname.split("/")[-1]
                feats = process_song(vec,fs)
                initial_quality = clf.predict_proba(feats)[0,1]
                headers = "song_index,iteration,quality,distance,splice_song_index,splice_song"
                v2s = [headers,"{0},{1},{2},{3},{4},{5}".format(i,-1,initial_quality,0,0,"N/A")]
                print(headers)
                for z in xrange(0,100):
                    if z%10==0 or z==0:
                        v2ind = random.randint(0,data.shape[0]-1)
                        v2fname = data['fname'][v2ind]
                        vec2, v2fs, v2enc = read_sound(v2fname)
                        feats = process_song(vec,fs)
                        quality = clf.predict_proba(feats)[0,1]
                        nearest_match, min_dist = find_nearest_match(feats, data[good_names])
                        descriptor = "{0},{1},{2},{3},{4},{5}".format(i,z,quality,min_dist,v2ind,v2fname.split("/")[-1])
                        v2s.append(descriptor)
                        print(descriptor)
                        if min_dist>.35 and (abs(quality-0)<=.1 or abs(1-quality)<=.1) and z!=0:
                            write_file(name,vec,fs,enc,v2s)
                    vec = alter(vec,vec2,fs,v2fs,clf)
                write_file(name,vec,fs,enc,v2s)
Esempio n. 4
0
class ScriptFormatter(JSONFormat):
    namespace = get_namespace(__module__)

    def from_script(self, input_data):
        """
        Reads script format input data, but data is already in json, so return.
        """
        return input_data
Esempio n. 5
0
class ProcessMusic(Task):
    data = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process sports events."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        d = []
        labels = []
        encs = []
        fss = []
        fnames = []
        if not os.path.isfile(settings.FEATURE_PATH):
            for (z,p) in enumerate(data):
                log.info("On file {0}".format(z))
                try:
                    data , fs, enc = read_sound(p['newpath'])
                except Exception:
                    continue
                try:
                    features = process_song(data,fs)
                except Exception:
                    log.exception("Could not get features")
                    continue
                d.append(features)
                labels.append(p['type'])
                fss.append(fs)
                encs.append(enc)
                fnames.append(p['newpath'])
            frame = pd.DataFrame(d)
            frame['labels']  = labels
            frame['fs'] = fss
            frame['enc'] = encs
            frame['fname'] = fnames
            label_dict = {
                'classical' : 1,
                'electronic' : 0
            }
            frame['label_code'] = [label_dict[i] for i in frame['labels']]
            frame.to_csv(settings.FEATURE_PATH)
        else:
            frame = pd.read_csv(settings.FEATURE_PATH)

        return frame
Esempio n. 6
0
class CleanupScriptText(Task):
    data = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data)

    def check_for_line_split(self, line):
        return check_if_character(line.split(":")[0])

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        voice_scripts = []
        for i in xrange(0,data.shape[0]):
            script_lines = data['script'][i].split('\n')
            voice_lines = []
            current_line = ""
            for (i,line) in enumerate(script_lines):
                current_line = current_line.strip()
                line = line.strip()
                if line.startswith("[") and line.endswith("]"):
                    continue
                if line.startswith("-"):
                    continue
                voice_line = re.search('\w+:',line)
                if voice_line is not None:
                    if self.check_for_line_split(current_line):
                        voice_lines.append(current_line)
                    current_line = line
                elif (len(line)==0 or line.startswith("-")) and len(current_line)>0:
                    if self.check_for_line_split(current_line):
                        voice_lines.append(current_line)
                    current_line = ""
                    voice_lines.append(" ")
                elif len(current_line)>0:
                    current_line+=" " + line
            script_text = "\n".join([l for l in voice_lines if len(l)>0 and "{" not in l and "=" not in l])
            script_text = re.sub("\[.+\]","",script_text)
            voice_scripts.append(script_text.strip())

        data['voice_script'] = voice_scripts

        return data
Esempio n. 7
0
class MusicInput(BaseInput):
    """
    Extends baseinput to read simpsons scripts
    """
    input_format = MusicFormats.mjson
    help_text = "Read in music links data."
    namespace = get_namespace(__module__)

    def read_input(self, mfile, has_header=True):
        """
        directory is a path to a directory with multiple csv files
        """

        mjson = json.load(open(mfile))
        for m in mjson:
            m['ltype'] = m['ltype'].split("?")[0]
        ltypes = list(set([m['ltype'] for m in mjson]))
        for l in ltypes:
            jp = join_path(settings.MUSIC_PATH, l)
            if not os.path.isdir(jp):
                os.mkdir(jp)

        fpaths = []
        for m in mjson:
            fname = m['link'].split("/")[-1]
            fpath = join_path(join_path(settings.MUSIC_PATH, m['ltype']),
                              fname)
            try:
                if not os.path.isfile(fpath):
                    r = requests.get(m['link'])
                    f = open(fpath, 'wb')
                    f.write(r.content)
                    f.close()
                fpaths.append({'type': m['ltype'], 'path': fpath})
            except Exception:
                log.exception("Could not get music file.")

        for p in fpaths:
            newfile = p['path'][:-4] + ".ogg"
            if not os.path.isfile(newfile):
                frommp3 = subprocess.Popen(['mpg123', '-w', '-', p['path']],
                                           stdout=subprocess.PIPE)
                toogg = subprocess.Popen(['oggenc', '-'],
                                         stdin=frommp3.stdout,
                                         stdout=subprocess.PIPE)
                with open(newfile, 'wb') as outfile:
                    while True:
                        data = toogg.stdout.read(1024 * 100)
                        if not data:
                            break
                        outfile.write(data)
            p['newpath'] = newfile

        self.data = fpaths
Esempio n. 8
0
class EventFormatter(JSONFormat):
    namespace = get_namespace(__module__)

    def from_events(self, input_data):
        """
        Reads subtitle format input data and converts to json.
        """
        return input_data

    def to_dataframe(self):
        return self.data
Esempio n. 9
0
class RandomForestTrain(Train):
    """
    A class to train a random forest
    """
    colnames = List()
    clf = Complex()
    category = RegistryCategories.algorithms
    namespace = get_namespace(__module__)
    algorithm = RandomForestClassifier
    args = {'n_estimators' : 300, 'min_samples_leaf' : 4, 'compute_importances' : True}

    help_text = "Train and predict with Random Forest."
Esempio n. 10
0
class BaseFormat(FieldModel):
    """
    Base class to reformat input data.  If implementing, add in from_ and to_ methods (see read_input and get_data)
    """
    #These fields will be cached.  See fields.base
    data = Dict()

    #Set the category and namespace for the registry
    category = RegistryCategories.formatters
    namespace = get_namespace(__module__)

    help_text = "Base class for reformatting input data.  Do not use directly."

    def __init__(self, **kwargs):
        super(BaseFormat, self).__init__(**kwargs)
        self.input_formats = []
        self.output_formats = []
        self.setup_formats()

    def setup_formats(self):
        """
        Inspects its methods to see what it can convert from and to
        """
        methods = self.get_methods()
        for m in methods:
            #Methods named "from_X" will be assumed to convert from format X to the common format
            if m.startswith("from_"):
                self.input_formats.append(re.sub("from_" , "",m))
            #Methods named "to_X" will be assumed to convert from the common format to X
            elif m.startswith("to_"):
                self.output_formats.append(re.sub("to_","",m))

    def read_input(self, input_data, data_format):
        """
        Reads the input data and converts to common format
        input_data - the output from one of the input classes (ie CSVInput)
        data_format - the format of the data.  See utils.input.dataformats
        """
        if data_format not in self.input_formats:
            raise Exception("Input format {0} not available with this class. Available formats are {1}.".format(data_format, self.input_formats))
        data_converter = getattr(self, "from_" + data_format)
        self.data = data_converter(input_data)

    def get_data(self, data_format):
        """
        Reads the common format and converts to output data
        data_format - the format of the output data.  See utils.input.dataformats
        """
        if data_format not in self.output_formats:
            raise Exception("Output format {0} not available with this class. Available formats are {1}.".format(data_format, self.output_formats))
        data_converter = getattr(self, "to_" + data_format)
        return data_converter()
Esempio n. 11
0
class ReformatScriptText(Task):
    data = Complex()
    voice_lines = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {'scriptfile' : os.path.abspath(os.path.join(settings.PROJECT_PATH, "data/raw_scripts2.json")), 'do_replace' : True}

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = data
        self.predict(self.data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        voice_scripts = list(data['voice_script'])
        scriptfile = kwargs['scriptfile']
        do_replace = kwargs['do_replace']
        json_scripts = json.load(open(scriptfile))
        voice_scripts+=[s['script'] for s in json_scripts]
        script_segments = []
        for script in voice_scripts:
            script = script.replace("\"","")
            lines = script.split("\n")
            segment = []
            for line in lines:
                if line.strip()!="":
                    line = line.encode('ascii','ignore')
                    line_split = line.split(":")
                    if do_replace:
                        line_split[0] = find_replacement(line_split[0].strip())
                    line_split[0] = cleanup_name(line_split[0].strip())
                    segment.append({'speaker' : line_split[0],
                                    'line' : ":".join(line_split[1:]).strip()})
                else:
                    if len(segment)>0:
                        script_segments.append(segment)
                        segment = []
            if len(segment)>0:
                script_segments.append(segment)
        self.voice_lines = script_segments
Esempio n. 12
0
class ScriptInput(BaseInput):
    """
    Extends baseinput to read simpsons scripts
    """
    input_format = SimpsonsFormats.script
    help_text = "Reformat simpsons script data."
    namespace = get_namespace(__module__)

    def read_input(self, filename, has_header=True):
        """
        directory is a path to a directory with multiple csv files
        """

        filestream = open(filename)
        self.data = json.load(filestream)
Esempio n. 13
0
class MidiInput(BaseInput):
    """
    Extends baseinput to read midi
    """
    input_format = MusicFormats.midijson
    help_text = "Read in music links data."
    namespace = get_namespace(__module__)

    def read_input(self, mfile, has_header=True):
        """
        directory is a path to a directory with multiple csv files
        """

        mjson = json.load(open(mfile))
        ltypes = list(set([m['ltype'] for m in mjson]))
        for l in ltypes:
            jp = join_path(settings.MIDI_MUSIC_PATH, l)
            if not os.path.isdir(jp):
                os.mkdir(jp)

        fpaths = []
        for m in mjson:
            fname = m['link'].split("/")[-1]
            fpath = join_path(join_path(settings.MIDI_MUSIC_PATH, m['ltype']),
                              fname)
            log.info(fpath)
            try:
                if not os.path.isfile(fpath):
                    r = requests.get(m['link'])
                    f = open(fpath, 'wb')
                    f.write(r.content)
                    f.close()
                fpaths.append({'type': m['ltype'], 'path': fpath})
            except Exception:
                log.exception("Could not get music file.")

        npaths = []
        for p in fpaths:
            try:
                p['newpath'] = convert_to_ogg(p['path'])
                npaths.append(p)
            except Exception:
                log.exception("Could not convert to ogg")
                continue

        self.data = npaths
Esempio n. 14
0
class PullDownComments(Task):
    data = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Pull down comments and store them."

    def train(self, data, **kwargs):
        try:
            items_done = read_raw_data_from_cache(
                os.path.abspath(
                    os.path.join(settings.DATA_PATH, "items_done.p")))
            comments = [c['comment'] for c in items_done]
            replies = [c['reply'] for c in items_done]
            for subreddit in settings.REPLY_SUBREDDIT_LIST:
                try:
                    comment = get_single_comment(subreddit)
                    print comment
                    if comment is None:
                        log.info("Could not get a comment")
                        continue
                    text = comment.body
                    cid = comment.id
                    reply = test_knn_matcher(knn_matcher, text)
                    if text in comments or (reply in replies
                                            and reply is not None):
                        continue
                    data = {'comment': text, 'reply': reply, 'comment_id': cid}
                    items_done.append(data)
                    replies.append(reply)
                    comments.append(text)
                    log.info("Subreddit: {0}".format(subreddit))
                    log.info("Comment: {0} {1}".format(cid, text))
                    log.info("Reply: {0}".format(reply))
                    log.info("-------------------")
                except:
                    log.exception("Cannot get reply for {0}".format(subreddit))
                    continue
                write_data_to_cache(items_done, "items_done.p", "comment_id")
        except Exception:
            log.exception("Could not pull down comment.")
Esempio n. 15
0
class ProcessGames(Task):
    data = Complex()
    row_data = List()
    speaker_code_dict = Dict()
    speaker_codes = List()
    vectorizer = Complex()

    data_format = SportsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process sports events."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        con = sqlite3.connect(settings.DB_PATH)
        c = con.cursor()
        rosters = sql.read_frame("select * from rosters", con)

        tys = []
        for i in xrange(0, rosters.shape[0]):
            year = rosters.iloc[i]['year']
            team = rosters.iloc[i]['team']
            ty = [year, team]
            if ty not in tys:
                tys.append(ty)

        for ty in tys:
            year, team = ty
            ros = rosters[((rosters['year'] == year) &
                           (rosters['team'] == team))]
            players = list(ros['id'])

        return data
Esempio n. 16
0
class Task(FieldModel):
    """
    Base class for task
    """
    #Used by the registry
    category = RegistryCategories.base
    namespace = get_namespace(__module__)

    #Define dependencies to run before this (results are passed into class before execution)
    dependencies = []
    trained_dependencies = []

    #Additional arguments to pass into train and predict functions (additional data files, etc)
    args = {}

    #Data format accepted
    data_format = DataFormats.dataframe

    #Cached field
    data = Dict()

    help_text = "Base task class. Do not use directly."

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)

    def train(self, data, **kwargs):
        """
        Used in the training phase.  Override.
        """
        pass

    def predict(self, test_data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        pass

    def get_data(self):
        data_dict = {}
        for key in self.stored_values:
            data_dict.update({key: getattr(self, key)})
Esempio n. 17
0
class SubtitleFormatter(JSONFormat):
    namespace = get_namespace(__module__)

    def from_subtitle(self, input_data):
        """
        Reads subtitle format input data and converts to json.
        """
        reformatted_data = []
        for (i, row) in enumerate(input_data):
            if i == 0:
                headers = row
            else:
                data_row = {}
                for (j, h) in enumerate(headers):
                    if j < len(row):
                        data_row.update({h: row[j]})
                    else:
                        data_row.update({h: 0})
                reformatted_data.append(data_row)
        return reformatted_data
Esempio n. 18
0
class GenerateTransitionMatrix(Task):
    data = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process midi files."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        tempos = {'tick' : [], 'mpqn' : []}
        notes = {}
        all_instruments = []
        for (z,p) in enumerate(data):
            log.info("On file {0}".format(z))
            try:
                m = midi.read_midifile(p['path'])
            except Exception:
                continue
            try:
                notes, tempos, instruments = process_midifile(m,notes,tempos)
                all_instruments.append(instruments)
            except Exception:
                log.exception("Could not get features")
                continue
        nm, tm = generate_matrices(notes,tempos)

        data = {'files' : data, 'notes' : notes, 'tempos' : tempos, 'nm' : nm, 'tm': tm, 'in' : list(chain.from_iterable(all_instruments))}

        return data
Esempio n. 19
0
class GameInput(BaseInput):
    """
    Extends baseinput to read simpsons scripts
    """
    input_format = SportsFormats.events
    help_text = "Read in baseball event data."
    namespace = get_namespace(__module__)

    def read_input(self, directory, has_header=True):
        """
        directory is a path to a directory with multiple csv files
        """

        efolds = [
            join_path(directory, f) for f in os.listdir(directory)
            if os.path.isdir(os.path.join(directory, f))
        ]
        efiles = []
        for fold in efolds:
            files = [
                i for i in os.listdir(fold)
                if os.path.isfile(join_path(fold, i)) if i.endswith(".EVN")
            ]
            years = list(set([i[:4] for i in files]))
            for (i, y) in enumerate(years):
                if not os.path.isfile('{0}/events-{1}.csv'.format(
                        settings.DATA_PATH, y)):
                    cmd = "{cp}cwevent -q -n -f 0-96 -x 0-62 -y {y} {y}*.EV* > {dp}/events-{y}.csv".format(
                        cp=settings.CHADWICK_PATH, dp=settings.DATA_PATH, y=y)
                    os.chdir(fold)
                    subprocess.call(cmd, shell=True)
                if not os.path.isfile('{0}/games-{1}.csv'.format(
                        settings.DATA_PATH, y)):
                    cmd = "{cp}cwgame -q -n -f 0-83 -y {y} {y}*.EV* > {dp}/games-{y}.csv".format(
                        cp=settings.CHADWICK_PATH, dp=settings.DATA_PATH, y=y)
                    subprocess.call(cmd, shell=True)
                if not os.path.isfile('{0}/boxes-{1}.csv'.format(
                        settings.DATA_PATH, y)):
                    cmd = "{cp}cwbox -q -X -y {y} {y}*.EV* > {dp}/boxes-{y}.csv".format(
                        cp=settings.CHADWICK_PATH, dp=settings.DATA_PATH, y=y)
                    os.chdir(fold)
                    subprocess.call(cmd, shell=True)
            efiles += [
                join_path(fold, i) for i in os.listdir(fold)
                if os.path.isfile(join_path(fold, i))
            ]

        con = sqlite3.connect(settings.DB_PATH)
        c = con.cursor()
        if not table_exists(c, "rosters"):
            rfiles = [f for f in efiles if f.endswith(".ROS")]
            rosters = []
            for r in rfiles:
                filestream = open(r)
                team, year = get_team_and_year(r)
                df = pd.read_csv(filestream,
                                 names=[
                                     "id", "lastname", "firstname", "pbat",
                                     "sbat", "team", "position"
                                 ])
                df['year'] = [year for i in xrange(0, df.shape[0])]
                rosters.append(df)
            roster = pd.concat(rosters, axis=0)
            sql.write_frame(roster, name='rosters', con=con)

        game_files = [
            join_path(settings.DATA_PATH, g)
            for g in os.listdir(settings.DATA_PATH) if g.startswith('games-')
        ]
        event_files = [
            join_path(settings.DATA_PATH, e)
            for e in os.listdir(settings.DATA_PATH) if e.startswith('events-')
        ]

        if not table_exists(c, "games"):
            games = []
            for g in game_files:
                df = pd.read_csv(open(g))
                year = e.split('-')[1].split('\.')[0]
                df['year'] = [year for i in xrange(0, df.shape[0])]
                games.append(df)
            games = pd.concat(games, axis=0)
            sql.write_frame(games, name='games', con=con)

        if not table_exists(c, "events"):
            events = []
            for e in event_files:
                df = pd.read_csv(open(e))
                year = e.split('-')[1].split('\.')[0]
                df['year'] = [year for i in xrange(0, df.shape[0])]
                events.append(df)
            events = pd.concat(events, axis=0)
            sql.write_frame(events, name='events', con=con)

        self.data = {
            'rosters': 'roster',
            'games': 'games',
            'events': 'events',
        }
Esempio n. 20
0
class KNNRF(Task):
    data = Complex()
    predictions = Complex()
    importances = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    args = {'algo': RandomForestTrain}

    help_text = "Cleanup simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        from preprocess import CHARACTERS

        vec_length = math.floor(MAX_FEATURES / 3)

        algo = kwargs.get('algo')
        alg = algo()
        train_data = data['train_frame'].iloc[:, :-1]
        target = data['train_frame']['current_speaker']
        clf = alg.train(train_data, target, **algo.args)
        self.importances = clf.feature_importances_

        test_data = data['data']
        match_data = data['current_features']
        reverse_speaker_code_dict = {
            data['speaker_code_dict'][k]: k
            for k in data['speaker_code_dict']
        }

        speaker_list = []
        speaker_codes = reverse_speaker_code_dict.keys()
        for i in xrange(0, len(speaker_codes)):
            s_text = "\n".join(
                list(data['speakers'][data['speakers']['speaker'] ==
                                      reverse_speaker_code_dict[
                                          speaker_codes[i]]]['line']))
            speaker_list.append(s_text)
        speaker_features = data['vectorizer'].batch_get_features(speaker_list)

        self.predictions = []
        counter = 0
        for script in test_data['voice_script']:
            counter += 1
            log.info("On script {0} out of {1}".format(
                counter, len(test_data['voice_script'])))
            lines = script.split("\n")
            speaker_code = [-1 for i in xrange(0, len(lines))]
            for (i, line) in enumerate(lines):
                if i > 0 and i % RESET_SCENE_EVERY != 0:
                    previous_line = lines[i - 1]
                    previous_speaker = speaker_code[i - 1]
                else:
                    previous_line = ""
                    previous_speaker = -1

                if i > 1 and i % RESET_SCENE_EVERY != 0:
                    two_back_speaker = speaker_code[i - 2]
                else:
                    two_back_speaker = -1

                if i < (len(lines) - 1):
                    next_line = lines[i + 1]
                else:
                    next_line = ""

                prev_features = data['vectorizer'].get_features(previous_line)
                cur_features = data['vectorizer'].get_features(line)
                next_features = data['vectorizer'].get_features(next_line)

                meta_features = make_df(
                    [[two_back_speaker], [previous_speaker]],
                    ["two_back_speaker", "previous_speaker"])
                #meta_features = make_df([[two_back_speaker]],["two_back_speaker"])
                train_frame = pd.concat([
                    pd.DataFrame(prev_features),
                    pd.DataFrame(cur_features),
                    pd.DataFrame(next_features), meta_features
                ],
                                        axis=1)

                speaker_code[i] = alg.predict(train_frame)[0]

                nearest_match, distance = self.find_nearest_match(
                    cur_features, speaker_features)
                if distance < CHARACTER_DISTANCE_MIN:
                    sc = speaker_codes[nearest_match]
                    speaker_code[i] = sc
                    continue

                for k in CHARACTERS:
                    for c in CHARACTERS[k]:
                        if c in previous_line:
                            speaker_code[i] = data['speaker_code_dict'][k]

                nearest_match, distance = self.find_nearest_match(
                    cur_features, match_data)
                if distance < DISTANCE_MIN:
                    sc = data['speakers']['speaker_code'][nearest_match]
                    speaker_code[i] = sc
                    continue

            df = make_df([
                lines, speaker_code,
                [reverse_speaker_code_dict[round(s)] for s in speaker_code]
            ], ["line", "speaker_code", "speaker"])
            self.predictions.append(df)
        return data

    def find_nearest_match(self, features, matrix):
        features = np.asarray(features)
        distances = [self.euclidean(u, features) for u in matrix]
        nearest_match = distances.index(min(distances))
        return nearest_match, min(distances)

    def euclidean(self, v1, v2):
        return np.sqrt(np.sum(np.square(np.subtract(v1, v2))))
Esempio n. 21
0
class FeatureExtractor(Task):
    data = Complex()
    row_data = List()
    speaker_code_dict = Dict()
    speaker_codes = List()
    vectorizer = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {
        'scriptfile':
        os.path.abspath(os.path.join(settings.DATA_PATH, "script_tasks"))
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        scriptfile = kwargs.get('scriptfile')
        script_data = pickle.load(open(scriptfile))
        script = script_data.tasks[2].voice_lines.value
        speakers = []
        lines = []
        for s in script:
            for (i, l) in enumerate(s):
                if i > 0:
                    previous_line = s[i - 1]['line']
                    previous_speaker = s[i - 1]['speaker']
                else:
                    previous_line = ""
                    previous_speaker = ""

                if i > 1:
                    two_back_speaker = s[i - 2]['speaker']
                else:
                    two_back_speaker = ""

                if len(s) > i + 1:
                    next_line = s[i + 1]['line']
                else:
                    next_line = ""
                current_line = s[i]['line']
                current_speaker = s[i]['speaker']
                lines.append(current_line)
                speakers.append(current_speaker)
                row_data = {
                    'previous_line': previous_line,
                    'previous_speaker': previous_speaker,
                    'next_line': next_line,
                    'current_line': current_line,
                    'current_speaker': current_speaker,
                    'two_back_speaker': two_back_speaker
                }
                self.row_data.append(row_data)
        self.speaker_code_dict = {
            k: i
            for (i, k) in enumerate(list(set(speakers)))
        }
        self.speaker_codes = [self.speaker_code_dict[s] for s in speakers]
        self.max_features = math.floor(MAX_FEATURES) / 3
        self.vectorizer = Vectorizer()
        self.vectorizer.fit(lines, self.speaker_codes, self.max_features)
        prev_features = self.vectorizer.batch_get_features(
            [rd['previous_line'] for rd in self.row_data])
        cur_features = self.vectorizer.batch_get_features(
            [rd['current_line'] for rd in self.row_data])
        next_features = self.vectorizer.batch_get_features(
            [rd['next_line'] for rd in self.row_data])

        self.speaker_code_dict.update({'': -1})
        meta_features = make_df(
            [[
                self.speaker_code_dict[s['two_back_speaker']]
                for s in self.row_data
            ],
             [
                 self.speaker_code_dict[s['previous_speaker']]
                 for s in self.row_data
             ], self.speaker_codes],
            ["two_back_speaker", "previous_speaker", "current_speaker"])
        #meta_features = make_df([[self.speaker_code_dict[s['two_back_speaker']] for s in self.row_data], self.speaker_codes],["two_back_speaker", "current_speaker"])
        train_frame = pd.concat([
            pd.DataFrame(prev_features),
            pd.DataFrame(cur_features),
            pd.DataFrame(next_features), meta_features
        ],
                                axis=1)
        train_frame.index = range(train_frame.shape[0])
        data = {
            'vectorizer':
            self.vectorizer,
            'speaker_code_dict':
            self.speaker_code_dict,
            'train_frame':
            train_frame,
            'speakers':
            make_df([speakers, self.speaker_codes, lines],
                    ["speaker", "speaker_code", "line"]),
            'data':
            data,
            'current_features':
            cur_features,
        }
        return data
Esempio n. 22
0
class GenerateMarkovTracks(Task):
    data = Complex()

    data_format = MusicFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Process midi files."

    args = {
        'non_predictors' : ["labels","label_code","fs","enc","fname","Unnamed: 0"],
        'target_var' : 'label_code',
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        frame1 = pd.read_csv(settings.MIDI_FEATURE_PATH)
        frame2 = pd.read_csv(settings.FEATURE_PATH)

        frame = pd.concat([frame1,frame2],axis=0)
        non_predictors = kwargs.get('non_predictors')
        target = kwargs.get('target_var')

        frame.index = range(frame.shape[0])

        alg = RandomForestTrain()
        good_names = [i for i in frame.columns if i not in non_predictors]
        for c in good_names:
            frame[c] = frame[c].astype(float)

        for c in good_names:
            frame[c] = frame[c].real

        clf = alg.train(np.asarray(frame[good_names]),frame[target],**alg.args)

        evolutions = 2
        track_count = 100
        patterns_to_pick = int(math.floor(track_count/4))
        remixes_to_make = int(math.floor(track_count/4))
        additions_to_make = int(math.floor(track_count/4))
        patterns = generate_patterns(track_count,data)
        for z in xrange(0,evolutions):
            new_quality, quality, patterns = rate_tracks(patterns, clf)
            patterns = patterns[0:patterns_to_pick]
            for i in xrange(0,remixes_to_make):
                patterns.append(remix(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick])))
            #for i in xrange(0,additions_to_make):
            #    patterns.append(add_song(random.choice(patterns[:patterns_to_pick]), random.choice(patterns[:patterns_to_pick])))
            new_patterns = []
            for p in patterns:
                if p not in new_patterns:
                    new_patterns.append(p)
            patterns = new_patterns
            patterns += generate_patterns(track_count - len(patterns), data)
        new_quality, quality, patterns = rate_tracks(patterns, clf)

        feats = []
        for (i,p) in enumerate(patterns):
            time = strftime("%m-%d-%Y-%H%M%S", gmtime())
            fname = time+random.choice(words)+".mid"
            oggpath = write_and_convert(p,fname)
            dat, fs, enc = oggread(oggpath)
            f = process_song(dat[:settings.MUSIC_TIME_LIMIT * fs,:],fs)
            feats.append(f)
        feats = pd.DataFrame(feats)
        feats['label_code'] = [2] * feats.shape[0]
        feats['label'] = ["generated"] * feats.shape[0]
        feats.to_csv(os.path.abspath(os.path.join(settings.DATA_PATH,"generated_midi_features.csv")))

        return data
Esempio n. 23
0
class SubtitleInput(BaseInput):
    """
    Extends baseinput to read simpsons scripts
    """
    input_format = SimpsonsFormats.subtitle
    help_text = "Reformat simpsons script data."
    namespace = get_namespace(__module__)


    def get_episode_metadata(self, name):
        episode_code = re.search("\[\d+\.\d+\]", name).group(0).replace("[","").replace("]","")
        season, episode = episode_code.split(".")
        season = int(season)
        episode = int(episode)
        return season, episode

    def read_input(self, directory, has_header=True):
        """
        directory is a path to a directory with multiple csv files
        """

        sub_datafiles = [ f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory,f)) and f.endswith(".sub")]
        all_sub_data = []
        for infile in sub_datafiles:
            stream = open(os.path.join(directory, infile))
            season,episode = self.get_episode_metadata(infile)
            data=stream.read()
            row_data = []
            for (i, row) in enumerate(data.split("\n")):
                row = row.replace('\r','')
                row_split = row.split("}")
                if len(row_split)>2:
                    start = float(row_split[0].replace("{",""))/24
                    end = float(row_split[1].replace("{",""))/24
                    line = row_split[2].split("{")[0]
                    if len(row_split[2].split("{"))>1:
                        label = row_split[2].split("{")[1].replace("}","")
                    else:
                        label = ""
                    row_data.append([start,end,line,label,season,episode])
            all_sub_data.append(row_data)
        srt_datafiles = [ f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory,f)) and f.endswith(".srt")]
        for infile in srt_datafiles:
            stream = open(os.path.join(directory, infile))
            season,episode = self.get_episode_metadata(infile)
            data=stream.read()
            row_data = []
            for (i, row) in enumerate(data.split("\r\n\r\n")):
                row_split = row.split("\r\n")
                if len(row_split)>3:
                    timing = row_split[1]
                    start_timing = timing.split("-->")[0].replace(",",".").split(":")
                    start_seconds = float(start_timing[-1]) + float(start_timing[-2])*60 + float(start_timing[-3])*3600
                    start = start_seconds
                    end_timing = timing.split("-->")[1].replace(",",".").split(":")
                    end_seconds = float(end_timing[-1]) + float(end_timing[-2])*60 + float(end_timing[-3])*3600
                    end = end_seconds
                    line = " ".join(row_split[2:])
                    if len(line.split("{"))>1:
                        label = line.split("{")[1].replace("}","")
                        line = line.split("{")[0]
                    else:
                        label = ""
                    row_data.append([start,end,line,label,season,episode])
            all_sub_data.append(row_data)
        sub_data = [["start","end","line","label","season","episode"]] + list(chain.from_iterable(all_sub_data))
        self.data = sub_data
Esempio n. 24
0
class CrossValidate(Task):
    data = Complex()
    results = Complex()
    error = Float()
    importances = Complex()
    importance = Complex()
    column_names = List()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)
    args = {
        'nfolds': 3,
        'algo': RandomForestTrain,
        'target_name': 'label_code',
        'non_predictors': ["label", "line", "label_code"]
    }

    help_text = "Cross validate simpsons data."

    def cross_validate(self, data, **kwargs):
        nfolds = kwargs.get('nfolds', 3)
        algo = kwargs.get('algo')
        seed = kwargs.get('seed', 1)
        self.target_name = kwargs.get('target_name')
        non_predictors = kwargs.get('non_predictors')

        self.column_names = [
            l for l in list(data.columns) if l not in non_predictors
        ]
        data_len = data.shape[0]
        counter = 0
        fold_length = int(math.floor(data_len / nfolds))
        folds = []
        data_seq = list(xrange(0, data_len))
        random.seed(seed)
        random.shuffle(data_seq)

        for fold in xrange(0, nfolds):
            start = counter

            end = counter + fold_length
            if fold == (nfolds - 1):
                end = data_len
            folds.append(data_seq[start:end])
            counter += fold_length

        results = []
        data.index = range(data.shape[0])
        self.importances = []
        for (i, fold) in enumerate(folds):
            predict_data = data.iloc[fold, :]
            out_indices = list(chain.from_iterable(folds[:i] +
                                                   folds[(i + 1):]))
            train_data = data.iloc[out_indices, :]
            alg = algo()
            target = train_data[self.target_name]
            train_data = train_data[[
                l for l in list(train_data.columns) if l not in non_predictors
            ]]
            predict_data = predict_data[[
                l for l in list(predict_data.columns)
                if l not in non_predictors
            ]]
            clf = alg.train(train_data, target, **algo.args)
            results.append(alg.predict(predict_data))
            self.importances.append(clf.feature_importances_)
        return results, folds

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.target_name = kwargs.get('target_name')
        results, folds = self.cross_validate(data, **kwargs)
        self.gather_results(results, folds, data)

    def gather_results(self, results, folds, data):
        full_results = list(chain.from_iterable(results))
        full_indices = list(chain.from_iterable(folds))
        partial_result_df = make_df([full_results, full_indices],
                                    ["result", "index"])
        partial_result_df = partial_result_df.sort(["index"])
        partial_result_df.index = range(partial_result_df.shape[0])
        result_df = pd.concat([partial_result_df, data], axis=1)
        self.results = result_df
        self.calc_importance(self.importances, self.column_names)

    def calc_error(self, result_df):
        self.error = np.mean(
            np.abs(result_df['result'] - result_df[self.target_name]))

    def calc_importance(self, importances, col_names):
        importance_frame = pd.DataFrame(importances)
        importance_frame.columns = col_names
        self.importance = importance_frame.mean(axis=0)
        self.importance.sort(0)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        pass
Esempio n. 25
0
class ClusterScriptText(Task):
    data = Complex()
    clusters = Complex()
    predictions = Complex()
    clusters = List()
    cl = Complex()
    vec = Complex()
    vec1 = Complex()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cluster simpsons scripts."

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = data
        self.predict(self.data)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        from train import Vectorizer, make_df

        self.vec = Vectorizer()

        reformatter = ReformatScriptText()
        args = reformatter.args
        args['do_replace'] = False
        reformatter.train(data, "", **args)

        script_segments = list(chain.from_iterable(reformatter.voice_lines))
        text = [s['line'] for s in script_segments]
        speaker = [s['speaker'] for s in script_segments]
        unique_speakers = list(set(speaker))
        speaker_code_dict = {k:i for (i,k) in enumerate(unique_speakers)}
        speaker_codes = [speaker_code_dict[k] for k in unique_speakers]
        speaker_list = []
        speaker_frame = make_df([text,speaker],["text","speaker"])
        for i in unique_speakers:
            s_text = "\n".join(list(speaker_frame[speaker_frame['speaker']==i]['text']))
            speaker_list.append(s_text)

        self.vec.fit(speaker_list, speaker_codes, 200,min_features=2)
        features = self.vec.batch_get_features(speaker_list)

        cl = KMeans()
        self.predictions = cl.fit_predict(features)
        self.cl = cl

        for i in xrange(0,max(self.predictions)):
            clust = []
            for c in xrange(0,len(speaker_codes)):
                if self.predictions[c]==i:
                    clust.append(unique_speakers[c])
            self.clusters.append(clust)

        pca = PCA(n_components=2, whiten=True).fit(features)
        rf = pca.transform(features)
        labels = cl.labels_
        pyplot.clf()
        centroids = cl.cluster_centers_
        pyplot.cla()
        for i in range(max(labels)):
            ds = rf[np.where(labels==i)]
            pyplot.plot(ds[:,0],ds[:,1],'o', label=self.clusters[i][0])
        pyplot.legend(loc=8)
        pyplot.savefig('clusters.png')

        self.vec1 = Vectorizer()
        speaker_codes = [speaker_code_dict[k] for k in speaker]

        self.vec1.fit(text, speaker_codes, 200,min_features=2)
        features = self.vec1.batch_get_features(text)

        pca = PCA(n_components=2, whiten=True).fit(features)
        rf = pca.transform(features)
        pyplot.clf()
        pyplot.cla()
        for i in range(len(speaker_codes)):
            pyplot.plot(rf[i,0],rf[i,1],'o', label=speaker[i])
        pyplot.savefig('all_speakers.png')
Esempio n. 26
0
class BaseWorkflow(object):
    """
    Base workflow class
    """
    #Metaclass needed to register the workflow
    __metaclass__ = MetaFieldModel
    #category, namespace, name for the registry
    category = RegistryCategories.base
    namespace = get_namespace(__module__)
    name = __name__.lower()

    #Defines how tasks are run
    runner = import_from_string(settings.RUNNER)
    input_file = ""
    input_format = DataFormats.csv
    target_file = ""
    target_format = DataFormats.csv
    predict_file = ""
    predict_format = DataFormats.csv
    tasks = []
    run_id = ""
    help_text = "Base class for workflow.  Do not use directly."

    def __init__(self, **kwargs):
        #initialize runner.  Don't do this at class level to avoid sharing same runner object.
        self.runner = self.runner()
        self.setup_run = False

    def setup(self):
        #Reformat input data as needed
        self.reformatted_input = self.reformat_input()
        self.setup_run = True

    def find_dependencies(self, task):
        dependencies = task.dependencies
        return dependencies

    def execute_train_task_with_dependencies(self, task_cls, **kwargs):
        """
        Run the training, as well as any dependencies of the training
        task_cls - class of a task
        """
        log.info("Task {0}".format(get_task_name(task_cls)))
        #Instantiate the task
        task_inst = task_cls()
        #Grab arguments from the task instance and set them
        for arg in task_inst.args:
            if arg not in kwargs:
                kwargs[arg] = task_inst.args[arg]
        #Check for dependencies defined by the task
        if hasattr(task_inst, "dependencies"):
            deps = task_inst.dependencies
            dep_results = []
            #Run the dependencies through recursion (in case of dependencies of dependencies, etc)
            for dep in deps:
                log.info("Dependency {0}".format(get_task_name(dep)))
                dep_results.append(
                    self.execute_train_task_with_dependencies(
                        dep.cls, **dep.args))
            trained_dependencies = []
            #Add executed dependency to trained_dependencies list on the task
            for i in xrange(0, len(deps)):
                dep = deps[i]
                dep_result = dep_results[i]
                name = dep.name
                namespace = dep.namespace
                category = dep.category
                trained_dependencies.append(
                    TrainedDependency(category=category,
                                      namespace=namespace,
                                      name=name,
                                      inst=dep))
            task_inst.trained_dependencies = trained_dependencies
        #Finally, run the task
        task_inst.train(**kwargs)
        return task_inst

    def execute_predict_task(self, task_inst, predict_data, **kwargs):
        """
        Do a prediction
        task_inst - instance of a task
        """
        result = task_inst.predict(predict_data, **task_inst.args)
        return result

    def train(self, **kwargs):
        """
        Do the workflow training
        """
        log.info("Starting to train...")
        if not self.setup_run:
            self.setup()
        self.trained_tasks = []
        for task in self.tasks:
            data = self.reformatted_input[task.data_format]['data']
            target = self.reformatted_input[task.data_format]['target']
            if data is None:
                raise Exception(
                    "Data cannot be none.  Check the config file to make sure the right input is being read."
                )
            kwargs['data'] = data
            kwargs['target'] = target
            trained_task = self.execute_train_task_with_dependencies(
                task, **kwargs)
            self.trained_tasks.append(trained_task)
            #If the trained task alters the data in any way, pass it down the chain to the next task
            if hasattr(trained_task, 'data'):
                self.reformatted_input[
                    task.data_format]['data'] = trained_task.data
        log.info("Finished training.")

    def predict(self, **kwargs):
        """
        Do the workflow prediction (done after training, with new data)
        """
        reformatted_predict = self.reformat_predict_data()
        results = {}
        for task_inst in self.trained_tasks:
            predict = reformatted_predict[task_inst.data_format]['predict']
            kwargs['predict'] = predict
            results.update({
                get_task_name(task_inst):
                self.execute_predict_task(task_inst, predict, **kwargs)
            })
        return results

    def find_input(self, input_format):
        """
        Find an input class for a given format
        input_format - see utils.input.dataformats
        """
        input_cls = find_needed_input(input_format)
        return input_cls

    def read_input(self, input_cls, filename, **kwargs):
        """
        Read in input and do some minimal preformatting
        input_cls - the class to use to read the input
        filename - input filename
        """
        input_inst = input_cls()
        input_inst.read_input(filename)
        return input_inst.get_data()

    def reformat_file(self, input_file, input_format, output_format):
        """
        Reformat input data files to a format the tasks can use
        """
        #Return none if input_file or input_format do not exist
        if input_file is None or input_format is None:
            return None
        #Find the needed input class and read the input stream
        try:
            input_cls = self.find_input(input_format)
            input_inst = input_cls()
        except TypeError:
            #Return none if input_cls is a Nonetype
            return None
        #If the input file cannot be found, return None
        try:
            input_inst.read_input(self.absolute_filepath(input_file))
        except IOError:
            return None

        formatter = find_needed_formatter(input_format, output_format)
        if formatter is None:
            raise Exception(
                "Cannot find a formatter that can convert from {0} to {1}".
                format(self.input_format, output_format))
        formatter_inst = formatter()
        formatter_inst.read_input(input_inst.get_data(), input_format)
        data = formatter_inst.get_data(output_format)
        return data

    def absolute_filepath(self, input_file):
        """
        Gets absolute path of a file
        """
        #abspath needed to avoid relative path issues
        return os.path.abspath(input_file)

    def reformat_predict_data(self, **kwargs):
        reformatted_predict = {}
        for output_format in self.needed_formats:
            reformatted_predict.update({
                output_format: {
                    'predict':
                    self.reformat_file(self.predict_file, self.predict_format,
                                       output_format),
                }
            })
        return reformatted_predict

    def reformat_input(self, **kwargs):
        """
        Reformat input data
        """
        reformatted_input = {}
        needed_formats = []
        for task_cls in self.tasks:
            needed_formats.append(task_cls.data_format)
        self.needed_formats = list(set(needed_formats))

        for output_format in self.needed_formats:
            reformatted_input.update({
                output_format: {
                    'data':
                    self.reformat_file(self.input_file, self.input_format,
                                       output_format),
                    'target':
                    self.reformat_file(self.target_file, self.target_format,
                                       output_format)
                }
            })
        return reformatted_input
Esempio n. 27
0
class Validate(Task):
    data = Complex()
    results = Complex()
    error = Float()
    importances = Complex()
    importance = Complex()
    column_names = List()

    data_format = DataFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Validate."

    def cross_validate(self, data, non_predictors, **kwargs):
        nfolds = kwargs.get('nfolds', 3)
        algo = kwargs.get('algo')
        seed = kwargs.get('seed', 1)
        data_len = data.shape[0]
        counter = 0
        fold_length = int(math.floor(data_len/nfolds))
        folds = []
        data_seq = list(xrange(0,data_len))
        random.seed(seed)
        random.shuffle(data_seq)

        for fold in xrange(0, nfolds):
            start = counter

            end = counter + fold_length
            if fold == (nfolds-1):
                end = data_len
            folds.append(data_seq[start:end])
            counter += fold_length

        results = []
        data.index = range(data.shape[0])
        self.importances = []
        for (i,fold) in enumerate(folds):
            predict_data = data.iloc[fold,:]
            out_indices = list(chain.from_iterable(folds[:i] + folds[(i + 1):]))
            train_data = data.iloc[out_indices,:]
            alg = algo()
            target = train_data['next_year_wins']
            train_data = train_data[[l for l in list(train_data.columns) if l not in non_predictors]]
            predict_data = predict_data[[l for l in list(predict_data.columns) if l not in non_predictors]]
            clf = alg.train(train_data,target,**algo.args)
            results.append(alg.predict(predict_data))
            self.importances.append(clf.feature_importances_)
        return results, folds

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        non_predictors = [i.replace(" ", "_").lower() for i in list(set(data['team']))] + ["team", "next_year_wins"]
        self.column_names = [l for l in list(data.columns) if l not in non_predictors]
        results, folds = self.cross_validate(data, non_predictors, **kwargs)
        self.gather_results(results, folds, data)

    def gather_results(self, results, folds, data):
        full_results = list(chain.from_iterable(results))
        full_indices = list(chain.from_iterable(folds))
        partial_result_df = make_df([full_results, full_indices], ["result", "index"])
        partial_result_df = partial_result_df.sort(["index"])
        partial_result_df.index = range(partial_result_df.shape[0])
        result_df = pd.concat([partial_result_df, data[['next_year_wins', 'team', 'year', 'total_wins']]], axis=1)
        result_df = result_df[(result_df['next_year_wins']>0) & result_df['total_wins']>0]
        self.results = result_df
        self.calc_error(result_df)
        self.calc_importance(self.importances, self.column_names)

    def calc_error(self, result_df):
        filtered_df = result_df[result_df['year']<np.max(result_df['year'])]
        self.error = np.mean(np.abs(filtered_df['result'] - filtered_df['next_year_wins']))

    def calc_importance(self, importances, col_names):
        importance_frame = pd.DataFrame(importances)
        importance_frame.columns = col_names
        self.importance = importance_frame.mean(axis=0)
        self.importance.sort(0)

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """

        pass
Esempio n. 28
0
class LoadAudioFiles(Task):
    data = Complex()
    all_files = List()
    seq = Complex()
    res = Complex()
    label_codes = Dict()

    data_format = SimpsonsFormats.dataframe

    category = RegistryCategories.preprocessors
    namespace = get_namespace(__module__)

    help_text = "Cleanup simpsons scripts."

    args = {
        'audio_dir': settings.AUDIO_DIR,
        'timeout': 600,
        'only_labelled_lines': settings.ONLY_LABELLED_LINES,
        'processed_files_limit': settings.PROCESSED_FILES_LIMIT
    }

    def train(self, data, target, **kwargs):
        """
        Used in the training phase.  Override.
        """
        self.data = self.predict(data, **kwargs)

    def extract_season(self, name):
        match1 = re.search('\[(\d+)[x\.](\d+)\]', name)
        if match1 is not None:
            season = match1.group(1)
            episode = match1.group(2)
            return int(season), int(episode)

        match2 = re.search('S(\d+)E(\d+)', name)
        if match2 is not None:
            season = match2.group(1)
            episode = match2.group(2)
            return int(season), int(episode)

        return None, None

    def predict(self, data, **kwargs):
        """
        Used in the predict phase, after training.  Override
        """
        p = Pool(4, maxtasksperchild=50)
        audio_dir = kwargs['audio_dir']
        timeout = kwargs['timeout']
        oll = kwargs['only_labelled_lines']
        pff = kwargs['processed_files_limit']

        all_files = []
        for ad in os.listdir(audio_dir):
            ad_path = os.path.abspath(os.path.join(audio_dir, ad))
            if os.path.isdir(ad_path):
                files = os.listdir(ad_path)
                all_files += [
                    os.path.abspath(os.path.join(ad_path, f)) for f in files
                ]
            else:
                all_files += [ad_path]
        self.all_files = [f for f in all_files if f.endswith(".ogg")]
        frames = []
        counter = 0
        for f in self.all_files:
            season, episode = self.extract_season(f)
            if season is None or (season == 11 and episode == 6):
                continue
            subtitle_frame = data[((data['season'] == season) &
                                   (data['episode'] == episode))]
            if subtitle_frame.shape[0] == 0:
                continue

            #To cause loop to end early, remove if needed
            if oll:
                label_frame = subtitle_frame[(subtitle_frame['label'] != "")]
                if label_frame.shape[0] == 0:
                    continue
            if pff is not None and isinstance(pff, int) and counter >= pff:
                break

            counter += 1
            log.info("On file {0} Season {1} Episode {2}".format(
                counter, season, episode))
            f_data, fs, enc = oggread(f)
            subtitle_frame = subtitle_frame.sort('start')
            subtitle_frame.index = range(subtitle_frame.shape[0])
            samps = []
            good_rows = []
            for i in xrange(0, subtitle_frame.shape[0]):
                start = subtitle_frame['start'].iloc[i]
                end = subtitle_frame['end'].iloc[i]
                if end - start > 6 or (subtitle_frame['label'][i] == ''
                                       and oll):
                    continue
                samp = f_data[(start * fs):(end * fs), :]
                samps.append({'samp': samp, 'fs': fs})
                good_rows.append(i)
            r = p.imap(process_subtitle, samps, chunksize=1)
            sf = subtitle_frame.iloc[good_rows]
            results = []
            for i in range(len(samps)):
                try:
                    results.append(r.next(timeout=timeout))
                except TimeoutError:
                    results.append(None)
            good_rows = [
                i for i in xrange(0, len(results)) if results[i] != None
            ]
            audio_features = [i for i in results if i != None]
            good_sf = sf.iloc[good_rows]
            good_sf.index = range(good_sf.shape[0])
            audio_frame = pd.DataFrame(audio_features)
            audio_frame.index = range(audio_frame.shape[0])
            df = pd.concat([good_sf, audio_frame], axis=1)
            df = df.fillna(-1)
            df.index = range(df.shape[0])
            frames.append(df)
            lab_df_shape = df[df['label'] != ''].shape[0]
            log.info("Processed {0} lines, {1} of which were labelled".format(
                df.shape[0], lab_df_shape))
        p.close()
        p.join()
        log.info("Done processing episodes.")
        data = pd.concat(frames, axis=0)
        data.index = range(data.shape[0])
        data.index = range(data.shape[0])

        for c in list(data.columns):
            data[c] = data[c].real
        for k in CHARACTERS:
            for i in CHARACTERS[k]:
                data['label'][data['label'] == i] = k
        self.label_codes = {k: i for (i, k) in enumerate(set(data['label']))}
        reverse_label_codes = {
            self.label_codes[k]: k
            for k in self.label_codes
        }
        data['label_code'] = [self.label_codes[k] for k in data['label']]
        self.seq = SequentialValidate()

        #Do cv to get error estimates
        cv_frame = data[data['label'] != ""]
        self.seq.train(cv_frame, **self.seq.args)
        self.res = self.seq.results
        self.res = self.res[[
            'line', 'label', 'label_code', 'result_code', 'result_label'
        ]]

        exact_percent, adj_percent = compute_error(self.res)
        log.info("Exact match percent: {0}".format(exact_percent))
        log.info("Adjacent match percent: {0}".format(adj_percent))
        #Predict in the frame
        alg = RandomForestTrain()
        target = cv_frame['label_code']
        non_predictors = ["label", "line", "label_code"]
        train_names = [
            l for l in list(cv_frame.columns) if l not in non_predictors
        ]
        train_data = cv_frame[train_names]
        predict_data = data[train_names]
        clf = alg.train(train_data, target, **alg.args)
        data['result_code'] = alg.predict(predict_data)
        data['result_label'] = [
            reverse_label_codes[k] for k in data['result_code']
        ]
        return data