Exemple #1
0
 def test_spelling_correction(self):
     prep2 = Preprocessor(corpus1, spellcorrect=True)
     corpus = prep2.preprocessed_corpus()[0]
     self.assertEqual(corpus[0], ["sentences", "word", "<positiveemoji>", "butterfly", "<positiveemoji>"])
Exemple #2
0
 def test_lemmatization(self):
     prep3 = Preprocessor(corpus1, lemmatize=True)
     corpus = prep3.preprocessed_corpus()[0]
     self.assertEqual(corpus[1], ['this', 'be', 'the', 'longest', 'sentence', 'in', 'the', 'corpus'])
    #    return [sent.string.strip() for sent in doc.sents]

    #time.sleep(0.5)
    predict_df[data_columns] = predict_df[data_columns].progress_apply(
        lambda x: sent_tokenize(x))
    predict_df = predict_df.explode(data_columns)
    predict_df = predict_df.reset_index(drop=True)
    predict_df = predict_df.reset_index(drop=False)

    ## do the preprocessing
    print("Preprocess")
    preprocessor = Preprocessor(
        doLower=args["doLower"],
        doLemmatization=args["doLemmatization"],
        removeStopWords=args["removeStopWords"],
        doSpellingCorrection=args["doSpellingCorrection"],
        removeNewLine=args["removeNewLine"],
        removePunctuation=args["removePunctuation"],
        removeHtmlTags=args["removeHtmlTags"],
        minTextLength=args["minTextLength"])
    predict_df["processed"] = preprocessor.fit_transform(
        predict_df["text_german"])
    predict_df = predict_df.dropna(subset=["processed"], axis=0)

    print("Tokenize")
    tokenizer = Tokenizer(tokenizeStr=preperation_technique,
                          ngram=preperation_ngram,
                          fasttextFile=args["fasttext_file"],
                          doLower=args["doLower"])
    predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"])
Exemple #4
0
def main(model_name,
         batch_size=64,
         model_params=None,
         preprocessing_params=None,
         test_run=False):
    model_wrapper = model_dict[model_name](*model_params)
    preprocessor = Preprocessor(**preprocessing_params)

    model_dir = os.path.join('test_runs' if test_run else 'output',
                             model_wrapper.path(), preprocessor.path())
    success_path = os.path.join(model_dir, 'history')

    if not os.path.exists(success_path):

        best_model_path = os.path.join(model_dir, 'model-best.hdf5')
        shutil.rmtree(model_dir, ignore_errors=True)
        os.makedirs(model_dir)
        model = model_wrapper.get_model()

        callbacks = [
            ModelCheckpoint(best_model_path,
                            monitor='val_loss',
                            verbose=1,
                            save_best_only=True,
                            mode='min'),
            EarlyStopping(patience=1 if test_run else 5, verbose=1)
        ]

        if test_run:
            print 'test run'
            train_generator = preprocessor.get_train_generator(
                vs.TINY_TRAIN_DIR, batch_size)
            val_generator = preprocessor.get_test_generator(
                vs.TINY_VALIDATION_DIR, batch_size)
            test_generator = preprocessor.get_test_generator(
                vs.TINY_TEST_DIR, batch_size)
            print 'class indices'
            print train_generator.class_indices
        else:
            print 'true run'
            train_generator = preprocessor.get_train_generator(
                vs.TRAIN_DIR, batch_size)
            val_generator = preprocessor.get_test_generator(
                vs.VALIDATION_DIR, batch_size)
            test_generator = preprocessor.get_test_generator(
                vs.TEST_DIR, batch_size)

        train_samples = train_generator.samples
        val_samples = val_generator.samples
        history = model.fit_generator(
            train_generator,
            train_samples // batch_size,
            validation_data=val_generator,
            validation_steps=val_samples // batch_size + 1,
            epochs=100,
            callbacks=callbacks).history
        with open(success_path, 'wb') as f:
            pickle.dump(history, f)

        model = load_model(best_model_path)
        make_submission(model, test_generator,
                        os.path.join(model_dir, 'submission.csv'))

    with open(success_path, 'rb') as f:
        history = pickle.load(f)

    print model_wrapper
    print preprocessor
    print 'best loss %.3f' % min(history['val_loss'])
    print 'best accuracy %.3f' % max(history['val_acc'])
Exemple #5
0
        -------
        y_pred : ndarray, shape (n_samples,)
            The predicted target.
        """
        return self.model.predict(X)


if __name__ == "__main__":

    # load data
    print('Loading data...')
    data = build_dataset('train')
    train_data, valid_data = train_test_split(data,
                                              test_size=0.2,
                                              random_state=42)
    preprocessor = Preprocessor()
    preprocessor.fit(train_data)
    train_data = preprocessor.transform(train_data)
    valid_data = preprocessor.transform(valid_data)
    save_dataset(pd.concat([train_data, valid_data]), 'train_preprocessed.csv')

    X_train = train_data.drop(['Sales', 'Customers'], axis=1)
    X_valid = valid_data.drop(['Sales', 'Customers'], axis=1)
    y_train = train_data['Sales']
    y_valid = valid_data['Sales']

    print('Training model on', len(X_train), 'samples')
    print('Validating model on', len(X_valid), 'samples')
    print('Training model on features: ', X_train.columns.tolist())

    # model selection with grid search
Exemple #6
0
import numpy as np
from multinomialNB import MultinomialNB
from preprocessing import Preprocessor
from Scorer.scorer import BinaryScorer
from matplotlib import pyplot as plt

# Open file and read content in a variable.
# Couldn't use standard python way of opening files due to ASCII decode errors.
raw = codecs.open('./SMSSpamCollection.txt', 'r', encoding='utf-8').readlines()

# Create a Multinomial Naive Bayes Classifier, in this case we only have 2 classes
model = MultinomialNB()

# Preprocess, Tokenize and Split data in train and test
# IMPORTANT: Unless seed parameter is removed from call, the split will always be the same.
x_tr, y_tr, x_ts, y_ts = Preprocessor(data=raw).preprocess().tokenize().split(
    percentage_train=0.8, seed=555, shuffle=True, functional=False)

# Fit the model
model.fit(x_tr, y_tr)

print('Fit complete')

# Predict train and test values
pred_tr = model.predict(x_tr, alpha=0.1, voc_size=20000)
pred_ts = model.predict(x_ts, alpha=0.1, voc_size=20000)

# Print train and test predictions performance
train_scores = BinaryScorer(y_tr, pred_tr, description='Training').describe()
test_scores = BinaryScorer(y_ts, pred_ts, description='Testing').describe()

# Create list of alphas with different i value
Exemple #7
0
    def __init__(self, g):
        self.G = deepcopy(g)
        self.queryCount = 0
        # Do pre-processing and get the initial query set
        p = Preprocessor(self.G)
        self.Q = deepcopy(p.query)
        # Find the lower limit tree
        self.Tl = deepcopy(p.Tl)
        # Find the upper limit tree
        self.Tu = deepcopy(p.Tu)
        g = list(deepcopy(self.Tu))
        g.sort(key=lambda x: -x.upper)
        # g is the set of edges in upper limit tree sorted in descending order of upper limits
        component = {}

        # DFS Function to traverse in the tree
        def dfs(u, par, adj, c):
            component[u] = c
            for v in adj[u]:
                if v.u + v.v - u != par:
                    dfs(v.u + v.v - u, u, adj, c)

        # Function to check whether the edgeSet contains in always minimal edge
        def check(edgeSet):
            if len(edgeSet) <= 1:
                return False
            lowers = []
            for edge in edgeSet:
                lowers.append(edge.lower)
            lowers.sort()
            for edge in edgeSet:
                if edge.lower == lowers[0] and (edge.trivial or edge.upper <= lowers[1]):
                    return False
            return True

        # Go through each edge in g
        for edge in g:
            # Construct the graph in adjacency list
            adj = [[] for _ in range(self.G.size + 1)]
            erased = None
            for edge2 in self.Tu:
                if {edge2.u, edge2.v} == {edge.u, edge.v}:
                    erased = edge2
            self.Tu.remove(erased)
            for edge2 in self.Tu:
                adj[edge2.u].append(edge2)
                adj[edge2.v].append(edge2)
            # Do two DFS - once from each component
            component.clear()
            dfs(edge.u, -1, adj, 0)
            dfs(edge.v, -1, adj, 1)
            # C stores the edge set denoting the "cut" created
            C = []
            for edge2 in self.G.edges:
                if component[edge2.u] != component[edge2.v]:
                    C.append(edge2)
            # While C does not have always mininal edge
            while check(C):
                firstLower = 1e9
                firstInd = 0
                # Find the two edges with minimum lower limits
                for i in range(len(C)):
                    if C[i].lower < firstLower:
                        firstLower = C[i].lower
                        firstInd = i
                secondLower = 1e9
                secondInd = 0
                firstEdge = C[firstInd]
                for i in range(len(C)):
                    if i == firstInd:
                        continue
                    if C[i].lower < secondLower:
                        secondLower = C[i].lower
                        secondInd = i
                secondEdge = C[secondInd]
                assert secondEdge.lower <= firstEdge.upper
                # If the first edge is not trivial, then query it
                if not firstEdge.trivial:
                    self.Q.add(deepcopy(firstEdge))
                    self.G.query(firstEdge)
                    C.remove(firstEdge)
                    firstEdge.lower = firstEdge.actual
                    firstEdge.upper = firstEdge.actual
                    firstEdge.trivial = True
                    C.append(firstEdge)
                # If the second edge is not trivial, then query it
                if not secondEdge.trivial:
                    self.Q.add(deepcopy(secondEdge))
                    self.G.query(secondEdge)
                    C.remove(secondEdge)
                    secondEdge.lower = secondEdge.actual
                    secondEdge.upper = secondEdge.actual
                    secondEdge.trivial = True
                    C.append(secondEdge)

            # If an always minimal edge is found, erase it from Upper Limit Tree
            if len(C):
                if len(C) == 1:
                    self.Tu.add(C[0])
                else:
                    lowers = []
                    for edge in C:
                        lowers.append(edge.lower)
                    lowers.sort()
                    for edge in C:
                        if edge.lower == lowers[0] and (edge.trivial or edge.upper <= lowers[1]):
                            self.Tu.add(edge)
                            break
from preprocessing import Preprocessor
from nn import train
import torch
import torch.nn as nn

p = Preprocessor(dimensions=2)
data, focus_words, contexts = p.run()

model = train(input_dimension=len(p.vocabulary),
              embedding_dimension=2,
              learning_rate=0.1,
              focus_words=focus_words,
              contexts=contexts)


# check for similarity between batman and wayne vs joker and wayne
def who_is_wayne(m):
    w1 = model.layer1.weight
    # stack to merge rows like row_1 -> col_1 with row_2 -> col_2 as a single row
    # print w1 in case you need an explanation!
    w1 = torch.stack((w1[0], w1[1]), dim=1)

    b1 = model.layer1.bias

    # word vectors we're looking for from word2vec
    # are actually the backprop updated weights and biases of the
    # hidden layer
    vectors = w1 + b1

    print("")
    print("Preparing vectors")
Exemple #9
0
class PostprocessingWorker(threading.Thread):
    """ Python script for Postprocessing worker... runs until cancelled or till max waiting time
    """

    pause_time = 2
    max_waiting_time = 60 * 60  # 60seconds * 60min = 1 hour in seconds
    base_path = ""
    saves_path = "mating_progress/"
    pickle_prefix = ""
    get_save = False
    pop_path = "population/"
    traces_path = "traces_afterVox/"
    traces_backup_path = "traces_afterVox_backup/"
    traces_during_pp_path = "traces_duringPP/"
    traces_after_pp_path = "traces_afterPP/"
    debug = False
    db = None
    queue = []
    vox_preamble = 8  # number of lines that voxelyze adds before the actual output in a trace file
    config = ConfigParser.RawConfigParser()
    arena_x = 0
    arena_y = 0
    arena_type = ""
    end_time = 0
    timeTolerance = 0.0  # maximum mating time distance
    spaceTolerance = 0.01  # maximum mating distance radius
    one_child = False
    infertile_birth = False
    infertile_birth_percent = 0.1
    area_birthcontrol = False
    area_birthcontrol_radius = 0.05
    area_birthcontrol_cutoff = 25
    population_cap = False
    pp = Preprocessor()
    indiv_max_age = 0
    indiv_infertile = False
    indiv_infertile_span = 0.25
    random_birth_place = False
    queue_length = 1
    timestep = 0.002865
    pick_from_pool = False

    def readConfig(self, config_path):
        self.config.read(config_path)
        self.exp_name = self.config.get('Experiment', 'name')
        self.path_prefix = self.config.get('Experiment', 'path_prefix')
        self.debug = self.config.get('Experiment', 'debug')
        self.end_time = self.config.getfloat('Experiment', 'end_time')

        self.base_path = os.path.expanduser(self.path_prefix +
                                            self.exp_name) + "/"
        self.queue_length = self.config.getint('Postprocessing', 'queue_len')
        self.pop_path = self.config.get('Postprocessing', 'pop_path')
        self.traces_path = self.config.get('Postprocessing', 'traces_path')
        self.traces_backup_path = self.config.get('Postprocessing',
                                                  'traces_backup_path')
        self.traces_during_pp_path = self.config.get('Postprocessing',
                                                     'traces_during_pp_path')
        self.traces_after_pp_path = self.config.get('Postprocessing',
                                                    'traces_after_pp_path')
        self.vox_preamble = self.config.getint('Postprocessing',
                                               'vox_preamble')
        self.timestep = self.config.getfloat('Postprocessing', 'timestep')

        self.pause_time = self.config.getint('Workers', 'pause_time')
        self.max_waiting_time = self.config.getint('Workers',
                                                   'max_waiting_time')

        self.timeTolerance = self.config.getfloat('Mating', 'timeTolerance')
        self.spaceTolerance = self.config.getfloat('Mating', 'spaceTolerance')
        self.indiv_infertile = self.config.getboolean('Mating',
                                                      'indiv_infertile')
        self.indiv_infertile_span = self.config.getfloat(
            'Mating', 'indiv_infertile_span')
        self.one_child = self.config.getboolean('Mating',
                                                'onlyOneChildPerParents')
        self.infertile_birth = self.config.getboolean('Mating',
                                                      'infertileAfterBirth')
        self.infertile_birth_percent = self.config.getfloat(
            'Mating', 'infertileAfterBirthPercentage')
        self.area_birthcontrol = self.config.getboolean(
            'Mating', 'areaBirthControl')
        self.area_birthcontrol_radius = self.config.getfloat(
            'Mating', 'areaBirthControlRadius')
        self.area_birthcontrol_cutoff = self.config.getfloat(
            'Mating', 'areaBirthControlCutoff')
        self.population_cap = self.config.getboolean('Mating', 'populationCap')
        self.random_birth_place = self.config.getboolean(
            'Mating', 'randomBirthPlace')
        self.pick_from_pool = self.config.getboolean('Mating', 'pickFromPool')

        self.arena_x = self.config.getfloat('Arena', 'x')
        self.arena_y = self.config.getfloat('Arena', 'y')
        self.arena_type = self.config.get('Arena', 'type')

        self.indiv_max_age = self.config.getfloat('Population',
                                                  'indiv_max_age')

    def __init__(self, dbParams, config_path):
        threading.Thread.__init__(self)
        self.db = DB(dbParams[0], dbParams[1], dbParams[2], dbParams[3])
        self.readConfig(config_path)

        self.stopRequest = threading.Event()

    def run(self):
        """ main thread function
        :return: None
        """
        waitCounter = 0
        startTime = time.time()

        obs_path = os.path.normpath(self.base_path + self.traces_path)

        while not self.stopRequest.isSet(
        ):  # and waitCounter < self.max_waiting_time):
            self.dirCheck(obs_path)

            if (len(self.queue) > 0):
                print 'PP:', map(self.getIDfromTrace, self.queue)
                self.queue = sorted(
                    self.queue, key=lambda id: int(self.getIDfromTrace(id)))
                item = self.queue[0]
                self.queue = self.queue[1:]
                if self.debug:
                    print "PP: working on id", item
                self.markAsVoxelyzed(item)
                self.moveFilesToTmp(item)
                self.adjustTraceFile(item)
                self.traceToDatabase(item)
                self.findMates(item)
                babies = self.calculateOffspring(item)
                self.makeBabies(babies)
                self.moveFilesToFinal(item)
                self.markAsPostprocessed(item)
                waitCounter = 0
            else:
                if (self.debug):
                    print("PP: found nothing")
                waitCounter += time.time() - startTime
                startTime = time.time()

                jobsRunning = self.db.getJobsWaitingCount()

                if (self.debug):
                    print("PP: {n} jobs currently waiting in LISA queue...".
                          format(n=jobsRunning))
                    print("PP: sleeping now for " + str(self.pause_time) + "s")

                self.stopRequest.wait(self.pause_time)

        print("PP: got exit signal... cleaning up")

    def join(self, timeout=None):
        """ function to terminate the thread (softly)
        :param timeout: not implemented yet
        :return: None
        """
        if (self.debug):
            print("PP: got kill request for thread")
        self.stopRequest.set()
        super(PostprocessingWorker, self).join(timeout)

    def getIDfromTrace(self, file_path):
        path, filename = os.path.split(file_path)
        name_parts = filename.split(".")
        return name_parts[0]

    def dirCheck(self, path):
        """ upon start check if there are files in the target diretory, because the watcher only notices files being moved there while running
        :return: None
        """
        unprocessed = [
            os.path.join(path, f) for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f)) and f.endswith('.trace')
        ]
        for todo in unprocessed:
            if todo not in self.queue:
                self.addFile(todo)

    def markAsVoxelyzed(self, todo):
        """ mark all the individuals as voxelyzed, i.e. as successfully processed by Voxelyze
        :param todos: list of strings with trace file paths
        :return: None
        """
        id = self.getIDfromTrace(todo)
        self.db.markAsVoxelyzed(id)
        self.db.setJobDone(id)

    def markAsPostprocessed(self, todo):
        """ mark all the individuals as postprocessed, i.e. all offspring has been calculated, files have been moved and the individuals are basically done
        :param todos: list of strings with trace file paths
        :return: None
        """
        id = self.getIDfromTrace(todo)
        self.db.markAsPostprocessed(id)
        self.db.setFinalTime(id)

    def adjustTraceFile(self, todo):
        """ put the individuals into an arena, correct their coordinates, etc.
        :param todos: list of strings with the individual trace filepaths
        :return: None
        """

        id = self.getIDfromTrace(todo)
        # get initial coordinates from DB
        indiv = self.db.getIndividual(id)
        first_trace = self.db.getFirstTrace(id)
        self.pp.addStartingPointArenaAndTime(self.getPathDuringPP(id),
                                             self.vox_preamble, self.arena_x,
                                             self.arena_y, self.arena_type,
                                             first_trace["x"],
                                             first_trace["y"], indiv["born"],
                                             self.end_time, self.timestep)

    def traceToDatabase(self, todo):
        """ put the individuals into the database
        :param todos: list of strings with the individual trace filepaths
        :return: None
        """

        id = self.getIDfromTrace(todo)
        with open(self.getPathDuringPP(id), 'r') as inputFile:
            traces = []

            fileAsList = inputFile.readlines()
            fileLen = len(fileAsList)
            for i in range(0, fileLen):
                fertile = 1
                if (self.infertile_birth):
                    if (i <= self.infertile_birth_percent * fileLen):
                        fertile = 0
                traceLine = fileAsList[i].split()
                traces.append([
                    id, traceLine[1], traceLine[2], traceLine[3], traceLine[4],
                    fertile
                ])
        if (len(traces) == 0):
            print(
                "PP-WARNING: individual {indiv} has 0 traces, so skipping... please check this though!"
                .format(len=len(traces), indiv=id))
        else:
            if (self.debug):
                print("PP: adding {len} traces for individual {indiv} to DB".
                      format(len=len(traces), indiv=id))
            self.db.addTraces(id, traces)

    def getPotentialBirthplace(self, parent1, parent2):
        x = (parent1["x"] + parent2["x"]) / 2
        y = (parent1["y"] + parent2["y"]) / 2
        return [x, y]

    def filterGlobalInfertility(self, id, mates):
        pass

    def filterIncestControl(self, id, mates):
        pass

    def filterAreaBirthControl(self, id, mates):
        pass

    def filterPopulationCap(self, id, mates):
        if len(mates) > 0:
            mate = random.choice(mates)
        else:
            if not self.pick_from_pool:  # then we mate the individual with itself
                lastTrace = self.db.getLastTrace(id)
                mate = {}
                mate["id"] = 0
                mate["indiv_id"] = id
                mate["ltime"] = lastTrace["ltime"]
                mate["x"] = lastTrace["x"]
                mate["y"] = lastTrace["y"]
                mate["z"] = lastTrace["z"]
                mate["mate_id"] = 0
                mate["mate_indiv_id"] = id
                mate["mate_ltime"] = lastTrace["ltime"]
                mate["mate_x"] = lastTrace["x"]
                mate["mate_y"] = lastTrace["y"]
                mate["mate_z"] = lastTrace["z"]
            else:
                return [None]
        return [mate]

    def calculateOffspring(self, todo):
        """ yeah, well... generate offspring, calculate where the new individuals met friends on the way
        :param todos: list of strings with the individual IDs
        :return: list of babies to make
        """

        babies = []

        if (not os.path.exists(todo)) or os.path.getsize(todo) == 0:
            return babies
        id = self.getIDfromTrace(todo)
        if self.debug:
            print("PP: looking for mates for individual {indiv}...".format(
                indiv=id))
        mates = self.db.getMates(id)

        # population cap is exclusive - if it is on, no other control works
        if self.population_cap:
            mates = self.filterPopulationCap(id, mates)
        else:
            if self.indiv_infertile:
                mates = self.filterGlobalInfertility(id, mates)
            if self.one_child:
                mates = self.filterIncestControl(id, mates)
            if self.area_birthcontrol:
                mates = self.filterAreaBirthControl(id, mates)
        if mates != [
                None
        ]:  # this happens only if self.pick_from_pool is True and if no mate was found
            babies += self.matesToBabies(id, mates)
        else:
            randomMate = self.db.getRandomMate(id)
            babies += self.matesToBabies(randomMate["id"], [randomMate])
        return babies

    def close_in_time(self, t1, t2):
        return abs(t1['ltime'] - t2['ltime']) <= self.timeTolerance

    def close_in_space(self, t1, t2):
        return math.sqrt((t1['x'] - t2['x'])**2 +
                         (t1['y'] - t2['y'])**2) <= self.spaceTolerance

    def findMates(self, indiv_path):
        id = self.getIDfromTrace(indiv_path)
        traces = self.db.getTraces(id)
        territory = self.db.getTerritory(id)
        lifetime = self.db.getLifetime(id)
        if not all(territory.values()) or not all(lifetime.values()):
            return
        possibleMates = self.db.getPossibleMates(id, territory, lifetime)
        mates = []
        for t in traces:
            for p in possibleMates:
                if self.close_in_time(t, p) and self.close_in_space(t, p):
                    mates.append((t, p))
        print 'PP: found', len(mates), 'possible mates for individual', id
        self.db.insertMates(mates)

    def matesToBabies(self, id, mates):
        babies = []
        for mate in mates:
            parent2 = {}
            parent2["id"] = mate["mate_id"]
            parent2["indiv_id"] = mate["mate_indiv_id"]
            parent2["ltime"] = mate["mate_ltime"]
            parent2["x"] = mate["mate_x"]
            parent2["y"] = mate["mate_y"]
            parent2["z"] = mate["mate_z"]
            babies.append([mate, parent2, mate["ltime"]])
        return babies

    def makeBabies(self, babies):
        for baby in babies:
            self.db.makeBaby(baby[0], baby[1], baby[2], self.one_child,
                             self.indiv_max_age * self.indiv_infertile_span,
                             self.arena_x, self.arena_y,
                             self.random_birth_place)

    def getPathDuringPP(self, id):
        return self.base_path + self.traces_during_pp_path + str(id) + ".trace"

    def moveFilesToTmp(self, indiv):
        """ once all preprocessing is done, move the files to their target destination
        :param todos: list of strings with the individual IDs
        :return: None
        """
        id = self.getIDfromTrace(indiv)
        try:
            shutil.copy2(
                indiv,
                self.base_path + self.traces_backup_path + str(id) + ".trace")
            shutil.copy2(indiv, self.getPathDuringPP(id))
        except:
            pass

    def moveFilesToFinal(self, indiv):
        """ once all preprocessing is done, move the files to their target destination
        :param todos: list of strings with the individual IDs
        :return: None
        """
        id = self.getIDfromTrace(indiv)
        if os.path.isfile(self.getPathDuringPP(id)):
            shutil.move(
                self.getPathDuringPP(id), self.base_path +
                self.traces_after_pp_path + str(id) + ".trace")
        if os.path.isfile(indiv):
            os.remove(indiv)

    def addFile(self, path):
        self.queue.append(path)
        if args["validation_split"]:
            train_df, val_df = train_test_split(
                train_df, test_size=args["validation_split"], random_state=42)
        else:
            logging.error("vaidation_split needs to be given.")
            sys.exit("vaidation_split needs to be given.")

        ## get data and train columns
        data_column = list(set(train_df.columns) - set(args["targets"]))[0]

        ## do the preprocessing
        print("Preprocess")
        preprocessor = Preprocessor(
            doLower=args["doLower"],
            doLemmatization=args["doLemmatization"],
            removeStopWords=args["removeStopWords"],
            doSpellingCorrection=args["doSpellingCorrection"],
            removeNewLine=args["removeNewLine"],
            removePunctuation=args["removePunctuation"])
        train_df[data_column] = preprocessor.fit_transform(
            train_df[data_column])
        val_df[data_column] = preprocessor.transform(val_df[data_column])
        test_df[data_column] = preprocessor.transform(test_df[data_column])

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        train_df.to_pickle(train_pre_path)
        val_df.to_pickle(val_pre_path)
        test_df.to_pickle(test_pre_path)
    else:
Exemple #11
0
from preprocessing import Preprocessor
# from attention import AttentionLayer
import tensorflow as tf
keras = tf.keras
from keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

if __name__ == "__main__":
    path = "/Users/seungyoungoh/workspace/text_summarization_project/"
    data = pd.read_csv(path + "/data/sample.csv", error_bad_lines=False)
    data = data.rename({
        'body': 'src',
        'key_point': 'smry'
    }, axis='columns')[['src', 'smry']]
    pr = Preprocessor(data)
    src_max_len, smry_max_len, src_vocab, smry_vocab, X_train, X_test, y_train, y_test = pr.preprocess(
    )

    # ### modeling
    # embedding_dim = 128
    # hidden_size = 256

    # # 인코더
    # encoder_inputs = Input(shape=(src_max_len,))

    # # 인코더의 임베딩 층
    # enc_emb = Embedding(src_vocab, embedding_dim)(encoder_inputs)

    # # 인코더의 LSTM 1
    # encoder_lstm1 = LSTM(hidden_size, return_sequences=True, return_state=True ,dropout = 0.4, recurrent_dropout = 0.4)
    def __init__(self, g):
        self.G = deepcopy(g)
        self.queryCount = 0
        # Do pre-processing and get the initial query set
        p = Preprocessor(g)
        self.Q = deepcopy(p.query)
        # Find the lower limit tree
        self.Tl = deepcopy(p.Tl)
        # Find the upper limit tree
        self.Tu = deepcopy(p.Tu)
        f = self.G.edges
        removed = set()
        for edge in self.Tl:
            removed.add(edge)
        erased = set()
        for edge in f:
            if edge in removed:
                erased.add(edge)
        for edge in erased:
            f.remove(edge)
        # print("f is", f)
        # f is a set of edges not present in lower limit tree
        C = []

        # DFS Function to traverse in the tree
        def dfs(u, par, adj, last, edges):
            if u == last:
                C[:] = edges
            for v in adj[u]:
                if v.u + v.v - u != par:
                    edges.append(v)
                    dfs(v.u + v.v - u, u, adj, last, edges)
                    edges.remove(v)

        # Function to check whether the edgeSet contains in always maximal edge
        def check(edgeSet):
            if len(edgeSet) <= 1:
                return False
            uppers = []
            for edge in edgeSet:
                uppers.append(edge.upper)
            uppers.sort()
            for edge in edgeSet:
                if edge.upper == uppers[-1] and (edge.trivial
                                                 or edge.lower >= uppers[-2]):
                    return False
            return True

        # Go through each edge in f
        for edge in f:
            # Construct the graph using adjacency list
            adj = [[] for _ in range(self.G.size + 1)]
            for edge2 in self.Tl:
                adj[edge2.u].append(edge2)
                adj[edge2.v].append(edge2)
            C[:] = []
            dfs(edge.u, -1, adj, edge.v, [])
            C.append(edge)
            # C is the set of edges which denote the cycle formed by adding 'edge'
            self.Tl.add(edge)
            # Unless C does not contain always maximal edge
            while check(C):
                uppers = []
                firstUpper = 0
                firstEdge = 0
                # Find the two edges with maximum upper limits
                for i in range(len(C)):
                    if C[i].upper > firstUpper:
                        firstUpper = C[i].upper
                        firstInd = i
                secondUpper = 0
                secondInd = 0
                firstEdge = C[firstInd]
                for i in range(len(C)):
                    if i == firstInd:
                        continue
                    if C[i].upper > secondUpper:
                        secondUpper = C[i].upper
                        secondInd = i
                secondEdge = C[secondInd]
                print("firstEdge ", firstEdge)
                print("secondEdge ", secondEdge)
                # If the first edge is not trivial, then query it
                if (not firstEdge.trivial) and (
                        secondEdge.trivial
                        or firstEdge.cost <= secondEdge.cost):
                    print("adding 1st", firstEdge)
                    self.Q.add(deepcopy(firstEdge))
                    self.Tl.remove(firstEdge)
                    self.G.query(firstEdge)
                    C.remove(firstEdge)
                    firstEdge.lower = firstEdge.actual
                    firstEdge.upper = firstEdge.actual
                    firstEdge.trivial = True
                    self.Tl.add(firstEdge)
                    C.append(firstEdge)
                # If the second edge is not trivial, then query it
                elif (not secondEdge.trivial) and (
                        firstEdge.trivial or secondEdge.cost < firstEdge.cost):
                    print("adding 2nd", secondEdge)
                    self.Q.add(deepcopy(secondEdge))
                    self.Tl.remove(secondEdge)
                    self.G.query(secondEdge)
                    C.remove(secondEdge)
                    secondEdge.lower = secondEdge.actual
                    secondEdge.upper = secondEdge.actual
                    secondEdge.trivial = True
                    self.Tl.add(secondEdge)
                    C.append(secondEdge)

            # If an always maximal edge is found, erase it from Lower Limit Tree
            if len(C):
                uppers = []
                for edge in C:
                    uppers.append(edge.upper)
                uppers.sort()
                uCon, vCon = None, None
                for edge in C:
                    if edge.upper == uppers[-1] and (edge.trivial or
                                                     edge.lower >= uppers[-2]):
                        self.Tl.remove(edge)
                        break
        def trainingModel(self):
            # Logging the start of Training
            self.log_writer.log(self.file_object, "ModelTrainingLog",
                                'Start of Training')

            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = Preprocessor(self.file_object, self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer', "Unnamed: 0"]
            )  # remove the unnamed column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Good/Bad')

            is_null_present = preprocessor.is_null_present(X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  # using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            # create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                # getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                # saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, "ModelTrainingLog",
                                'Successful End of Training')
Exemple #14
0
 def test_emoji_replacement(self):
     prep4 = Preprocessor(corpus1, remove_short_tweets=False, verbose_emoji=True)
     corpus = prep4.preprocessed_corpus()[0]
     self.assertEqual(corpus[1], ['short', ':fearful_face:'])
Exemple #15
0
def main():
    """Main entry point"""

    # Unclear why this is needed but I get BLAS errors otherwise
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # Valiate command line args
    parser = argparse.ArgumentParser(
        description="Fit and evaluate all models in `models.py`")
    parser.add_argument('label')
    parser.add_argument('epochs', type=int)
    parser.add_argument('batch_size', type=int)
    opts = parser.parse_args(sys.argv[1:])
    filename = os.path.join("output", opts.label+".json")

    if not os.path.exists(filename):
        parser.error("Could not find JSON preprocessed data: %s" % filename)

    # Create image output directory if it doesn't exist
    if not os.path.exists("images"):
        os.mkdir("images")

    # De-serialize preprocessor
    with open(filename, "r") as this_file:
        json_txt = this_file.read()
    pre_proc = Preprocessor()
    pre_proc.from_json(json_txt)

    # Create the models
    models_dict = models.generate_models(pre_proc)

    # Fitting with cross-validation
    kfolds = KFold(n_splits=models.K_FOLDS)
    results = {}

    # Loop over all models...
    for model_name, fold_models in models_dict.items():

        # Reshape X based on model type... standard neural networks
        # take a different shape than LSTM and have an additional input
        # for position in vector. This is handled automatically for LSTM
        # networks.
        if model_name[0:4] == "DENS":
            x_data = pre_proc.x_train
            y_data = pre_proc.y_train
        elif model_name[0:4] == "LSTM":
            x_data, y_data, _, _ = pre_proc.get_rnn_format()
        else:
            raise ValueError("Uknown model prefix: %s" % model_name[0:4])

        begin_time = time.time()
        train_err = []
        val_err = []
        history = []

        model_idx = 0
        for train_idx, val_idx in kfolds.split(x_data, y_data):

            # Grab the model for this fold
            model = fold_models[model_idx]

            train_dataset = tf.data.Dataset.from_tensor_slices((
                tf.cast(x_data[train_idx], tf.float32),
                tf.cast(y_data[train_idx], tf.float32),
            ))
            train_dataset = train_dataset.batch(opts.batch_size)

            val_dataset = tf.data.Dataset.from_tensor_slices((
                tf.cast(x_data[val_idx], tf.float32),
                tf.cast(y_data[val_idx], tf.float32),
            ))
            val_dataset = val_dataset.batch(opts.batch_size)

            # No suffle, already done
            hist = model.fit(
                        x=train_dataset,
                        epochs=opts.epochs,
                        shuffle=False,              # Shuffle already done
                        verbose=0,
                        callbacks=callbacks,
                        validation_data=val_dataset,
                        )

            # Story history and cycle to next fold
            history.append(hist)
            train_err.append(hist.history['loss'][-1])
            val_err.append(hist.history['val_loss'][-1])
            print("{0:10} {1:7.4f} {2:7.4f}".format(
                        model_name, train_err[-1], val_err[-1]))
            model_idx += 1

        # Done with all the folds
        end_time = time.time()
        fit_time = end_time-begin_time
        print("time: {0:7.2f}".format(fit_time))

        results[model_name] = (np.mean(train_err),
                               np.mean(val_err),
                               fit_time)

        fig = go.Figure()
        for i in range(models.K_FOLDS):
            fig.add_trace(go.Scatter(
                x=[t for t in range(len(history[i].history['loss']))],
                y=history[i].history['loss'],
                name="Train Fold {}".format(i+1),
                mode='lines',
            ))
            fig.add_trace(go.Scatter(
                x=[t for t in range(len(history[i].history['val_loss']))],
                y=history[i].history['val_loss'],
                name="Val Fold {}".format(i+1),
                mode='lines',
            ))

        fig.update_xaxes(title="Epoch")
        fig.update_yaxes(title="Loss")
        plot(fig)

    df_summary = pd.DataFrame(results).transpose()
    df_summary.columns = ['train', 'val', 'time']
    df_summary['labels'] = df_summary.index
    df_summary = df_summary[['labels', 'train', 'val', 'time']]

    outfile = os.path.join("output", "{0}_param_search.csv".format(opts.label))
    df_summary.to_csv(outfile)
Exemple #16
0
 def test_has_proper_noun(self):
     prep7 = Preprocessor([])
     self.assertTrue(prep7.has_proper_noun("Bernie will win."))
     self.assertFalse(prep7.has_proper_noun("not impeachable!"))
     self.assertFalse(prep7.has_proper_noun("🤕"))
  Directory of files used for training.
  """)
parser.add_argument('--architecture',
                    type=str,
                    default='vgg',
                    help="""\
  Model architecture to use.
  """)
FLAGS, unparsed = parser.parse_known_args()

print("Loading model...")

tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.InteractiveSession()
preprocessor = Preprocessor(feature_count=40,
                            window_size_ms=20,
                            window_stride_ms=10)

fingerprint_input = tf.placeholder(tf.float32,
                                   [None, preprocessor.fingerprint_size],
                                   name='fingerprint_input')
fingerprint_input_4d = tf.reshape(
    fingerprint_input,
    [-1, preprocessor.feature_count, preprocessor.window_number, 1])
logits = create_model(FLAGS.architecture,
                      fingerprint_input_4d, {'label_count': len(CLASSES)},
                      is_training=False)
predicted_indices = tf.argmax(logits, 1)

tf.global_variables_initializer().run()
if FLAGS.model:
Exemple #18
0
from preprocessing import Preprocessor
import pandas as pd
import numpy as np
import boto

df = pd.read_csv('stackodata.csv')

p = Preprocessor()

cleaned_data = p.transform(df.values)

df_cleaned = pd.DataFrame(cleaned_data,
                          columns=[
                              'id', 'title', 'qscore', 'ascore', 'tags',
                              'q_nocode', 'q_code', 'a_nocode', 'a_code'
                          ])

df_cleaned.to_csv('stack_data_cleaned.csv', encoding='utf-8')
Exemple #19
0
db, log = DBProvider(), Log(config, 'update')

now_morning = datetime(now.year, now.month, now.day, 4)
start_time = time.time()
log.debug(f"Clear RAM {config.OS.clear_ram()}")
log.debug(f'Current DB is {config.Database}')
log.debug(f"Update current_matches to {now_morning}")
updater = CurrentUpdater(LeagueScraper(),
                         MatchScraper(from_time=None, to_time=now_morning), db,
                         log)
updater.update()
log.debug(f"Updated current_matches for {int(time.time() - start_time)} sec")

next_day = now + timedelta(days=1)
next_day_morning = datetime(next_day.year, next_day.month, next_day.day, 4)
start_time = time.time()
log.debug(f"Update future_matches from {now_morning} to {next_day_morning}")
updater = FutureUpdater(
    FutureLeagueScraper(),
    MatchScraper(from_time=now_morning, to_time=next_day_morning), db, log)
updater.update()
log.debug(f"Updated future_matches for {int(time.time() - start_time)} sec")

start_time = time.time()
log.debug(f"Clear RAM {config.OS.clear_ram()}")
preprocessor = Preprocessor(db, log)
log.debug('Preprocess matches')
num_matches = preprocessor.preprocess()
log.debug(
    f'Preprocessed {num_matches} matches for {int(time.time() - start_time)} sec'
)
Exemple #20
0
    def setup_to_train(self, train_data=None, dev_data=None, test_data=None):
        # create a model directory:
        if os.path.isdir(self.model_dir):
            shutil.rmtree(self.model_dir)
        os.mkdir(self.model_dir)

        self.train_tokens = train_data['token']
        if self.include_test:
            self.test_tokens = test_data['token']
        if self.include_dev:
            self.dev_tokens = dev_data['token']

        idx_cnt = 0
        if self.include_lemma:
            self.lemma_out_idx = idx_cnt
            idx_cnt += 1
            self.train_lemmas = train_data['lemma']
            self.known_lemmas = set(self.train_lemmas)
            if self.include_dev:
                self.dev_lemmas = dev_data['lemma']
            if self.include_test:
                self.test_lemmas = test_data['lemma']
        if self.include_pos:
            self.pos_out_idx = idx_cnt
            idx_cnt += 1
            self.train_pos = train_data['pos']
            if self.include_dev:
                self.dev_pos = dev_data['pos']
            if self.include_test:
                self.test_pos = test_data['pos']
        if self.include_morph:
            self.morph_out_idx = idx_cnt
            self.train_morph = train_data['morph']
            if self.include_dev:
                self.dev_morph = dev_data['morph']
            if self.include_test:
                self.test_morph = test_data['morph']

        self.preprocessor = Preprocessor().fit(
            tokens=self.train_tokens,
            lemmas=self.train_lemmas,
            pos=self.train_pos,
            morph=self.train_morph,
            include_lemma=self.include_lemma,
            include_morph=self.include_morph,
            max_token_len=self.max_token_len,
            focus_repr=self.focus_repr,
            min_lem_cnt=self.min_lem_cnt,
        )
        self.pretrainer = Pretrainer(nb_left_tokens=self.nb_left_tokens,
                                     nb_right_tokens=self.nb_right_tokens,
                                     size=self.nb_embedding_dims,
                                     minimum_count=self.min_token_freq_emb)
        self.pretrainer.fit(tokens=self.train_tokens)

        train_transformed = self.preprocessor.transform(
            tokens=self.train_tokens,
            lemmas=self.train_lemmas,
            pos=self.train_pos,
            morph=self.train_morph)
        if self.include_dev:
            dev_transformed = self.preprocessor.transform(
                tokens=self.dev_tokens,
                lemmas=self.dev_lemmas,
                pos=self.dev_pos,
                morph=self.dev_morph)
        if self.include_test:
            test_transformed = self.preprocessor.transform(
                tokens=self.test_tokens,
                lemmas=self.test_lemmas,
                pos=self.test_pos,
                morph=self.test_morph)

        self.train_X_focus = train_transformed['X_focus']
        if self.include_dev:
            self.dev_X_focus = dev_transformed['X_focus']
        if self.include_test:
            self.test_X_focus = test_transformed['X_focus']

        if self.include_lemma:
            self.train_X_lemma = train_transformed['X_lemma']
            if self.include_dev:
                self.dev_X_lemma = dev_transformed['X_lemma']
            if self.include_test:
                self.test_X_lemma = test_transformed['X_lemma']

        if self.include_pos:
            self.train_X_pos = train_transformed['X_pos']
            if self.include_dev:
                self.dev_X_pos = dev_transformed['X_pos']
            if self.include_test:
                self.test_X_pos = test_transformed['X_pos']

        if self.include_morph:
            self.train_X_morph = train_transformed['X_morph']
            if self.include_dev:
                self.dev_X_morph = dev_transformed['X_morph']
            if self.include_test:
                self.test_X_morph = test_transformed['X_morph']

        self.train_contexts = self.pretrainer.transform(
            tokens=self.train_tokens)
        if self.include_dev:
            self.dev_contexts = self.pretrainer.transform(
                tokens=self.dev_tokens)
        if self.include_test:
            self.test_contexts = self.pretrainer.transform(
                tokens=self.test_tokens)

        print('Building model...')
        nb_tags = None
        try:
            nb_tags = len(self.preprocessor.pos_encoder.classes_)
        except AttributeError:
            pass
        nb_morph_cats = None
        try:
            nb_morph_cats = self.preprocessor.nb_morph_cats
        except AttributeError:
            pass
        max_token_len, token_char_dict = None, None
        try:
            max_token_len = self.preprocessor.max_token_len
            token_char_dict = self.preprocessor.token_char_dict
        except AttributeError:
            pass
        max_lemma_len, lemma_char_dict = None, None
        try:
            max_lemma_len = self.preprocessor.max_lemma_len
            lemma_char_dict = self.preprocessor.lemma_char_dict
        except AttributeError:
            pass
        nb_lemmas = None
        try:
            nb_lemmas = len(self.preprocessor.lemma_encoder.classes_)
        except AttributeError:
            pass
        self.model = build_model(
            token_len=max_token_len,
            token_char_vector_dict=token_char_dict,
            lemma_len=max_lemma_len,
            nb_tags=nb_tags,
            nb_morph_cats=nb_morph_cats,
            lemma_char_vector_dict=lemma_char_dict,
            nb_encoding_layers=self.nb_encoding_layers,
            nb_dense_dims=self.nb_dense_dims,
            nb_embedding_dims=self.nb_embedding_dims,
            nb_train_tokens=len(self.pretrainer.train_token_vocab),
            nb_context_tokens=self.nb_context_tokens,
            pretrained_embeddings=self.pretrainer.pretrained_embeddings,
            include_token=self.include_token,
            include_context=self.include_context,
            include_lemma=self.include_lemma,
            include_pos=self.include_pos,
            include_morph=self.include_morph,
            nb_filters=self.nb_filters,
            filter_length=self.filter_length,
            focus_repr=self.focus_repr,
            dropout_level=self.dropout_level,
            nb_lemmas=nb_lemmas,
        )
        self.save()
        self.setup = True
Exemple #21
0
    # evaluating SVM using cross validation
    print "Evaluating model with cross validation..."

    if speaker_indipendence:
        k_folds = len(db.test_sets)
        splits = zip(db.train_sets, db.test_sets)
    else:
        k_folds = 10  # 交叉验证,分为k个子样本
        sss = StratifiedShuffleSplit(n_splits=k_folds,
                                     test_size=0.2,
                                     random_state=1)
        splits = sss.split(Fglobal, y)

    # setting preprocessing
    pp = Preprocessor('standard', n_components=50)
    n_classes = len(db.classes)
    clf = OneVsRestClassifier(svm.SVC(kernel='rbf', C=10, gamma=0.01))
    # C惩罚参数,值越大,训练准确率高,泛化能力弱
    # gamma 不知道干嘛的
    # one vs rest, Also known as one-vs-all
    prfs = []
    scores = []
    acc = np.zeros(n_classes)
    mi_threshold = 0.0
    for (train, test) in splits:
        # selecting features using mutual information
        Ftrain = Fglobal[train]
        Ftest = Fglobal[test]
        f_subset = pp.mutual_info_select(Ftrain, y[train], mi_threshold)
        # 通过互信息选择特征