def __init__(self):

        '''
        Training parameters:
        '''
        self.batch_size=512
        self.minLen=50
        self.maxLen=200
        self.maxWords=50000
        self.max_features=self.maxWords+3
        self.nb_epoch=5

        self.trained=False
        self.modelLoaded=False
        self.dataPrepared=False

        self.tokenizer = Tokenizer(maxWords=self.maxWords)

        print('Build model...')

        self.model = Sequential()
        self.model.add(Embedding(self.max_features, 128))
        self.model.add(LSTM(input_dim=128, output_dim=128,return_sequences=True))
        self.model.add(Dropout(0.5))
        self.model.add(LSTM(input_dim=128, output_dim=128,return_sequences=True))
        self.model.add(Dropout(0.5))
        self.model.add(LSTM(input_dim=128, output_dim=128,return_sequences=False))
        self.model.add(Dense(input_dim=128, output_dim=1))
        self.model.add(Activation('sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

        print('Model has been built!')
class GenderClassifier(object):

    def __init__(self):

        '''
        Training parameters:
        '''
        self.batch_size=512
        self.minLen=50
        self.maxLen=200
        self.maxWords=50000
        self.max_features=self.maxWords+3
        self.nb_epoch=5

        self.trained=False
        self.modelLoaded=False
        self.dataPrepared=False

        self.tokenizer = Tokenizer(maxWords=self.maxWords)

        print('Build model...')

        self.model = Sequential()
        self.model.add(Embedding(self.max_features, 128))
        self.model.add(LSTM(input_dim=128, output_dim=128,return_sequences=True))
        self.model.add(Dropout(0.5))
        self.model.add(LSTM(input_dim=128, output_dim=128,return_sequences=True))
        self.model.add(Dropout(0.5))
        self.model.add(LSTM(input_dim=128, output_dim=128,return_sequences=False))
        self.model.add(Dense(input_dim=128, output_dim=1))
        self.model.add(Activation('sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

        print('Model has been built!')

    def load_data(self,dataSize=-1):
        import pandas as pd
        file_loc = os.path.dirname(os.path.realpath(__file__))
        relative_path = "blogger_data_2.csv" # move dataset to examples directory
        fullpath = os.path.join(file_loc, relative_path)
        if dataSize==-1:
            data=pd.read_csv(fullpath)
        else:
            data = pd.read_csv(fullpath, nrows=dataSize)
        X = data['text'].values
        X = [str(x) for x in X]
        y=data['gender'].values
        return X,y

    def prepareData(self,evaluate=False):
        print("Loading data...")
        X,y = self.load_data() # Can increase up to 250K or so
        print("Number of lines:", len(X))

        print("Vectorizing sequence data...")

        if not evaluate:
            self.tokenizer.fit(X)
            self.tokenizer.save("./data/Tokenizer.pkl")
        X=self.tokenizer.transform(X)



        _X=[]
        _y=[]

        for i in range(len(X)):
            if self.minLen<=len(X[i]) and len(X[i])<=self.maxLen:
                _X.append(X[i])
                _y.append(y[i])

        X=sequence.pad_sequences(_X, maxlen=self.maxLen)
        y=_y

        count_1=0
        for i in range(len(y)):
            if y[i]==1.0:
                count_1+=1
        print("female percentage:",(len(y)-count_1)/len(y))
        print("male percentage:",count_1/len(y))

        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(X, y, test_size=0.10, random_state=100)
        self.dataPrepared=True
        print("Data preparation completed!")

    def evaluate(self):
        '''
        For debugging purpose
        '''
        if not (self.modelLoaded or self.trained):
            print('Warning: Model data has not been loaded or the model has not been trained.')
            print("Trying to load Model dada...")
            self.loadModel()
        if not self.dataPrepared:
            self.prepareData(evaluate=True)

        score, acc = self.model.evaluate(self.X_valid, self.y_valid, batch_size=self.batch_size, show_accuracy=True)
        print("Evaluation results:", score, " ", acc)

    def train(self):
        if not self.dataPrepared:
            self.prepareData()

        checkpointer = ModelCheckpoint(filepath="./weights.hdf5", verbose=1, save_best_only=True)
        stopper=EarlyStopping(monitor='val_loss', patience=50, verbose=0)

        print("Training...")

        self.model.fit(self.X_train, self.y_train, batch_size=self.batch_size, nb_epoch=self.nb_epoch, validation_data=(self.X_valid,self.y_valid), show_accuracy=True,callbacks=[checkpointer,stopper])

        self.trained=True
        print('Training completed!')

    def loadModel(self):
        print('Loading model...')
        self.model.load_weights('./data/weights.hdf5')
        self.tokenizer.load("./data/Tokenizer.pkl")
        self.modelLoaded=True
        print('Model loaded')

    '''
    text: list or string
    return: vector of results
    '''
    def predict(self,text):
        if not (self.modelLoaded or self.trained):
            print('Warning: Model data has not been loaded or the model has not been trained.')
            print("Trying to load Model dada...")
            self.loadModel()

        X=np.array(self.tokenizer.transform(text))
        y=self.model.predict(X,batch_size=1)
        return y