Example #1
0
    def __init__(self, *, use_pretrained=True):
        if not use_pretrained:
            data = pd.read_csv("../data/train_clean.csv")
            data = exclude_sents(data)
            train, _ = train_val_split(data)
            all_questions = train['question1'].append(train['question2'])
            sentences = all_questions.fillna('').apply(lambda x: x.split())
            print('Training model...')
            self.model = gs.models.Word2Vec(sentences,
                                            size=c.WORD_EMBED_SIZE,
                                            window=5,
                                            min_count=c.UNKNOWN_MIN_COUNT,
                                            workers=4)
            print('Done.')

            self.model.save(c.WORD2VEC_FILEPATH)
        else:
            self.model = gs.models.Word2Vec.load(c.WORD2VEC_FILEPATH)
Example #2
0
    def __init__(self):
        """Arguments: 
           - model_name: name of the model to load/save to
           - use_pretrained: if True, then loading model from model_name, else training
        """
        data = pd.read_csv('../data/train_clean.csv')
        # dropping low-length and high-length sentences
        data = exclude_sents(data)
        # randomly shuffling data and separating into train/val data
        train_data, val_data = train_val_split(data)
        # pd.Series to ndarray
        train_q1_str, train_q2_str = train_data[
            'question1'].values, train_data['question2'].values
        val_q1_str, val_q2_str = val_data['question1'].values, val_data[
            'question2'].values

        print('Fitting tokenizer...')
        self.tokenizer = Tokenizer(filters="", oov_token='!UNK!')
        self.tokenizer.fit_on_texts(np.append(train_q1_str, train_q2_str))
        self.w2v = Word2VecModel()
        unk_embed = self.produce_unk_embed()

        print('Converting strings to int arrays...')
        self.x_train_q1 = pad_sequences(
            self.tokenizer.texts_to_sequences(train_q1_str), maxlen=c.SENT_LEN)
        self.x_train_q2 = pad_sequences(
            self.tokenizer.texts_to_sequences(train_q2_str), maxlen=c.SENT_LEN)
        self.y_train = train_data['is_duplicate'].values
        self.x_val_q1 = pad_sequences(
            self.tokenizer.texts_to_sequences(val_q1_str), maxlen=c.SENT_LEN)
        self.x_val_q2 = pad_sequences(
            self.tokenizer.texts_to_sequences(val_q2_str), maxlen=c.SENT_LEN)
        self.y_val = val_data['is_duplicate'].values

        print('Creating embeddings matrix...')
        num_words = len(self.tokenizer.word_index
                        ) + 2  # 0 index is reserved, oov token appended
        self.embedding_matrix = np.zeros((num_words, c.WORD_EMBED_SIZE))
        for word, i in self.tokenizer.word_index.items():
            try:
                embedding_vector = self.w2v.model.wv[word]
            except KeyError:
                embedding_vector = unk_embed
            self.embedding_matrix[i] = embedding_vector
Example #3
0
        if ds_type == 'flow':
            result_by_mod_n[mod] = v.drop(
                ['city_code', 'district_code', 'date_dt'],
                axis=1).mean().to_dict()
        else:
            result_by_mod_n[mod] = v.drop(
                ['o_city_code', 'o_district_code', 'date_dt'],
                axis=1).mean().to_dict()

    return result_by_mod_n


if __name__ == '__main__':
    #read data
    flow_train = pd.read_csv('../../data/flow_train.csv')
    total_flow_train, total_flow_val = train_val_split(flow_train)

    # transition_train = pd.read_csv('../data/transition_train.csv')

    #read all sample path
    sample_data_path = '../../data/flow/'
    all_sample = os.listdir(sample_data_path)

    #grid search for n
    for i in range(3, 16):
        gt_for_each_sample = []
        result_for_each_sample = []
        for sample in tqdm(all_sample):
            city, district = sample[:-4].split('_')
            # We'll start by playing the flow, then the transition
            ###statistic with mod7, week
Example #4
0
    sample_data_path = '../../data/flow/'
    all_sample = os.listdir(sample_data_path)

    channels = ['dwell']

    top5_loss_param_each_sample = {}
    #search for channel
    for channel in channels:
        for sample in tqdm(all_sample):
            loss_table = {}

            city, district = sample[:-4].split('_')

            flow_sample = pd.read_csv(sample_data_path + sample)

            sample_train, sample_val = train_val_split(flow_sample)
            # grid search
            for a in range(12):
                for b in range(3):
                    for c in range(12):
                        loss = 100

                        try:
                            # first condider dwell
                            channel_predict = predict_by_ARIMA(sample_train,
                                                               channel,
                                                               param=(a, b, c),
                                                               offset=0)

                            columns = [
                                'date_dt', 'city_code', 'district_code',
Example #5
0
                                     Dense)
import config

model = tf.keras.Sequential([
    Embedding(input_dim=config.VOCAB_SIZE,
              output_dim=config.EMBEDDING_DIM,
              input_length=config.MAX_LENGTH),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid"),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
)

if __name__ == "__main__":
    import data
    import preprocessing
    df = preprocessing.preprocess(data.load("twitter"))
    train_df, validation_df = preprocessing.train_val_split(df)
    tokenizer = preprocessing.get_tokenizer(train_df)
    train_padded, validation_padded = preprocessing.tokenize(
        tokenizer, train_df, validation_df)
    history = model.fit(x=train_padded, y=train_df.label.to_numpy(), epochs=2)
    eval_loss, eval_acc = model.evaluate(x=validation_padded,
                                         y=validation_df.label.to_numpy())