def __init__(self, *, use_pretrained=True): if not use_pretrained: data = pd.read_csv("../data/train_clean.csv") data = exclude_sents(data) train, _ = train_val_split(data) all_questions = train['question1'].append(train['question2']) sentences = all_questions.fillna('').apply(lambda x: x.split()) print('Training model...') self.model = gs.models.Word2Vec(sentences, size=c.WORD_EMBED_SIZE, window=5, min_count=c.UNKNOWN_MIN_COUNT, workers=4) print('Done.') self.model.save(c.WORD2VEC_FILEPATH) else: self.model = gs.models.Word2Vec.load(c.WORD2VEC_FILEPATH)
def __init__(self): """Arguments: - model_name: name of the model to load/save to - use_pretrained: if True, then loading model from model_name, else training """ data = pd.read_csv('../data/train_clean.csv') # dropping low-length and high-length sentences data = exclude_sents(data) # randomly shuffling data and separating into train/val data train_data, val_data = train_val_split(data) # pd.Series to ndarray train_q1_str, train_q2_str = train_data[ 'question1'].values, train_data['question2'].values val_q1_str, val_q2_str = val_data['question1'].values, val_data[ 'question2'].values print('Fitting tokenizer...') self.tokenizer = Tokenizer(filters="", oov_token='!UNK!') self.tokenizer.fit_on_texts(np.append(train_q1_str, train_q2_str)) self.w2v = Word2VecModel() unk_embed = self.produce_unk_embed() print('Converting strings to int arrays...') self.x_train_q1 = pad_sequences( self.tokenizer.texts_to_sequences(train_q1_str), maxlen=c.SENT_LEN) self.x_train_q2 = pad_sequences( self.tokenizer.texts_to_sequences(train_q2_str), maxlen=c.SENT_LEN) self.y_train = train_data['is_duplicate'].values self.x_val_q1 = pad_sequences( self.tokenizer.texts_to_sequences(val_q1_str), maxlen=c.SENT_LEN) self.x_val_q2 = pad_sequences( self.tokenizer.texts_to_sequences(val_q2_str), maxlen=c.SENT_LEN) self.y_val = val_data['is_duplicate'].values print('Creating embeddings matrix...') num_words = len(self.tokenizer.word_index ) + 2 # 0 index is reserved, oov token appended self.embedding_matrix = np.zeros((num_words, c.WORD_EMBED_SIZE)) for word, i in self.tokenizer.word_index.items(): try: embedding_vector = self.w2v.model.wv[word] except KeyError: embedding_vector = unk_embed self.embedding_matrix[i] = embedding_vector
if ds_type == 'flow': result_by_mod_n[mod] = v.drop( ['city_code', 'district_code', 'date_dt'], axis=1).mean().to_dict() else: result_by_mod_n[mod] = v.drop( ['o_city_code', 'o_district_code', 'date_dt'], axis=1).mean().to_dict() return result_by_mod_n if __name__ == '__main__': #read data flow_train = pd.read_csv('../../data/flow_train.csv') total_flow_train, total_flow_val = train_val_split(flow_train) # transition_train = pd.read_csv('../data/transition_train.csv') #read all sample path sample_data_path = '../../data/flow/' all_sample = os.listdir(sample_data_path) #grid search for n for i in range(3, 16): gt_for_each_sample = [] result_for_each_sample = [] for sample in tqdm(all_sample): city, district = sample[:-4].split('_') # We'll start by playing the flow, then the transition ###statistic with mod7, week
sample_data_path = '../../data/flow/' all_sample = os.listdir(sample_data_path) channels = ['dwell'] top5_loss_param_each_sample = {} #search for channel for channel in channels: for sample in tqdm(all_sample): loss_table = {} city, district = sample[:-4].split('_') flow_sample = pd.read_csv(sample_data_path + sample) sample_train, sample_val = train_val_split(flow_sample) # grid search for a in range(12): for b in range(3): for c in range(12): loss = 100 try: # first condider dwell channel_predict = predict_by_ARIMA(sample_train, channel, param=(a, b, c), offset=0) columns = [ 'date_dt', 'city_code', 'district_code',
Dense) import config model = tf.keras.Sequential([ Embedding(input_dim=config.VOCAB_SIZE, output_dim=config.EMBEDDING_DIM, input_length=config.MAX_LENGTH), Bidirectional(LSTM(128)), Dropout(0.5), Dense(64, activation="relu"), Dense(1, activation="sigmoid"), ]) model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], ) if __name__ == "__main__": import data import preprocessing df = preprocessing.preprocess(data.load("twitter")) train_df, validation_df = preprocessing.train_val_split(df) tokenizer = preprocessing.get_tokenizer(train_df) train_padded, validation_padded = preprocessing.tokenize( tokenizer, train_df, validation_df) history = model.fit(x=train_padded, y=train_df.label.to_numpy(), epochs=2) eval_loss, eval_acc = model.evaluate(x=validation_padded, y=validation_df.label.to_numpy())