def get_strings(FILE_NAMES, b_dep): count = 0 FILE_STRINGS = [] for files in FILE_NAMES: string_user = [] for file in files: tweet = "" all_lines = "" with open(file, 'r', encoding="utf-8") as f: for line in f: all_lines += line f.close() pre_processed_tokens = ppc.pre_process_data(all_lines) for ppt in pre_processed_tokens: string_user.append(ppt) #string_user.append(pre_processed_tokens) #string_user = ''.join(string_user) if len(string_user) < 1000: print("User Omitted: Too Few Records") continue if len(string_user) < MAX_TENSOR_LENGTH: if b_dep == 1: LENGTHS_DEPRESSION.append(len(string_user)) else: LENGTHS_CONTROL.append(len(string_user)) string_user += [''] * (MAX_TENSOR_LENGTH - len(string_user)) #print(len(string_user)) count = count + 1 FILE_STRINGS.append(string_user) if count > 999: break return FILE_STRINGS
x = tf.keras.layers.GlobalMaxPooling1D()(x) x = tf.keras.layers.Dense(128, activation='relu')(x) x = tf.keras.layers.Dense(conf['targets'].shape[1], activation='sigmoid')(x) model = tf.keras.Model(input, x) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) return model if __name__ == "__main__": config = pre_process_data() model = build_model(config) print('Training the model...') history = model.fit(x=config['data'], y=config['targets'], epochs=EPOCHS, validation_split=VALIDATION_SPLIT, batch_size=BATCH_SIZE) print('Training done.') model.save('./saved_models/cnn_toxic.h5') plt.plot(history.history['loss'], label='Loss') plt.plot(history.history['val_loss'], label='Validation loss') plt.legend()
#all tweets are stored in a single vector (string at the moment) depression_lengths = [] control_lengths = [] FILE_STRINGS = [] FILE_STRINGS_CONTROL = [] count = 0 for files in FILE_NAMES_DEPRESSION: string_user = [] for file in files: tweet = "" all_lines = "" with open(file, 'r', encoding="utf-8") as f: for line in f: all_lines += line f.close() pre_processed_tokens = ppc.pre_process_data(all_lines) #print(pre_processed_tokens) #debugging line for ppt in pre_processed_tokens: string_user.append(ppt) #string_user.append(pre_processed_tokens) #string_user = ''.join(string_user) if len(string_user) < 1000: print("User Omitted: Too Few Records") continue if len(string_user) < MAX_TENSOR_LENGTH: depression_lengths.append(len(string_user)) string_user += [''] * (MAX_TENSOR_LENGTH - len(string_user)) #print(len(string_user)) count = count + 1 FILE_STRINGS.append(string_user) if count > 999:
FILE_STRINGS = [] FILE_STRINGS_CONTROL = [] count = 0 for files in FILE_NAMES_DEPRESSION: string_user = [] for file in files: #print("Loaded tweets from depression: ", count, end='\r') tweet = "" all_lines = "" all_tweets = [] with open(file, 'r', encoding="utf-8") as f: for line in f: if len(line) > 5: if len(line) < 280: #pre process the lines as they come in pps = ppc.pre_process_data(line) if len(pps) > 1: #pad lines pps += [''] * (280 - len(pps)) TWEETS_DEPRESSION.append(pps) count = count + 1 if DEBUG_MODE == 1 and count > 10000: f.close() break f.close() count = 0 print('\n') print("Importing Control Users") for files in FILE_NAMES_CONTROL: string_user = [] for file in files: