def annotate_tweets_ds( cls, dict_ds: dict, store: bool = False, dirpath: str = (r"..\..\Datasets\twitter-sars-cov-2" "\\annotated\\") ) -> dict: # TODO: find a way to check store once not twice without recoding # If we want to store the results and the provided directory is invalid # then exit if store: if not dtls.isready_dirpath(dirpath): raise IOError( f"Provided storage destination '{dirpath}' is not valid") # Either we don't want to store, or the storage destination is valid for k, ds in dict_ds.items(): ds["five_g"] = cls.label_tweets(ds.text, mode=cls.G5) logger.info(f"Dataset {k} annotated with 5G labels") if store: fpath = f"{dirpath}\\annotated_{k}" ds.to_csv(path_or_buf=fpath, sep=",", index=False, encoding="utf-8") logger.info(f"Annotated dataset {k} saved to disk") return dict_ds
def init_ui(self): """ Initializes the top-level tab widget and all sub tabs ("Data", "Training", "Testing") """ self.setGeometry(0, 0, 1100, 800) self.setWindowTitle('Myo Tools') self.setObjectName("TopWidget") self.setStyleSheet("#TopWidget {background-color: white;}") # # Top-level layout # tools_layout = QVBoxLayout() self.tool_tabs = QTabWidget() # Fancy styling tab_widgets = self.tool_tabs.findChild(QStackedWidget) tab_widgets.setObjectName("TabWidgets") tools_layout.addWidget(self.tool_tabs) top_tabs = self.tool_tabs.findChild(QTabBar) top_tabs.setObjectName("TopTabs") self.tool_tabs.setStyleSheet( "QTabBar#TopTabs::tab {font-weight: bold; height:35px; width: 150px; border-radius: 3px; " " border: 2px solid #bbbbbb; background-color:#dddddd;}" "QStackedWidget#TabWidgets {background-color: #eeeeee;}") self.tool_tabs.currentChanged.connect(self.on_tab_changed) self.cur_index = 0 self.data_tools_tab = DataTools(self.on_device_connected, self.on_device_disconnected, self.is_data_tools_open) self.online_training_tab = OnlineTraining( self.data_tools_tab.data_collected) self.online_pred_tab = OnlineTesting( self.data_tools_tab.data_collected) self.tool_tabs.addTab(self.data_tools_tab, "Data Collection") self.tool_tabs.addTab(self.online_training_tab, "Online Training") self.tool_tabs.addTab(self.online_pred_tab, "Online Predictions") self.setLayout(tools_layout) self.show()
def main(): #prepare args args = getArgs() print("Train CUDA: {}, Test CUDA: {}, Batch Size: {}".format(args.use_cuda_train, args.use_cuda_test, args.batch_size)) #init stuff torch.manual_seed(args.seed) model = Net() #load data with utils.MeasureBlockTime("Data loading (s): "): train_loader, test_loader = DataTools.getDataLoaders(args) print("Train: {}, Test:{}, Train batches: {}, Test batches:{}".format( len(train_loader.dataset), len(test_loader.dataset), len(train_loader), len(test_loader))) train_test = TrainTest(model, args.lr, args.momentum, args.device_train, args.device_test) probe = DebugProbe(train_test, "test_exp") #train model train_test.train_model(args.epochs, train_loader, test_loader)
def __init__(self): dt = DataTools() self.ys, self.tokenMatrix, self.positionMatrix1, self.positionMatrix2, self.sdpMatrix = dt.get_data("pkl/test.pkl.gz")
args = parse() print(args) args.lr_base = args.lr args.betas = (args.beta1, args.beta2) os.makedirs(join('output', args.experiment_name), exist_ok=True) os.makedirs(join('output', args.experiment_name, 'checkpoint'), exist_ok=True) os.makedirs(join('output', args.experiment_name, 'sample_training'), exist_ok=True) with open(join('output', args.experiment_name, 'setting.txt'), 'w') as f: f.write(json.dumps(vars(args), indent=4, separators=(',', ':'))) data_tools = DataTools() transformed_training_dataset = data_tools.transformed_training_data # load training data in batches batch_size = args.batch_size num_workers = args.num_workers train_loader = DataLoader(transformed_training_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) # Define the loss and optimization lr = args.lr criterion = nn.SmoothL1Loss() optimizer = optim.Adam(net.parameters(), lr=lr)
class TopLevel(QWidget): """ Main window containing all GUI components. """ # # Used to cleanup background worker thread(s) (on exit) # class Exit(QObject): exitClicked = pyqtSignal() def closeEvent(self, event): self.close_event.exitClicked.emit() def __init__(self): super().__init__() self.close_event = self.Exit() # Thread cleanup on exit self.close_event.exitClicked.connect(self.stop_background_workers) # Configurable self.worker_check_period = 1 # seconds self.init_ui() def init_online_pred_tab(self): """ Initializes UI elements in the "Online Predictions" tab :return: (QWidget) online_training_tab """ online_pred_tab = QWidget() return online_pred_tab def init_ui(self): """ Initializes the top-level tab widget and all sub tabs ("Data", "Training", "Testing") """ self.setGeometry(0, 0, 1100, 800) self.setWindowTitle('Myo Tools') self.setObjectName("TopWidget") self.setStyleSheet("#TopWidget {background-color: white;}") # # Top-level layout # tools_layout = QVBoxLayout() self.tool_tabs = QTabWidget() # Fancy styling tab_widgets = self.tool_tabs.findChild(QStackedWidget) tab_widgets.setObjectName("TabWidgets") tools_layout.addWidget(self.tool_tabs) top_tabs = self.tool_tabs.findChild(QTabBar) top_tabs.setObjectName("TopTabs") self.tool_tabs.setStyleSheet( "QTabBar#TopTabs::tab {font-weight: bold; height:35px; width: 150px; border-radius: 3px; " " border: 2px solid #bbbbbb; background-color:#dddddd;}" "QStackedWidget#TabWidgets {background-color: #eeeeee;}") self.tool_tabs.currentChanged.connect(self.on_tab_changed) self.cur_index = 0 self.data_tools_tab = DataTools(self.on_device_connected, self.on_device_disconnected, self.is_data_tools_open) self.online_training_tab = OnlineTraining( self.data_tools_tab.data_collected) self.online_pred_tab = OnlineTesting( self.data_tools_tab.data_collected) self.tool_tabs.addTab(self.data_tools_tab, "Data Collection") self.tool_tabs.addTab(self.online_training_tab, "Online Training") self.tool_tabs.addTab(self.online_pred_tab, "Online Predictions") self.setLayout(tools_layout) self.show() def is_data_tools_open(self): return self.cur_index == 0 def on_device_connected(self, address, rssi, battery_level): """ Called on user initiated connection :param address: MAC address of connected Myo device """ self.online_pred_tab.device_connected(address, rssi, battery_level) self.online_training_tab.device_connected(address, rssi, battery_level) def on_device_disconnected(self, address): """ Called on user initiated disconnect, or unexpected disconnect :param address: MAC address of disconnected Myo device """ self.online_pred_tab.device_disconnected(address) self.online_training_tab.device_disconnected(address) def on_tab_changed(self, value): """ Intercepts a user attempting to switch tabs (to ensure a valid tab switch is taking place) value: Desired tab index to switch to """ if self.cur_index == value: return valid_switch = False # # Determine if we can switch # data_tool_idx = 0 online_train_idx = 1 online_pred_idx = 2 if self.cur_index == data_tool_idx: # # Check for incomplete Myo search workers # waiting_on_search = False for worker in self.data_tools_tab.search_threads: if not worker.complete: waiting_on_search = True break if not waiting_on_search: # # Check for background data workers # # worker_running = False # num_widgets = self.data_tools_tab.ports_found.count() # # for idx in range(num_widgets): # # Ignore port widgets (only interested in Myo device rows) # list_widget = self.data_tools_tab.ports_found.item(idx) # if hasattr(list_widget, "port_idx"): # continue # # myo_widget = self.data_tools_tab.ports_found.itemWidget(list_widget) # if not (myo_widget.worker is None): # if not myo_widget.worker.complete: # worker_running = True # break worker_running = False if not worker_running: # # Close the background video worker if appropriate # # if not self.data_tools_tab.gt_helper_open: # if not (self.data_tools_tab.gt_helper.worker is None): # self.data_tools_tab.gt_helper.stop_videos() # # while not (self.data_tools_tab.gt_helper.worker.complete): # time.sleep(self.worker_check_period) # # # # # IF we make it here, the switch is valid (for the case of the data tools tab) # # # valid_switch = True # else: # self.warn_user("Please close GT Helper first.") valid_switch = True else: self.warn_user( "Please close connection to Myo devices first.") else: self.warn_user( "Please wait for Myo device search to complete first.") # # To control switching out of online training / testing # elif self.cur_index == online_train_idx: valid_switch = True elif self.cur_index == online_pred_idx: valid_switch = True if valid_switch: self.cur_index = value else: self.tool_tabs.setCurrentIndex(self.cur_index) def stop_background_workers(self): """ This function is called on (user click-initiated) exit of the main window. """ self.data_tools_tab.stop_data_tools_workers() def warn_user(self, message): """ Generates a pop-up warning message :param message: The text to display """ self.warning = QErrorMessage() self.warning.showMessage(message) self.warning.show()
def __init__(self): dt = DataTools() self.ys, self.tokenMatrix, self.positionMatrix1, self.positionMatrix2, self.sdpMatrix = dt.get_data( "./pkl/final_test.pkl.gz") print(self.ys.shape[0])
def __init__(self): self.dt = DataTools()
class KejModel(object): WORD_DIM = 250 CLASS_DIM = 30 NUM_CLASSES = 11 MAX_SENTENCE_LEN = 100 multiple = 1.0 dt = None relationsMapping = { 'other': 0, 'locaA': 1, 'locAa': 2, 'med-ill': 3, 'ill-med': 4, "clsaA": 5, "clsAa": 6, "w-c": 7, "c-w": 8, "cs-ef": 9, "ef-cs": 10 } idx2relation = { 0: 'other', 1: 'locaA', 2: 'locAa', 3: 'med-ill', 4: 'ill-med', 5: "clsaA", 6: "clsAa", 7: "w-c", 8: "c-w", 9: "cs-ef", 10: "ef-cs" } def __init__(self): self.dt = DataTools() def load_kej_model(self, modelpath): model = load_model(modelpath) print(model.summary()) return model def build_new_model(self, n_out, max_sentence_len, max_position, embedding, printflag=False): print("build new model") num_filter = 140 filter_length = 3 position_dims = 80 print(max_sentence_len) words_input = Input(shape=(max_sentence_len, ), dtype='int32', name='words_input') # words = Embedding(70000, self.WORD_DIM)(words_input) words = embedding(words_input) # if printflag: # print(words) # sdp_input = Input(shape=(max_sentence_len,), dtype='float32', name='sdp_input') distance1_input = Input(shape=(max_sentence_len, ), dtype='int32', name='distance1_input') distance1 = Embedding(max_position, position_dims)(distance1_input) print(distance1) distance2_input = Input(shape=(max_sentence_len, ), dtype='int32', name='distance2_input') distance2 = Embedding(max_position, position_dims)(distance2_input) print(distance2) output = concatenate([words, distance1, distance2]) output = Convolution1D(filters=num_filter, kernel_size=filter_length, padding='same', activation='relu', strides=1)(output) print(output) output = GlobalMaxPooling1D()(output) print(output) output = Dropout(0.25)(output) output = Dense(50, activation='relu')(output) output = Dropout(0.25)(output) output = Dense(n_out, activation='softmax')(output) # model = Model(inputs = [sdp_input, words_input,distance1_input,distance2_input],outputs=[output]) model = Model(inputs=[words_input, distance1_input, distance2_input], outputs=[output]) model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy']) return model def train_model(self, tosave, savepath, num_epoch=20, batch_size=64, load=True): ys, tokenMatrix, positionMatrix1, positionMatrix2, sdpMatrix = self.dt.get_data( 'pkl/train.pkl.gz') n_out = max(ys) + 1 max_sentence_len = tokenMatrix.shape[1] print(max_sentence_len) max_position = max(np.max(positionMatrix1), np.max(positionMatrix2)) + 1 if load: model = load_model('model/kej_model.h5') else: genmodel = GenModels.Word2Vec.load("w2vmodel/word2vec2.model") embedding = genmodel.wv.get_keras_embedding(False) model = self.build_new_model(n_out, max_sentence_len, max_position, embedding) print("Start training") max_prec, max_rec, max_acc, max_f1 = 0, 0, 0, 0 # for epoch in range(num_epoch): model.fit([tokenMatrix, positionMatrix1, positionMatrix2], ys, batch_size=batch_size, verbose=True, epochs=num_epoch) if tosave: model.save(savepath) print("保存成功") def predict_classes(self, prediction): return prediction.argmax(axis=-1) def model_predict(self, testpkl_path, model_path='model/kej_model.h5'): ys, tokenMatrix, positionMatrix1, positionMatrix2, sdpMatrix = self.dt.get_data( testpkl_path) model = load_model(model_path) pred_test = self.predict_classes( model.predict([tokenMatrix, positionMatrix1, positionMatrix2], verbose=False)) acc = np.sum(pred_test == ys) / float(len(ys)) print("测试样例数量: " + str(len(ys))) print("准确率: " + str(acc)) print("错误样例:") wrong_outputs = [] for i in range(len(ys)): if pred_test[i] != ys[i]: wrong_outputs.append( str(i) + " " + self.idx2relation[ys[i]] + " " + self.idx2relation[pred_test[i]]) for item in wrong_outputs: print(item) def model_predict_one(self, model, relationidx, positionMatrix1, positionMatrix2, tokenMatrix): pred_test = self.predict_classes( model.predict([tokenMatrix, positionMatrix1, positionMatrix2], verbose=False)) print(self.idx2relation[pred_test[0]])
def main(): logger.info("Main Module Start") # Location where the original tweets are master_folder = f"{datasets_folder}\\pruned" if prune_tweets: # If the tweet csvs still contain retweets, clean them logger.info("You've chosen to prune the files. Pruning..") # Make a dictionary of <filepaths, whether it's hydrator schema> dpaths = {} dpaths["200201"] = ((f"{datasets_folder}\\ids_2020-02-01\\" "Rehydrate_tweets_2020-02-01.csv"), True) dpaths["200215"] = ((f"{datasets_folder}\\ids_2020-02-15\\" "tweets_2020-02-15.csv"), True) dpaths["200301"] = ((f"{datasets_folder}\\ids_2020-03-01\\" "rehydrated_tweets_20200301.csv"), False) dpaths["200401"] = ((f"{datasets_folder}\\ids_2020-04-01\\" "rehydrated_tweets_20200401.csv"), True) dpaths["200501"] = ((f"{datasets_folder}\\ids_2020-05-01\\" "tweets_20200501.csv"), True) dpaths["200315"] = ((f"{datasets_folder}\\ids_2020-03-15\\" "tweets_2020-03-15.csv"), True) dpaths["200415"] = ((f"{datasets_folder}\\ids_2020-04-15\\" "tweets_2020-04-15.csv"), True) # Add the older tweets # Keys are 7 chars long and for older datasets # dpaths["200201o"] = ((f"{datasets_folder}\\ids_2020-02-01\\" # "tweets_20200201.csv"), True) # dpaths["200301o"] = ((f"{datasets_folder}\\ids_2020-03-01\\" # "tweets_20200301.csv"), True) try: DataTools.prune_retweets_clean_to_csv(csv_files=dpaths, dirpath=master_folder, only_eng=True) except Exception: logger.exception("exception raised") logger.info("Pruning phase ended.") if not resave_5g_tagged_tweets: exit(0) # append pruned to the master path and proceed: dfs = {} # Load the four datasets for pruning and saving with DataTools.scan_directory(master_folder) as docs: for doc in docs: dfs[doc.name] = DataTools.load_tweets_ds(csv_fpath=doc.path, already_pruned=True, hydrator_file=True, remove_retweets=False) logger.info(f"File {doc.name} loaded into a dataframe") # Annotate tweets with 5G labels in five_g columns logger.info("Annotating tweets with 5G labels") dfs = ConspiracyDetector.annotate_tweets_ds(dict_ds=dfs, store=resave_5g_tagged_tweets) logger.info("Annotation of 5G finished") logger.info("Main Module End.") logger.shutdown()
class KejModel(object): WORD_DIM = 250 CLASS_DIM = 30 NUM_CLASSES = 11 MAX_SENTENCE_LEN = 100 multiple = 1.0 dt = None relationsMapping = { 'other': 0, 'locaA': 1, 'locAa': 2, 'med-ill': 3, 'ill-med': 4, "clsaA": 5, "clsAa": 6, "w-c": 7, "c-w": 8, "cs-ef": 9, "ef-cs": 10 } idx2relation = { 0: 'other', 1: 'locaA', 2: 'locAa', 3: 'med-ill', 4: 'ill-med', 5: "clsaA", 6: "clsAa", 7: "w-c", 8: "c-w", 9: "cs-ef", 10: "ef-cs" } def __init__(self): self.dt = DataTools() def load_kej_model(self, modelpath): return load_model(modelpath) def build_new_model(self, n_out, max_sentence_len, max_position, embedding, printflag=False): print("build new model") num_filter = 140 filter_length = 3 position_dims = 80 print(max_sentence_len) words_input = Input(shape=(max_sentence_len, ), dtype='int32', name='words_input') # words = Embedding(70000, self.WORD_DIM)(words_input) words = embedding(words_input) # if printflag: # print(words) sdp_input = Input(shape=(max_sentence_len, ), dtype='float32', name='sdp_input') print(sdp_input) # K.variable(K.random_uniform(self.)) init_att = math.sqrt(6.0 / (self.WORD_DIM + self.CLASS_DIM)) U = K.variable(K.random_uniform([self.WORD_DIM, self.CLASS_DIM], minval=-init_att, maxval=init_att, dtype=tf.float32), name="U") classes_matrix = K.variable(K.random_uniform( [self.CLASS_DIM, self.NUM_CLASSES], dtype=tf.float32), name="classmatrix") G = tf.matmul(tf.reshape(words, [-1, self.WORD_DIM]), U, name="G") print(G) G = tf.reshape(tf.matmul(G, classes_matrix), [-1, self.MAX_SENTENCE_LEN, self.NUM_CLASSES], name="G") print(G) init_m = math.sqrt(6.0 / (self.MAX_SENTENCE_LEN + self.NUM_CLASSES)) M = tf.Variable(tf.random_uniform( [self.MAX_SENTENCE_LEN, self.NUM_CLASSES], minval=-init_m, maxval=init_m, dtype=tf.float32), name="M") alpha = tf.expand_dims(tf.matmul(sdp_input, M), axis=1, name="alpha") print(alpha) alpha = tf.matmul(alpha, tf.transpose(G, [0, 2, 1]), name="alpha") print(alpha) alpha = tf.nn.l2_normalize(tf.squeeze(alpha, axis=1), axis=-1, name="alpha") print(alpha) alpha_v = tf.add(sdp_input, tf.scalar_mul(self.multiple, alpha), name="alpha_v") alpha = tf.matrix_diag(alpha_v, name="alpha") print(alpha) weighted_data = tf.matmul(alpha, words, name="weighted_data") print(weighted_data) distance1_input = Input(shape=(max_sentence_len, ), dtype='int32', name='distance1_input') distance1 = Embedding(max_position, position_dims)(distance1_input) print(distance1) distance2_input = Input(shape=(max_sentence_len, ), dtype='int32', name='distance2_input') distance2 = Embedding(max_position, position_dims)(distance2_input) print(distance2) # my_concat = Lambda(lambda x: K.concatenate([x[0], x[1],x[2],x[3]], axis=-1)) # output = my_concat([words,distance1,distance2,alpha]) outputtemp = tf.concat([words, distance1, distance2], axis=2) # outputtemp = concatenate([words,distance1,distance2]) print(outputtemp) output = Convolution1D(filters=num_filter, kernel_size=filter_length, padding='same', activation='relu', strides=1)(outputtemp) print(output) output = GlobalMaxPooling1D()(output) print(output) output = Dropout(0.25)(output) output = Dense(50, activation='relu')(output) output = Dropout(0.25)(output) output = Dense(n_out, activation='softmax')(output) model = Model( inputs=[sdp_input, words_input, distance1_input, distance2_input], outputs=[output]) model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy']) return model def train_model(self, tosave, savepath, num_epoch=20, batch_size=64, load=True): ys, tokenMatrix, positionMatrix1, positionMatrix2, sdpMatrix = self.dt.get_data( 'pkl/train2.pkl.gz') n_out = max(ys) + 1 max_sentence_len = tokenMatrix.shape[1] print(max_sentence_len) max_position = max(np.max(positionMatrix1), np.max(positionMatrix2)) + 1 if load: model = load_model('model/kej_model.h5') else: genmodel = GenModels.Word2Vec.load("w2vmodel/word2vec2.model") embedding = genmodel.wv.get_keras_embedding(False) model = self.build_new_model(n_out, max_sentence_len, max_position, embedding) print("Start training") max_prec, max_rec, max_acc, max_f1 = 0, 0, 0, 0 # for epoch in range(num_epoch): model.fit([sdpMatrix, tokenMatrix, positionMatrix1, positionMatrix2], ys, batch_size=batch_size, verbose=True, epochs=num_epoch) if tosave: model.save(savepath) print("保存成功") def predict_classes(self, prediction): return prediction.argmax(axis=-1) def model_predict(self, testpkl_path, model_path='model/kej_model.h5'): ys, tokenMatrix, positionMatrix1, positionMatrix2 = self.dt.get_data( testpkl_path) model = load_model(model_path) pred_test = self.predict_classes( model.predict([tokenMatrix, positionMatrix1, positionMatrix2], verbose=False)) acc = np.sum(pred_test == ys) / float(len(ys)) print("测试样例数量: " + str(len(ys))) print("准确率: " + str(acc)) print("错误样例:") wrong_outputs = [] for i in range(len(ys)): if pred_test[i] != ys[i]: wrong_outputs.append( str(i) + " " + self.idx2relation[ys[i]] + " " + self.idx2relation[pred_test[i]]) for item in wrong_outputs: print(item) def model_predict_one(self, model, relationidx, positionMatrix1, positionMatrix2, tokenMatrix): pred_test = self.predict_classes( model.predict([tokenMatrix, positionMatrix1, positionMatrix2], verbose=False)) print(self.idx2relation[pred_test[0]])