def random_forest_generator(data_filename, replacer,train_ratio = 0.8): #initialize variables data = pd.read_csv(data_filename) data_labels = data['state'].tolist() data_texts = data['content'] bow_transformer = CountVectorizer() #vectorize train data word_list = preprocess(data_texts, replacer) vectorized_data = bow_transformer.fit_transform(word_list) tfidf_transformer = TfidfTransformer().fit(vectorized_data) worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray() #print(vectorized_data_list) all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio) train_data = all_data['train_data'] train_labels = all_data['train_labels'] test_data = all_data['test_data'] test_labels = all_data['test_labels'] #build model forest = RandomForestClassifier(n_estimators=30,n_jobs=-1) forest = forest.fit(train_data,train_labels) pred=forest.predict(test_data) accuracy = worker.calculate_result(test_labels,pred) #save data and model joblib.dump(forest,'./var/model/forest_0',compress = 3) joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3) joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3) return forest
def logistic_regression_generator(data_filename, replacer, train_ratio = 0.8): data = pd.read_csv(data_filename) data_labels = data['state'].tolist() data_texts = data['content'] bow_transformer = CountVectorizer() #vectorize train data word_list = preprocess(data_texts, replacer) vectorized_data = bow_transformer.fit_transform(word_list) tfidf_transformer = TfidfTransformer().fit(vectorized_data) worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray() all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio) train_data = all_data['train_data'] train_labels = all_data['train_labels'] test_data = all_data['test_data'] test_labels = all_data['test_labels'] #build model lr = LogisticRegression(C=30,penalty='l2',random_state=0) lr.fit(train_data,train_labels) pred=lr.predict(test_data) print("generator prediction:") accuracy = worker.calculate_result(test_labels,pred) #save data and model joblib.dump(lr,'./var/model/lr_0',compress = 3) joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3) joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3) return lr
def test_with_unlabeled_state(data_filename, replacer,classifier, bow, tfidf): data = pd.read_csv(data_filename) bow_transformer = joblib.load(bow) tfidf_transformer = joblib.load(tfidf) classifier = joblib.load(classifier) data_texts = data['content'] worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) word_list = preprocess(data_texts, replacer) predict_data = worker.classifier_predict(word_list,classifier) predict_data.to_csv('predict_data.csv')
def test_with_labeled_state(data_filename,replacer, classifier, bow,tfidf): data = pd.read_csv(data_filename) bow_transformer = joblib.load(bow) tfidf_transformer = joblib.load(tfidf) classifier = joblib.load(classifier) data_labels = data['state'].tolist() data_texts = data['content'] #preprocess data worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) word_list = preprocess(data_texts, replacer) text_vectors_list = worker.content_to_vectors(word_list) #test model pred = classifier.predict(text_vectors_list) worker.calculate_result(data_labels,pred)
def get_idle_worker(): doc = es.search_one_document(w_idx, es.missing_field_query("current_task_id")) if doc: worker = Worker.from_es_data(doc['_source']) worker_id = doc['_id'] return worker, worker_id return None, None
def main(): print "Content-type: text/html\n\n" cgitb.enable() # process input form = cgi.FieldStorage() xml_feedback = form.getfirst('xmlFeedback', 'None') token_number = form.getfirst('token', 'None') # open database session = get_local_db_session() try: token = session.query(Token).filter_by(number=token_number).one() except NoResultFound: print "KO" return # insert the retrived dialogueID into the xmlFeedback xml_feedback = xml_feedback.replace( "<dialogueId></dialogueId>", "<dialogueId>" + token.submission.dialogue_id + "</dialogueId>") worker_id = get_worker_id(xml_feedback) phone = token.submission.dialogue_id.rsplit("-", 1)[1] try: curr_worker = session.query(Worker).filter_by( worker_id=worker_id).one() except NoResultFound: curr_worker = Worker(worker_id=worker_id, phone_number=phone) session.add(curr_worker) # save submitted data submission = token.submission submission.data = xml_feedback submission.worker = curr_worker submission.timestamp = datetime.datetime.now() # free token token.submission = None session.commit()
def job(csv_filename=csv_filename, bow_transformer=bow_transformer, classifier=classifier, tfidf_transformer=tfidf_transformer, host=host, passcode=passcode, charset=charset, usr=usr, db=db,time_storage=time_storage): #Start to load synonym pairs to SynonymReplacer logger.info("Start to load synonym pairs to SynonymReplacer:") try: replacer = SynonymReplacer() replacer.addSynDict_from_csv(csv_filename) logger.info("Successfully load synonym pairs to SynonymReplacer.") except Exception as err: logger.error("Fail to load synonym pairs to SynonymReplacer. {}".format(err)) #replacer.show_syndict() #forest = random_forest_generator('./var/model/data.csv',replacer) #test_with_labeled_state('./var/model/model_data.csv',replacer,'./var/model/forest_0','./var/model/bow_0','./var/model/tfidf_0') error_flag = False logger.info("Start to load lasttime from time_storage.txt:") try: with open(time_storage,'r') as file: lasttime = file.readline() logger.info("Successfully load lasttime from time_storage.txt.") except Exception as err: lasttime = time.strftime( '%Y-%m-%d %X', time.localtime()) logger.error("Fail to load lasttime from time_storage.txt. Instead, set current time to be lasttime. {}".format(err)) #Connect to the database logger.info("Start to connect to the database:") try: connection = pymysql.connect(host=host, user= usr, password= passcode, db= db, charset=charset, cursorclass=pymysql.cursors.DictCursor) logger.info("Successfully connect to the database.") with connection.cursor() as cursor: #retrieve data from the database newtime = str(time.strftime( '%Y-%m-%d %X', time.localtime())) flag = cursor.execute("SELECT * FROM GC_User_Comment WHERE CreatedTime >= %s and CreatedTime < %s",(lasttime,newtime)) if flag: Recentdata = pd.DataFrame(cursor.fetchall()) Recentdata = Recentdata[['CMTID','Content','State','AuditState']] logger.info("Successfully retrieve {} comments from database.".format(len(Recentdata))) logger.info("Start to load models:") try: classifier = joblib.load(classifier) bow_transformer = joblib.load(bow_transformer) tfidf_transformer = joblib.load(tfidf_transformer) logger.info("Successfully load models.") except Exception as err: logger.error("Fail to load models. {}".format(err)) error_flag = True logger.info("Start to preprocecss Recentdata:") try: data_texts = Recentdata['Content'] word_list = preprocess(data_texts, replacer) logger.info("Successfully preprocess Recentdata.") except Exception as err: logger.error("Fail to preprocess Recentdata. {}".format(err)) error_flag = True logger.info("Start to predict Recentdata:") try: worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) predict_data = worker.classifier_predict(word_list,classifier) predict_data.to_csv('predict_data.csv') logger.info("Successfully predict Recentdata") Recentdata['State'] = [state_classify(prob) for prob in predict_data['trash']] Recentdata['AuditState'] = [auditstate_classify(prob) for prob in predict_data['trash']] except Exception as err: logger.error("Fail to predict Recentdata. {}".format(err)) error_flag = True logger.info("Start to update State and AuditState in the database:") try: count = 0 for CMTID, Content, State, AuditState in Recentdata.values: cursor.execute("UPDATE GC_User_Comment SET State = %s, AuditState = %s WHERE CMTID = %s",(int(State), int(AuditState), int(CMTID))) count = count + 1 connection.commit() except Exception as err: logger.error("Fail to update GC_User_Comment State. {0}".format(err)) error_flag = True if(not error_flag): #update lasttime into txt file logger.info("Start to update time in time_storage.txt:") try: with open(time_storage,'w') as file: file.writelines(newtime) logger.info("Successfully updated time in time_storage.txt") logger.info("Successfully updated State and AuditState for {} comment".format(count)) except Exception as err: logger.error("Fail to update time in time_storage.txt. {}".format(err)) else: logger.info("No new comment was created from the last recorded CreatedTime") connection.close() except Exception as err: logger.error("Fail to connect to the database. {}".format(err))
def register_worker(platform_info, environment): return db.register_worker(Worker(platform_info, environment))
def get_worker_by_id(worker_id): doc = es.get_document_by_id(w_idx, worker_id) if doc: return Worker.from_es_data(doc['_source']) return None
import torch as th from utils import SharedAdam from model import Worker, Net import gym import torch.multiprocessing as mp env = gym.make('CartPole-v0') n_action = env.action_space.n n_state = env.observation_space.shape[0] global_net = Net(n_state, n_action) global_net.share_memory() optA = SharedAdam(global_net.policy.parameters(), lr=1e-4, betas=(0.92, 0.999)) optC = SharedAdam(global_net.v.parameters(), lr=1e-4, betas=(0.92, 0.999)) workers = [Worker(global_net, optA, optC, str(i)) for i in range(8)] [w.start() for w in workers] [w.join() for w in workers]