def test_with_unlabeled_state(data_filename, replacer,classifier, bow, tfidf): data = pd.read_csv(data_filename) bow_transformer = joblib.load(bow) tfidf_transformer = joblib.load(tfidf) classifier = joblib.load(classifier) data_texts = data['content'] worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) word_list = preprocess(data_texts, replacer) predict_data = worker.classifier_predict(word_list,classifier) predict_data.to_csv('predict_data.csv')
def job(csv_filename=csv_filename, bow_transformer=bow_transformer, classifier=classifier, tfidf_transformer=tfidf_transformer, host=host, passcode=passcode, charset=charset, usr=usr, db=db,time_storage=time_storage): #Start to load synonym pairs to SynonymReplacer logger.info("Start to load synonym pairs to SynonymReplacer:") try: replacer = SynonymReplacer() replacer.addSynDict_from_csv(csv_filename) logger.info("Successfully load synonym pairs to SynonymReplacer.") except Exception as err: logger.error("Fail to load synonym pairs to SynonymReplacer. {}".format(err)) #replacer.show_syndict() #forest = random_forest_generator('./var/model/data.csv',replacer) #test_with_labeled_state('./var/model/model_data.csv',replacer,'./var/model/forest_0','./var/model/bow_0','./var/model/tfidf_0') error_flag = False logger.info("Start to load lasttime from time_storage.txt:") try: with open(time_storage,'r') as file: lasttime = file.readline() logger.info("Successfully load lasttime from time_storage.txt.") except Exception as err: lasttime = time.strftime( '%Y-%m-%d %X', time.localtime()) logger.error("Fail to load lasttime from time_storage.txt. Instead, set current time to be lasttime. {}".format(err)) #Connect to the database logger.info("Start to connect to the database:") try: connection = pymysql.connect(host=host, user= usr, password= passcode, db= db, charset=charset, cursorclass=pymysql.cursors.DictCursor) logger.info("Successfully connect to the database.") with connection.cursor() as cursor: #retrieve data from the database newtime = str(time.strftime( '%Y-%m-%d %X', time.localtime())) flag = cursor.execute("SELECT * FROM GC_User_Comment WHERE CreatedTime >= %s and CreatedTime < %s",(lasttime,newtime)) if flag: Recentdata = pd.DataFrame(cursor.fetchall()) Recentdata = Recentdata[['CMTID','Content','State','AuditState']] logger.info("Successfully retrieve {} comments from database.".format(len(Recentdata))) logger.info("Start to load models:") try: classifier = joblib.load(classifier) bow_transformer = joblib.load(bow_transformer) tfidf_transformer = joblib.load(tfidf_transformer) logger.info("Successfully load models.") except Exception as err: logger.error("Fail to load models. {}".format(err)) error_flag = True logger.info("Start to preprocecss Recentdata:") try: data_texts = Recentdata['Content'] word_list = preprocess(data_texts, replacer) logger.info("Successfully preprocess Recentdata.") except Exception as err: logger.error("Fail to preprocess Recentdata. {}".format(err)) error_flag = True logger.info("Start to predict Recentdata:") try: worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) predict_data = worker.classifier_predict(word_list,classifier) predict_data.to_csv('predict_data.csv') logger.info("Successfully predict Recentdata") Recentdata['State'] = [state_classify(prob) for prob in predict_data['trash']] Recentdata['AuditState'] = [auditstate_classify(prob) for prob in predict_data['trash']] except Exception as err: logger.error("Fail to predict Recentdata. {}".format(err)) error_flag = True logger.info("Start to update State and AuditState in the database:") try: count = 0 for CMTID, Content, State, AuditState in Recentdata.values: cursor.execute("UPDATE GC_User_Comment SET State = %s, AuditState = %s WHERE CMTID = %s",(int(State), int(AuditState), int(CMTID))) count = count + 1 connection.commit() except Exception as err: logger.error("Fail to update GC_User_Comment State. {0}".format(err)) error_flag = True if(not error_flag): #update lasttime into txt file logger.info("Start to update time in time_storage.txt:") try: with open(time_storage,'w') as file: file.writelines(newtime) logger.info("Successfully updated time in time_storage.txt") logger.info("Successfully updated State and AuditState for {} comment".format(count)) except Exception as err: logger.error("Fail to update time in time_storage.txt. {}".format(err)) else: logger.info("No new comment was created from the last recorded CreatedTime") connection.close() except Exception as err: logger.error("Fail to connect to the database. {}".format(err))