def random_forest_generator(data_filename, replacer,train_ratio = 0.8):
    #initialize variables
    data = pd.read_csv(data_filename)
    data_labels = data['state'].tolist()
    data_texts = data['content']
    bow_transformer = CountVectorizer()

    #vectorize train data
    word_list = preprocess(data_texts, replacer)
    vectorized_data = bow_transformer.fit_transform(word_list)
    tfidf_transformer = TfidfTransformer().fit(vectorized_data)
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray()
    #print(vectorized_data_list)
    all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio)

    train_data = all_data['train_data']
    train_labels = all_data['train_labels']
    test_data = all_data['test_data']
    test_labels = all_data['test_labels']
    
    #build model
    forest = RandomForestClassifier(n_estimators=30,n_jobs=-1)
    forest = forest.fit(train_data,train_labels)
    pred=forest.predict(test_data)
    accuracy = worker.calculate_result(test_labels,pred)
 
    #save data and model
    joblib.dump(forest,'./var/model/forest_0',compress = 3)
    joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3)
    joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3)
    return forest
def logistic_regression_generator(data_filename, replacer, train_ratio = 0.8):
    
    data = pd.read_csv(data_filename)
    data_labels = data['state'].tolist()
    data_texts = data['content']
    
    bow_transformer = CountVectorizer()
    
    #vectorize train data
    word_list = preprocess(data_texts, replacer)
    vectorized_data = bow_transformer.fit_transform(word_list)
    tfidf_transformer = TfidfTransformer().fit(vectorized_data)
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray()
    
    all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio)
    
    train_data = all_data['train_data']
    train_labels = all_data['train_labels']
    test_data = all_data['test_data']
    test_labels = all_data['test_labels']
    
    #build model
    lr = LogisticRegression(C=30,penalty='l2',random_state=0)
    lr.fit(train_data,train_labels)
    pred=lr.predict(test_data)
    print("generator prediction:")
    accuracy = worker.calculate_result(test_labels,pred)
    
    
    #save data and model
    joblib.dump(lr,'./var/model/lr_0',compress = 3)
    joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3)
    joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3)
    return lr
def test_with_unlabeled_state(data_filename, replacer,classifier, bow, tfidf):
    
    data = pd.read_csv(data_filename)
    bow_transformer = joblib.load(bow)
    tfidf_transformer = joblib.load(tfidf)
    classifier = joblib.load(classifier)
    data_texts = data['content']
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    word_list = preprocess(data_texts, replacer)
    predict_data = worker.classifier_predict(word_list,classifier)
    predict_data.to_csv('predict_data.csv')
def test_with_labeled_state(data_filename,replacer, classifier, bow,tfidf):
    data = pd.read_csv(data_filename)

    bow_transformer = joblib.load(bow)
    tfidf_transformer = joblib.load(tfidf)
    classifier = joblib.load(classifier)
    data_labels = data['state'].tolist()
    data_texts = data['content']
    #preprocess data
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    word_list = preprocess(data_texts, replacer)
    text_vectors_list = worker.content_to_vectors(word_list)
    #test model
    pred = classifier.predict(text_vectors_list)
    worker.calculate_result(data_labels,pred)
def get_idle_worker():
    doc = es.search_one_document(w_idx, es.missing_field_query("current_task_id"))
    if doc:
        worker = Worker.from_es_data(doc['_source'])
        worker_id = doc['_id']
        return worker, worker_id

    return None, None
Example #6
0
def main():
    print "Content-type: text/html\n\n"
    cgitb.enable()

    # process input
    form = cgi.FieldStorage()
    xml_feedback = form.getfirst('xmlFeedback', 'None')
    token_number = form.getfirst('token', 'None')

    # open database
    session = get_local_db_session()

    try:
        token = session.query(Token).filter_by(number=token_number).one()
    except NoResultFound:
        print "KO"
        return

    # insert the retrived dialogueID into the xmlFeedback
    xml_feedback = xml_feedback.replace(
        "<dialogueId></dialogueId>",
        "<dialogueId>" + token.submission.dialogue_id + "</dialogueId>")
    worker_id = get_worker_id(xml_feedback)
    phone = token.submission.dialogue_id.rsplit("-", 1)[1]

    try:
        curr_worker = session.query(Worker).filter_by(
            worker_id=worker_id).one()
    except NoResultFound:
        curr_worker = Worker(worker_id=worker_id, phone_number=phone)
        session.add(curr_worker)

    # save submitted data
    submission = token.submission
    submission.data = xml_feedback
    submission.worker = curr_worker
    submission.timestamp = datetime.datetime.now()

    # free token
    token.submission = None

    session.commit()
def job(csv_filename=csv_filename, bow_transformer=bow_transformer, classifier=classifier, tfidf_transformer=tfidf_transformer, host=host, passcode=passcode, charset=charset, usr=usr, db=db,time_storage=time_storage):
    #Start to load synonym pairs to SynonymReplacer
    logger.info("Start to load synonym pairs to SynonymReplacer:")
    try:
        replacer = SynonymReplacer()
        replacer.addSynDict_from_csv(csv_filename)
        logger.info("Successfully load synonym pairs to SynonymReplacer.")
    except Exception as err:
        logger.error("Fail to load synonym pairs to SynonymReplacer. {}".format(err))
   
    #replacer.show_syndict()
    #forest = random_forest_generator('./var/model/data.csv',replacer)
    #test_with_labeled_state('./var/model/model_data.csv',replacer,'./var/model/forest_0','./var/model/bow_0','./var/model/tfidf_0')

    error_flag = False

    logger.info("Start to load lasttime from time_storage.txt:")
    try:
        with open(time_storage,'r') as file:
            lasttime = file.readline()
        logger.info("Successfully load lasttime from time_storage.txt.")
    except Exception as err:
        lasttime = time.strftime( '%Y-%m-%d %X', time.localtime())
        logger.error("Fail to load lasttime from time_storage.txt. Instead, set current time to be lasttime. {}".format(err))




    #Connect to the database
    logger.info("Start to connect to the database:")
    try:
        connection = pymysql.connect(host=host,
                                 user= usr,
                                 password= passcode,
                                 db= db,
                                 charset=charset,
                                 cursorclass=pymysql.cursors.DictCursor)
        logger.info("Successfully connect to the database.")
        
        with connection.cursor() as cursor:
            #retrieve data from the database
            newtime = str(time.strftime( '%Y-%m-%d %X', time.localtime()))
            flag = cursor.execute("SELECT * FROM GC_User_Comment WHERE CreatedTime >= %s and CreatedTime < %s",(lasttime,newtime))
            if flag:
                Recentdata = pd.DataFrame(cursor.fetchall())
                Recentdata = Recentdata[['CMTID','Content','State','AuditState']]
                logger.info("Successfully retrieve {} comments from database.".format(len(Recentdata)))
                            
                logger.info("Start to load models:")
                try:
                    classifier = joblib.load(classifier)
                    bow_transformer = joblib.load(bow_transformer)
                    tfidf_transformer = joblib.load(tfidf_transformer)
                    logger.info("Successfully load models.")
                
                except Exception as err:
                    logger.error("Fail to load models. {}".format(err))
                    error_flag = True
            
                logger.info("Start to preprocecss Recentdata:")
                try:
                    data_texts = Recentdata['Content']
                    word_list = preprocess(data_texts, replacer)
                    logger.info("Successfully preprocess Recentdata.")
                except Exception as err:
                    logger.error("Fail to preprocess Recentdata. {}".format(err))
                    error_flag = True
                                 
                logger.info("Start to predict Recentdata:")
                try:
                    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
                    predict_data = worker.classifier_predict(word_list,classifier)
                    predict_data.to_csv('predict_data.csv')
                    logger.info("Successfully predict Recentdata")
                    Recentdata['State'] = [state_classify(prob) for prob in predict_data['trash']]
                    Recentdata['AuditState'] = [auditstate_classify(prob) for prob in predict_data['trash']]
                except Exception as err:
                    logger.error("Fail to predict Recentdata. {}".format(err))
                    error_flag = True
                
                logger.info("Start to update State and AuditState in the database:")
                try:
                    count = 0
                    for CMTID, Content, State, AuditState in Recentdata.values:
                        cursor.execute("UPDATE GC_User_Comment SET State = %s, AuditState = %s WHERE CMTID = %s",(int(State), int(AuditState), int(CMTID)))
                        count = count + 1
                    connection.commit()
                except Exception as err:
                    logger.error("Fail to update GC_User_Comment State. {0}".format(err))
                    error_flag = True
                
                if(not error_flag):
                    #update lasttime into txt file
                    logger.info("Start to update time in time_storage.txt:")
                    try:
                        with open(time_storage,'w') as file:
                            file.writelines(newtime)
                        logger.info("Successfully updated time in time_storage.txt")
                        logger.info("Successfully updated State and AuditState for {} comment".format(count))
                    except Exception as err:
                        logger.error("Fail to update time in time_storage.txt. {}".format(err))
                                 

            else:
                logger.info("No new comment was created from the last recorded CreatedTime")
        connection.close()
    except Exception as err:
        logger.error("Fail to connect to the database. {}".format(err))
def register_worker(platform_info, environment):
    return db.register_worker(Worker(platform_info, environment))
def get_worker_by_id(worker_id):
    doc = es.get_document_by_id(w_idx, worker_id)
    if doc:
        return Worker.from_es_data(doc['_source'])

    return None
Example #10
0
import torch as th
from utils import SharedAdam
from model import Worker, Net
import gym
import torch.multiprocessing as mp

env = gym.make('CartPole-v0')
n_action = env.action_space.n
n_state = env.observation_space.shape[0]

global_net = Net(n_state, n_action)
global_net.share_memory()
optA = SharedAdam(global_net.policy.parameters(), lr=1e-4, betas=(0.92, 0.999))
optC = SharedAdam(global_net.v.parameters(), lr=1e-4, betas=(0.92, 0.999))
workers = [Worker(global_net, optA, optC, str(i)) for i in range(8)]
[w.start() for w in workers]

[w.join() for w in workers]