def task_clustering(word2vec=None): # load db from remote _ids, tasks = query_pool.get_article_logs() # word2vec load if word2vec is None: word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True) # create and dump pipe # percentage 는 전체 태스크 수를 클러스터링 했을 때 한 클러스터의 차지하는 비중 print 'Start Clustering' pipe, labels = Trainer.decompose_and_cluster(tasks, word2vec, PIPE_DUMPING, n_clusters=3) # update task db query_pool.attach_task_label(_ids=_ids, labels=labels)
def task_clustering(method, word2vec=None): # load db from remote _ids, tasks = webserver_query_pool.get_article_logs() # word2vec load if word2vec is None: word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True) # create and dump pipe # percentage 는 전체 태스크 수를 클러스터링 했을 때 한 클러스터의 차지하는 비중 print 'Start Clustering' log_length = len(tasks) cluster_number = determine_cluster_numbers(log_length) pipe, labels = Trainer.decompose_and_cluster(tasks, word2vec, PIPE_DUMPING, method=method, option=cluster_number) # update task db webserver_query_pool.attach_task_label(_ids=_ids, labels=labels)
def train_or_load_word2vec(delete=False): # load articles from db if os.path.exists(WORD2VEC_LINE_TEXT): if delete: os.remove(WORD2VEC_LINE_TEXT) else: print 'line_file_exists' else: print 'Load Article Contents.....' contents = query_pool.get_unparsed_content() # do text processing and mk word2vec line file print 'Proceed text filtering and mk word2vec_line_text' law_contents_to_file(contents, WORD2VEC_LINE_TEXT) if os.path.exists(WORD2VEC_MODEL): if delete: os.remove(WORD2VEC_MODEL) else: return gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True) # training! return Trainer.train_and_save_vector(WORD2VEC_LINE_TEXT, WORD2VEC_MODEL)