def data_preprocessing(basepath, topicID, sparse_parameters): """ data preprocessing for docs sparse representation. ------------ parameters: ------------ base path : the root path of the data set. topicID: the data_preparing id like 0001 sparse_parameters: dictionary_loss:dictionary Optimization Methods("lars","omp","lasso_lars","lasso_cd") dictionary_rep: the dictionary original representation(1-tf score addition,2-tf*term_weight ) dictionary_norm: the dictionary original is normalization or not(Y-yes.N-not) document_ori_rep:the document original representation(D1-tf/idf,D2=tf/idf * bm25,D3-tf/idf * weight of subtopic,D4- bm25 * tf/idf * weight of subtopic) document_rep_norm: the document original representation is normalization or not(Y-yes,N-not) runname: a file store the middle result for debug. ------------ return: ------------ dictionary_A1: unoptimized dictionary for sparse representation documents: a list of docs class subtopic_candidates: a list of subtopic class word2id_weight: dictionary structure for store the term id and frequency informations """ data_prepare_dict = {} runname_list = [] expansion_methods = sparse_parameters["dictionary_rep"] for dictionary_norm in sparse_parameters["dictionary_norm"]: for dictionary_rep in sparse_parameters["dictionary_rep"]: for document_ori_rep in sparse_parameters["document_ori_rep"]: for document_rep_norm in sparse_parameters[ "document_rep_norm"]: for mine_method in sparse_parameters["mine_method"]: print topicID, "preparing the data: ", dictionary_rep + dictionary_norm + document_ori_rep + document_rep_norm, mine_method subtopic_candidates, documents, word2id_weight, dictionary_A1, query = construct_data( basepath, topicID, dictionary_norm, dictionary_rep, document_ori_rep, document_rep_norm, mine_method) for dictionary_loss in sparse_parameters[ "dictionary_loss"]: runname = build_run_name(dictionary_loss, dictionary_norm, dictionary_rep, document_ori_rep, document_rep_norm, mine_method) runname_list.append([runname, dictionary_loss]) if not data_prepare_dict.has_key(runname[0:2] + runname[4:]): data_prepare_dict[runname[0:2] + runname[4:]] = [ subtopic_candidates, documents, word2id_weight, dictionary_A1, query ] return runname_list, data_prepare_dict
def run_analysis(sparse_parameters, basepath, collection): runname_list = [] for dictionary_loss in sparse_parameters["dictionary_loss"]: for dictionary_norm in sparse_parameters["dictionary_norm"]: for dictionary_rep in sparse_parameters["dictionary_rep"]: for document_ori_rep in sparse_parameters["document_ori_rep"]: for document_rep_norm in sparse_parameters[ "document_rep_norm"]: for mine_method in sparse_parameters["mine_method"]: runname = build_run_name(dictionary_loss, dictionary_norm, dictionary_rep, document_ori_rep, document_rep_norm, mine_method) for para_1 in sparse_parameters["para_1"]: for subtopic_less in sparse_parameters[ "subtopic_less"]: for dict_learning in sparse_parameters[ "dictionary_learning"]: for para_2 in sparse_parameters[ "para_2"]: for para_3 in sparse_parameters[ "para_3"]: if subtopic_less == "Y": runname_list.append( runname + str(int(para_1 * 10)).zfill(2) + "LY" + dict_learning + str(int(para_2 * 10)).zfill(2) + str(int(para_3 * 10)).zfill(2)) else: runname_list.append( runname + str(int(para_1 * 10)).zfill(2) + "LN" + dict_learning + str(int(para_2 * 10)).zfill(2) + str(int(para_3 * 10)).zfill(2)) for runname in runname_list: doc_str = construct_doc_str(runname) DR_result_analysis_sparse(basepath, doc_str, runname, collection)
def data_preprocessing(basepath,topicID,sparse_parameters): """ data preprocessing for docs sparse representation. ------------ parameters: ------------ base path : the root path of the data set. topicID: the data_preparing id like 0001 sparse_parameters: dictionary_loss:dictionary Optimization Methods("lars","omp","lasso_lars","lasso_cd") dictionary_rep: the dictionary original representation(1-tf score addition,2-tf*term_weight ) dictionary_norm: the dictionary original is normalization or not(Y-yes.N-not) document_ori_rep:the document original representation(D1-tf/idf,D2=tf/idf * bm25,D3-tf/idf * weight of subtopic,D4- bm25 * tf/idf * weight of subtopic) document_rep_norm: the document original representation is normalization or not(Y-yes,N-not) runname: a file store the middle result for debug. ------------ return: ------------ dictionary_A1: unoptimized dictionary for sparse representation documents: a list of docs class subtopic_candidates: a list of subtopic class word2id_weight: dictionary structure for store the term id and frequency informations """ data_prepare_dict = {} runname_list = [] expansion_methods = sparse_parameters["dictionary_rep"] for dictionary_norm in sparse_parameters["dictionary_norm"]: for dictionary_rep in sparse_parameters["dictionary_rep"]: for document_ori_rep in sparse_parameters["document_ori_rep"]: for document_rep_norm in sparse_parameters["document_rep_norm"]: for mine_method in sparse_parameters["mine_method"]: print topicID,"preparing the data: ",dictionary_rep+dictionary_norm+document_ori_rep+document_rep_norm,mine_method subtopic_candidates,documents,word2id_weight,dictionary_A1,query = construct_data(basepath,topicID,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method) for dictionary_loss in sparse_parameters["dictionary_loss"]: runname = build_run_name(dictionary_loss,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method) runname_list.append([runname,dictionary_loss]) if not data_prepare_dict.has_key(runname[0:2]+runname[4:]): data_prepare_dict[runname[0:2]+runname[4:]] = [subtopic_candidates,documents,word2id_weight,dictionary_A1,query] return runname_list,data_prepare_dict
def run_analysis(sparse_parameters,basepath,collection): runname_list = [] for dictionary_loss in sparse_parameters["dictionary_loss"]: for dictionary_norm in sparse_parameters["dictionary_norm"]: for dictionary_rep in sparse_parameters["dictionary_rep"]: for document_ori_rep in sparse_parameters["document_ori_rep"]: for document_rep_norm in sparse_parameters["document_rep_norm"]: for mine_method in sparse_parameters["mine_method"]: runname = build_run_name(dictionary_loss,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method) for para_1 in sparse_parameters["para_1"]: for subtopic_less in sparse_parameters["subtopic_less"]: for dict_learning in sparse_parameters["dictionary_learning"]: for para_2 in sparse_parameters["para_2"]: for para_3 in sparse_parameters["para_3"]: if subtopic_less=="Y": runname_list.append(runname+str(int(para_1*10)).zfill(2)+"LY"+dict_learning+str(int(para_2*10)).zfill(2)+str(int(para_3*10)).zfill(2)) else: runname_list.append(runname+str(int(para_1*10)).zfill(2)+"LN"+dict_learning+str(int(para_2*10)).zfill(2)+str(int(para_3*10)).zfill(2)) for runname in runname_list: doc_str = construct_doc_str(runname) DR_result_analysis_sparse(basepath,doc_str,runname,collection)