Example #1
0
def data_preprocessing(basepath, topicID, sparse_parameters):
    """
    data preprocessing for docs sparse representation.
    ------------
    parameters:
    ------------
    base path : the root path of the data set.
    topicID: the data_preparing id like 0001
    sparse_parameters:
        dictionary_loss:dictionary Optimization Methods("lars","omp","lasso_lars","lasso_cd")
        dictionary_rep: the dictionary original representation(1-tf score addition,2-tf*term_weight )
        dictionary_norm: the dictionary original is normalization or not(Y-yes.N-not) 
        document_ori_rep:the document original representation(D1-tf/idf,D2=tf/idf * bm25,D3-tf/idf * weight of subtopic,D4- bm25 * tf/idf * weight of subtopic)
        document_rep_norm: the document original representation is normalization or not(Y-yes,N-not)
    runname: a file store the middle result for debug.    
    
    ------------
    return:
    ------------
    dictionary_A1: unoptimized dictionary for sparse representation
    documents: a list of docs class
    subtopic_candidates: a list of subtopic class
    word2id_weight: dictionary structure for store the term id and frequency informations
    """

    data_prepare_dict = {}
    runname_list = []
    expansion_methods = sparse_parameters["dictionary_rep"]

    for dictionary_norm in sparse_parameters["dictionary_norm"]:
        for dictionary_rep in sparse_parameters["dictionary_rep"]:

            for document_ori_rep in sparse_parameters["document_ori_rep"]:
                for document_rep_norm in sparse_parameters[
                        "document_rep_norm"]:

                    for mine_method in sparse_parameters["mine_method"]:
                        print topicID, "preparing the data: ", dictionary_rep + dictionary_norm + document_ori_rep + document_rep_norm, mine_method
                        subtopic_candidates, documents, word2id_weight, dictionary_A1, query = construct_data(
                            basepath, topicID, dictionary_norm, dictionary_rep,
                            document_ori_rep, document_rep_norm, mine_method)
                        for dictionary_loss in sparse_parameters[
                                "dictionary_loss"]:
                            runname = build_run_name(dictionary_loss,
                                                     dictionary_norm,
                                                     dictionary_rep,
                                                     document_ori_rep,
                                                     document_rep_norm,
                                                     mine_method)
                            runname_list.append([runname, dictionary_loss])
                            if not data_prepare_dict.has_key(runname[0:2] +
                                                             runname[4:]):
                                data_prepare_dict[runname[0:2] +
                                                  runname[4:]] = [
                                                      subtopic_candidates,
                                                      documents,
                                                      word2id_weight,
                                                      dictionary_A1, query
                                                  ]
    return runname_list, data_prepare_dict
Example #2
0
def run_analysis(sparse_parameters, basepath, collection):
    runname_list = []
    for dictionary_loss in sparse_parameters["dictionary_loss"]:
        for dictionary_norm in sparse_parameters["dictionary_norm"]:
            for dictionary_rep in sparse_parameters["dictionary_rep"]:
                for document_ori_rep in sparse_parameters["document_ori_rep"]:
                    for document_rep_norm in sparse_parameters[
                            "document_rep_norm"]:
                        for mine_method in sparse_parameters["mine_method"]:
                            runname = build_run_name(dictionary_loss,
                                                     dictionary_norm,
                                                     dictionary_rep,
                                                     document_ori_rep,
                                                     document_rep_norm,
                                                     mine_method)
                            for para_1 in sparse_parameters["para_1"]:
                                for subtopic_less in sparse_parameters[
                                        "subtopic_less"]:
                                    for dict_learning in sparse_parameters[
                                            "dictionary_learning"]:
                                        for para_2 in sparse_parameters[
                                                "para_2"]:
                                            for para_3 in sparse_parameters[
                                                    "para_3"]:
                                                if subtopic_less == "Y":
                                                    runname_list.append(
                                                        runname +
                                                        str(int(para_1 *
                                                                10)).zfill(2) +
                                                        "LY" + dict_learning +
                                                        str(int(para_2 *
                                                                10)).zfill(2) +
                                                        str(int(para_3 *
                                                                10)).zfill(2))
                                                else:
                                                    runname_list.append(
                                                        runname +
                                                        str(int(para_1 *
                                                                10)).zfill(2) +
                                                        "LN" + dict_learning +
                                                        str(int(para_2 *
                                                                10)).zfill(2) +
                                                        str(int(para_3 *
                                                                10)).zfill(2))
    for runname in runname_list:
        doc_str = construct_doc_str(runname)
        DR_result_analysis_sparse(basepath, doc_str, runname, collection)
Example #3
0
def data_preprocessing(basepath,topicID,sparse_parameters):
    """
    data preprocessing for docs sparse representation.
    ------------
    parameters:
    ------------
    base path : the root path of the data set.
    topicID: the data_preparing id like 0001
    sparse_parameters:
        dictionary_loss:dictionary Optimization Methods("lars","omp","lasso_lars","lasso_cd")
        dictionary_rep: the dictionary original representation(1-tf score addition,2-tf*term_weight )
        dictionary_norm: the dictionary original is normalization or not(Y-yes.N-not) 
        document_ori_rep:the document original representation(D1-tf/idf,D2=tf/idf * bm25,D3-tf/idf * weight of subtopic,D4- bm25 * tf/idf * weight of subtopic)
        document_rep_norm: the document original representation is normalization or not(Y-yes,N-not)
    runname: a file store the middle result for debug.    
    
    ------------
    return:
    ------------
    dictionary_A1: unoptimized dictionary for sparse representation
    documents: a list of docs class
    subtopic_candidates: a list of subtopic class
    word2id_weight: dictionary structure for store the term id and frequency informations
    """


    data_prepare_dict = {}
    runname_list = []
    expansion_methods = sparse_parameters["dictionary_rep"]
    

    for dictionary_norm in sparse_parameters["dictionary_norm"]:
        for dictionary_rep in sparse_parameters["dictionary_rep"]:
            
            for document_ori_rep in sparse_parameters["document_ori_rep"]:
                for document_rep_norm in sparse_parameters["document_rep_norm"]:
                    
                    for mine_method in sparse_parameters["mine_method"]:
                        print topicID,"preparing the data: ",dictionary_rep+dictionary_norm+document_ori_rep+document_rep_norm,mine_method
                        subtopic_candidates,documents,word2id_weight,dictionary_A1,query = construct_data(basepath,topicID,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method)
                        for dictionary_loss in sparse_parameters["dictionary_loss"]:
                            runname = build_run_name(dictionary_loss,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method)
                            runname_list.append([runname,dictionary_loss])
                            if not data_prepare_dict.has_key(runname[0:2]+runname[4:]):
                                data_prepare_dict[runname[0:2]+runname[4:]] = [subtopic_candidates,documents,word2id_weight,dictionary_A1,query]
    return runname_list,data_prepare_dict
Example #4
0
def run_analysis(sparse_parameters,basepath,collection):
    runname_list = []
    for dictionary_loss in sparse_parameters["dictionary_loss"]:
        for dictionary_norm in sparse_parameters["dictionary_norm"]:
            for dictionary_rep in sparse_parameters["dictionary_rep"]:
                for document_ori_rep in sparse_parameters["document_ori_rep"]:
                    for document_rep_norm in sparse_parameters["document_rep_norm"]:
                        for mine_method in sparse_parameters["mine_method"]:
                            runname = build_run_name(dictionary_loss,dictionary_norm,dictionary_rep,document_ori_rep,document_rep_norm,mine_method)
                            for para_1 in sparse_parameters["para_1"]:
                                for subtopic_less in sparse_parameters["subtopic_less"]:
                                    for dict_learning in sparse_parameters["dictionary_learning"]:
                                        for para_2 in sparse_parameters["para_2"]:
                                            for para_3 in sparse_parameters["para_3"]:
                                                if subtopic_less=="Y":
                                                    runname_list.append(runname+str(int(para_1*10)).zfill(2)+"LY"+dict_learning+str(int(para_2*10)).zfill(2)+str(int(para_3*10)).zfill(2))
                                                else:
                                                    runname_list.append(runname+str(int(para_1*10)).zfill(2)+"LN"+dict_learning+str(int(para_2*10)).zfill(2)+str(int(para_3*10)).zfill(2))
    for runname in runname_list:
        doc_str = construct_doc_str(runname)
        DR_result_analysis_sparse(basepath,doc_str,runname,collection)