def __cpp_sim_matrix_l_generation_routine__(sim_matrix_path, mscmodel, msc2ix, secondary_codes_weight, docid2seccodes): #dstmatrixpath = TMPDIR+"/mlevel_similarity_matrix_"+similarity_aggregation_method_l+"_"+base64.b16encode(aux.quick_md5(sim_matrix_path+similarity_aggregation_method_l+str(MIN_COUNT_MSCPRIM))) dstmatrixpath = sim_matrix_path+".msc"+str(MIN_COUNT_MSCPRIM)+"_"+similarity_aggregation_method_l if not aux.exists(dstmatrixpath): msc2wids_list = get_msc2wids_list(msc2ix, mscmodel, secondary_codes_weight, docid2seccodes) cpp_wrapper.aggregate_simmatrix(sim_matrix_path, dstmatrixpath, msc2wids_list, method=similarity_aggregation_method_l) logging.info("[build_msc_tree] Loading simmatrix from: "+str(dstmatrixpath)) (rows, cols, sim_matrix_l) = matrix_io.fread_smatrix(dstmatrixpath) ____validate_cpp_output____(msc2ix, rows) return sim_matrix_l
def __cpp_sim_matrix_l_generation_routine__(sim_matrix_path, mscmodel, msc2ix, secondary_codes_weight, docid2seccodes): #dstmatrixpath = TMPDIR+"/mlevel_similarity_matrix_"+similarity_aggregation_method_l+"_"+base64.b16encode(aux.quick_md5(sim_matrix_path+similarity_aggregation_method_l+str(MIN_COUNT_MSCPRIM))) dstmatrixpath = sim_matrix_path + ".msc" + str( MIN_COUNT_MSCPRIM) + "_" + similarity_aggregation_method_l if not aux.exists(dstmatrixpath): msc2wids_list = get_msc2wids_list(msc2ix, mscmodel, secondary_codes_weight, docid2seccodes) cpp_wrapper.aggregate_simmatrix(sim_matrix_path, dstmatrixpath, msc2wids_list, method=similarity_aggregation_method_l) logging.info("[build_msc_tree] Loading simmatrix from: " + str(dstmatrixpath)) (rows, cols, sim_matrix_l) = matrix_io.fread_smatrix(dstmatrixpath) ____validate_cpp_output____(msc2ix, rows) return sim_matrix_l
print "[build_msc_tree] MIN_COUNT_MSCPRIM =", MIN_COUNT_MSCPRIM print "[build_msc_tree] MIN_COUNT_MSCSEC =", MIN_COUNT_MSCSEC print "[build_msc_tree] clustering_method =", clustering_method print "[build_msc_tree] similarity_aggregation_method_l =", similarity_aggregation_method_l print "[build_msc_tree] secondary_weight_method =", secondary_weight_method print "[build_msc_tree] secondary_codes_weight =", secondary_codes_weight print "[build_msc_tree] similarity_aggregation_method_m =", similarity_aggregation_method_m print "[build_msc_tree] similarity_aggregator_m =", similarity_aggregator_m print "[build_msc_tree] numiterations =", numiterations print "[build_msc_tree] l_clusters_range =", str( l_clusters_range)[:100], "..." print "[build_msc_tree] m_clusters_range =", str( m_clusters_range)[:100], "..." print "[build_msc_tree] *************************************" if not aux.exists( sim_matrix_path) or os.stat(sim_matrix_path).st_size <= 0: print "[build_msc_tree] ============================================================================================================" print "[build_msc_tree] Building similarity matrix: sim_matrix_path = ", sim_matrix_path #print "[zbl_build_msc_tree] cpp:",zbl_path, sim_matrix_path, field_name, similarity_calculator result = cpp_wrapper.run_exec( "../cpp_modules/main/zbl_similarity_matrix", args=[ field_name, similarity_calculator, zbl_path, sim_matrix_path ]) if result != 0: print "[build_msc_tree][ERROR] Failure while calculating documents' similarity matrix!" sys.exit(-10) print "[build_msc_tree] ================================================================" print "[build_msc_tree] ============================================================================================================" print "[build_msc_tree] Loading ZBL records from zbl_path=", zbl_path
print "[build_msc_tree] sim_matrix_path=",sim_matrix_path print "[build_msc_tree] MIN_COUNT_MSC =",MIN_COUNT_MSC print "[build_msc_tree] MIN_COUNT_MSCPRIM =",MIN_COUNT_MSCPRIM print "[build_msc_tree] MIN_COUNT_MSCSEC =",MIN_COUNT_MSCSEC print "[build_msc_tree] clustering_method =", clustering_method print "[build_msc_tree] similarity_aggregation_method_l =", similarity_aggregation_method_l print "[build_msc_tree] secondary_weight_method =",secondary_weight_method print "[build_msc_tree] secondary_codes_weight =",secondary_codes_weight print "[build_msc_tree] similarity_aggregation_method_m =", similarity_aggregation_method_m print "[build_msc_tree] similarity_aggregator_m =",similarity_aggregator_m print "[build_msc_tree] numiterations =",numiterations print "[build_msc_tree] l_clusters_range =",str(l_clusters_range)[:100],"..." print "[build_msc_tree] m_clusters_range =",str(m_clusters_range)[:100],"..." print "[build_msc_tree] *************************************" if not aux.exists(sim_matrix_path) or os.stat(sim_matrix_path).st_size <= 0: print "[build_msc_tree] ============================================================================================================" print "[build_msc_tree] Building similarity matrix: sim_matrix_path = ", sim_matrix_path #print "[zbl_build_msc_tree] cpp:",zbl_path, sim_matrix_path, field_name, similarity_calculator result = cpp_wrapper.run_exec("../cpp_modules/main/zbl_similarity_matrix", args = [field_name, similarity_calculator, zbl_path, sim_matrix_path]) if result != 0: print "[build_msc_tree][ERROR] Failure while calculating documents' similarity matrix!" sys.exit(-10) print "[build_msc_tree] ================================================================" print "[build_msc_tree] ============================================================================================================" print "[build_msc_tree] Loading ZBL records from zbl_path=",zbl_path zblid2zbl = dict( (zbl[zbl_io.ZBL_ID_FIELD],zbl) for zbl in _get_zbl_generator_(zbl_path) ) print "[build_msc_tree] zblid2zbl [",len(zblid2zbl),"docs loaded] =",str(list(zblid2zbl.iteritems()))[:100]