def extract_profile(): _home_folder = '../resource/United States' profile_list = [] for excel_file in ProfileHelper.generate_excel_list(_home_folder): profile_list.extend( ProfileHelper.generate_profile_list(excel_file)) print("After merged file(%s) total profile list number is %d" % (excel_file, len(profile_list))) StoreHelper.store_data(profile_list, _home_folder + '/profile.dat') StoreHelper.save_file(profile_list, _home_folder + '/profile.txt')
def split_dict(): phase_dict = StoreHelper.load_data("phase_dict.dat", {}) phase_dict_single = {} phase_dict_double = {} for key, value in phase_dict.items(): if '_' in key: phase_dict_double[key] = value else: phase_dict_single[key] = value StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_single), 'phase_dict_single.txt') StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_double), 'phase_dict_double.txt')
def merge_dict(): profile_dict_list = StoreHelper.load_data( '../resource/convert_profile.dat', []) merged_list = [] for profile_dict in profile_dict_list: merged_dict = {} for feature in profile_dict: for key in profile_dict[feature]: DictHelper.increase_dic_key(merged_dict, key) merged_list.append(merged_dict) StoreHelper.store_data(merged_list, '../resource/merged_profile.dat') StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
def run_lemmatize(src_folder, dst_folder): for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) output_file = path.join(dst_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content = StoreHelper.read_file(input_file) new_content = [ SegmentHelper.normalize(line) for line in file_content.splitlines() ] StoreHelper.save_file(os.linesep.join(new_content), output_file) else: print("%s not exist!" % input_file)
def convert_profile2(debug=False): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) university_name_convert_dict = StoreHelper.load_data( '../university_name_convert.dic', {}) vector_list = [] count = 0 total = len(profile_vectors) for _profile in profile_vectors: count += 1 if debug: print("Profile convert progress: %d/%d" % (count, total)) educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'work_change_times': ProfileHelper.calculate_years(_profile)[0], 'years': ProfileHelper.calculate_years(_profile)[1], 'university': ProfileHelper.convert_university(_profile, university_name_convert_dict), 'education': educations, 'company': [ SegmentHelper.normalize(company) for company in _profile['company'] ], 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
def convert_profile(): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) vector_list = [] for _profile in profile_vectors: educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'years': ProfileHelper.get_years(_profile), 'education': educations, 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
def print_label(label, index_list, cluster_number=None): if cluster_number is None: label_dict = DictHelper.dict_from_count_list(label) print("\t".join([str(i) for i in label])) print(label_dict) print("max cluster number: %i" % max(label_dict)) print("min cluster number: %i" % min(label_dict)) position_tag = {} for i in range(len(label)): DictHelper.append_dic_key(position_tag, label[i], int(index_list[i])) for key, value in position_tag.items(): print("%s: %s" % (key, value)) StoreHelper.store_data(position_tag, 'position_tag.dat') StoreHelper.save_file(position_tag, 'position_tag.txt') else: length = len(label) clusters = [[str(j) for j in range(length) if label[j] == i] for i in range(cluster_number)] for i in range(len(clusters)): print("Cluster %i has %i position, position: %s" % (i, len(clusters[i]), str(clusters[i])))