Esempio n. 1
0
 def extract_profile():
     _home_folder = '../resource/United States'
     profile_list = []
     for excel_file in ProfileHelper.generate_excel_list(_home_folder):
         profile_list.extend(
             ProfileHelper.generate_profile_list(excel_file))
         print("After merged file(%s) total profile list number is %d" %
               (excel_file, len(profile_list)))
     StoreHelper.store_data(profile_list, _home_folder + '/profile.dat')
     StoreHelper.save_file(profile_list, _home_folder + '/profile.txt')
Esempio n. 2
0
 def split_dict():
     phase_dict = StoreHelper.load_data("phase_dict.dat", {})
     phase_dict_single = {}
     phase_dict_double = {}
     for key, value in phase_dict.items():
         if '_' in key:
             phase_dict_double[key] = value
         else:
             phase_dict_single[key] = value
     StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_single), 'phase_dict_single.txt')
     StoreHelper.save_file(DictHelper.get_sorted_list(phase_dict_double), 'phase_dict_double.txt')
Esempio n. 3
0
 def merge_dict():
     profile_dict_list = StoreHelper.load_data(
         '../resource/convert_profile.dat', [])
     merged_list = []
     for profile_dict in profile_dict_list:
         merged_dict = {}
         for feature in profile_dict:
             for key in profile_dict[feature]:
                 DictHelper.increase_dic_key(merged_dict, key)
         merged_list.append(merged_dict)
     StoreHelper.store_data(merged_list, '../resource/merged_profile.dat')
     StoreHelper.save_file(merged_list, '../resource/merged_profile.txt')
 def run_lemmatize(src_folder, dst_folder):
     for i in range(8535):
         input_file = path.join(src_folder, "%04d.dat" % i)
         output_file = path.join(dst_folder, "%04d.dat" % i)
         if StoreHelper.is_file_exist(input_file):
             file_content = StoreHelper.read_file(input_file)
             new_content = [
                 SegmentHelper.normalize(line)
                 for line in file_content.splitlines()
             ]
             StoreHelper.save_file(os.linesep.join(new_content),
                                   output_file)
         else:
             print("%s not exist!" % input_file)
Esempio n. 5
0
    def convert_profile2(debug=False):
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')
        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        university_name_convert_dict = StoreHelper.load_data(
            '../university_name_convert.dic', {})
        vector_list = []

        count = 0
        total = len(profile_vectors)
        for _profile in profile_vectors:
            count += 1
            if debug:
                print("Profile convert progress: %d/%d" % (count, total))
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills':
                ProfileHelper.get_skills(_profile, skills_dic),
                'work_change_times':
                ProfileHelper.calculate_years(_profile)[0],
                'years':
                ProfileHelper.calculate_years(_profile)[1],
                'university':
                ProfileHelper.convert_university(_profile,
                                                 university_name_convert_dict),
                'education':
                educations,
                'company': [
                    SegmentHelper.normalize(company)
                    for company in _profile['company']
                ],
                'major':
                majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
Esempio n. 6
0
    def convert_profile():
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')

        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        vector_list = []
        for _profile in profile_vectors:
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills': ProfileHelper.get_skills(_profile, skills_dic),
                'years': ProfileHelper.get_years(_profile),
                'education': educations,
                'major': majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')
Esempio n. 7
0
 def print_label(label, index_list, cluster_number=None):
     if cluster_number is None:
         label_dict = DictHelper.dict_from_count_list(label)
         print("\t".join([str(i) for i in label]))
         print(label_dict)
         print("max cluster number: %i" % max(label_dict))
         print("min cluster number: %i" % min(label_dict))
         position_tag = {}
         for i in range(len(label)):
             DictHelper.append_dic_key(position_tag, label[i],
                                       int(index_list[i]))
         for key, value in position_tag.items():
             print("%s: %s" % (key, value))
         StoreHelper.store_data(position_tag, 'position_tag.dat')
         StoreHelper.save_file(position_tag, 'position_tag.txt')
     else:
         length = len(label)
         clusters = [[str(j) for j in range(length) if label[j] == i]
                     for i in range(cluster_number)]
         for i in range(len(clusters)):
             print("Cluster %i has %i position, position: %s" %
                   (i, len(clusters[i]), str(clusters[i])))