def get_discipline(major_phrase,
                    prefer_index,
                    discipline_phrase_dic,
                    debug=False):
     prefer_major = major_phrase[prefer_index]
     prefer_major = SegmentHelper.normalize(
         TextHelper.unicode_to_ascii(prefer_major))
     prefer_major = TextHelper.get_dict_pattern(prefer_major,
                                                discipline_phrase_dic)
     if len(prefer_major) == 0:
         if debug:
             print("prefer major can not found match phrase in dict: %s" %
                   major_phrase[prefer_index])
         prefer_major = ' '.join(major_phrase)
         prefer_major = SegmentHelper.normalize(
             TextHelper.unicode_to_ascii(prefer_major))
         prefer_major = TextHelper.get_dict_pattern(prefer_major,
                                                    discipline_phrase_dic)
         if len(prefer_major) == 0:
             if debug:
                 print("Can not found major words: %s" % str(major_phrase))
             return None
     max_length = max([len(key) for key in prefer_major.keys()])
     for major in prefer_major:
         if len(major) == max_length:
             return major
 def get_highest_education(profile, education_phrase_dic,
                           discipline_phrase_dic):
     education_dic = {}
     for i in range(len(profile['education'])):
         education = SegmentHelper.normalize(
             TextHelper.unicode_to_ascii(profile['education'][i]))
         education_dic[i] = TextHelper.get_dict_pattern(
             education, education_phrase_dic)
     education_dic = {
         e_dic.keys()[0]: index
         for index, e_dic in education_dic.items() if len(e_dic) > 0
     }
     if 'Doctor' in education_dic:
         return ['Doctor'], [
             ProfileHelper.get_discipline(profile['major'],
                                          education_dic['Doctor'],
                                          discipline_phrase_dic)
         ]
     elif 'Master' in education_dic:
         return ['Master'], [
             ProfileHelper.get_discipline(profile['major'],
                                          education_dic['Master'],
                                          discipline_phrase_dic)
         ]
     elif 'Bachelor' in education_dic:
         return ['Bachelor'], [
             ProfileHelper.get_discipline(profile['major'],
                                          education_dic['Bachelor'],
                                          discipline_phrase_dic)
         ]
     else:
         return [], []
Example #3
0
 def get_combine_company_dict(store_data_file):
     company_dict = {}
     for tab in range(2):
         header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab)
         row, column = raw_data.shape
         for i in range(row):
             company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip())
             if len(company_name) > 0:
                 DictHelper.increase_dic_key(company_dict, raw_data[i][0])
     df = pd.read_csv('../resource/us_list_company_1.csv')
     name_serial = df['Name']
     for i in range(df.shape[0]):
         company_name = SegmentHelper.normalize(name_serial[i])
         if len(company_name) > 0:
             DictHelper.increase_dic_key(company_dict, name_serial[i])
     StoreHelper.store_data(company_dict, store_data_file)
Example #4
0
 def get_discipline_dict(excel_file, dict_file):
     probability_dict = {}
     header, raw_data = ExcelHelper.read_excel(excel_file)
     row_number, column_number = raw_data.shape
     print(raw_data.shape)
     if column_number != 2:
         print("Attention! Excel file more than two column, please have a check! Use the first two column as dict")
     for i in range(row_number):
         value = raw_data[i][0]
         key_list = raw_data[i][1].split('|')
         for key in key_list:
             key = SegmentHelper.normalize(key)
             if len(key.strip()) == 0:  # ignore single word
                 continue
             probability_dict[key] = value
         probability_dict[SegmentHelper.normalize(value)] = value
     StoreHelper.store_data(probability_dict, dict_file)
     print (probability_dict)
     print("Generalized successfully and store dict(%i) to data file %s!" % (len(probability_dict), dict_file))
 def get_skills(profile, skills_dic, debug=False):
     skill_phrases = ' '.join(profile['skills'])
     skill_phrases = SegmentHelper.normalize(
         TextHelper.unicode_to_ascii(skill_phrases))
     if debug:
         print("right after normalize: %s" % skill_phrases)
     skill_phrases_dict = TextHelper.get_dict_pattern(
         skill_phrases, skills_dic)
     if len(skill_phrases_dict) == 0:
         # print ("can not found skills in %s" % str(skills))
         return []
     else:
         return skill_phrases_dict.keys()
 def run_lemmatize(src_folder, dst_folder):
     for i in range(8535):
         input_file = path.join(src_folder, "%04d.dat" % i)
         output_file = path.join(dst_folder, "%04d.dat" % i)
         if StoreHelper.is_file_exist(input_file):
             file_content = StoreHelper.read_file(input_file)
             new_content = [
                 SegmentHelper.normalize(line)
                 for line in file_content.splitlines()
             ]
             StoreHelper.save_file(os.linesep.join(new_content),
                                   output_file)
         else:
             print("%s not exist!" % input_file)
 def convert_university(profile, convert_dict, debug=False):
     university_list = profile['university']
     if len(university_list) == 0:
         return None
     university_list = [
         SegmentHelper.normalize(university)
         for university in university_list
     ]
     for university in university_list:
         convert_name = DictHelper.find_in_key(convert_dict, university)
         if convert_name is not None:
             if debug:
                 print("%s ==> %s" % (university, convert_name))
             return convert_dict[convert_name]
     return university_list[0]
    def convert_profile2(debug=False):
        education_phrase_dic = StoreHelper.load_data(
            '../resource/education.dat')
        discipline_phrase_dic = StoreHelper.load_data(
            '../resource/discipline.dat')
        skills_dic = StoreHelper.load_data('../resource/skills.dat')
        profile_vectors = StoreHelper.load_data(
            '../resource/United States/profile.dat', [])
        university_name_convert_dict = StoreHelper.load_data(
            '../university_name_convert.dic', {})
        vector_list = []

        count = 0
        total = len(profile_vectors)
        for _profile in profile_vectors:
            count += 1
            if debug:
                print("Profile convert progress: %d/%d" % (count, total))
            educations, majors = ProfileHelper.get_highest_education(
                _profile, education_phrase_dic, discipline_phrase_dic)
            profile_dict = {
                'skills':
                ProfileHelper.get_skills(_profile, skills_dic),
                'work_change_times':
                ProfileHelper.calculate_years(_profile)[0],
                'years':
                ProfileHelper.calculate_years(_profile)[1],
                'university':
                ProfileHelper.convert_university(_profile,
                                                 university_name_convert_dict),
                'education':
                educations,
                'company': [
                    SegmentHelper.normalize(company)
                    for company in _profile['company']
                ],
                'major':
                majors
            }
            vector_list.append(profile_dict)
        StoreHelper.store_data(vector_list, '../resource/convert_profile.dat')
        StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')