def get_discipline(major_phrase, prefer_index, discipline_phrase_dic, debug=False): prefer_major = major_phrase[prefer_index] prefer_major = SegmentHelper.normalize( TextHelper.unicode_to_ascii(prefer_major)) prefer_major = TextHelper.get_dict_pattern(prefer_major, discipline_phrase_dic) if len(prefer_major) == 0: if debug: print("prefer major can not found match phrase in dict: %s" % major_phrase[prefer_index]) prefer_major = ' '.join(major_phrase) prefer_major = SegmentHelper.normalize( TextHelper.unicode_to_ascii(prefer_major)) prefer_major = TextHelper.get_dict_pattern(prefer_major, discipline_phrase_dic) if len(prefer_major) == 0: if debug: print("Can not found major words: %s" % str(major_phrase)) return None max_length = max([len(key) for key in prefer_major.keys()]) for major in prefer_major: if len(major) == max_length: return major
def get_highest_education(profile, education_phrase_dic, discipline_phrase_dic): education_dic = {} for i in range(len(profile['education'])): education = SegmentHelper.normalize( TextHelper.unicode_to_ascii(profile['education'][i])) education_dic[i] = TextHelper.get_dict_pattern( education, education_phrase_dic) education_dic = { e_dic.keys()[0]: index for index, e_dic in education_dic.items() if len(e_dic) > 0 } if 'Doctor' in education_dic: return ['Doctor'], [ ProfileHelper.get_discipline(profile['major'], education_dic['Doctor'], discipline_phrase_dic) ] elif 'Master' in education_dic: return ['Master'], [ ProfileHelper.get_discipline(profile['major'], education_dic['Master'], discipline_phrase_dic) ] elif 'Bachelor' in education_dic: return ['Bachelor'], [ ProfileHelper.get_discipline(profile['major'], education_dic['Bachelor'], discipline_phrase_dic) ] else: return [], []
def get_combine_company_dict(store_data_file): company_dict = {} for tab in range(2): header, raw_data = ExcelHelper.read_excel('../resource/us_list_company2.xlsx', tab) row, column = raw_data.shape for i in range(row): company_name = SegmentHelper.normalize(str(raw_data[i][0]).strip()) if len(company_name) > 0: DictHelper.increase_dic_key(company_dict, raw_data[i][0]) df = pd.read_csv('../resource/us_list_company_1.csv') name_serial = df['Name'] for i in range(df.shape[0]): company_name = SegmentHelper.normalize(name_serial[i]) if len(company_name) > 0: DictHelper.increase_dic_key(company_dict, name_serial[i]) StoreHelper.store_data(company_dict, store_data_file)
def get_discipline_dict(excel_file, dict_file): probability_dict = {} header, raw_data = ExcelHelper.read_excel(excel_file) row_number, column_number = raw_data.shape print(raw_data.shape) if column_number != 2: print("Attention! Excel file more than two column, please have a check! Use the first two column as dict") for i in range(row_number): value = raw_data[i][0] key_list = raw_data[i][1].split('|') for key in key_list: key = SegmentHelper.normalize(key) if len(key.strip()) == 0: # ignore single word continue probability_dict[key] = value probability_dict[SegmentHelper.normalize(value)] = value StoreHelper.store_data(probability_dict, dict_file) print (probability_dict) print("Generalized successfully and store dict(%i) to data file %s!" % (len(probability_dict), dict_file))
def get_skills(profile, skills_dic, debug=False): skill_phrases = ' '.join(profile['skills']) skill_phrases = SegmentHelper.normalize( TextHelper.unicode_to_ascii(skill_phrases)) if debug: print("right after normalize: %s" % skill_phrases) skill_phrases_dict = TextHelper.get_dict_pattern( skill_phrases, skills_dic) if len(skill_phrases_dict) == 0: # print ("can not found skills in %s" % str(skills)) return [] else: return skill_phrases_dict.keys()
def run_lemmatize(src_folder, dst_folder): for i in range(8535): input_file = path.join(src_folder, "%04d.dat" % i) output_file = path.join(dst_folder, "%04d.dat" % i) if StoreHelper.is_file_exist(input_file): file_content = StoreHelper.read_file(input_file) new_content = [ SegmentHelper.normalize(line) for line in file_content.splitlines() ] StoreHelper.save_file(os.linesep.join(new_content), output_file) else: print("%s not exist!" % input_file)
def convert_university(profile, convert_dict, debug=False): university_list = profile['university'] if len(university_list) == 0: return None university_list = [ SegmentHelper.normalize(university) for university in university_list ] for university in university_list: convert_name = DictHelper.find_in_key(convert_dict, university) if convert_name is not None: if debug: print("%s ==> %s" % (university, convert_name)) return convert_dict[convert_name] return university_list[0]
def convert_profile2(debug=False): education_phrase_dic = StoreHelper.load_data( '../resource/education.dat') discipline_phrase_dic = StoreHelper.load_data( '../resource/discipline.dat') skills_dic = StoreHelper.load_data('../resource/skills.dat') profile_vectors = StoreHelper.load_data( '../resource/United States/profile.dat', []) university_name_convert_dict = StoreHelper.load_data( '../university_name_convert.dic', {}) vector_list = [] count = 0 total = len(profile_vectors) for _profile in profile_vectors: count += 1 if debug: print("Profile convert progress: %d/%d" % (count, total)) educations, majors = ProfileHelper.get_highest_education( _profile, education_phrase_dic, discipline_phrase_dic) profile_dict = { 'skills': ProfileHelper.get_skills(_profile, skills_dic), 'work_change_times': ProfileHelper.calculate_years(_profile)[0], 'years': ProfileHelper.calculate_years(_profile)[1], 'university': ProfileHelper.convert_university(_profile, university_name_convert_dict), 'education': educations, 'company': [ SegmentHelper.normalize(company) for company in _profile['company'] ], 'major': majors } vector_list.append(profile_dict) StoreHelper.store_data(vector_list, '../resource/convert_profile.dat') StoreHelper.save_file(vector_list, '../resource/convert_profile.txt')