def get_discipline(major_phrase, prefer_index, discipline_phrase_dic, debug=False): prefer_major = major_phrase[prefer_index] prefer_major = SegmentHelper.normalize( TextHelper.unicode_to_ascii(prefer_major)) prefer_major = TextHelper.get_dict_pattern(prefer_major, discipline_phrase_dic) if len(prefer_major) == 0: if debug: print("prefer major can not found match phrase in dict: %s" % major_phrase[prefer_index]) prefer_major = ' '.join(major_phrase) prefer_major = SegmentHelper.normalize( TextHelper.unicode_to_ascii(prefer_major)) prefer_major = TextHelper.get_dict_pattern(prefer_major, discipline_phrase_dic) if len(prefer_major) == 0: if debug: print("Can not found major words: %s" % str(major_phrase)) return None max_length = max([len(key) for key in prefer_major.keys()]) for major in prefer_major: if len(major) == max_length: return major
def get_highest_education(profile, education_phrase_dic, discipline_phrase_dic): education_dic = {} for i in range(len(profile['education'])): education = SegmentHelper.normalize( TextHelper.unicode_to_ascii(profile['education'][i])) education_dic[i] = TextHelper.get_dict_pattern( education, education_phrase_dic) education_dic = { e_dic.keys()[0]: index for index, e_dic in education_dic.items() if len(e_dic) > 0 } if 'Doctor' in education_dic: return ['Doctor'], [ ProfileHelper.get_discipline(profile['major'], education_dic['Doctor'], discipline_phrase_dic) ] elif 'Master' in education_dic: return ['Master'], [ ProfileHelper.get_discipline(profile['major'], education_dic['Master'], discipline_phrase_dic) ] elif 'Bachelor' in education_dic: return ['Bachelor'], [ ProfileHelper.get_discipline(profile['major'], education_dic['Bachelor'], discipline_phrase_dic) ] else: return [], []
def remove_reduntant_text_part(self, caption: CaptionLine, to_cut_text: str, at_start: bool) -> str: """ Удаляем только ту часть, которая осталась лишней и неотформатированной. Возвращает подстроку, которую нужно еще удалить :return: """ len_to_cut = TextHelper.define_smallest_len_to_cut( caption.left_unformatted_text, to_cut_text) if caption.left_unformatted_text == '': return to_cut_text if self.is_similar_strings(caption.left_unformatted_text, to_cut_text, at_start): if at_start: splitted_text_tuple = TextHelper.cut_text_from_start( caption.left_unformatted_text, len_to_cut) else: splitted_text_tuple = TextHelper.cut_text_from_end( caption.left_unformatted_text, len_to_cut) else: return to_cut_text caption.left_unformatted_text = splitted_text_tuple[1] caption.correct_unformatted_text = splitted_text_tuple[0] # Корректируем оставшуюся длину для удаления. return TextHelper.cut_text_from_start(to_cut_text, len_to_cut)[0]
def get_skills(profile, skills_dic, debug=False): skill_phrases = ' '.join(profile['skills']) skill_phrases = SegmentHelper.normalize( TextHelper.unicode_to_ascii(skill_phrases)) if debug: print("right after normalize: %s" % skill_phrases) skill_phrases_dict = TextHelper.get_dict_pattern( skill_phrases, skills_dic) if len(skill_phrases_dict) == 0: # print ("can not found skills in %s" % str(skills)) return [] else: return skill_phrases_dict.keys()
def is_similar_strings(self, str1: str, str2: str, at_start_to_cut=True) -> bool: len_to_cut = TextHelper.define_smallest_len_to_cut(str1, str2) if at_start_to_cut: if len(str1) == len_to_cut: if str2.startswith(str1): return True else: return False else: if str1.startswith(str2): return True else: return False else: if len(str1) == len_to_cut: if str2.endswith(str1): return True else: return False else: if str1.endswith(str2): return True else: return False
def calculate_years(profile): year_list = [] for years in profile['years']: year_pair = TextHelper.unicode_to_ascii(years).split('-') year_pair = [ TextHelper.get_year(year_str) for year_str in year_pair ] year_pair = [year for year in year_pair if year is not None] if len(year_pair) == 2: try: year_list.append(year_pair[1] - year_pair[0]) except TypeError: print("Can not minus between %s" % str(year_pair)) else: print("can not calculate %s" % str(year_pair)) return len(year_list), sum(year_list)
def __init__(self, original_sentence: str): self.original_sentence = original_sentence self.without_punctuation = TextHelper.leave_only_letters( self.original_sentence) self.len_without_punctuation = len(self.without_punctuation) self.unused_original_sentence = original_sentence self.letters = LetterListManager.create_letters_list( self.original_sentence)
def _get_education_words(self, education_dict): education_phrase_dict = TextHelper.get_dict_pattern( self.raw_position, education_dict) if len(education_phrase_dict) == 0: default_education_requirement = "Bachelor" self.new_words_list.append(default_education_requirement) return {default_education_requirement: 1} else: return education_phrase_dict
def get_prompt_url_from_web(url, start_tag, end_tag, user='******', password='******'): url_ignore, web_content = WebHelper.get_auth_url_content( url, user, password) prompt_url_short = TextHelper.find_text_between_tag( web_content, start_tag, end_tag) prompt_url = WebHelper.join_url(url, prompt_url_short) print("get new prompt url: %s" % prompt_url) return prompt_url
def _get_working_year_words(self, year_convert_file=None): year_list = TextHelper.get_years_pattern(self.raw_position) if len(year_list) == 0: default_year_requirement = "[0]" self.new_words_list.append(default_year_requirement) year_list = [default_year_requirement] elif year_convert_file is not None: year_convert_dict = StoreHelper.load_data(year_convert_file, {}) year_list = [ year_convert_dict[item] for item in year_list if item in year_convert_dict ] return DictHelper.dict_from_count_list(year_list)
def __init__(self, raw_line: str, line_number: int): self.raw_line = raw_line self.caption_letters = LetterListManager.create_letters_list(raw_line) self.line_number = line_number self.without_punctuation = TextHelper.leave_only_letters(self.raw_line) self.len_without_punctuation = len(self.without_punctuation) self.__start_pos_in_cap_text = 0 self.__end_pos_in_cap_text = 0 # Оставшийся текст, который нужно отформатировать self.left_unformatted_text = self.without_punctuation self.correct_unformatted_text = self.without_punctuation self.is_formatted = False self.formatted_text = '' self.checked = False self.checked_twice = False
def get_post(web_source): if not TextHelper.contain(web_source, 'data scientist'): print("Not contain data scientist") return False, None soup = BeautifulSoup(web_source, 'lxml') # post = soup.find('article', id='jobview') This include header # if post is not None: # return True, post post = soup.find_all('div', class_='jobview-section') if len(post) >= 1: if len(post) > 1: print("Too many jobview-section") return (True, post[0]) if len(post) == 1 else (False, post[0]) post = soup.findAll(True, {'class': re.compile('^panel panel-default')}) if len(post) >= 1: if len(post) > 1: print("Too many panel panel-default") post = soup.find_all( 'article', class_='panel panel-default m-job-view panel-body') if len(post) == 1: return True, post[0] else: print("Can not solve multi panel") return False, None return (True, post[0]) if len(post) == 1 else (False, post[0]) post = soup.find_all('div', class_='job-show-description-scroller') if len(post) >= 1: if len(post) > 1: print("Too many job-show-description-scroller") return (True, post[0]) if len(post) == 1 else (False, post[0]) post = soup.find('div', id='details-job-content') if post is not None: return True, post post = soup.find('div', id='jobcopy') if post is not None: return True, post post = soup.find('div', id='bodycol') if post is not None: return True, post post = soup.find('div', id='JobDescription') return (True, post) if post is not None else (False, None)
def get_part_of_string_by_percent(self, percent: int) -> str: if percent >= 90: percent = 100 cut_chars_from_start_cnt = math.floor( len(self.original_sentence) * percent / 100) # Бывает, что текст обрезается на половине слова, поэтому нужно его увеличить. for i, char in enumerate(self.raw_sentence): if i >= cut_chars_from_start_cnt: if not TextHelper.is_letter(char) or re.match( r'[.?!,]', char) is not None: cut_chars_from_start_cnt = i break result = self.raw_sentence[:cut_chars_from_start_cnt] self.raw_sentence = self.raw_sentence[cut_chars_from_start_cnt:] if re.match(r'[.?!]', self.raw_sentence) is not None: result += self.raw_sentence self.raw_sentence = '' return result
def start(self): captions_raw = FileManager.get_file_rows(self.captions_filename) result = [] # Для каждой строки caption определяем номер предложения и количество символов for caption_raw in captions_raw: caption_line = CaptionLine(caption_raw) if TextHelper.is_new_line(caption_raw): result.append(caption_line) continue if caption_line.substrings_cnt == 1: eng_sentence = self.get_first_not_handled_english_sentence() percent = int(caption_line.get_full_len() / eng_sentence.get_full_len() * 100) eng_sentence.add_percent(percent) sentence_part = self.get_first_not_empty_russian_sentence( ).get_part_of_string_by_percent(percent) caption_line.add_formatted_text(sentence_part) if caption_line.substrings_cnt > 1: for i in range(caption_line.substrings_cnt): eng_sentence = self.get_first_not_handled_english_sentence( ) len_cap_line = len(caption_line.substrings[i]) percent = int(len_cap_line / eng_sentence.get_full_len() * 100) eng_sentence.add_percent(percent) sentence_part = self.get_first_not_empty_russian_sentence( ).get_part_of_string_by_percent(percent) caption_line.add_formatted_text(sentence_part) result.append(caption_line) self.save_list_to_file(result)
def get_fortune_500_dict(): return {TextHelper.company_name_clean(value): key for key, value in pd.read_csv("./resource/fortune500.csv")['company'].to_dict().items()}
if __name__ == '__main__': print('Start training') time_start_load_everything = time.time() parser = argparse.ArgumentParser(description='PPDL') parser.add_argument('--params', dest='params', default='params_words.json') args = parser.parse_args() with open(f'./{args.params}', 'r') as f: params_loaded = yaml.load(f) current_time = datetime.datetime.now().strftime('%b.%d_%H.%M.%S') if params_loaded['type'] == "image": helper = ImageHelper(current_time=current_time, params=params_loaded, name=params_loaded.get('name', 'image')) else: helper = TextHelper(current_time=current_time, params=params_loaded, name=params_loaded.get('name', 'text')) helper.load_data() helper.create_model() ### Create models if helper.params['is_poison']: helper.params['adversary_list'] = [0]+ \ random.sample(range(helper.params['number_of_total_participants']), helper.params['number_of_adversaries']-1) logger.info(f"Poisoned following participants: {len(helper.params['adversary_list'])}") else: helper.params['adversary_list'] = list() best_loss = float('inf') vis.text(text=dict_html(helper.params, current_time=helper.params["current_time"]),
def _get_skill_words(self, skill_dict): return TextHelper.get_dict_pattern(self.raw_position, skill_dict)
'SearchJobJserp":{"jobPosting":"urn:li:fs_normalized_jobPosting:(\d*)"', web_source))) return job_id def get_web_source(self, url): if not self.__has_authentication: print("Error! Not have authenticate yet!") return None self.__driver.get(url) delay = random.choice([5, 3, 3, 13, 4, 15, 7, 60, 5, 3, 4, 5, 7, 9]) # random delay seconds time.sleep(delay) try: WebDriverWait(self.__driver, delay).until( EC.presence_of_element_located((By.ID, 'clientPageInstance'))) print("Page is ready!") return self.__driver.page_source.encode("utf-8") except TimeoutException: print("Loading took too much time!") return None if __name__ == "__main__": chrome = ChromeHelper() chrome.authenticate("https://www.linkedin.com/uas/login-cap", ("*****@*****.**", "Linkedin0405")) web_content = chrome.get_web_source( "https://www.linkedin.com/jobs/view/309059479") soup = BeautifulSoup(web_content.decode('utf-8'), 'html.parser') TextHelper.store_html(soup.prettify().encode('utf-8'), "309059479.html")
def _get_discipline_words(self, discipline_dict): discipline_phrase_dict = TextHelper.get_dict_pattern( self.raw_position, discipline_dict) return discipline_phrase_dict
if __name__ == '__main__': parser = argparse.ArgumentParser(description='PPDL') parser.add_argument('--params', dest='params', default='utils/params.yaml') parser.add_argument('--name', dest='name', required=True) args = parser.parse_args() d = datetime.now().strftime('%b.%d_%H.%M.%S') writer = SummaryWriter(log_dir=f'runs/{args.name}') writer.add_custom_scalars(layout) with open(args.params) as f: params = yaml.load(f) if params.get('model', False) == 'word': helper = TextHelper(current_time=d, params=params, name='text') helper.corpus = torch.load(helper.params['corpus']) logger.info(helper.corpus.train.shape) else: helper = ImageHelper(current_time=d, params=params, name='utk') logger.addHandler( logging.FileHandler(filename=f'{helper.folder_path}/log.txt')) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) logger.info(f'current path: {helper.folder_path}') batch_size = int(helper.params['batch_size']) num_microbatches = int(helper.params['num_microbatches']) lr = float(helper.params['lr']) momentum = float(helper.params['momentum']) decay = float(helper.params['decay'])
if __name__ == '__main__': parser = argparse.ArgumentParser(description='PPDL') parser.add_argument('--params', dest='params', default='utils/params.yaml') # parser.add_argument('--name', dest='name', required=True) parser.add_argument('--name', dest='name', default='test') args = parser.parse_args() d = datetime.now().strftime('%b.%d_%H.%M.%S') writer = SummaryWriter(log_dir=f'runs/{args.name}') writer.add_custom_scalars(layout) with open(args.params) as f: params = yaml.load(f) if params.get('model', False) == 'word': helper = TextHelper(current_time=d, params=params, name='text') helper.corpus = torch.load(helper.params['corpus']) logger.info(helper.corpus.train.shape) else: helper = ImageHelper(current_time=d, params=params, name='utk') logger.addHandler(logging.FileHandler(filename=f'{helper.folder_path}/log.txt')) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) logger.info(f'current path: {helper.folder_path}') batch_size = int(helper.params['batch_size']) num_microbatches = int(helper.params['num_microbatches']) lr = float(helper.params['lr']) momentum = float(helper.params['momentum']) decay = float(helper.params['decay']) epochs = int(helper.params['epochs'])
def _get_responsibility_words(self, education_dict): return TextHelper.get_dict_pattern(self.raw_position, education_dict)
def _feature4(element): return TextHelper.get_data_length(element)
def find_caption_list_which_best_suit_for_sentence( self, punctuation_file_sentence: PunctuationSentence, captions_prepared: List[CaptionLine]) -> List[CaptionLine]: """ Метод, который вернет список CaptionLine, который содержит необходимые строки для форматирования. """ result = [] result_string_len = 0 for caption_prepared in captions_prepared: if not caption_prepared.checked_twice and caption_prepared.left_unformatted_text != '' and not caption_prepared.is_formatted and TextHelper.has_similar_substring( caption_prepared.left_unformatted_text, punctuation_file_sentence.without_punctuation): # Собираем длину строки result_string_len += len( caption_prepared.left_unformatted_text) if caption_prepared.checked: caption_prepared.checked_twice = True caption_prepared.checked = True result.append(caption_prepared) if result_string_len > punctuation_file_sentence.len_without_punctuation: break return result