def extract_education_section(file_path, actual_file_name): """ Opens the converted pdf to text file & tries to extract the work information with different unicodes. """ reload(sys) sys.setdefaultencoding('latin-1') with open(file_path, 'r') as fp: try: pdf_text = str(fp.read()) return extract_text_from_pdf(pdf_text, actual_file_name) except Exception: print traceback.print_exc() utility.write_to_text_file(CODEC_FAILURE, file_path + "','")
def extract_bold_header_section(file_path, actual_file_name): """ Opens the html file path & tries to extract the bold sections with different unicodes. """ reload(sys) sys.setdefaultencoding('latin-1') with open(file_path, 'r') as fp: try: html_text = str(fp.read()) return utility.fetch_bold_header_contents(html_text, actual_file_name) except Exception: print traceback.print_exc() utility.write_to_text_file(CODEC_FAILURE, file_path + "','")
def categorize_work_info(data, logic_code): """ Master function for parsing the different work category. """ parsed_data_result = list() try: work_content = str(data["work"]) file_name = str(data["file_name"]) word_list = work_content.split("|") forward_processed = work_parsing(word_list, file_name, logic_code) if not forward_processed: raise Exception("No data parsed from work ********************") parsed_data_result.extend(forward_processed) except Exception: print traceback.print_exc() utility.write_to_text_file(FAILED_WORK_JSON_PARSE, data["file_name"] + "','") parsed_data = utility.convert_to_df(parsed_data_result) put_parse_data_to_db(parsed_data)