def extract_education_section(file_path, actual_file_name):
    """
    Opens the converted pdf to text file & tries to extract the work information with different unicodes.
    """
    reload(sys)
    sys.setdefaultencoding('latin-1')
    with open(file_path, 'r') as fp:
        try:
            pdf_text = str(fp.read())
            return extract_text_from_pdf(pdf_text, actual_file_name)
        except Exception:
            print traceback.print_exc()
            utility.write_to_text_file(CODEC_FAILURE, file_path + "','")
Exemple #2
0
def extract_bold_header_section(file_path, actual_file_name):
    """
    Opens the html file path & tries to extract the bold sections with different unicodes.
    """
    reload(sys)
    sys.setdefaultencoding('latin-1')
    with open(file_path, 'r') as fp:
        try:
            html_text = str(fp.read())
            return utility.fetch_bold_header_contents(html_text,
                                                      actual_file_name)
        except Exception:
            print traceback.print_exc()
            utility.write_to_text_file(CODEC_FAILURE, file_path + "','")
Exemple #3
0
def categorize_work_info(data, logic_code):
    """
    Master function for parsing the different work category.
    """
    parsed_data_result = list()

    try:
        work_content = str(data["work"])
        file_name = str(data["file_name"])

        word_list = work_content.split("|")
        forward_processed = work_parsing(word_list, file_name, logic_code)
        if not forward_processed:
            raise Exception("No data parsed from work ********************")
        parsed_data_result.extend(forward_processed)
    except Exception:
        print traceback.print_exc()
        utility.write_to_text_file(FAILED_WORK_JSON_PARSE,
                                   data["file_name"] + "','")

    parsed_data = utility.convert_to_df(parsed_data_result)
    put_parse_data_to_db(parsed_data)