def get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y):
    sic_str = 'sic={sic}&'
    s_d_str = 'startday={s_d}&'
    s_m_str = 'startmonth={s_m}&'
    s_y_str = 'startyear={s_y}&'
    e_d_str = 'endday={e_d}&'
    e_m_str = 'endmonth={e_m}&'
    e_y_str = 'endyear={e_y}&'

    search_url = 'https://www.osha.gov/pls/imis/AccidentSearch.search?'
    search_url = search_url + 'p_logger=1&acc_description=fall&acc_Abstract=&acc_keyword=&'
    search_url = search_url + sic_str.format(sic=SICNo)
    search_url = search_url + 'naics=&Office=All&officetype=All&'
    search_url = search_url + e_m_str.format(e_m=e_m)
    search_url = search_url + e_d_str.format(e_d=e_d)
    search_url = search_url + e_y_str.format(e_y=e_y)
    search_url = search_url + s_m_str.format(s_m=s_m)
    search_url = search_url + s_d_str.format(s_d=s_d)
    search_url = search_url + s_y_str.format(s_y=s_y)
    search_url = search_url + 'InspNr='

    brw = BrowserShadow()
    res = brw.open_url(search_url)
    page_content = res.read()
    page_soup = BeautifulSoup(page_content, "html.parser")

    count_str_cell = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.row-fluid')[0].select('.span3')[0].select('.text-right')
    count_str = count_str_cell[0].get_text();
    record_num = int(count_str[count_str.find('of') + 2 : len(count_str)])
    
    #print(record_num)
    #print(search_url)

    return record_num
def parse_page_obtian_event(page_soup):
    #event_content = page_soup.select('#maintain')[0].select('.row-fluid')[0].select('.table-responsive')[0].select('tbody')[0]
    event_content = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.table-responsive')[1].select('table')[0].find_all('tr')

    for index in range(1 , len(event_content)):
        item_list = event_content[index].find_all('td')
        SummaryNr = item_list[2].find('a').get_text()
        EventDate = item_list[3].get_text()
        ReportID = item_list[4].get_text()
        Fat = item_list[5].get_text()
        if Fat.find('X') > -1 :
            Fat = 1
        else:
            Fat = 0

        SIC = item_list[6].find('a').get_text()
        EventDesc = item_list[7].get_text()
        #print(SummaryNr + EventData + ReportID + str(Fat) + SIC)

        accident_url_str = item_list[2].find('a')['href']
        accident_url_str = 'https://www.osha.gov/pls/imis/' + accident_url_str

        #print(SummaryNr)
        brw_d = BrowserShadow()
        res_d = brw_d.open_url(accident_url_str)
        page_content_d = res_d.read()
        page_soup_d = BeautifulSoup(page_content_d, "html.parser")

        # parse the detail information about the event
        abstract_info = {'SummaryNr':SummaryNr , 'EventDate':EventDate, 'ReportID':ReportID, 'Fat':str(Fat), 'SIC':SIC, 'EventDesc':EventDesc}
        parse_accident_details(page_soup_d , abstract_info)
def get_similar_matrix(data_dic):
    
    output_f = open('ocup_comp_result.txt', 'w+')
    
    comp_element = list(data_dic.keys())

    with open('ocup_comp_records.csv', 'w', newline='') as comp_r_csv:
        spamwriter = csv.writer(comp_r_csv,dialect='excel')
        spamwriter.writerow(comp_element)

        url = "https://www.metamind.io/language/relatedness/test?{test_p_str}"
        brw = BrowserShadow()
        similar_matrix_dic = {}
        for each_item_r  in comp_element:
            r_list = []
            for each_item_c in comp_element:
                test_data = {"text_2":each_item_r, "text_1":each_item_c}
                test_data_str = urlencode(test_data)
    
                comp_url = url.format(test_p_str = test_data_str)
    
                res = brw.open_url(comp_url)

                page_content = res.read()
                
                compar_result = page_content.decode()
                result = eval(compar_result)
                r_list.append(result["score"])
                r_str = "row, col, comp_r, %s, %s, %s" % (each_item_r, each_item_c, result["score"])
                output_f.write(r_str + "\n")
                print(r_str)
            similar_matrix_dic[each_item_r] = r_list
            spamwriter.writerow(r_list)
    output_f.close()
    return similar_matrix_dic
def get_record_list(SICNo, s_d, s_m, s_y, e_d, e_m, e_y):

    record_num = get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y)

    if record_num <= 0:
        print("No Eligible Record has been retrieved!")
        return

    
    p_finish = 0
#     if SICNo == 16:
#         p_finish = 825
#     else:
#         p_finish = 0
        
    p_show = 100
    if record_num < p_show:
        p_show = record_num

    checked_num = 0
    while 1:

        if (p_finish + p_show) > record_num:
            p_show = record_num - p_finish

        if p_show == 0:
            break
        
        sic_str = 'sic={sic}&'
        p_finish_str = 'p_finish={p_finish}&'
        p_show_str = 'p_show={p_show}'

        search_url = 'https://www.osha.gov/pls/imis/accidentsearch.search?'
        search_url = search_url + sic_str.format(sic=SICNo)
        search_url = search_url + 'sicgroup=&naics=&acc_description=fall&acc_abstract=&acc_keyword=&inspnr=&fatal=&officetype=All&office=All&'
        search_url = search_url + 'startmonth=07&startday=24&startyear=2015&endmonth=07&endday=23&endyear=1984&keyword_list=&p_start=&'
        search_url = search_url + p_finish_str.format(p_finish=p_finish)
        search_url = search_url + 'p_sort=&p_desc=DESC&p_direction=Next&'
        search_url = search_url + p_show_str.format(p_show=p_show)
        #print(search_url)
        brw = BrowserShadow()
        res = brw.open_url(search_url)
        page_content = res.read()
        page_soup = BeautifulSoup(page_content, "html.parser")

        # prase the specified record
        """collect the records into mysql database"""
        parse_page_obtian_event(page_soup)

        checked_num = checked_num + p_show
        #print(checked_num)
        if checked_num == record_num:
            break
        else:
            p_finish = p_finish + p_show
def compare_2_sentences(text_1, text_2):
    
    url = "https://www.metamind.io/language/relatedness/test?{text}"
    data_dic = {"text_1":text_1, "text_2":text_2}
    text_str = urlencode(data_dic)
    url = url.format(text = text_str)
    brw = BrowserShadow()
    res = brw.open_url(url)
    page_content = res.read()
    compar_result = page_content.decode()
    result = eval(compar_result)
    return result["score"]
def update_employee_detail_info_from_OSHA():
    
    
    user = '******'
    pwd = '123456'
    host = '127.0.0.1'
    db = 'reported_fall_event'
    cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db)
    cursor = cnx.cursor()
    
    query_sql = "select SummaryNr from case_employees order by ID"
    cursor.execute(query_sql)
    employee_list = cursor.fetchall()
    
    brw = BrowserShadow()
    
    need_update = 0
    for each_employee in employee_list:
        SummaryNr_value = each_employee[0].strip()
        
        if SummaryNr_value != "202315776" and need_update == 0:
            continue
        else:
            need_update = 1
    
        print("SummaryNr :" , SummaryNr_value)
        employee_detail_info_url = "https://www.osha.gov/pls/imis/accidentsearch.accident_detail?id={SummaryNr}"
        
        employee_detail_info_url = employee_detail_info_url.format(SummaryNr = SummaryNr_value)

        res = brw.open_url(employee_detail_info_url)
        page_content = res.read()
        page_soup = BeautifulSoup(page_content, "html.parser")
        
        event_content_details = page_soup.find_all('tr')
        keyword_position = -1
        for index in range(0, len(event_content_details)):
            if event_content_details[index].get_text().find('Keywords') > -1 :
                keyword_position = index

        # do not process if there is no keyword
        if keyword_position == -1:
            continue
        
        proj_type_has = 1
        if event_content_details[keyword_position + 1].find_all('th')[0].get_text().find('End Use') < 0:
            proj_type_has = 0
        
        if proj_type_has == 0:
            index_employee_start = keyword_position + 2
        else:
            index_employee_start = keyword_position + 4
        for index in range(index_employee_start, len(event_content_details)):
            current_employee_eid = event_content_details[index].find_all('td')[0].get_text().strip()
            
            current_employee_href = event_content_details[index].find_all('td')[0].find('a')['href']
            current_employee_url_str = 'https://www.osha.gov/pls/imis/' + current_employee_href
            print(current_employee_url_str)
            
            res_employee_info = brw.open_url(current_employee_url_str)
            page_content_employee = res_employee_info.read()
            page_soup_employee = BeautifulSoup(page_content_employee, "html.parser")
            employee_content_details = page_soup_employee.find_all('tr')
            inspection_Nr_position = -1
            for index in range(0, len(employee_content_details)):
                if employee_content_details[index].get_text().find('Inspection') > -1 :
                    inspection_Nr_position = index
                    break
            if inspection_Nr_position == -1:
                continue
            
            TaskAssigned = employee_content_details[inspection_Nr_position + 13].find_all('td')[1].get_text()
            HumanFactor = employee_content_details[inspection_Nr_position + 10].find_all('td')[1].get_text()
            EnvironmentFactor = employee_content_details[inspection_Nr_position + 9].find_all('td')[1].get_text()
            EventType = employee_content_details[inspection_Nr_position + 8].find_all('td')[1].get_text()
            SourceInjury = employee_content_details[inspection_Nr_position + 7].find_all('td')[1].get_text()
            PartBody = employee_content_details[inspection_Nr_position + 6].find_all('td')[1].get_text()
            
            update_employee_injury_info_sql = ("update case_employees set "
                      "TaskAssigned = %s, HumanFactor = %s, EnvironmentFactor = %s, EventType = %s, SourceInjury = %s, PartBody = %s"
                      "where EID = %s and SummaryNr = %s")
            update_values = (TaskAssigned, HumanFactor, EnvironmentFactor, EventType, SourceInjury, PartBody, current_employee_eid, SummaryNr_value)
            
            cursor.execute(update_employee_injury_info_sql, update_values)
            cnx.commit()
            print(TaskAssigned, '--', HumanFactor, '--', EnvironmentFactor, '--', EventType, '--', SourceInjury, '--', PartBody)
    print("Update has done!")       
    return 1