def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") div = soup.find_all('div', {'id': 'CPHBody_PanelList'})[0] a_list_soup = BeautifulSoup(str(div), "html.parser") a_list = a_list_soup.find_all('a') a_list_unique = list(set(a_list)) for a in a_list_unique: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_no = escape_string(str(str(a.text)[:-10]).replace("-", "")) pdf_data = "NULL" pdf_file = "NULL" # insert_check = False judgment_date = escape_string(str(a.text)[-10:]) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True a_link = a.get('href') pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) pdf_file = escape_string(base_url + a_link) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \ "VALUE ('" + case_no + "', '" + judgment_date + "', '" \ + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") ul = soup.find_all('ul')[0] ul_soup = BeautifulSoup(str(ul), "html.parser") li_list = ul_soup.find_all('li') # p_list = ul_soup.find_all('p') # p_list = [x for x in p_list if "<p><font" not in str(x)] # print(p_list) # return for li in li_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break a = BeautifulSoup(str(li), "html.parser").a a_link = a.get('href') case_no = str(a_link[a_link.rfind("/")+1:]).replace('.pdf', '') judgment_date = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True judgment_date = escape_string(case_no[-10:].replace('(', '').replace(')', '')) pdf_data = escape_string(request_pdf(base_url + a_link, case_no, court_name)) pdf_file = escape_string(base_url + a_link) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \ "VALUE ('" + case_no + "', '" + judgment_date + "', '" \ + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET text_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, appeal_type): try: soup = BeautifulSoup(html_str, "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {'class': 'table table-bordered'})[0]), 'html.parser') tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" date_of_order = "NULL" appellant = "NULL" respondent = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 3: date_of_order = escape_string( str(td.text).strip().replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order): # insert_check = True if i == 4: party = str(td.decode_contents()).split("V/s") appellant = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_url = str(base_url + a_tag.get('href')).replace( '\\', '/') pdf_file = escape_string(pdf_url) pdf_data = escape_string( request_pdf(pdf_url, case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, appellant, respondent, " \ "pdf_file, appeal_type, pdf_filename) VALUE ('" + \ case_no + "', '" + date_of_order + "', '" + appellant + "', '" + respondent + "', '" + \ pdf_file + "', '" + appeal_type + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench_code): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table') for table in table_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" text = "NULL" text_file = "NULL" pdf_file = "NULL" # insert_check = False table_soup = BeautifulSoup(str(table), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string(str(td.decode_contents())) if i == 2: petitioner = escape_string(str(td.decode_contents())) if i == 4: respondent = escape_string(str(td.decode_contents())) if i == 6: judgment_date = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 7: judge_name = escape_string(str(td.decode_contents())) if i == 8: a_link = BeautifulSoup(str(td), "html.parser").a.get('href') text_dir = request_text(base_url + a_link, case_no, court_name) text = escape_string(text_dir['data']) text_file = escape_string(base_url + a_link) if i == 9: a_link = BeautifulSoup(str(td), "html.parser").a.get('href') pdf_file = escape_string(base_url + a_link) pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) # if case_no != "NULL" and insert_check and petitioner != 'Judgment Information System': if case_no != "NULL" and petitioner != 'Judgment Information System': sql_query = "INSERT INTO " + str(court_name) + \ " (case_no, petitioner, respondent, judgment_date, judge_name, text_data, text_file, " \ "pdf_file, bench_code, pdf_filename) VALUE ('" + case_no + "', '" + petitioner + "', '" + \ respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + text + "', '" + \ text_file + "', '" + pdf_file + "', " + str(bench_code) + ", '" + court_name + "_" + \ slugify(case_no) + ".txt')" insert_query(sql_query) update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") tr_list = soup.find_all('tr') case_no = "NULL" diary_number = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" judge_name = "NULL" bench = "NULL" pdf_data = "NULL" pdf_file = "NULL" tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') if tr_count == 1: td_count = 0 for td in td_list: td_count += 1 if td_count == 3: diary_number = escape_string(str(td.decode_contents())) if tr_count == 2: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: case_no = escape_string(str(td.decode_contents())) if td_count == 3: judgment_date = escape_string(str(td.a.string)) a_link = BeautifulSoup(str(td), "html.parser").a.get('href') pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) pdf_file = escape_string(base_url + a_link) if tr_count == 3: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: petitioner = escape_string(str(td.decode_contents())) if tr_count == 4: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: respondent = escape_string(str(td.decode_contents())) if tr_count == 5: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: petitioner_advocate = escape_string( str(td.decode_contents())) if tr_count == 6: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: respondent_advocate = escape_string( str(td.decode_contents())) if tr_count == 7: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: bench = escape_string(str(td.decode_contents())) if tr_count == 8: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: judge_name = escape_string(str(td.decode_contents())) # if case_no != "NULL" and select_count_query(str(court_name), str(case_no), 'judgment_date', # judgment_date): if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + \ " (diary_number, case_no, petitioner, respondent, petitioner_advocate, " \ "respondent_advocate, judgment_date, bench, judge_name, pdf_file, pdf_filename) VALUE "\ "('" + diary_number + "', '" + case_no + "', '" + petitioner + "', '" + respondent + \ "', '" + petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + \ "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + court_name + "_" \ + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") if tr_count == 9: tr_count = 0 case_no = "NULL" diary_number = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" judge_name = "NULL" bench = "NULL" pdf_data = "NULL" pdf_file = "NULL" return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str): try: soup = BeautifulSoup(str(html_str), "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {"width": "100%"})[0]), "html.parser") tr_list = table_soup.select('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 2: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.select('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font case_no = escape_string(str(font_tag.text)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 3 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font respondent = escape_string(str(font_tag.text)) if i == 4 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font petitioner = escape_string(str(font_tag.text)) if i == 5 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font judgment_date = escape_string(str(font_tag.text)) if td.get('align') == 'left': td_soup1 = BeautifulSoup(str(td), "html.parser") judge_name = escape_string(str(td_soup1.text)) if td.get('align') == 'center': font_tag = BeautifulSoup(str(td), "html.parser").font a_tag = BeautifulSoup(str(font_tag), "html.parser").a pdf_file = escape_string(base_url + "/" + a_tag.get('href')) pdf_data = escape_string( bytes( str( request_pdf(base_url + "/" + a_tag.get('href'), case_no)), 'utf-8').decode("utf-8", 'ignore')) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, pdf_filename) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") select_soup = BeautifulSoup( str(soup.find_all('select', {'id': 'txtlist'})[0]), "html.parser") tr_list = select_soup.find_all('option') for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break # insert_check = False pdf_value = tr['value'] res = BeautifulSoup( str(tr['onmouseover']).replace("return overlib('", "").replace("')", ""), "html.parser") [s.extract() for s in res('font')] res = str(res).replace('\n', '').strip().split('<br/>') petitioner = escape_string(res[0]) respondent = escape_string(res[1]) judge = escape_string(res[2]) judgment_date = escape_string(res[3]) mix_data = str(res[4]).replace("', CAPTION, '", '') reportable = mix_data[0:2] case_no = escape_string(mix_data[3:]) if reportable == 'No': continue # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True # if case_no != "NULL" and insert_check: if case_no != "NULL": pdf_data = escape_string( request_pdf(case_no, court_name, pdf_value)) sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge, pdf_file, pdf_filename, reportable) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge + "', '" + pdf_value + "', '" + court_name + "_" + slugify(case_no) + ".pdf', '" + \ reportable + "')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, court_id): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table') for table in table_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_no = "NULL" judgment_date = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False table_soup = BeautifulSoup(str(table), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 3: judgment_date = escape_string(str(td.decode_contents())) if i == 4: pdf_file = base_url + BeautifulSoup( str(td), "html.parser").a.get('href') pdf_data = escape_string( request_pdf(pdf_file, case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, court_id, judgment_date, pdf_file, " \ "pdf_filename) VALUE ('" + case_no + "', " + court_id + \ ", '" + judgment_date + "', '" + pdf_file + "', '" + court_name + "_" + \ slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, m_sideflg): try: soup = BeautifulSoup(html_str, "html.parser") table_soup = BeautifulSoup(str(soup.find_all('form')[0]), "html.parser") table_soup = BeautifulSoup( str(table_soup.find_all('table', {"width": "100%"})[0]), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 4 or tr_count % 2 == 0: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" coram = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1 or i == 6 or str(td.decode_contents()).replace("\n", "").strip() == \ '<font color="blue">LBR : Larger Benches Referred Matter</font>': continue if i == 2: coram = escape_string(str(td.decode_contents())) if i == 3: data1 = escape_string(str(td.decode_contents())) data1_list = data1.split("<b>") petitioner = data1_list[0] respondent = str(data1_list[1]).split("</b>")[1] if i == 4: data2 = escape_string(str(td.decode_contents())) data2_list = data2.split("<br/>") judgment_date = data2_list[0] if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = base_url + a_tag.get('href') case_no = str(a_tag.text).replace("\n", "") pdf_data = escape_string( request_pdf(pdf_file, case_no, court_name)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (m_sideflg, case_no, petitioner, respondent, " \ "judgment_date, coram, pdf_file, pdf_filename) VALUE " \ "('" + m_sideflg +\ "', '" + case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + \ "', '" + coram + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table', {'style': 'width:100%; margin-top: 10px; font-size: 12px;'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue appeal_no = "NULL" appellant = "NULL" respondent = "NULL" date_of_order = "NULL" filed_by = "NULL" pdf_data = "NULL" pdf_file = "NULL" order_type = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: appeal_no = escape_string(str(td.text).strip().replace("\n", "")) if i == 2: filed_by = escape_string(str(td.text).strip().replace('\n', '')) if i == 3: appellant = escape_string(str(td.text).strip().replace('\n', '')) if i == 4: respondent = escape_string(str(td.text).strip().replace('\n', '')) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a details_url = a_tag.get('href') date_of_order, pdf_file, order_type = details_parse(details_url, appeal_no, court_name) # if select_count_query_other(str(court_name), 'appeal_no', str(appeal_no), 'date_of_order', # date_of_order): # insert_check = True pdf_data = escape_string(str(request_pdf(pdf_file, appeal_no, court_name)).replace("'", "")) # if appeal_no != "NULL" and insert_check: if appeal_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (appeal_no, appellant, respondent, filed_by, " \ "bench_code, pdf_filename ) VALUE ('" + appeal_no + \ "', '" + appellant + "', '" + respondent + "', '" + filed_by + "', '" + court_name + \ "_" + slugify(appeal_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "', date_of_order ='" + date_of_order + "', pdf_file = '" + pdf_file + "', order_type = '" + order_type + "' WHERE appeal_no = '" + str(appeal_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") table_tag = soup.find_all('table')[1] table_soup = BeautifulSoup(str(table_tag), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 2 or tr_count > 17: continue case_no = "NULL" judgment_date = "NULL" coram = "NULL" type_ = "NULL" status = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 2: coram = escape_string(str(td.decode_contents())) if i == 3: judgment_date = escape_string(str(td.decode_contents())) if i == 5: type_ = escape_string(str(td.decode_contents())) if i == 6: status = escape_string(str(td.decode_contents())) if i == 4: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(base_url + a_tag.get('href')) pdf_data = escape_string(request_pdf(base_url + a_tag.get('href'), case_no, court_name)) # if case_no != "NULL" and insert_check and case_no.find("DISCLAIMER") == -1: if case_no != "NULL" and case_no.find("DISCLAIMER") == -1: sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, coram, type, status, " \ "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \ judgment_date + "', '" + coram + "', '" + type_ + "', '" + status + "', '" + pdf_file + \ "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") table_tag = soup.find_all('table', {'class': 'miscTable'})[0] table_soup = BeautifulSoup(str(table_tag), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" judgment_date = "NULL" judge_name = "NULL" petitioner = "NULL" respondent = "NULL" bench = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: judgment_date = escape_string(str(td.decode_contents())) if i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a case_no = escape_string(str(a_tag.text).replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True new_url = base_url + a_tag.get('href') response = requests.request('GET', new_url, headers=headers, proxies=proxy_dict) new_soup = BeautifulSoup(str(response.text), "html.parser") new_td_tag = new_soup.find_all('td', {'headers': 't1'})[0] new_a_href = BeautifulSoup(str(new_td_tag), "html.parser").a.get('href') pdf_file = escape_string(base_url + new_a_href) pdf_data = escape_string(request_pdf(base_url + new_a_href, case_no, court_name)) if i == 3: judge_name = escape_string(str(td.text)) if i == 4: petitioner = escape_string(str(td.text)) if i == 5: respondent = escape_string(str(td.text)) if i == 6: bench = escape_string(str(td.text)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + "(case_no, judgment_date, judge_name, petitioner, " \ "respondent, bench, pdf_file, pdf_filename) VALUE ('" +\ case_no + "', '" + judgment_date + "', '" + judge_name + "', '" + petitioner + "', '" + \ respondent + "', '" + bench + "', '" + pdf_file + "', '" + court_name + "_" + \ slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench): try: soup = BeautifulSoup( html_str.replace("<b>", "").replace("</b>", "").replace( "<br>", "").replace("</br>", "").replace("<b", "").replace("<br< p=" "></br<>", ""), "html.parser") tr_list = soup.find_all('tr') del tr_list[0:7] case_no = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" disposal_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" tr_count = 0 for tr in tr_list: tr_count += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break # insert_check = False if tr_count == 1: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: case_no = escape_string(str(td.text).strip()) if td_count == 4: td_text = str(td.text) if td_text.find("NA") == -1: a_tag = BeautifulSoup(str(td), "html.parser").a if a_tag: a_link = a_tag.get('href') pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) pdf_file = base_url + a_link judgment_date = escape_string( td_text.replace("Judgement", "").replace( "Orders", "").replace("r", "").replace( "(AFR)", "").replace("NA", "").strip()) # if select_count_query_bench(str(court_name), str(case_no), bench, 'judgment_date', judgment_date): # insert_check = True if tr_count == 2: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: petitioner = escape_string(str(td.text).strip()) if tr_count == 3: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: respondent = escape_string(str(td.text).strip()) if tr_count == 4: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: petitioner_advocate = escape_string( str(td.text).strip()) if tr_count == 5: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: respondent_advocate = escape_string( str(td.text).strip()) if tr_count == 6: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: judge_name = escape_string(str(td.text).strip()) if tr_count == 7: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: disposal_date = escape_string(str(td.text).strip()) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + \ " (case_no, petitioner, respondent, petitioner_advocate, respondent_advocate, " \ "judgment_date, disposal_date, bench, judge_name, pdf_file, pdf_filename)" \ " VALUE ('" + case_no + "', '" + petitioner + "', '" + respondent + "', '" + \ petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + "', '" + \ disposal_date + "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + \ court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") if tr_count == 9: tr_count = 0 case_no = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, headers): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table', {'id': 'tables11'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 3: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False table_soup = BeautifulSoup(str(tr), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a case_no = escape_string(str(a_tag.text)) if i == 3: party = str(td.decode_contents()).split("Vs") petitioner = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 4: judgment_date = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 5: a_link = BeautifulSoup(str(td), "html.parser").a.get('onclick') a_formatted = str( str(a_link).replace("window.open('", "")).replace("')", "") pdf_file = escape_string(base_url + "/" + a_formatted) # pdf_data = escape_string(request_pdf( # str(pdf_file).replace(base_url + "download_file.php?auth=", ""), case_no, court_name, # headers)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \ petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + pdf_file + "', '" + \ court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench, child_url): try: soup = BeautifulSoup(html_str, "html.parser") div_soup = BeautifulSoup(str(soup.find_all('div', {'id': 'text'})[0]), 'html.parser') tr_list = div_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" date_of_order = "NULL" description = "NULL" section = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 2: date_of_order = escape_string( str(td.text).strip().replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order): # insert_check = True if i == 3: description = escape_string(str(td.text).strip()) a_tag = BeautifulSoup(str(td), "html.parser").font.a pdf_url = base_url + child_url + a_tag.get('href') pdf_file = escape_string(pdf_url) pdf_data = escape_string( request_pdf(pdf_url, case_no, court_name)) if i == 4: section = str(td.text) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, description, section, " \ "pdf_file, bench_code, pdf_filename) VALUE ('" + \ case_no + "', '" + date_of_order + "', '" + description + "', '" + section + "', '" + \ pdf_file + "', '" + str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench, start_date): try: soup = BeautifulSoup(html_str, "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {'class': 'hoverTable'})[0]), 'html.parser') tr_list = table_soup.find_all('tr') if not tr_list: logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") return True tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 3: party = str(td.decode_contents()).split("<br/>") petitioner = escape_string(str(party[0]).strip()) respondent = escape_string(str(party[2]).strip()) if i == 4: judge_name = escape_string(str(td.text).strip()) if i == 5: judgment_date = escape_string(str(td.text).strip()) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 7: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = base_url + a_tag.get('href') # pdf_data = escape_string(request_pdf(pdf_file, case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, bench_code, pdf_filename) VALUE"\ " ('" + case_no + "', '" + petitioner + "', '" + \ respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + pdf_file + "', '" + \ str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(str(html_str).replace('&', ' '), "html.parser") tr_list = soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: judgment_date = escape_string(str(td.decode_contents())) if i == 2: judge_name = escape_string(str(td.decode_contents())) if i == 3: case_no = escape_string(str(td.text)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 4: party = str(td.decode_contents()).split("v/s") petitioner = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(str(base_url + a_tag.get('href'))) pdf_data = escape_string( request_pdf(base_url + a_tag.get('href'), case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, pdf_filename) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, flag): try: soup = BeautifulSoup(html_str, "html.parser") soup = BeautifulSoup(str(soup.prettify()), "html.parser") date_h4 = soup.find_all('h4', {'align': 'center'})[0] month_year = str(date_h4.text).replace('JUDGMENTS FOR THE MONTH OF', '').strip() table_list = soup.find_all('table', {'class': 'DISCOVERY3'})[0] table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" subject = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') if flag: i = 1 else: i = 0 for td in td_list: i += 1 if i == 2: judgment_day = escape_string(str(td.decode_contents())) judgment_date = str(re.findall('\d+', str(judgment_day))[0]) + ", " + month_year.replace( 'JUDGEMENTS FOR THE MONTH OF', '') if i == 3: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(str(base_url + a_tag.get('href'))) case_no = escape_string(str(a_tag.text).replace("\n", "").strip()) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True pdf_data = escape_string(request_pdf(str(base_url + a_tag.get('href')), case_no, court_name)) if i == 4: font_tag = BeautifulSoup(str(td), "html.parser").font if font_tag is not None: span_tag = font_tag.span else: span_tag = BeautifulSoup(str(td), "html.parser").span if span_tag is None: span_tag = BeautifulSoup(str(td), "html.parser") party = str(span_tag.decode_contents()).split("<br/>") petitioner = escape_string( str(party[0]).replace('<td align="center" bgcolor="#FFFFFF" valign="middle" width="30%">', '').strip()) petitioner = re.sub(r'(\\x(.){2})', '', petitioner) respondent = escape_string(str(party[2]).replace('</td>', '').strip()) respondent = re.sub(r'(\\x(.){2})', '', respondent) if i == 5: subject = escape_string(str(td.decode_contents()).strip()) if i == 6: judge_name = escape_string(str(td.text).replace(r'\x', '').replace('\\xC2\\x92BLE', '').strip()) judge_name = re.sub(r'(\\x(.){2})', '', judge_name) judge_name = re.sub(r'', '', judge_name, re.U) # if case_no != "NULL" and insert_check and td_list: if case_no != "NULL" and td_list: sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "subject, pdf_file, pdf_filename) VALUE ('" + case_no + \ "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + subject + \ "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET judge_name = '" + str(judge_name) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, dc): try: soup = BeautifulSoup(html_str, "html.parser") tr_list = soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" corrigendum = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = base_url + a_tag.get('href') case_no = str(a_tag.text).replace("\n", "") pdf_data = escape_string( request_pdf(pdf_file, case_no, court_name)) if i == 3: span_tag = BeautifulSoup(str(td), "html.parser").span judgment_date = escape_string( str(span_tag.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 5: span_tag = BeautifulSoup(str(td), "html.parser").span corrigendum = escape_string(str( span_tag.decode_contents())) if i == 4: td_soup = BeautifulSoup(str(td), "html.parser") span_list = td_soup.find_all('span') j = 0 for span in span_list: j += 1 if j == 1: petitioner = escape_string( str(span.decode_contents())) if j == 3: respondent = escape_string( str(span.decode_contents())) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "corrigendum, pdf_file, bench_code, pdf_filename) VALUE"\ " ('" + case_no + "', '" + petitioner + "', '" + \ respondent + "', '" + judgment_date + "', '" + corrigendum + "', '" + pdf_file + "', " + \ str(dc) + ", '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False