def parse_html(html_str, court_name, m_sideflg): try: soup = BeautifulSoup(html_str, "html.parser") table_soup = BeautifulSoup(str(soup.find_all('form')[0]), "html.parser") table_soup = BeautifulSoup( str(table_soup.find_all('table', {"width": "100%"})[0]), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 4 or tr_count % 2 == 0: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" coram = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1 or i == 6 or str(td.decode_contents()).replace("\n", "").strip() == \ '<font color="blue">LBR : Larger Benches Referred Matter</font>': continue if i == 2: coram = escape_string(str(td.decode_contents())) if i == 3: data1 = escape_string(str(td.decode_contents())) data1_list = data1.split("<b>") petitioner = data1_list[0] respondent = str(data1_list[1]).split("</b>")[1] if i == 4: data2 = escape_string(str(td.decode_contents())) data2_list = data2.split("<br/>") judgment_date = data2_list[0] if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = base_url + a_tag.get('href') case_no = str(a_tag.text).replace("\n", "") pdf_data = escape_string( request_pdf(pdf_file, case_no, court_name)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (m_sideflg, case_no, petitioner, respondent, " \ "judgment_date, coram, pdf_file, pdf_filename) VALUE " \ "('" + m_sideflg +\ "', '" + case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + \ "', '" + coram + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench, child_url): try: soup = BeautifulSoup(html_str, "html.parser") div_soup = BeautifulSoup(str(soup.find_all('div', {'id': 'text'})[0]), 'html.parser') tr_list = div_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" date_of_order = "NULL" description = "NULL" section = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 2: date_of_order = escape_string( str(td.text).strip().replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order): # insert_check = True if i == 3: description = escape_string(str(td.text).strip()) a_tag = BeautifulSoup(str(td), "html.parser").font.a pdf_url = base_url + child_url + a_tag.get('href') pdf_file = escape_string(pdf_url) pdf_data = escape_string( request_pdf(pdf_url, case_no, court_name)) if i == 4: section = str(td.text) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, description, section, " \ "pdf_file, bench_code, pdf_filename) VALUE ('" + \ case_no + "', '" + date_of_order + "', '" + description + "', '" + section + "', '" + \ pdf_file + "', '" + str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table', {'style': 'width:100%; margin-top: 10px; font-size: 12px;'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue appeal_no = "NULL" appellant = "NULL" respondent = "NULL" date_of_order = "NULL" filed_by = "NULL" pdf_data = "NULL" pdf_file = "NULL" order_type = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: appeal_no = escape_string(str(td.text).strip().replace("\n", "")) if i == 2: filed_by = escape_string(str(td.text).strip().replace('\n', '')) if i == 3: appellant = escape_string(str(td.text).strip().replace('\n', '')) if i == 4: respondent = escape_string(str(td.text).strip().replace('\n', '')) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a details_url = a_tag.get('href') date_of_order, pdf_file, order_type = details_parse(details_url, appeal_no, court_name) # if select_count_query_other(str(court_name), 'appeal_no', str(appeal_no), 'date_of_order', # date_of_order): # insert_check = True pdf_data = escape_string(str(request_pdf(pdf_file, appeal_no, court_name)).replace("'", "")) # if appeal_no != "NULL" and insert_check: if appeal_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (appeal_no, appellant, respondent, filed_by, " \ "bench_code, pdf_filename ) VALUE ('" + appeal_no + \ "', '" + appellant + "', '" + respondent + "', '" + filed_by + "', '" + court_name + \ "_" + slugify(appeal_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "', date_of_order ='" + date_of_order + "', pdf_file = '" + pdf_file + "', order_type = '" + order_type + "' WHERE appeal_no = '" + str(appeal_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, m_sideflg, start_date, end_date_): try: url = base_url + "ordqryrepact_action.php" headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Cache-Control': "no-cache" } i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=180)).strftime("%d-%m-%Y") if datetime.datetime.strptime(end_date_, "%d-%m-%Y") + datetime.timedelta(days=180) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "pageno=1" \ "&frmaction=" \ "&m_sideflg=" + str(m_sideflg) + \ "&actcode=0" \ "&frmdate=" + str(start_date) + \ "&todate=" + str(end_date) response = requests.request("POST", url, data=payload, headers=headers) res = response.text if "invalid inputs given" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name, m_sideflg): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, start_date, end_date_): try: if int(start_date[-4:]) < 2010: update_query( "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_[-4:]) < 2010: update_history_tracker(court_name) return True for month_year in month_list_([str(start_date), str(end_date_)]): emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True data = { 'ctl00$CPHBody$DropDownListYear': str(month_year[-4:]), 'ctl00$CPHBody$DropDownListMonth': str(month_year[:-4]).lstrip("0"), 'ctl00$CPHBody$TextBox1': '', 'ctl00$CPHBody$SM1': 'ctl00$CPHBody$SM1|ctl00$CPHBody$DropDownListMonth' } with requests.Session() as s: page = s.get(base_url + 'judgement.aspx') soup = BeautifulSoup(page.content, "html.parser") data["__VIEWSTATE"] = soup.select_one("#__VIEWSTATE")["value"] data["__VIEWSTATEGENERATOR"] = soup.select_one( "#__VIEWSTATEGENERATOR")["value"] data["__EVENTVALIDATION"] = soup.select_one( "#__EVENTVALIDATION")["value"] update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "' WHERE Name = '" + str(court_name) + "'") response = s.post(base_url + 'judgement.aspx', data=data) res = response.text if "no records were found." in res.lower( ) or "application error" in res.lower(): logging.error("NO data Found for start date: " + str(month_year)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name): logging.error("Failed to parse data") return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name, dc): try: soup = BeautifulSoup(html_str, "html.parser") tr_list = soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" corrigendum = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = base_url + a_tag.get('href') case_no = str(a_tag.text).replace("\n", "") pdf_data = escape_string( request_pdf(pdf_file, case_no, court_name)) if i == 3: span_tag = BeautifulSoup(str(td), "html.parser").span judgment_date = escape_string( str(span_tag.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 5: span_tag = BeautifulSoup(str(td), "html.parser").span corrigendum = escape_string(str( span_tag.decode_contents())) if i == 4: td_soup = BeautifulSoup(str(td), "html.parser") span_list = td_soup.find_all('span') j = 0 for span in span_list: j += 1 if j == 1: petitioner = escape_string( str(span.decode_contents())) if j == 3: respondent = escape_string( str(span.decode_contents())) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "corrigendum, pdf_file, bench_code, pdf_filename) VALUE"\ " ('" + case_no + "', '" + petitioner + "', '" + \ respondent + "', '" + judgment_date + "', '" + corrigendum + "', '" + pdf_file + "', " + \ str(dc) + ", '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench): try: soup = BeautifulSoup( html_str.replace("<b>", "").replace("</b>", "").replace( "<br>", "").replace("</br>", "").replace("<b", "").replace("<br< p=" "></br<>", ""), "html.parser") tr_list = soup.find_all('tr') del tr_list[0:7] case_no = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" disposal_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" tr_count = 0 for tr in tr_list: tr_count += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break # insert_check = False if tr_count == 1: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: case_no = escape_string(str(td.text).strip()) if td_count == 4: td_text = str(td.text) if td_text.find("NA") == -1: a_tag = BeautifulSoup(str(td), "html.parser").a if a_tag: a_link = a_tag.get('href') pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) pdf_file = base_url + a_link judgment_date = escape_string( td_text.replace("Judgement", "").replace( "Orders", "").replace("r", "").replace( "(AFR)", "").replace("NA", "").strip()) # if select_count_query_bench(str(court_name), str(case_no), bench, 'judgment_date', judgment_date): # insert_check = True if tr_count == 2: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: petitioner = escape_string(str(td.text).strip()) if tr_count == 3: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: respondent = escape_string(str(td.text).strip()) if tr_count == 4: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: petitioner_advocate = escape_string( str(td.text).strip()) if tr_count == 5: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: respondent_advocate = escape_string( str(td.text).strip()) if tr_count == 6: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: judge_name = escape_string(str(td.text).strip()) if tr_count == 7: td_count = 0 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') for td in td_list: td_count += 1 if td_count == 3: disposal_date = escape_string(str(td.text).strip()) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + \ " (case_no, petitioner, respondent, petitioner_advocate, respondent_advocate, " \ "judgment_date, disposal_date, bench, judge_name, pdf_file, pdf_filename)" \ " VALUE ('" + case_no + "', '" + petitioner + "', '" + respondent + "', '" + \ petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + "', '" + \ disposal_date + "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + \ court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") if tr_count == 9: tr_count = 0 case_no = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(headers, start_date, end_date_): try: url = base_url + "/ByDate.php" i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=180)).strftime("%d-%m-%Y") if datetime.datetime.strptime(str(end_date_), "%d-%m-%Y") + datetime.timedelta(days=180) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "date_day=" + str(start_date[0:2]).replace("0", "") + \ "&date_month=" + str(start_date[3:5]).replace("0", "") + \ "&date_year=" + str(start_date[6:]) + \ "&date_day1=" + str(end_date[0:2]).replace("0", "") + \ "&date_month1=" + str(end_date[3:5]).replace("0", "") + \ "&date_year1=" + str(end_date[6:]) + \ "&submit=Submit" response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if "invalid inputs given" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not offset_link(res, headers): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") select_soup = BeautifulSoup( str(soup.find_all('select', {'id': 'txtlist'})[0]), "html.parser") tr_list = select_soup.find_all('option') for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break # insert_check = False pdf_value = tr['value'] res = BeautifulSoup( str(tr['onmouseover']).replace("return overlib('", "").replace("')", ""), "html.parser") [s.extract() for s in res('font')] res = str(res).replace('\n', '').strip().split('<br/>') petitioner = escape_string(res[0]) respondent = escape_string(res[1]) judge = escape_string(res[2]) judgment_date = escape_string(res[3]) mix_data = str(res[4]).replace("', CAPTION, '", '') reportable = mix_data[0:2] case_no = escape_string(mix_data[3:]) if reportable == 'No': continue # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True # if case_no != "NULL" and insert_check: if case_no != "NULL": pdf_data = escape_string( request_pdf(case_no, court_name, pdf_value)) sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge, pdf_file, pdf_filename, reportable) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge + "', '" + pdf_value + "', '" + court_name + "_" + slugify(case_no) + ".pdf', '" + \ reportable + "')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, headers, start_date, end_date_): try: url = base_url + '/php/hc/judgement/judgement_pro_all.php' i = 0 while True: i += 1 end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=1)).strftime("%d-%m-%Y") if datetime.datetime.strptime(str(end_date_), "%d-%m-%Y") + datetime.timedelta(days=1) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("END date Exceed.") break benches = ['IND', 'JBP', 'GWL'] for bench in benches: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "lst_judge=0" \ "&lst_pet=" \ "&txtparty=" \ "&lst_counsel=" \ "&txtcounsel=" \ "&date1=" + str(start_date) + \ "&date2=" + str(end_date) + \ "&court=" + str(bench) + \ "&lst_judge1=0" \ "&lst_judge2=0" \ "&btn_search=is" \ "&bench=" \ "&sort=jo" \ "&ad=DESC" \ "&code=" if int(end_date[-4:]) <= 2014 and int(start_date[-4:]) <= 2014: payload += "&onlyafr=N" else: payload += "&onlyafr=Y" response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if "no jugdement or order found that you want to search" in res.lower( ): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") sleep(2) start_date = end_date continue if not parse_html(res, court_name, bench): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, start_date, end_date_): try: url = base_url + "date_JQ.asp" headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Cache-Control': "no-cache" } i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=1)).strftime("%d-%m-%Y") if datetime.datetime.strptime(end_date_, "%d-%m-%Y") + datetime.timedelta(days=1) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "txtday=" + str(start_date[0:2]).lstrip('0') + \ "&txtmonth=" + str(start_date[3:5]).lstrip('0') + \ "&txtyear=" + str(start_date[-4:]) response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if "no judgement found for your search" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def cancel_json(court_name): return jsonify( update_query( "UPDATE Tracker_JSON SET status='IN_ABORT', emergency_exit=true WHERE Name='" + court_name + "'"))
def cancel_pdf(): return jsonify( update_query( "UPDATE Tracker_pdf SET status='IN_ABORT', emergency_exit=true WHERE 1" ))
def request_data(court_name, start_date, end_date_): try: url = base_url + "/hcs/hcourt/hg_judgement_search" headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Accept': "application/json", 'Cache-Control': "no-cache" } if int(start_date[-2:]) < 11: update_query( "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_[-2:]) < 11: update_history_tracker(court_name) return True for month_year in month_list_([str(start_date), str(end_date_)]): year = int(month_year[-2:]) - 10 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "', End_Date = '" + str(end_date_) + "' WHERE Name = '" + str(court_name) + "'") querystring = {"ajax_form": "1", "_wrapper_format": "drupal_ajax"} payload = "form_build_id=form-BS37MKVfuGmv9fgHWUqr3U9nFCjolonq-Nnenj3Ks24" \ "&form_id=ajax_example_form" \ "&ordermonth=" + str(month_year[:-2]).lstrip("0") + \ "&orderyear=" + str(year) + \ "&_triggering_element_name=op" \ "&_triggering_element_value=Search" \ "&_drupal_ajax=1" \ "&ajax_page_state%5Btheme%5D=mytheme" \ "&ajax_page_state%5Btheme_token%5D=%20" \ "&ajax_page_state%5Blibraries%5D=asset_injector%2Fcss%2Fanimation_accordin%2Casset_injector" \ "%2Fcss%2Fside_bar%2Casset_injector%2Fcss%2Ftable%2Casset_injector%2Fjs%2Fseperate_tab_%2C" \ "core%2Fdrupal.ajax%2Ccore%2Fhtml5shiv%2Ccore%2Fjquery.form%2Cmytheme%2Fmylibrarynew%2C" \ "system%2Fbase%2Cviews%2Fviews.module" response = requests.request("POST", url, data=payload, headers=headers, params=querystring, proxies=proxy_dict) json_res = json.loads(response.text) res = None for json_r in json_res: if "data" in json_r: res = BeautifulSoup(str(json_r['data']), "html.parser") break if res is None: logging.error("NO data Found for start date: " + str(month_year)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name): logging.error("Failed to parse data from date: " + str(month_year)) return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") table_tag = soup.find_all('table', {'class': 'miscTable'})[0] table_soup = BeautifulSoup(str(table_tag), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" judgment_date = "NULL" judge_name = "NULL" petitioner = "NULL" respondent = "NULL" bench = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: judgment_date = escape_string(str(td.decode_contents())) if i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a case_no = escape_string(str(a_tag.text).replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True new_url = base_url + a_tag.get('href') response = requests.request('GET', new_url, headers=headers, proxies=proxy_dict) new_soup = BeautifulSoup(str(response.text), "html.parser") new_td_tag = new_soup.find_all('td', {'headers': 't1'})[0] new_a_href = BeautifulSoup(str(new_td_tag), "html.parser").a.get('href') pdf_file = escape_string(base_url + new_a_href) pdf_data = escape_string(request_pdf(base_url + new_a_href, case_no, court_name)) if i == 3: judge_name = escape_string(str(td.text)) if i == 4: petitioner = escape_string(str(td.text)) if i == 5: respondent = escape_string(str(td.text)) if i == 6: bench = escape_string(str(td.text)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + "(case_no, judgment_date, judge_name, petitioner, " \ "respondent, bench, pdf_file, pdf_filename) VALUE ('" +\ case_no + "', '" + judgment_date + "', '" + judge_name + "', '" + petitioner + "', '" + \ respondent + "', '" + bench + "', '" + pdf_file + "', '" + court_name + "_" + \ slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(str(html_str).replace('&', ' '), "html.parser") tr_list = soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: judgment_date = escape_string(str(td.decode_contents())) if i == 2: judge_name = escape_string(str(td.decode_contents())) if i == 3: case_no = escape_string(str(td.text)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 4: party = str(td.decode_contents()).split("v/s") petitioner = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(str(base_url + a_tag.get('href'))) pdf_data = escape_string( request_pdf(base_url + a_tag.get('href'), case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, pdf_filename) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") table_tag = soup.find_all('table')[1] table_soup = BeautifulSoup(str(table_tag), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 2 or tr_count > 17: continue case_no = "NULL" judgment_date = "NULL" coram = "NULL" type_ = "NULL" status = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 2: coram = escape_string(str(td.decode_contents())) if i == 3: judgment_date = escape_string(str(td.decode_contents())) if i == 5: type_ = escape_string(str(td.decode_contents())) if i == 6: status = escape_string(str(td.decode_contents())) if i == 4: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(base_url + a_tag.get('href')) pdf_data = escape_string(request_pdf(base_url + a_tag.get('href'), case_no, court_name)) # if case_no != "NULL" and insert_check and case_no.find("DISCLAIMER") == -1: if case_no != "NULL" and case_no.find("DISCLAIMER") == -1: sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, coram, type, status, " \ "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \ judgment_date + "', '" + coram + "', '" + type_ + "', '" + status + "', '" + pdf_file + \ "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str): try: soup = BeautifulSoup(str(html_str), "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {"width": "100%"})[0]), "html.parser") tr_list = table_soup.select('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 2: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.select('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font case_no = escape_string(str(font_tag.text)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 3 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font respondent = escape_string(str(font_tag.text)) if i == 4 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font petitioner = escape_string(str(font_tag.text)) if i == 5 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font judgment_date = escape_string(str(font_tag.text)) if td.get('align') == 'left': td_soup1 = BeautifulSoup(str(td), "html.parser") judge_name = escape_string(str(td_soup1.text)) if td.get('align') == 'center': font_tag = BeautifulSoup(str(td), "html.parser").font a_tag = BeautifulSoup(str(font_tag), "html.parser").a pdf_file = escape_string(base_url + "/" + a_tag.get('href')) pdf_data = escape_string( bytes( str( request_pdf(base_url + "/" + a_tag.get('href'), case_no)), 'utf-8').decode("utf-8", 'ignore')) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, pdf_filename) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def full_details_parse(res, appeal_no, court_name): try: filed_on = 'NULL' assessment_year = 'NULL' bench_allotted = 'NULL' case_status = 'NULL' date_of_first_hearing = 'NULL' date_of_last_hearing = 'NULL' date_of_next_hearing = 'NULL' date_of_final_hearing = 'NULL' date_of_tribunal_order = 'NULL' date_of_pronouncement = 'NULL' order_result = 'NULL' soup = BeautifulSoup(res, "html.parser") table_list = soup.find_all('table', {'class': 'table table-striped table-bordered manage-efects'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: tr_count += 1 if tr_count != 5: continue tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 2: filed_on = escape_string(str(td.text).strip().replace("\n", "")) if i == 3: assessment_year = escape_string(str(td.text).strip().replace("\n", "")) if i == 4: bench_allotted = escape_string(str(td.text).strip().replace("\n", "")) if i == 5: case_status = escape_string(str(td.text).strip().replace("\n", "")) soup = BeautifulSoup(res, "html.parser") table_list = soup.find_all('section', {'id': 'panel2-3'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: tr_count += 1 if tr_count == 1: continue tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') if tr_count == 2: i = 0 for td in td_list: i += 1 if i == 1: date_of_first_hearing = escape_string(str(td.text).strip().replace("\n", "")) if i == 2: date_of_tribunal_order = escape_string(str(td.text).strip().replace("\n", "")) if tr_count == 3: i = 0 for td in td_list: i += 1 if i == 1: date_of_last_hearing = escape_string(str(td.text).strip().replace("\n", "")) if i == 2: date_of_pronouncement = escape_string(str(td.text).strip().replace("\n", "")) if tr_count == 4: i = 0 for td in td_list: i += 1 if i == 1: date_of_next_hearing = escape_string(str(td.text).strip().replace("\n", "")) if i == 2: order_result = escape_string(str(td.text).strip().replace("\n", "")) if tr_count == 5: i = 0 for td in td_list: i += 1 if i == 1: date_of_final_hearing = escape_string(str(td.text).strip().replace("\n", "")) update_query("UPDATE " + court_name + " SET filed_on = '" + str(filed_on) + "', assessment_year = '" + str(assessment_year) + "', bench_allotted = '" + str(bench_allotted) + "', case_status = '" + str(case_status) + "', date_of_first_hearing = '" + str(date_of_first_hearing) + "', date_of_last_hearing = '" + str(date_of_last_hearing) + "', date_of_next_hearing = '" + str(date_of_next_hearing) + "', date_of_final_hearing = '" + str(date_of_final_hearing) + "', date_of_tribunal_order = '" + str(date_of_tribunal_order) + "', date_of_pronouncement = '" + str(date_of_pronouncement) + "', order_result = '" + str(order_result) + "' WHERE appeal_no = '" + str(appeal_no) + "'") except Exception as e: logging.error("Failed to parse the details html: %s", e)
def request_data(court_name, start_date, end_date_): try: url = base_url + 'php/getJBJ.php' headers = { 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8", 'Cache-Control': "no-cache" } i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=30)).strftime("%d-%m-%Y") if datetime.datetime.strptime(end_date_, "%d-%m-%Y") + datetime.timedelta(days=30) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("END date Exceed.") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "jorrop=J" \ "&JBJfrom_date=" + str(start_date) + \ "&JBJto_date=" + str(end_date) response = requests.request("POST", url, data=payload, headers=headers, verify=False, proxies=proxy_dict) res = response.text if "no data found" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, dc, headers, start_date, end_date_): try: url = base_url + "/juddt1.php" i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d/%m/%Y") + datetime.timedelta(days=1)).strftime("%d/%m/%Y") if datetime.datetime.strptime(str(end_date_), "%d/%m/%Y") + datetime.timedelta(days=1) < \ datetime.datetime.strptime(str(end_date), "%d/%m/%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") querystring = {"dc": str(dc), "fflag": "1"} payload = "juddt=" + str(start_date) + "&Submit=Submit" response = requests.request("POST", url, data=payload, headers=headers, params=querystring, proxies=proxy_dict) res = response.text if "NO ROWS" in res.upper(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not offset_link(res, headers, court_name, dc): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False