def offset_link(html_str, url, querystring, court_name): try: if not parse_html(html_str, court_name): return False querystring['sort_by'] = "1" querystring['etal'] = "-1" soup = BeautifulSoup(html_str, "html.parser") div_tag = soup.find_all('div', {'class': 'browse_range'})[0] total_records = int(re.findall('\d+', str(div_tag.text))[-1]) total_calls = ceil(total_records/200) next_num = 0 for page_link in range(0, total_calls): next_num += 200 emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True querystring['offset'] = str(next_num) response = requests.request("GET", url, headers=headers, params=querystring, proxies=proxy_dict) res = response.text if not parse_html(res, court_name): logging.error("Failed for url: " + str(next_num)) return False return True except Exception as e: logging.error("Error in offset_link. %s", e) return False
def request_data(court_name, bench, start_date, end_date_): try: url = base_url + "/tribunalorders" headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Cache-Control': "no-cache" } i = 0 while True: i += 1 emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = (datetime.datetime.strptime(str(start_date), "%d/%m/%Y") + datetime.timedelta(days=1) ).strftime("%d/%m/%Y") if datetime.datetime.strptime(str(end_date_), "%d/%m/%Y") + datetime.timedelta(days=1) < \ datetime.datetime.strptime(str(end_date), "%d/%m/%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "bench=" + str(bench) + \ "&appeal_type=" \ "&hearingdate=" \ "&pronouncementdate=" \ "&orderdate=" + str(start_date) + \ "&member=" \ "&assesseename=" response = requests.request("POST", url, data=payload, headers=headers, verify=False, proxies=proxy_dict) res = response.text if res is None: logging.error("NO data Found.") update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name, bench): logging.error("Failed to parse data from bench: " + str(bench)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from bench: " + str(bench)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, headers, start_date, end_date_): try: url = base_url + "coram-reported-judgment.php" i = 0 while True: i += 1 emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = (datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=1) ).strftime("%d-%m-%Y") if datetime.datetime.strptime(str(end_date_), "%d-%m-%Y") + datetime.timedelta(days=1) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "coram=0" \ "&ojtype=1" \ "&bench_type=0" \ "&reported=Y" \ "&startdate=" + str(start_date) + \ "&enddate=" + str(end_date) + \ "&coramqueryreported=0" response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if "NO ROWS" in res.upper(): logging.error("NO data Found for start date: " + str(start_date)) update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not offset_link(res, payload, court_name, headers): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") div = soup.find_all('div', {'id': 'CPHBody_PanelList'})[0] a_list_soup = BeautifulSoup(str(div), "html.parser") a_list = a_list_soup.find_all('a') a_list_unique = list(set(a_list)) for a in a_list_unique: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_no = escape_string(str(str(a.text)[:-10]).replace("-", "")) pdf_data = "NULL" pdf_file = "NULL" # insert_check = False judgment_date = escape_string(str(a.text)[-10:]) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True a_link = a.get('href') pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) pdf_file = escape_string(base_url + a_link) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \ "VALUE ('" + case_no + "', '" + judgment_date + "', '" \ + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") ul = soup.find_all('ul')[0] ul_soup = BeautifulSoup(str(ul), "html.parser") li_list = ul_soup.find_all('li') # p_list = ul_soup.find_all('p') # p_list = [x for x in p_list if "<p><font" not in str(x)] # print(p_list) # return for li in li_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break a = BeautifulSoup(str(li), "html.parser").a a_link = a.get('href') case_no = str(a_link[a_link.rfind("/")+1:]).replace('.pdf', '') judgment_date = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True judgment_date = escape_string(case_no[-10:].replace('(', '').replace(')', '')) pdf_data = escape_string(request_pdf(base_url + a_link, case_no, court_name)) pdf_file = escape_string(base_url + a_link) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \ "VALUE ('" + case_no + "', '" + judgment_date + "', '" \ + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET text_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, start_date, end_date_): try: headers = { 'Cache-Control': "no-cache", } if int(start_date[-2:]) < 10: update_query("UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_[-2:]) < 10: update_history_tracker(court_name) return True for month_year in month_list_([str(start_date), str(end_date_)]): month_year = date_fix(month_year) emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True url = base_url + "JDMT" + str(month_year) + ".html" update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "', End_Date = '" + str(end_date_) + "' WHERE Name = '" + str(court_name) + "'") response = requests.request("GET", url, headers=headers, proxies=proxy_dict) res = response.text if "file or directory not found" in res.upper(): logging.error("NO data Found for start date: " + str(month_year)) update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if str(month_year[-2:]) == '10' or str(month_year) == 'Jan11': if not parse_html(res, court_name, True): logging.error("Failed to parse data from date: " + str(month_year)) else: if not parse_html(res, court_name, False): logging.error("Failed to parse data from date: " + str(month_year)) return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, bench, start_date, end_date_): try: for year in range(start_date, end_date_ + 1): if int(year) < 2010 or int(year) > 2016: logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue section_types = ['111_111_A', '397_398', 'Others'] for section_type in section_types: child_url = str(bench) + '/' + str(year) + '/' + str( section_type) + '/' url = base_url + child_url + 'index.html' emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True update_query("UPDATE Tracker SET Start_Date = '" + str(year) + "', End_Date = '" + str(year) + "' WHERE Name = '" + str(court_name) + "'") response = requests.request("GET", url, proxies=proxy_dict) res = response.text if res is None: logging.error("NO data Found for year: " + str(year)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name, bench, child_url): logging.error("Failed to parse data for year: " + str(year)) return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def offset_link(html_str, headers): try: if not parse_html(html_str): return False soup = BeautifulSoup(html_str, "html.parser") td_tag = soup.find_all('td', { 'height': '172', 'align': 'center', 'valign': 'top' })[0] td_soup = BeautifulSoup(str(td_tag), "html.parser") for table in td_soup.find_all("table"): table.decompose() a_tags = td_soup.find_all('a') a_link_list = [] for a_tag in a_tags: a_link = base_url + a_tag.get('href') a_link_list.append(a_link) a_link_list_unique = list(set(a_link_list)) i = 0 for page_link in a_link_list_unique: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: break if page_link != 'http://hcmjudgment.man.nic.in/ByDate.php?page=1': response = requests.request("POST", page_link, headers=headers, proxies=proxy_dict) res = response.text if not parse_html(res): logging.error("Failed for url: " + page_link) return False return True except Exception as e: logging.error("Error in offset_link. %s", e) return False
def offset_link(html_str, headers, court_name): try: if not parse_html(html_str, court_name, headers): return False soup = BeautifulSoup(html_str, "html.parser") table_tag = soup.find_all('table', {'id': 'tables11'})[0] table_soup = BeautifulSoup(str(table_tag), "html.parser") tr_tag = table_soup.find_all('tr', {'align': 'center'}) if len(tr_tag) <= 0: return True tr_tag = tr_tag[1] tr_soup = BeautifulSoup(str(tr_tag), "html.parser") a_tags = tr_soup.find_all('a') a_link_list = [] for a_tag in a_tags: a_link = base_url + a_tag.get('href') a_link_list.append(a_link) a_link_list_unique = list(set(a_link_list)) i = 0 for page_link in a_link_list_unique: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: break if page_link != "https://phhc.gov.in./home.php?search_param=free_text_search_judgment&page_no=1": response = requests.request("POST", page_link, headers=headers, verify=False, proxies=proxy_dict) res = response.text if not parse_html(res, court_name, headers): logging.error("Failed for url: " + page_link) return False return True except Exception as e: logging.error("Error in offset_link. %s", e) return False
def request_data(court_name, start_date, end_date_): try: if int(start_date) < 2012: update_query("UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_) < 2012: update_history_tracker(court_name) return True for year_ in range(int(start_date), int(end_date_) + 1): emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True if int(year_) == 2018: year_ = '' url = base_url + "DecisionsHeadline" + str(year_) + ".html" update_query("UPDATE Tracker SET Start_Date = '" + str(year_) + "', End_Date = '" + str(end_date_) + "' WHERE Name = '" + str(court_name) + "'") response = requests.request("GET", url, proxies=proxy_dict) res = response.text if "file or directory not found" in res.lower(): logging.error("NO data Found for start date: " + str(year_)) update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name): logging.error("Failed to parse data from date: " + str(year_)) return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def offset_link(html_str, headers, court_name, dc): try: if not parse_html(html_str, court_name, dc): return False soup = BeautifulSoup(html_str, "html.parser") p_tag = soup.find_all('p', {'class': 'style2'})[1] p_soup = BeautifulSoup(str(p_tag), "html.parser") a_tags = p_soup.find_all('a') a_link_list = [] for a_tag in a_tags: a_link = base_url + a_tag.get('href') a_link_list.append(a_link) a_link_list_unique = list(set(a_link_list)) i = 0 for page_link in a_link_list_unique: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: break if page_link != "http://lobis.nic.in/juddt1.php?offset=0": response = requests.request("POST", page_link, headers=headers, proxies=proxy_dict) res = response.text if not parse_html(res, court_name, dc): logging.error("Failed for url: " + page_link) return False return True except Exception as e: logging.error("Error in offset_link. %s", e) return False
def offset_link(html_str, o_payload, court_name, headers): url = base_url + "coram-reported-judgment.php" try: if not parse_html(html_str, court_name): return False soup = BeautifulSoup(html_str, "html.parser") table_tag = soup.find_all('table')[1] table_soup = BeautifulSoup(str(table_tag), "html.parser") b_tag = table_soup.find_all('b')[0] if str(b_tag.decode_contents()).lower().find('no record found') != -1: return True total_records = int(re.findall('\d+', str(b_tag.decode_contents()))[-1]) total_calls = ceil(total_records/15) next_num = 0 for page_link in range(0, total_calls): next_num += 15 emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: break payload = o_payload + "&start=" + str(next_num) response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if not parse_html(res, court_name): logging.error("Failed for url: " + str(next_num)) return False return True except Exception as e: traceback.print_exc() logging.error("Error in offset_link. %s", e) return False
def parse_html(html_str, court_name, court_id): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table') for table in table_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_no = "NULL" judgment_date = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False table_soup = BeautifulSoup(str(table), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 3: judgment_date = escape_string(str(td.decode_contents())) if i == 4: pdf_file = base_url + BeautifulSoup( str(td), "html.parser").a.get('href') pdf_data = escape_string( request_pdf(pdf_file, case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, court_id, judgment_date, pdf_file, " \ "pdf_filename) VALUE ('" + case_no + "', " + court_id + \ ", '" + judgment_date + "', '" + pdf_file + "', '" + court_name + "_" + \ slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench, start_date): try: soup = BeautifulSoup(html_str, "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {'class': 'hoverTable'})[0]), 'html.parser') tr_list = table_soup.find_all('tr') if not tr_list: logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") return True tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 3: party = str(td.decode_contents()).split("<br/>") petitioner = escape_string(str(party[0]).strip()) respondent = escape_string(str(party[2]).strip()) if i == 4: judge_name = escape_string(str(td.text).strip()) if i == 5: judgment_date = escape_string(str(td.text).strip()) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 7: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = base_url + a_tag.get('href') # pdf_data = escape_string(request_pdf(pdf_file, case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, bench_code, pdf_filename) VALUE"\ " ('" + case_no + "', '" + petitioner + "', '" + \ respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + pdf_file + "', '" + \ str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, court_id, start_date, end_date_): try: url = base_url + "dtquery_new_v1.asp" headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Cache-Control': "no-cache" } i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d/%m/%Y") + datetime.timedelta(days=180)).strftime("%d/%m/%Y") if datetime.datetime.strptime(end_date_, "%d/%m/%Y") + datetime.timedelta(days=180) < \ datetime.datetime.strptime(str(end_date), "%d/%m/%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "action=validate_login" \ "&Court_Id=" + str(court_id) + \ "&party=jus" \ "&FromDt=" + str(start_date) + \ "&ToDt=" + str(end_date) response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if "no data found" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name, court_id): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name, headers): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table', {'id': 'tables11'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 3: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False table_soup = BeautifulSoup(str(tr), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a case_no = escape_string(str(a_tag.text)) if i == 3: party = str(td.decode_contents()).split("Vs") petitioner = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 4: judgment_date = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 5: a_link = BeautifulSoup(str(td), "html.parser").a.get('onclick') a_formatted = str( str(a_link).replace("window.open('", "")).replace("')", "") pdf_file = escape_string(base_url + "/" + a_formatted) # pdf_data = escape_string(request_pdf( # str(pdf_file).replace(base_url + "download_file.php?auth=", ""), case_no, court_name, # headers)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \ petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + pdf_file + "', '" + \ court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, bench, headers, start_date, end_date_): try: url = base_url + '/' + str(bench) + "/services/judgement_status.php" i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%Y-%m-%d") + datetime.timedelta(days=30)).strftime("%Y-%m-%d") if datetime.datetime.strptime(str(end_date_), "%Y-%m-%d") + datetime.timedelta(days=30) < \ datetime.datetime.strptime(str(end_date), "%Y-%m-%d"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "case_no=" \ "&case_type=0" \ "&case_year=" \ "&filing_no=" \ "&from_date=" \ "&from_date1=" + str(start_date) + \ "&judge_detail=0" \ "&search_type=3" \ "&to_date=" \ "&to_date1=" + str(end_date) + \ "&txtState=" \ "&txtSubject=" response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if res is None: logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name, bench, start_date): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str): try: soup = BeautifulSoup(str(html_str), "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {"width": "100%"})[0]), "html.parser") tr_list = table_soup.select('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 2: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.select('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font case_no = escape_string(str(font_tag.text)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 3 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font respondent = escape_string(str(font_tag.text)) if i == 4 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font petitioner = escape_string(str(font_tag.text)) if i == 5 and td.get('align') is None: font_tag = BeautifulSoup(str(td), "html.parser").font judgment_date = escape_string(str(font_tag.text)) if td.get('align') == 'left': td_soup1 = BeautifulSoup(str(td), "html.parser") judge_name = escape_string(str(td_soup1.text)) if td.get('align') == 'center': font_tag = BeautifulSoup(str(td), "html.parser").font a_tag = BeautifulSoup(str(font_tag), "html.parser").a pdf_file = escape_string(base_url + "/" + a_tag.get('href')) pdf_data = escape_string( bytes( str( request_pdf(base_url + "/" + a_tag.get('href'), case_no)), 'utf-8').decode("utf-8", 'ignore')) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, pdf_filename) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, headers, start_date, end_date_): try: url = base_url + "/home.php" i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d/%m/%Y") + datetime.timedelta(days=1)).strftime("%d/%m/%Y") if datetime.datetime.strptime(end_date_, "%d/%m/%Y") + datetime.timedelta(days=1) < \ datetime.datetime.strptime(str(end_date), "%d/%m/%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") querystring = {"search_param": "free_text_search_judgment"} payload = "t_case_type=" \ "&t_case_year=" \ "&submit=Search%20Case" \ "&from_date=" + str(start_date) + \ "&to_date=" + str(end_date) + \ "&pet_name=" \ "&res_name=" \ "&free_text=Justice" response = requests.request("POST", url, data=payload, headers=headers, params=querystring, verify=False, proxies=proxy_dict) res = response.text if "no data found" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") sleep(2) start_date = end_date continue if not offset_link(res, headers, court_name): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, start_date, end_date_): try: headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Cache-Control': "no-cache", } url = base_url + '/judgementsdetails.asp' appeal_types = [ 'NDPS/FPA/ND', 'PMLA/FPA-PMLA', 'SAFEMA/FPA-1', 'FPA/BP', 'FEMA/FERA/FPA-FE' ] if int(start_date[-4:]) < 2013: update_query( "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_[-4:]) < 2013: update_history_tracker(court_name) return True for month_year in month_list_([str(start_date), str(end_date_)]): for appeal_type in appeal_types: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "', End_Date = '" + str(month_year) + "' WHERE Name = '" + str(court_name) + "'") payload = "ACTAPPEALTYPE=" + appeal_type + \ "&DDMONTH=" + str(month_year[:-4]) + \ "&DDYEAR=" + str(month_year[-4:]) response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if 'there are no records at present' in res.lower(): logging.error("NO data Found for year: " + str(month_year)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name, appeal_type): logging.error("Failed to parse data for year: " + str(month_year)) return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name, appeal_type): try: soup = BeautifulSoup(html_str, "html.parser") table_soup = BeautifulSoup( str(soup.find_all('table', {'class': 'table table-bordered'})[0]), 'html.parser') tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" date_of_order = "NULL" appellant = "NULL" respondent = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue if i == 2: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 3: date_of_order = escape_string( str(td.text).strip().replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order): # insert_check = True if i == 4: party = str(td.decode_contents()).split("V/s") appellant = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_url = str(base_url + a_tag.get('href')).replace( '\\', '/') pdf_file = escape_string(pdf_url) pdf_data = escape_string( request_pdf(pdf_url, case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, appellant, respondent, " \ "pdf_file, appeal_type, pdf_filename) VALUE ('" + \ case_no + "', '" + date_of_order + "', '" + appellant + "', '" + respondent + "', '" + \ pdf_file + "', '" + appeal_type + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(html_str, "html.parser") tr_list = soup.find_all('tr') case_no = "NULL" diary_number = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" judge_name = "NULL" bench = "NULL" pdf_data = "NULL" pdf_file = "NULL" tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') if tr_count == 1: td_count = 0 for td in td_list: td_count += 1 if td_count == 3: diary_number = escape_string(str(td.decode_contents())) if tr_count == 2: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: case_no = escape_string(str(td.decode_contents())) if td_count == 3: judgment_date = escape_string(str(td.a.string)) a_link = BeautifulSoup(str(td), "html.parser").a.get('href') pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) pdf_file = escape_string(base_url + a_link) if tr_count == 3: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: petitioner = escape_string(str(td.decode_contents())) if tr_count == 4: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: respondent = escape_string(str(td.decode_contents())) if tr_count == 5: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: petitioner_advocate = escape_string( str(td.decode_contents())) if tr_count == 6: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: respondent_advocate = escape_string( str(td.decode_contents())) if tr_count == 7: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: bench = escape_string(str(td.decode_contents())) if tr_count == 8: td_count = 0 for td in td_list: td_count += 1 if td_count == 2: judge_name = escape_string(str(td.decode_contents())) # if case_no != "NULL" and select_count_query(str(court_name), str(case_no), 'judgment_date', # judgment_date): if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + \ " (diary_number, case_no, petitioner, respondent, petitioner_advocate, " \ "respondent_advocate, judgment_date, bench, judge_name, pdf_file, pdf_filename) VALUE "\ "('" + diary_number + "', '" + case_no + "', '" + petitioner + "', '" + respondent + \ "', '" + petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + \ "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + court_name + "_" \ + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") if tr_count == 9: tr_count = 0 case_no = "NULL" diary_number = "NULL" petitioner = "NULL" respondent = "NULL" petitioner_advocate = "NULL" respondent_advocate = "NULL" judgment_date = "NULL" judge_name = "NULL" bench = "NULL" pdf_data = "NULL" pdf_file = "NULL" return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, start_date, end_date_): try: url = base_url + "/hcs/hcourt/hg_judgement_search" headers = { 'Content-Type': "application/x-www-form-urlencoded", 'Accept': "application/json", 'Cache-Control': "no-cache" } if int(start_date[-2:]) < 11: update_query( "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_[-2:]) < 11: update_history_tracker(court_name) return True for month_year in month_list_([str(start_date), str(end_date_)]): year = int(month_year[-2:]) - 10 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "', End_Date = '" + str(end_date_) + "' WHERE Name = '" + str(court_name) + "'") querystring = {"ajax_form": "1", "_wrapper_format": "drupal_ajax"} payload = "form_build_id=form-BS37MKVfuGmv9fgHWUqr3U9nFCjolonq-Nnenj3Ks24" \ "&form_id=ajax_example_form" \ "&ordermonth=" + str(month_year[:-2]).lstrip("0") + \ "&orderyear=" + str(year) + \ "&_triggering_element_name=op" \ "&_triggering_element_value=Search" \ "&_drupal_ajax=1" \ "&ajax_page_state%5Btheme%5D=mytheme" \ "&ajax_page_state%5Btheme_token%5D=%20" \ "&ajax_page_state%5Blibraries%5D=asset_injector%2Fcss%2Fanimation_accordin%2Casset_injector" \ "%2Fcss%2Fside_bar%2Casset_injector%2Fcss%2Ftable%2Casset_injector%2Fjs%2Fseperate_tab_%2C" \ "core%2Fdrupal.ajax%2Ccore%2Fhtml5shiv%2Ccore%2Fjquery.form%2Cmytheme%2Fmylibrarynew%2C" \ "system%2Fbase%2Cviews%2Fviews.module" response = requests.request("POST", url, data=payload, headers=headers, params=querystring, proxies=proxy_dict) json_res = json.loads(response.text) res = None for json_r in json_res: if "data" in json_r: res = BeautifulSoup(str(json_r['data']), "html.parser") break if res is None: logging.error("NO data Found for start date: " + str(month_year)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name): logging.error("Failed to parse data from date: " + str(month_year)) return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def request_data(court_name, start_date, end_date_): try: url = base_url + 'php/getJBJ.php' headers = { 'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8", 'Cache-Control': "no-cache" } i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=30)).strftime("%d-%m-%Y") if datetime.datetime.strptime(end_date_, "%d-%m-%Y") + datetime.timedelta(days=30) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("END date Exceed.") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "jorrop=J" \ "&JBJfrom_date=" + str(start_date) + \ "&JBJto_date=" + str(end_date) response = requests.request("POST", url, data=payload, headers=headers, verify=False, proxies=proxy_dict) res = response.text if "no data found" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not parse_html(res, court_name): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name): try: soup = BeautifulSoup(str(html_str).replace('&', ' '), "html.parser") tr_list = soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: judgment_date = escape_string(str(td.decode_contents())) if i == 2: judge_name = escape_string(str(td.decode_contents())) if i == 3: case_no = escape_string(str(td.text)) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 4: party = str(td.decode_contents()).split("v/s") petitioner = escape_string(str(party[0])) respondent = escape_string(str(party[1])) if i == 5: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(str(base_url + a_tag.get('href'))) pdf_data = escape_string( request_pdf(base_url + a_tag.get('href'), case_no, court_name)) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "judge_name, pdf_file, pdf_filename) VALUE ('" + \ case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \ judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(court_name, start_date, end_date_): try: if int(start_date[-4:]) < 2010: update_query( "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" + str(court_name) + "'") if int(end_date_[-4:]) < 2010: update_history_tracker(court_name) return True for month_year in month_list_([str(start_date), str(end_date_)]): emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True data = { 'ctl00$CPHBody$DropDownListYear': str(month_year[-4:]), 'ctl00$CPHBody$DropDownListMonth': str(month_year[:-4]).lstrip("0"), 'ctl00$CPHBody$TextBox1': '', 'ctl00$CPHBody$SM1': 'ctl00$CPHBody$SM1|ctl00$CPHBody$DropDownListMonth' } with requests.Session() as s: page = s.get(base_url + 'judgement.aspx') soup = BeautifulSoup(page.content, "html.parser") data["__VIEWSTATE"] = soup.select_one("#__VIEWSTATE")["value"] data["__VIEWSTATEGENERATOR"] = soup.select_one( "#__VIEWSTATEGENERATOR")["value"] data["__EVENTVALIDATION"] = soup.select_one( "#__EVENTVALIDATION")["value"] update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "' WHERE Name = '" + str(court_name) + "'") response = s.post(base_url + 'judgement.aspx', data=data) res = response.text if "no records were found." in res.lower( ) or "application error" in res.lower(): logging.error("NO data Found for start date: " + str(month_year)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") continue if not parse_html(res, court_name): logging.error("Failed to parse data") return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False
def parse_html(html_str, court_name, flag): try: soup = BeautifulSoup(html_str, "html.parser") soup = BeautifulSoup(str(soup.prettify()), "html.parser") date_h4 = soup.find_all('h4', {'align': 'center'})[0] month_year = str(date_h4.text).replace('JUDGMENTS FOR THE MONTH OF', '').strip() table_list = soup.find_all('table', {'class': 'DISCOVERY3'})[0] table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 1: continue case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" subject = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') if flag: i = 1 else: i = 0 for td in td_list: i += 1 if i == 2: judgment_day = escape_string(str(td.decode_contents())) judgment_date = str(re.findall('\d+', str(judgment_day))[0]) + ", " + month_year.replace( 'JUDGEMENTS FOR THE MONTH OF', '') if i == 3: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_file = escape_string(str(base_url + a_tag.get('href'))) case_no = escape_string(str(a_tag.text).replace("\n", "").strip()) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True pdf_data = escape_string(request_pdf(str(base_url + a_tag.get('href')), case_no, court_name)) if i == 4: font_tag = BeautifulSoup(str(td), "html.parser").font if font_tag is not None: span_tag = font_tag.span else: span_tag = BeautifulSoup(str(td), "html.parser").span if span_tag is None: span_tag = BeautifulSoup(str(td), "html.parser") party = str(span_tag.decode_contents()).split("<br/>") petitioner = escape_string( str(party[0]).replace('<td align="center" bgcolor="#FFFFFF" valign="middle" width="30%">', '').strip()) petitioner = re.sub(r'(\\x(.){2})', '', petitioner) respondent = escape_string(str(party[2]).replace('</td>', '').strip()) respondent = re.sub(r'(\\x(.){2})', '', respondent) if i == 5: subject = escape_string(str(td.decode_contents()).strip()) if i == 6: judge_name = escape_string(str(td.text).replace(r'\x', '').replace('\\xC2\\x92BLE', '').strip()) judge_name = re.sub(r'(\\x(.){2})', '', judge_name) judge_name = re.sub(r'', '', judge_name, re.U) # if case_no != "NULL" and insert_check and td_list: if case_no != "NULL" and td_list: sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \ "subject, pdf_file, pdf_filename) VALUE ('" + case_no + \ "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + subject + \ "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET judge_name = '" + str(judge_name) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench, child_url): try: soup = BeautifulSoup(html_str, "html.parser") div_soup = BeautifulSoup(str(soup.find_all('div', {'id': 'text'})[0]), 'html.parser') tr_list = div_soup.find_all('tr') tr_count = 0 for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count == 1: continue case_no = "NULL" date_of_order = "NULL" description = "NULL" section = "NULL" pdf_data = "NULL" pdf_file = "NULL" # insert_check = False tr_soup = BeautifulSoup(str(tr), "html.parser") td_list = tr_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string( str(td.text).strip().replace("\n", "")) if i == 2: date_of_order = escape_string( str(td.text).strip().replace("\n", "")) # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order): # insert_check = True if i == 3: description = escape_string(str(td.text).strip()) a_tag = BeautifulSoup(str(td), "html.parser").font.a pdf_url = base_url + child_url + a_tag.get('href') pdf_file = escape_string(pdf_url) pdf_data = escape_string( request_pdf(pdf_url, case_no, court_name)) if i == 4: section = str(td.text) # if case_no != "NULL" and insert_check: if case_no != "NULL": sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, description, section, " \ "pdf_file, bench_code, pdf_filename) VALUE ('" + \ case_no + "', '" + date_of_order + "', '" + description + "', '" + section + "', '" + \ pdf_file + "', '" + str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')" insert_query(sql_query) update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" + str(case_no) + "'") update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: traceback.print_exc() logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def parse_html(html_str, court_name, bench_code): try: soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table') for table in table_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_no = "NULL" petitioner = "NULL" respondent = "NULL" judgment_date = "NULL" judge_name = "NULL" text = "NULL" text_file = "NULL" pdf_file = "NULL" # insert_check = False table_soup = BeautifulSoup(str(table), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: case_no = escape_string(str(td.decode_contents())) if i == 2: petitioner = escape_string(str(td.decode_contents())) if i == 4: respondent = escape_string(str(td.decode_contents())) if i == 6: judgment_date = escape_string(str(td.decode_contents())) # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date): # insert_check = True if i == 7: judge_name = escape_string(str(td.decode_contents())) if i == 8: a_link = BeautifulSoup(str(td), "html.parser").a.get('href') text_dir = request_text(base_url + a_link, case_no, court_name) text = escape_string(text_dir['data']) text_file = escape_string(base_url + a_link) if i == 9: a_link = BeautifulSoup(str(td), "html.parser").a.get('href') pdf_file = escape_string(base_url + a_link) pdf_data = escape_string( request_pdf(base_url + a_link, case_no, court_name)) # if case_no != "NULL" and insert_check and petitioner != 'Judgment Information System': if case_no != "NULL" and petitioner != 'Judgment Information System': sql_query = "INSERT INTO " + str(court_name) + \ " (case_no, petitioner, respondent, judgment_date, judge_name, text_data, text_file, " \ "pdf_file, bench_code, pdf_filename) VALUE ('" + case_no + "', '" + petitioner + "', '" + \ respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + text + "', '" + \ text_file + "', '" + pdf_file + "', " + str(bench_code) + ", '" + court_name + "_" + \ slugify(case_no) + ".txt')" insert_query(sql_query) update_query( "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'") return True except Exception as e: logging.error("Failed to parse the html: %s", e) update_query( "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'") return False
def request_data(headers, start_date, end_date_): try: url = base_url + "/ByDate.php" i = 0 while True: i += 1 emergency_exit = select_one_query( "SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'") if emergency_exit['emergency_exit'] == 1: update_history_tracker(court_name) return True end_date = ( datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=180)).strftime("%d-%m-%Y") if datetime.datetime.strptime(str(end_date_), "%d-%m-%Y") + datetime.timedelta(days=180) < \ datetime.datetime.strptime(str(end_date), "%d-%m-%Y"): logging.error("DONE") break update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" + str(end_date) + "' WHERE Name = '" + str(court_name) + "'") payload = "date_day=" + str(start_date[0:2]).replace("0", "") + \ "&date_month=" + str(start_date[3:5]).replace("0", "") + \ "&date_year=" + str(start_date[6:]) + \ "&date_day1=" + str(end_date[0:2]).replace("0", "") + \ "&date_month1=" + str(end_date[3:5]).replace("0", "") + \ "&date_year1=" + str(end_date[6:]) + \ "&submit=Submit" response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict) res = response.text if "invalid inputs given" in res.lower(): logging.error("NO data Found for start date: " + str(start_date)) update_query( "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" + str(court_name) + "'") start_date = end_date continue if not offset_link(res, headers): logging.error("Failed to parse data from date: " + str(start_date)) start_date = end_date return True except Exception as e: traceback.print_exc() logging.error("Failed to get data from date: " + str(start_date)) logging.error("Failed to request: %s", e) return False