def offset_link(html_str, url, querystring, court_name):
    try:
        if not parse_html(html_str, court_name):
            return False

        querystring['sort_by'] = "1"
        querystring['etal'] = "-1"

        soup = BeautifulSoup(html_str, "html.parser")
        div_tag = soup.find_all('div', {'class': 'browse_range'})[0]

        total_records = int(re.findall('\d+', str(div_tag.text))[-1])
        total_calls = ceil(total_records/200)

        next_num = 0
        for page_link in range(0, total_calls):
            next_num += 200

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            querystring['offset'] = str(next_num)
            response = requests.request("GET", url, headers=headers, params=querystring, proxies=proxy_dict)
            res = response.text

            if not parse_html(res, court_name):
                logging.error("Failed for url: " + str(next_num))
                return False

        return True
    except Exception as e:
        logging.error("Error in offset_link. %s", e)
        return False
Esempio n. 2
0
def request_data(court_name, bench, start_date, end_date_):
    try:
        url = base_url + "/tribunalorders"
        headers = {
            'Content-Type': "application/x-www-form-urlencoded",
            'Cache-Control': "no-cache"
        }
        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (datetime.datetime.strptime(str(start_date), "%d/%m/%Y") + datetime.timedelta(days=1)
                        ).strftime("%d/%m/%Y")

            if datetime.datetime.strptime(str(end_date_), "%d/%m/%Y") + datetime.timedelta(days=1) < \
                    datetime.datetime.strptime(str(end_date), "%d/%m/%Y"):
                logging.error("DONE")
                break

            update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" +
                         str(end_date) + "' WHERE Name = '" + str(court_name) + "'")

            payload = "bench=" + str(bench) + \
                      "&appeal_type=" \
                      "&hearingdate=" \
                      "&pronouncementdate=" \
                      "&orderdate=" + str(start_date) + \
                      "&member=" \
                      "&assesseename="

            response = requests.request("POST", url, data=payload, headers=headers, verify=False, proxies=proxy_dict)
            res = response.text

            if res is None:
                logging.error("NO data Found.")
                update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" +
                             str(court_name) + "'")

                start_date = end_date
                continue

            if not parse_html(res, court_name, bench):
                logging.error("Failed to parse data from bench: " + str(bench))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from bench: " + str(bench))
        logging.error("Failed to request: %s", e)
        return False
def request_data(court_name, headers, start_date, end_date_):
    try:
        url = base_url + "coram-reported-judgment.php"

        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (datetime.datetime.strptime(str(start_date), "%d-%m-%Y") + datetime.timedelta(days=1)
                        ).strftime("%d-%m-%Y")

            if datetime.datetime.strptime(str(end_date_), "%d-%m-%Y") + datetime.timedelta(days=1) < \
                    datetime.datetime.strptime(str(end_date), "%d-%m-%Y"):
                logging.error("DONE")
                break

            update_query("UPDATE Tracker SET Start_Date = '" + str(start_date) + "', End_Date = '" +
                         str(end_date) + "' WHERE Name = '" + str(court_name) + "'")

            payload = "coram=0" \
                      "&ojtype=1" \
                      "&bench_type=0" \
                      "&reported=Y" \
                      "&startdate=" + str(start_date) + \
                      "&enddate=" + str(end_date) + \
                      "&coramqueryreported=0"

            response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict)
            res = response.text

            if "NO ROWS" in res.upper():
                logging.error("NO data Found for start date: " + str(start_date))
                update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" +
                             str(court_name) + "'")

                start_date = end_date
                continue

            if not offset_link(res, payload, court_name, headers):
                logging.error("Failed to parse data from date: " + str(start_date))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
Esempio n. 4
0
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        div = soup.find_all('div', {'id': 'CPHBody_PanelList'})[0]
        a_list_soup = BeautifulSoup(str(div), "html.parser")
        a_list = a_list_soup.find_all('a')

        a_list_unique = list(set(a_list))
        for a in a_list_unique:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            case_no = escape_string(str(str(a.text)[:-10]).replace("-", ""))
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            judgment_date = escape_string(str(a.text)[-10:])

            # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
            #     insert_check = True

            a_link = a.get('href')
            pdf_data = escape_string(
                request_pdf(base_url + a_link, case_no, court_name))
            pdf_file = escape_string(base_url + a_link)

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \
                                                               "VALUE ('" + case_no + "', '" + judgment_date + "', '" \
                            + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        ul = soup.find_all('ul')[0]
        ul_soup = BeautifulSoup(str(ul), "html.parser")
        li_list = ul_soup.find_all('li')

        # p_list = ul_soup.find_all('p')
        # p_list = [x for x in p_list if "<p><font" not in str(x)]
        # print(p_list)
        # return

        for li in li_list:
            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            a = BeautifulSoup(str(li), "html.parser").a
            a_link = a.get('href')

            case_no = str(a_link[a_link.rfind("/")+1:]).replace('.pdf', '')
            judgment_date = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
            #     insert_check = True

            judgment_date = escape_string(case_no[-10:].replace('(', '').replace(')', ''))
            pdf_data = escape_string(request_pdf(base_url + a_link, case_no, court_name))
            pdf_file = escape_string(base_url + a_link)

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \
                                                               "VALUE ('" + case_no + "', '" + judgment_date + "', '" \
                            + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET text_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
Esempio n. 6
0
def request_data(court_name, start_date, end_date_):
    try:
        headers = {
            'Cache-Control': "no-cache",
        }

        if int(start_date[-2:]) < 10:
            update_query("UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" +
                         str(court_name) + "'")
            if int(end_date_[-2:]) < 10:
                update_history_tracker(court_name)
                return True

        for month_year in month_list_([str(start_date), str(end_date_)]):
            month_year = date_fix(month_year)

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            url = base_url + "JDMT" + str(month_year) + ".html"

            update_query("UPDATE Tracker SET Start_Date = '" + str(month_year) + "', End_Date = '" +
                         str(end_date_) + "' WHERE Name = '" + str(court_name) + "'")

            response = requests.request("GET", url, headers=headers, proxies=proxy_dict)
            res = response.text

            if "file or directory not found" in res.upper():
                logging.error("NO data Found for start date: " + str(month_year))

                update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" +
                             str(court_name) + "'")

                continue

            if str(month_year[-2:]) == '10' or str(month_year) == 'Jan11':
                if not parse_html(res, court_name, True):
                    logging.error("Failed to parse data from date: " + str(month_year))
            else:
                if not parse_html(res, court_name, False):
                    logging.error("Failed to parse data from date: " + str(month_year))

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)

        return False
Esempio n. 7
0
def request_data(court_name, bench, start_date, end_date_):
    try:
        for year in range(start_date, end_date_ + 1):
            if int(year) < 2010 or int(year) > 2016:
                logging.error("NO data Found for start date: " +
                              str(start_date))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")
                continue

            section_types = ['111_111_A', '397_398', 'Others']
            for section_type in section_types:

                child_url = str(bench) + '/' + str(year) + '/' + str(
                    section_type) + '/'
                url = base_url + child_url + 'index.html'

                emergency_exit = select_one_query(
                    "SELECT emergency_exit FROM Tracker WHERE Name='" +
                    court_name + "'")
                if emergency_exit['emergency_exit'] == 1:
                    update_history_tracker(court_name)
                    return True

                update_query("UPDATE Tracker SET Start_Date = '" + str(year) +
                             "', End_Date = '" + str(year) +
                             "' WHERE Name = '" + str(court_name) + "'")

                response = requests.request("GET", url, proxies=proxy_dict)
                res = response.text

                if res is None:
                    logging.error("NO data Found for year: " + str(year))
                    update_query(
                        "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                        + str(court_name) + "'")
                    continue

                if not parse_html(res, court_name, bench, child_url):
                    logging.error("Failed to parse data for year: " +
                                  str(year))

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
def offset_link(html_str, headers):
    try:
        if not parse_html(html_str):
            return False

        soup = BeautifulSoup(html_str, "html.parser")
        td_tag = soup.find_all('td', {
            'height': '172',
            'align': 'center',
            'valign': 'top'
        })[0]
        td_soup = BeautifulSoup(str(td_tag), "html.parser")
        for table in td_soup.find_all("table"):
            table.decompose()

        a_tags = td_soup.find_all('a')
        a_link_list = []

        for a_tag in a_tags:
            a_link = base_url + a_tag.get('href')
            a_link_list.append(a_link)

        a_link_list_unique = list(set(a_link_list))
        i = 0
        for page_link in a_link_list_unique:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                break

            if page_link != 'http://hcmjudgment.man.nic.in/ByDate.php?page=1':
                response = requests.request("POST",
                                            page_link,
                                            headers=headers,
                                            proxies=proxy_dict)
                res = response.text

                if not parse_html(res):
                    logging.error("Failed for url: " + page_link)
                    return False

        return True
    except Exception as e:
        logging.error("Error in offset_link. %s", e)
        return False
def offset_link(html_str, headers, court_name):
    try:
        if not parse_html(html_str, court_name, headers):
            return False

        soup = BeautifulSoup(html_str, "html.parser")
        table_tag = soup.find_all('table', {'id': 'tables11'})[0]
        table_soup = BeautifulSoup(str(table_tag), "html.parser")
        tr_tag = table_soup.find_all('tr', {'align': 'center'})
        if len(tr_tag) <= 0:
            return True

        tr_tag = tr_tag[1]
        tr_soup = BeautifulSoup(str(tr_tag), "html.parser")
        a_tags = tr_soup.find_all('a')
        a_link_list = []

        for a_tag in a_tags:
            a_link = base_url + a_tag.get('href')
            a_link_list.append(a_link)

        a_link_list_unique = list(set(a_link_list))
        i = 0
        for page_link in a_link_list_unique:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                break

            if page_link != "https://phhc.gov.in./home.php?search_param=free_text_search_judgment&page_no=1":
                response = requests.request("POST",
                                            page_link,
                                            headers=headers,
                                            verify=False,
                                            proxies=proxy_dict)
                res = response.text

                if not parse_html(res, court_name, headers):
                    logging.error("Failed for url: " + page_link)
                    return False

        return True
    except Exception as e:
        logging.error("Error in offset_link. %s", e)
        return False
def request_data(court_name, start_date, end_date_):
    try:
        if int(start_date) < 2012:
            update_query("UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '" +
                         str(court_name) + "'")
            if int(end_date_) < 2012:
                update_history_tracker(court_name)
                return True

        for year_ in range(int(start_date), int(end_date_) + 1):
            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            if int(year_) == 2018:
                year_ = ''

            url = base_url + "DecisionsHeadline" + str(year_) + ".html"

            update_query("UPDATE Tracker SET Start_Date = '" + str(year_) + "', End_Date = '" + str(end_date_) +
                         "' WHERE Name = '" + str(court_name) + "'")

            response = requests.request("GET", url, proxies=proxy_dict)
            res = response.text

            if "file or directory not found" in res.lower():
                logging.error("NO data Found for start date: " + str(year_))

                update_query("UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '" +
                             str(court_name) + "'")

                continue

            if not parse_html(res, court_name):
                logging.error("Failed to parse data from date: " + str(year_))

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
Esempio n. 11
0
def offset_link(html_str, headers, court_name, dc):
    try:
        if not parse_html(html_str, court_name, dc):
            return False

        soup = BeautifulSoup(html_str, "html.parser")
        p_tag = soup.find_all('p', {'class': 'style2'})[1]
        p_soup = BeautifulSoup(str(p_tag), "html.parser")
        a_tags = p_soup.find_all('a')
        a_link_list = []

        for a_tag in a_tags:
            a_link = base_url + a_tag.get('href')
            a_link_list.append(a_link)

        a_link_list_unique = list(set(a_link_list))
        i = 0
        for page_link in a_link_list_unique:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                break

            if page_link != "http://lobis.nic.in/juddt1.php?offset=0":
                response = requests.request("POST",
                                            page_link,
                                            headers=headers,
                                            proxies=proxy_dict)
                res = response.text

                if not parse_html(res, court_name, dc):
                    logging.error("Failed for url: " + page_link)
                    return False

        return True
    except Exception as e:
        logging.error("Error in offset_link. %s", e)
        return False
def offset_link(html_str, o_payload, court_name, headers):
    url = base_url + "coram-reported-judgment.php"
    try:
        if not parse_html(html_str, court_name):
            return False

        soup = BeautifulSoup(html_str, "html.parser")
        table_tag = soup.find_all('table')[1]
        table_soup = BeautifulSoup(str(table_tag), "html.parser")
        b_tag = table_soup.find_all('b')[0]
        if str(b_tag.decode_contents()).lower().find('no record found') != -1:
            return True

        total_records = int(re.findall('\d+', str(b_tag.decode_contents()))[-1])
        total_calls = ceil(total_records/15)

        next_num = 0
        for page_link in range(0, total_calls):
            next_num += 15

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                break

            payload = o_payload + "&start=" + str(next_num)
            response = requests.request("POST", url, data=payload, headers=headers, proxies=proxy_dict)
            res = response.text

            if not parse_html(res, court_name):
                logging.error("Failed for url: " + str(next_num))
                return False

        return True
    except Exception as e:
        traceback.print_exc()
        logging.error("Error in offset_link. %s", e)
        return False
def parse_html(html_str, court_name, court_id):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table')

        for table in table_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            case_no = "NULL"
            judgment_date = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            table_soup = BeautifulSoup(str(table), "html.parser")
            td_list = table_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 3:
                    judgment_date = escape_string(str(td.decode_contents()))
                if i == 4:
                    pdf_file = base_url + BeautifulSoup(
                        str(td), "html.parser").a.get('href')
                    pdf_data = escape_string(
                        request_pdf(pdf_file, case_no, court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, court_id, judgment_date, pdf_file, " \
                                                               "pdf_filename) VALUE ('" + case_no + "', " + court_id + \
                            ", '" + judgment_date + "', '" + pdf_file + "', '" + court_name + "_" + \
                            slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name, bench, start_date):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_soup = BeautifulSoup(
            str(soup.find_all('table', {'class': 'hoverTable'})[0]),
            'html.parser')
        tr_list = table_soup.find_all('tr')

        if not tr_list:
            logging.error("NO data Found for start date: " + str(start_date))
            update_query(
                "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                + str(court_name) + "'")
            return True

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    case_no = escape_string(
                        str(td.text).strip().replace("\n", ""))

                if i == 3:
                    party = str(td.decode_contents()).split("<br/>")
                    petitioner = escape_string(str(party[0]).strip())
                    respondent = escape_string(str(party[2]).strip())

                if i == 4:
                    judge_name = escape_string(str(td.text).strip())

                if i == 5:
                    judgment_date = escape_string(str(td.text).strip())

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 7:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = base_url + a_tag.get('href')
                    # pdf_data = escape_string(request_pdf(pdf_file, case_no, court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge_name, pdf_file, bench_code, pdf_filename) VALUE"\
                                                               " ('" + case_no + "', '" + petitioner + "', '" + \
                            respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + pdf_file + "', '" + \
                            str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def request_data(court_name, court_id, start_date, end_date_):
    try:
        url = base_url + "dtquery_new_v1.asp"
        headers = {
            'Content-Type': "application/x-www-form-urlencoded",
            'Cache-Control': "no-cache"
        }

        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (
                datetime.datetime.strptime(str(start_date), "%d/%m/%Y") +
                datetime.timedelta(days=180)).strftime("%d/%m/%Y")

            if datetime.datetime.strptime(end_date_, "%d/%m/%Y") + datetime.timedelta(days=180) < \
                    datetime.datetime.strptime(str(end_date), "%d/%m/%Y"):
                logging.error("DONE")
                break

            update_query("UPDATE Tracker SET Start_Date = '" +
                         str(start_date) + "', End_Date = '" + str(end_date) +
                         "' WHERE Name = '" + str(court_name) + "'")

            payload = "action=validate_login" \
                      "&Court_Id=" + str(court_id) + \
                      "&party=jus" \
                      "&FromDt=" + str(start_date) + \
                      "&ToDt=" + str(end_date)

            response = requests.request("POST",
                                        url,
                                        data=payload,
                                        headers=headers,
                                        proxies=proxy_dict)
            res = response.text

            if "no data found" in res.lower():
                logging.error("NO data Found for start date: " +
                              str(start_date))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")

                start_date = end_date
                continue

            if not parse_html(res, court_name, court_id):
                logging.error("Failed to parse data from date: " +
                              str(start_date))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
def parse_html(html_str, court_name, headers):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table', {'id': 'tables11'})
        table_soup = BeautifulSoup(str(table_list), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 3:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            table_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = table_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    case_no = escape_string(str(a_tag.text))

                if i == 3:
                    party = str(td.decode_contents()).split("Vs")
                    petitioner = escape_string(str(party[0]))
                    respondent = escape_string(str(party[1]))

                if i == 4:
                    judgment_date = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 5:
                    a_link = BeautifulSoup(str(td),
                                           "html.parser").a.get('onclick')
                    a_formatted = str(
                        str(a_link).replace("window.open('",
                                            "")).replace("')", "")
                    pdf_file = escape_string(base_url + "/" + a_formatted)

                    # pdf_data = escape_string(request_pdf(
                    #     str(pdf_file).replace(base_url + "download_file.php?auth=", ""), case_no, court_name,
                    #     headers))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \
                            petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + pdf_file + "', '" + \
                            court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def request_data(court_name, bench, headers, start_date, end_date_):
    try:
        url = base_url + '/' + str(bench) + "/services/judgement_status.php"

        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (
                datetime.datetime.strptime(str(start_date), "%Y-%m-%d") +
                datetime.timedelta(days=30)).strftime("%Y-%m-%d")

            if datetime.datetime.strptime(str(end_date_), "%Y-%m-%d") + datetime.timedelta(days=30) < \
                    datetime.datetime.strptime(str(end_date), "%Y-%m-%d"):
                logging.error("DONE")
                break

            update_query("UPDATE Tracker SET Start_Date = '" +
                         str(start_date) + "', End_Date = '" + str(end_date) +
                         "' WHERE Name = '" + str(court_name) + "'")

            payload = "case_no=" \
                      "&case_type=0" \
                      "&case_year=" \
                      "&filing_no=" \
                      "&from_date=" \
                      "&from_date1=" + str(start_date) + \
                      "&judge_detail=0" \
                      "&search_type=3" \
                      "&to_date=" \
                      "&to_date1=" + str(end_date) + \
                      "&txtState=" \
                      "&txtSubject="

            response = requests.request("POST",
                                        url,
                                        data=payload,
                                        headers=headers,
                                        proxies=proxy_dict)
            res = response.text

            if res is None:
                logging.error("NO data Found for start date: " +
                              str(start_date))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")

                start_date = end_date
                continue

            if not parse_html(res, court_name, bench, start_date):
                logging.error("Failed to parse data from date: " +
                              str(start_date))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
def parse_html(html_str):
    try:
        soup = BeautifulSoup(str(html_str), "html.parser")

        table_soup = BeautifulSoup(
            str(soup.find_all('table', {"width": "100%"})[0]), "html.parser")
        tr_list = table_soup.select('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 2:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.select('td')

            i = 0
            for td in td_list:
                i += 1

                if i == 1:
                    continue

                if i == 2 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    case_no = escape_string(str(font_tag.text))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 3 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    respondent = escape_string(str(font_tag.text))

                if i == 4 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    petitioner = escape_string(str(font_tag.text))

                if i == 5 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    judgment_date = escape_string(str(font_tag.text))

                if td.get('align') == 'left':
                    td_soup1 = BeautifulSoup(str(td), "html.parser")
                    judge_name = escape_string(str(td_soup1.text))

                if td.get('align') == 'center':
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    a_tag = BeautifulSoup(str(font_tag), "html.parser").a
                    pdf_file = escape_string(base_url + "/" +
                                             a_tag.get('href'))
                    pdf_data = escape_string(
                        bytes(
                            str(
                                request_pdf(base_url + "/" + a_tag.get('href'),
                                            case_no)),
                            'utf-8').decode("utf-8", 'ignore'))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge_name, pdf_file, pdf_filename) VALUE ('" + \
                            case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \
                            judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def request_data(court_name, headers, start_date, end_date_):
    try:
        url = base_url + "/home.php"

        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (
                datetime.datetime.strptime(str(start_date), "%d/%m/%Y") +
                datetime.timedelta(days=1)).strftime("%d/%m/%Y")

            if datetime.datetime.strptime(end_date_, "%d/%m/%Y") + datetime.timedelta(days=1) < \
                    datetime.datetime.strptime(str(end_date), "%d/%m/%Y"):
                logging.error("DONE")
                break

            update_query("UPDATE Tracker SET Start_Date = '" +
                         str(start_date) + "', End_Date = '" + str(end_date) +
                         "' WHERE Name = '" + str(court_name) + "'")

            querystring = {"search_param": "free_text_search_judgment"}

            payload = "t_case_type=" \
                      "&t_case_year=" \
                      "&submit=Search%20Case" \
                      "&from_date=" + str(start_date) + \
                      "&to_date=" + str(end_date) + \
                      "&pet_name=" \
                      "&res_name=" \
                      "&free_text=Justice"

            response = requests.request("POST",
                                        url,
                                        data=payload,
                                        headers=headers,
                                        params=querystring,
                                        verify=False,
                                        proxies=proxy_dict)
            res = response.text

            if "no data found" in res.lower():
                logging.error("NO data Found for start date: " +
                              str(start_date))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")
                sleep(2)

                start_date = end_date
                continue

            if not offset_link(res, headers, court_name):
                logging.error("Failed to parse data from date: " +
                              str(start_date))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
Esempio n. 20
0
def request_data(court_name, start_date, end_date_):
    try:
        headers = {
            'Content-Type': "application/x-www-form-urlencoded",
            'Cache-Control': "no-cache",
        }
        url = base_url + '/judgementsdetails.asp'

        appeal_types = [
            'NDPS/FPA/ND', 'PMLA/FPA-PMLA', 'SAFEMA/FPA-1', 'FPA/BP',
            'FEMA/FERA/FPA-FE'
        ]

        if int(start_date[-4:]) < 2013:
            update_query(
                "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '"
                + str(court_name) + "'")
            if int(end_date_[-4:]) < 2013:
                update_history_tracker(court_name)
                return True

        for month_year in month_list_([str(start_date), str(end_date_)]):
            for appeal_type in appeal_types:
                emergency_exit = select_one_query(
                    "SELECT emergency_exit FROM Tracker WHERE Name='" +
                    court_name + "'")
                if emergency_exit['emergency_exit'] == 1:
                    update_history_tracker(court_name)
                    return True

                update_query("UPDATE Tracker SET Start_Date = '" +
                             str(month_year) + "', End_Date = '" +
                             str(month_year) + "' WHERE Name = '" +
                             str(court_name) + "'")

                payload = "ACTAPPEALTYPE=" + appeal_type + \
                          "&DDMONTH=" + str(month_year[:-4]) + \
                          "&DDYEAR=" + str(month_year[-4:])

                response = requests.request("POST",
                                            url,
                                            data=payload,
                                            headers=headers,
                                            proxies=proxy_dict)
                res = response.text

                if 'there are no records at present' in res.lower():
                    logging.error("NO data Found for year: " + str(month_year))
                    update_query(
                        "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                        + str(court_name) + "'")
                    continue

                if not parse_html(res, court_name, appeal_type):
                    logging.error("Failed to parse data for year: " +
                                  str(month_year))

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
Esempio n. 21
0
def parse_html(html_str, court_name, appeal_type):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_soup = BeautifulSoup(
            str(soup.find_all('table', {'class': 'table table-bordered'})[0]),
            'html.parser')
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            date_of_order = "NULL"
            appellant = "NULL"
            respondent = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    case_no = escape_string(
                        str(td.text).strip().replace("\n", ""))

                if i == 3:
                    date_of_order = escape_string(
                        str(td.text).strip().replace("\n", ""))

                # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order):
                #     insert_check = True

                if i == 4:
                    party = str(td.decode_contents()).split("V/s")
                    appellant = escape_string(str(party[0]))
                    respondent = escape_string(str(party[1]))

                if i == 5:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_url = str(base_url + a_tag.get('href')).replace(
                        '\\', '/')
                    pdf_file = escape_string(pdf_url)
                    pdf_data = escape_string(
                        request_pdf(pdf_url, case_no, court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, appellant, respondent, " \
                                                               "pdf_file, appeal_type, pdf_filename) VALUE ('" + \
                            case_no + "', '" + date_of_order + "', '" + appellant + "', '" + respondent + "', '" + \
                            pdf_file + "', '" + appeal_type + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
Esempio n. 22
0
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        tr_list = soup.find_all('tr')

        case_no = "NULL"
        diary_number = "NULL"
        petitioner = "NULL"
        respondent = "NULL"
        petitioner_advocate = "NULL"
        respondent_advocate = "NULL"
        judgment_date = "NULL"
        judge_name = "NULL"
        bench = "NULL"
        pdf_data = "NULL"
        pdf_file = "NULL"

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            if tr_count == 1:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        diary_number = escape_string(str(td.decode_contents()))

            if tr_count == 2:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        case_no = escape_string(str(td.decode_contents()))
                    if td_count == 3:
                        judgment_date = escape_string(str(td.a.string))
                        a_link = BeautifulSoup(str(td),
                                               "html.parser").a.get('href')
                        pdf_data = escape_string(
                            request_pdf(base_url + a_link, case_no,
                                        court_name))
                        pdf_file = escape_string(base_url + a_link)

            if tr_count == 3:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        petitioner = escape_string(str(td.decode_contents()))

            if tr_count == 4:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        respondent = escape_string(str(td.decode_contents()))

            if tr_count == 5:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        petitioner_advocate = escape_string(
                            str(td.decode_contents()))

            if tr_count == 6:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        respondent_advocate = escape_string(
                            str(td.decode_contents()))

            if tr_count == 7:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        bench = escape_string(str(td.decode_contents()))

            if tr_count == 8:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        judge_name = escape_string(str(td.decode_contents()))

                # if case_no != "NULL" and select_count_query(str(court_name), str(case_no), 'judgment_date',
                #                                             judgment_date):
                if case_no != "NULL":
                    sql_query = "INSERT INTO " + str(court_name) + \
                                " (diary_number, case_no, petitioner, respondent, petitioner_advocate, " \
                                "respondent_advocate, judgment_date, bench, judge_name, pdf_file, pdf_filename) VALUE "\
                                "('" + diary_number + "', '" + case_no + "', '" + petitioner + "', '" + respondent + \
                                "', '" + petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + \
                                "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + court_name + "_" \
                                + slugify(case_no) + ".pdf')"
                    insert_query(sql_query)

                    update_query("UPDATE " + court_name + " SET pdf_data = '" +
                                 str(pdf_data) + "' WHERE case_no = '" +
                                 str(case_no) + "'")
                    update_query(
                        "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                        + str(court_name) + "'")

            if tr_count == 9:
                tr_count = 0
                case_no = "NULL"
                diary_number = "NULL"
                petitioner = "NULL"
                respondent = "NULL"
                petitioner_advocate = "NULL"
                respondent_advocate = "NULL"
                judgment_date = "NULL"
                judge_name = "NULL"
                bench = "NULL"
                pdf_data = "NULL"
                pdf_file = "NULL"

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
Esempio n. 23
0
def request_data(court_name, start_date, end_date_):
    try:
        url = base_url + "/hcs/hcourt/hg_judgement_search"
        headers = {
            'Content-Type': "application/x-www-form-urlencoded",
            'Accept': "application/json",
            'Cache-Control': "no-cache"
        }

        if int(start_date[-2:]) < 11:
            update_query(
                "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '"
                + str(court_name) + "'")
            if int(end_date_[-2:]) < 11:
                update_history_tracker(court_name)
                return True

        for month_year in month_list_([str(start_date), str(end_date_)]):
            year = int(month_year[-2:]) - 10

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            update_query("UPDATE Tracker SET Start_Date = '" +
                         str(month_year) + "', End_Date = '" + str(end_date_) +
                         "' WHERE Name = '" + str(court_name) + "'")

            querystring = {"ajax_form": "1", "_wrapper_format": "drupal_ajax"}

            payload = "form_build_id=form-BS37MKVfuGmv9fgHWUqr3U9nFCjolonq-Nnenj3Ks24" \
                      "&form_id=ajax_example_form" \
                      "&ordermonth=" + str(month_year[:-2]).lstrip("0") + \
                      "&orderyear=" + str(year) + \
                      "&_triggering_element_name=op" \
                      "&_triggering_element_value=Search" \
                      "&_drupal_ajax=1" \
                      "&ajax_page_state%5Btheme%5D=mytheme" \
                      "&ajax_page_state%5Btheme_token%5D=%20" \
                      "&ajax_page_state%5Blibraries%5D=asset_injector%2Fcss%2Fanimation_accordin%2Casset_injector" \
                      "%2Fcss%2Fside_bar%2Casset_injector%2Fcss%2Ftable%2Casset_injector%2Fjs%2Fseperate_tab_%2C" \
                      "core%2Fdrupal.ajax%2Ccore%2Fhtml5shiv%2Ccore%2Fjquery.form%2Cmytheme%2Fmylibrarynew%2C" \
                      "system%2Fbase%2Cviews%2Fviews.module"

            response = requests.request("POST",
                                        url,
                                        data=payload,
                                        headers=headers,
                                        params=querystring,
                                        proxies=proxy_dict)

            json_res = json.loads(response.text)
            res = None
            for json_r in json_res:
                if "data" in json_r:
                    res = BeautifulSoup(str(json_r['data']), "html.parser")
                    break

            if res is None:
                logging.error("NO data Found for start date: " +
                              str(month_year))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")
                continue

            if not parse_html(res, court_name):
                logging.error("Failed to parse data from date: " +
                              str(month_year))

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
Esempio n. 24
0
def request_data(court_name, start_date, end_date_):
    try:
        url = base_url + 'php/getJBJ.php'
        headers = {
            'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
            'Cache-Control': "no-cache"
        }

        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (
                datetime.datetime.strptime(str(start_date), "%d-%m-%Y") +
                datetime.timedelta(days=30)).strftime("%d-%m-%Y")

            if datetime.datetime.strptime(end_date_, "%d-%m-%Y") + datetime.timedelta(days=30) < \
                    datetime.datetime.strptime(str(end_date), "%d-%m-%Y"):
                logging.error("END date Exceed.")
                break

            update_query("UPDATE Tracker SET Start_Date = '" +
                         str(start_date) + "', End_Date = '" + str(end_date) +
                         "' WHERE Name = '" + str(court_name) + "'")

            payload = "jorrop=J" \
                      "&JBJfrom_date=" + str(start_date) + \
                      "&JBJto_date=" + str(end_date)

            response = requests.request("POST",
                                        url,
                                        data=payload,
                                        headers=headers,
                                        verify=False,
                                        proxies=proxy_dict)
            res = response.text

            if "no data found" in res.lower():
                logging.error("NO data Found for start date: " +
                              str(start_date))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")
                start_date = end_date
                continue

            if not parse_html(res, court_name):
                logging.error("Failed to parse data from date: " +
                              str(start_date))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)

        return False
Esempio n. 25
0
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(str(html_str).replace('&', ' '), "html.parser")
        tr_list = soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    judgment_date = escape_string(str(td.decode_contents()))

                if i == 2:
                    judge_name = escape_string(str(td.decode_contents()))

                if i == 3:
                    case_no = escape_string(str(td.text))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 4:
                    party = str(td.decode_contents()).split("v/s")
                    petitioner = escape_string(str(party[0]))
                    respondent = escape_string(str(party[1]))

                if i == 5:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = escape_string(str(base_url + a_tag.get('href')))
                    pdf_data = escape_string(
                        request_pdf(base_url + a_tag.get('href'), case_no,
                                    court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge_name, pdf_file, pdf_filename) VALUE ('" + \
                            case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \
                            judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
Esempio n. 26
0
def request_data(court_name, start_date, end_date_):
    try:
        if int(start_date[-4:]) < 2010:
            update_query(
                "UPDATE Tracker SET status = 'IN_NO_DATA_FOUND', emergency_exit=true WHERE Name = '"
                + str(court_name) + "'")
            if int(end_date_[-4:]) < 2010:
                update_history_tracker(court_name)
                return True

        for month_year in month_list_([str(start_date), str(end_date_)]):
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            data = {
                'ctl00$CPHBody$DropDownListYear':
                str(month_year[-4:]),
                'ctl00$CPHBody$DropDownListMonth':
                str(month_year[:-4]).lstrip("0"),
                'ctl00$CPHBody$TextBox1':
                '',
                'ctl00$CPHBody$SM1':
                'ctl00$CPHBody$SM1|ctl00$CPHBody$DropDownListMonth'
            }

            with requests.Session() as s:
                page = s.get(base_url + 'judgement.aspx')
                soup = BeautifulSoup(page.content, "html.parser")

                data["__VIEWSTATE"] = soup.select_one("#__VIEWSTATE")["value"]
                data["__VIEWSTATEGENERATOR"] = soup.select_one(
                    "#__VIEWSTATEGENERATOR")["value"]
                data["__EVENTVALIDATION"] = soup.select_one(
                    "#__EVENTVALIDATION")["value"]

                update_query("UPDATE Tracker SET Start_Date = '" +
                             str(month_year) + "' WHERE Name = '" +
                             str(court_name) + "'")

                response = s.post(base_url + 'judgement.aspx', data=data)
                res = response.text

                if "no records were found." in res.lower(
                ) or "application error" in res.lower():
                    logging.error("NO data Found for start date: " +
                                  str(month_year))
                    update_query(
                        "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                        + str(court_name) + "'")
                    continue

                if not parse_html(res, court_name):
                    logging.error("Failed to parse data")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False
Esempio n. 27
0
def parse_html(html_str, court_name, flag):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        soup = BeautifulSoup(str(soup.prettify()), "html.parser")

        date_h4 = soup.find_all('h4', {'align': 'center'})[0]
        month_year = str(date_h4.text).replace('JUDGMENTS FOR THE MONTH OF', '').strip()

        table_list = soup.find_all('table', {'class': 'DISCOVERY3'})[0]
        table_soup = BeautifulSoup(str(table_list), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            subject = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            if flag:
                i = 1
            else:
                i = 0
            for td in td_list:
                i += 1
                if i == 2:
                    judgment_day = escape_string(str(td.decode_contents()))
                    judgment_date = str(re.findall('\d+', str(judgment_day))[0]) + ", " + month_year.replace(
                        'JUDGEMENTS FOR THE MONTH OF', '')

                if i == 3:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = escape_string(str(base_url + a_tag.get('href')))
                    case_no = escape_string(str(a_tag.text).replace("\n", "").strip())

                    # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                    #     insert_check = True
                    pdf_data = escape_string(request_pdf(str(base_url + a_tag.get('href')), case_no, court_name))

                if i == 4:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    if font_tag is not None:
                        span_tag = font_tag.span
                    else:
                        span_tag = BeautifulSoup(str(td), "html.parser").span
                        if span_tag is None:
                            span_tag = BeautifulSoup(str(td), "html.parser")

                    party = str(span_tag.decode_contents()).split("<br/>")
                    petitioner = escape_string(
                        str(party[0]).replace('<td align="center" bgcolor="#FFFFFF" valign="middle" width="30%">',
                                              '').strip())
                    petitioner = re.sub(r'(\\x(.){2})', '', petitioner)

                    respondent = escape_string(str(party[2]).replace('</td>', '').strip())
                    respondent = re.sub(r'(\\x(.){2})', '', respondent)

                if i == 5:
                    subject = escape_string(str(td.decode_contents()).strip())

                if i == 6:
                    judge_name = escape_string(str(td.text).replace(r'\x', '').replace('\\xC2\\x92BLE', '').strip())
                    judge_name = re.sub(r'(\\x(.){2})', '', judge_name)
                    judge_name = re.sub(r'', '', judge_name, re.U)

            # if case_no != "NULL" and insert_check and td_list:
            if case_no != "NULL" and td_list:
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "subject, pdf_file, pdf_filename) VALUE ('" + case_no + \
                            "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + subject + \
                            "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET judge_name = '" + str(judge_name) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
Esempio n. 28
0
def parse_html(html_str, court_name, bench, child_url):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        div_soup = BeautifulSoup(str(soup.find_all('div', {'id': 'text'})[0]),
                                 'html.parser')
        tr_list = div_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            date_of_order = "NULL"
            description = "NULL"
            section = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(
                        str(td.text).strip().replace("\n", ""))

                if i == 2:
                    date_of_order = escape_string(
                        str(td.text).strip().replace("\n", ""))

                # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order):
                #     insert_check = True

                if i == 3:
                    description = escape_string(str(td.text).strip())
                    a_tag = BeautifulSoup(str(td), "html.parser").font.a
                    pdf_url = base_url + child_url + a_tag.get('href')
                    pdf_file = escape_string(pdf_url)
                    pdf_data = escape_string(
                        request_pdf(pdf_url, case_no, court_name))

                if i == 4:
                    section = str(td.text)

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, description, section, " \
                                                               "pdf_file, bench_code, pdf_filename) VALUE ('" + \
                            case_no + "', '" + date_of_order + "', '" + description + "', '" + section + "', '" + \
                            pdf_file + "', '" + str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
Esempio n. 29
0
def parse_html(html_str, court_name, bench_code):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table')

        for table in table_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            text = "NULL"
            text_file = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            table_soup = BeautifulSoup(str(table), "html.parser")
            td_list = table_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(str(td.decode_contents()))

                if i == 2:
                    petitioner = escape_string(str(td.decode_contents()))
                if i == 4:
                    respondent = escape_string(str(td.decode_contents()))
                if i == 6:
                    judgment_date = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 7:
                    judge_name = escape_string(str(td.decode_contents()))
                if i == 8:
                    a_link = BeautifulSoup(str(td),
                                           "html.parser").a.get('href')
                    text_dir = request_text(base_url + a_link, case_no,
                                            court_name)
                    text = escape_string(text_dir['data'])
                    text_file = escape_string(base_url + a_link)
                if i == 9:
                    a_link = BeautifulSoup(str(td),
                                           "html.parser").a.get('href')
                    pdf_file = escape_string(base_url + a_link)
                    pdf_data = escape_string(
                        request_pdf(base_url + a_link, case_no, court_name))

            # if case_no != "NULL" and insert_check and petitioner != 'Judgment Information System':
            if case_no != "NULL" and petitioner != 'Judgment Information System':
                sql_query = "INSERT INTO " + str(court_name) + \
                            " (case_no, petitioner, respondent, judgment_date, judge_name, text_data, text_file, " \
                            "pdf_file, bench_code, pdf_filename) VALUE ('" + case_no + "', '" + petitioner + "', '" + \
                            respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + text + "', '" + \
                            text_file + "', '" + pdf_file + "', " + str(bench_code) + ", '" + court_name + "_" + \
                            slugify(case_no) + ".txt')"
                insert_query(sql_query)

                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def request_data(headers, start_date, end_date_):
    try:
        url = base_url + "/ByDate.php"

        i = 0
        while True:
            i += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit['emergency_exit'] == 1:
                update_history_tracker(court_name)
                return True

            end_date = (
                datetime.datetime.strptime(str(start_date), "%d-%m-%Y") +
                datetime.timedelta(days=180)).strftime("%d-%m-%Y")

            if datetime.datetime.strptime(str(end_date_), "%d-%m-%Y") + datetime.timedelta(days=180) < \
                    datetime.datetime.strptime(str(end_date), "%d-%m-%Y"):
                logging.error("DONE")
                break

            update_query("UPDATE Tracker SET Start_Date = '" +
                         str(start_date) + "', End_Date = '" + str(end_date) +
                         "' WHERE Name = '" + str(court_name) + "'")

            payload = "date_day=" + str(start_date[0:2]).replace("0", "") + \
                      "&date_month=" + str(start_date[3:5]).replace("0", "") + \
                      "&date_year=" + str(start_date[6:]) + \
                      "&date_day1=" + str(end_date[0:2]).replace("0", "") + \
                      "&date_month1=" + str(end_date[3:5]).replace("0", "") + \
                      "&date_year1=" + str(end_date[6:]) + \
                      "&submit=Submit"

            response = requests.request("POST",
                                        url,
                                        data=payload,
                                        headers=headers,
                                        proxies=proxy_dict)
            res = response.text

            if "invalid inputs given" in res.lower():
                logging.error("NO data Found for start date: " +
                              str(start_date))
                update_query(
                    "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                    + str(court_name) + "'")

                start_date = end_date
                continue

            if not offset_link(res, headers):
                logging.error("Failed to parse data from date: " +
                              str(start_date))

            start_date = end_date

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to get data from date: " + str(start_date))
        logging.error("Failed to request: %s", e)
        return False