def check_date(get_htmlSource, SegFeild): deadline = str(SegFeild[24]) curdate = datetime.now() curdate_str = curdate.strftime("%Y-%m-%d") try: if deadline != '': datetime_object_deadline = datetime.strptime(deadline, '%Y-%m-%d') datetime_object_curdate = datetime.strptime( curdate_str, '%Y-%m-%d') timedelta_obj = datetime_object_deadline - datetime_object_curdate day = timedelta_obj.days if day > 0: insert_in_Local(get_htmlSource, SegFeild) else: print("Expired Tender") global_var.expired += 1 else: print("Deadline Not Given") global_var.deadline_Not_given += 1 except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Error ON : ", sys._getframe().f_code.co_name + "--> " + str(e), "\n", exc_type, "\n", fname, "\n", exc_tb.tb_lineno)
def check_date(get_htmlSource, SegField): a = 0 while a == 0: tender_date = str(SegField[24]) nowdate = datetime.now() date2 = nowdate.strftime("%Y-%m-%d") try: if tender_date != '': deadline = time.strptime(tender_date, "%Y-%m-%d") currentdate = time.strptime(date2, "%Y-%m-%d") if deadline > currentdate: insert_in_Local(get_htmlSource, SegField) a = 1 else: print("Tender Expired") Global_var.expired += 1 a = 1 else: print("Deadline was not given") Global_var.deadline_Not_given += 1 a = 1 except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] exc_type, exc_obj, exc_tb = sys.exc_info() print("Error ON : ", sys._getframe().f_code.co_name + "--> " + str(e), "\n", exc_type, "\n", fname, "\n", exc_tb.tb_lineno) a = 0
def Check_date(get_htmlSource, browser, SegFeild): tender_date = str(SegFeild[24]) nowdate = datetime.now() date2 = nowdate.strftime("%Y-%m-%d") try: if tender_date != '': deadline = time.strptime(tender_date, "%Y-%m-%d") currentdate = time.strptime(date2, "%Y-%m-%d") if deadline > currentdate: insert_in_Local(get_htmlSource, SegFeild) else: print("Expired") global_var.expired += 1 else: print("Deadline was not given") global_var.deadline_Not_given += 1 ctypes.windll.user32.MessageBoxW(0, "Deadline Not Found", "pprasindh.gov.pk", 1) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Error ON : ", sys._getframe().f_code.co_name + "--> " + str(e), "\n", exc_type, "\n", fname, "\n", exc_tb.tb_lineno)
def Scraping_data(get_htmlSource, purchaser, reference_number, Title, Tender_id, Url): SegFields = [] for data in range(45): SegFields.append('') Decoded_get_htmlSource: str = html.unescape(str(get_htmlSource)) Decoded_get_htmlSource: str = re.sub('\s+', ' ', str(Decoded_get_htmlSource)).replace( "\n", "") a = True a = True while a == True: try: # ================================================================================================================== # Purchaser_Email ID Address_html = Decoded_get_htmlSource.partition( "NAZWA I ADRES:")[2].partition("</div>")[0] Email_ID = Address_html.partition("e-mail")[2].partition( ",")[0].replace(':', '') SegFields[1] = Email_ID.strip() # Purchaser_Email ID # ================================================================================================================== # Address Address = Address_html.partition("Polska,")[0] cleanr = re.compile('<.*?>') Address = re.sub(cleanr, '', Address).strip() Address = string.capwords(str(Address)) Address = f'{str(Address)} Polska.' Tel = Address_html.partition("tel.")[2].partition(",")[0].strip() Fax = Address_html.partition("faks")[2].partition(".")[0].strip() Collected_Address = str(Address) + "<br>\n" + "Tel: " + str( Tel) + "<br>\n" + "faks: " + Fax.replace('-', '') SegFields[2] = Collected_Address # Purchaser_Address # # ================================================================================================================== # # contactor_name = Decoded_get_htmlSource.partition( "Nazwa wykonawcy:")[2].partition("<br>")[0].strip() SegFields[3] = contactor_name # Contractor_name CN_Address = Decoded_get_htmlSource.partition( "Adres pocztowy:")[2].partition("<br>")[0].strip() if CN_Address != '': SegFields[4] += f'{CN_Address}, ' CN_Postal_Address = Decoded_get_htmlSource.partition( "Kod pocztowy:")[2].partition("<br>")[0].strip() CN_City = Decoded_get_htmlSource.partition( "Miejscowość:")[2].partition("<br>")[0].strip() if CN_City != '': SegFields[4] += f'{CN_City}, ' CN_Country_province = Decoded_get_htmlSource.partition( "Kraj/woj.:")[2].partition("<br>")[0].strip() if CN_Country_province != '': SegFields[4] += f'{CN_Country_province}, ' if CN_Postal_Address != '': SegFields[ 4] += f'Kod pocztowy({CN_Postal_Address})' # Contractor_Address SegFields[4] = SegFields[4].strip().rstrip(',') SegFields[7] = global_var.pur_country_code # Purchaser_Country SegFields[5] = global_var.cont_country_code # Contractor_Country CN_Email_ID = Decoded_get_htmlSource.partition( "Email wykonawcy:")[2].partition("<br>")[0].strip() SegFields[6] = CN_Email_ID.lower().strip() # Contractor_EmailID # # ================================================================================================================== # # Purchaser_URL Purchaser_URL = Address_html.partition( "Adres strony internetowej (url):")[2].replace('<br>', '').strip() SegFields[8] = Purchaser_URL # # ================================================================================================================== # # Purchaser_name SegFields[12] = purchaser.strip() # # ================================================================================================================== # # reference_no SegFields[13] = Tender_id.strip() SegFields[14] = "0" # news_check SegFields[16] = "1" # qc SegFields[17] = global_var.exe_no # CA_exe_number # # ================================================================================================================== # # Tender Details SegFields[19] = Title # short_descp Short_disc = Decoded_get_htmlSource.partition( "Krótki opis przedmiotu zamówienia")[2].partition("</div>")[0] Short_disc = str(Short_disc).encode('ascii', 'replace') cleanr = re.compile('<.*?>') Short_disc = re.sub(cleanr, '', str(Short_disc)) Short_disc = string.capwords( str(Short_disc.strip().replace('?', ''))) Type_of_contract = Decoded_get_htmlSource.partition( "Rodzaj zamówienia:</b>")[2].partition("</div>")[0] cleanr = re.compile('<.*?>') Type_of_contract = re.sub(cleanr, '', Type_of_contract) Type_of_contract = string.capwords(str(Type_of_contract.strip())) Collected_Tender_Details = f'Krótki opis przedmiotu zamówienia {str(Short_disc)}<br>\nRodzaj zamówienia: {str(Type_of_contract)}' SegFields[18] = Collected_Tender_Details # award_details # # ================================================================================================================== # # contract_date DATE_OF_CONTRACT_AWARD = Decoded_get_htmlSource.partition( "DATA UDZIELENIA ZAMÓWIENIA:")[2].partition("<br>")[0].replace( ' ', '').strip() cleanr = re.compile('<.*?>') DATE_OF_CONTRACT_AWARD = re.sub(cleanr, '', DATE_OF_CONTRACT_AWARD) try: datetime_object = datetime.strptime(DATE_OF_CONTRACT_AWARD, '%d/%m/%Y') mydate = datetime_object.strftime("%Y-%m-%d") SegFields[24] = mydate except: pass SegFields[20] = '0' # userid contract_amount = Decoded_get_htmlSource.partition( "Wartość bez VAT</b>")[2].partition("<br>")[0].strip() if contract_amount != '': SegFields[21] = contract_amount.replace(',', '') # contract_value SegFields[22] = "PLN" # contract_currency SegFields[28] = Url.strip() # tender_doc_file_col2 # Source Name SegFields[31] = global_var.source # source_col1 ReplyStrings = Decoded_get_htmlSource.partition( 'Główny Kod CPV:</b>')[2].partition( '</div> </div>')[0].strip() cleanr = re.compile('<.*?>') ReplyStrings = re.sub(cleanr, '', ReplyStrings) if ReplyStrings != "": copy_cpv = "" Cpv_status = True all_string = "" try: while Cpv_status == True: phoneNumRegex = re.compile(r'\d\d\d\d\d\d\d\d-') CPv_main = phoneNumRegex.search(ReplyStrings) mainNumber = CPv_main.groups() if CPv_main: copy_cpv = CPv_main.group(), ", " ReplyStrings = ReplyStrings.replace( CPv_main.group(), "") else: Cpv_status = False result = "".join(str(x) for x in copy_cpv) result = result.replace("-", "").strip() result2 = result.replace("\n", "") # print(result2) all_string += result2 except: pass # print(all_string.strip(",")) all_string = all_string.strip().rstrip(',') SegFields[36] = all_string else: SegFields[36] = "" for SegIndex in range(len(SegFields)): print(SegIndex, end=' ') print(SegFields[SegIndex]) SegFields[SegIndex] = html.unescape(str(SegFields[SegIndex])) SegFields[SegIndex] = str(SegFields[SegIndex]).replace( "'", "''") if len(SegFields[19]) >= 200: SegFields[19] = str(SegFields[19])[:200] + '...' if len(SegFields[18]) >= 1500: SegFields[18] = str(SegFields[18])[:1500] + '...' insert_in_Local(get_htmlSource, SegFields) a = False except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Error ON : ", sys._getframe().f_code.co_name + "--> " + str(e), "\n", exc_type, "\n", fname, "\n", exc_tb.tb_lineno) global_var.On_Error += 1 a = True # def check_date(get_htmlSource, SegFields): # deadline = str(SegFields[24]) # curdate = datetime.now() # curdate_str = curdate.strftime("%Y-%m-%d") # try: # if deadline != '': # datetime_object_deadline = datetime.strptime(deadline, '%Y-%m-%d') # datetime_object_curdate = datetime.strptime(curdate_str, '%Y-%m-%d') # timedelta_obj = datetime_object_deadline - datetime_object_curdate # day = timedelta_obj.days # if day > 0: # insert_in_Local(get_htmlSource , SegFields) # else: # print("Expired Tender") # global_var.expired += 1 # else: # print("Deadline Not Given") # global_var.deadline_Not_given += 1 # except Exception as e: # exc_type , exc_obj , exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Error ON : " , sys._getframe().f_code.co_name + "--> " + str(e) , "\n" , exc_type , "\n" , fname , "\n" ,exc_tb.tb_lineno)
def Scrap_data(browser, Tender_href): global a a = True while a == True: try: for href in Tender_href: browser.get(href) Global_var.Total += 1 for Submission_date in browser.find_elements_by_xpath('//*[@id="CLOSING_DATE"]'): Submission_date = Submission_date.get_attribute('innerText').strip() if Submission_date != '': nowdate = datetime.now() date2 = nowdate.strftime("%Y-%m-%d") deadline = time.strptime(Submission_date, "%m/%d/%Y") currentdate = time.strptime(date2, "%Y-%m-%d") if deadline > currentdate: SegFeild = [] for data in range(45): SegFeild.append('') SegFeild[24] = date2 get_htmlSource = "" for outerHTML in browser.find_elements_by_xpath('/html/body/div/div/div[3]/div[2]/div[3]/div/form/div[3]'): get_htmlSource = outerHTML.get_attribute('outerHTML') get_htmlSource = get_htmlSource.replace('href="viewAttachment', 'href="https://www.ok.gov/dcs/solicit/app/viewAttachment')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="16" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="14" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="13" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="12" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="15" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="11" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="17" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="18" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="19" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="20" class="sm_button">','')\ .replace('<input type="submit" name="button" id="MAIN_MENU" value="Return To Main Menu" tabindex="10" class="sm_button">','')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="15" class="sm_button">','') \ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="13" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="14" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="16" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="17" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="18" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="19" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="20" class="sm_button">', '')\ .replace('<input type="submit" name="button" id="BACK" value="Back" tabindex="12" class="sm_button">', '') break # Attachment for attachment in browser.find_elements_by_xpath('//*[@class="table_wrapper"]'): attachment = attachment.get_attribute('outerHTML') SegFeild[4] = attachment.replace('href="viewAttachment', 'href="https://www.ok.gov/dcs/solicit/app/viewAttachment') break # Purchaser for Agency in browser.find_elements_by_xpath('//*[@id="AGENCY_ID"]'): Agency = Agency.get_attribute('innerText').upper() SegFeild[12] = Agency.strip() break # Title for Description in browser.find_elements_by_xpath('//*[@id="DESCRIPTION"]'): Description = Description.get_attribute('innerText').strip() Description = string.capwords(str(Description)) SegFeild[19] = Description break # tender NO for SOL_NUMBER in browser.find_elements_by_xpath('//*[@id="SOL_NUMBER"]'): SOL_NUMBER = SOL_NUMBER.get_attribute('innerText').strip() SegFeild[13] = SOL_NUMBER.strip() break # Tender Details CONTRACT_TYPE = "" BUYER_ID = '' DATE_STATUS = '' STATUS = '' for CONTRACT_TYPE in browser.find_elements_by_xpath('//*[@id="CONTRACT_TYPE"]'): CONTRACT_TYPE = CONTRACT_TYPE.get_attribute('innerText').replace(' ', '').strip() break for BUYER_ID in browser.find_elements_by_xpath('//*[@id="BUYER_ID"]'): BUYER_ID = BUYER_ID.get_attribute('innerText').strip() break for DATE_STATUS in browser.find_elements_by_xpath('//*[@id="DATE_STATUS"]'): DATE_STATUS = DATE_STATUS.get_attribute('innerText').strip() break for STATUS in browser.find_elements_by_xpath('//*[@id="STATUS"]'): STATUS = STATUS.get_attribute('innerText').strip() break # CPV global result2 for CPV in browser.find_elements_by_xpath('//*[@class="commodity"]'): CPV = CPV.get_attribute('innerText').strip().replace('\n', '') if CPV != "": copy_cpv = "" Cpv_status = True all_string = "" try: while Cpv_status == True: phoneNumRegex = re.compile(r'\d\d\d\d\d\d\d\d') CPv_main = phoneNumRegex.search(CPV) mainNumber = CPv_main.groups() if CPv_main: copy_cpv = CPv_main.group(), ", " CPV = CPV.replace(CPv_main.group(), "") else: Cpv_status = False result = "".join(str(x) for x in copy_cpv) result = result.replace("", "").strip() result2 = result.replace("\n", "") # print(result2) all_string += result2+',' except: pass all_string = all_string.replace(',,', ',') if all_string.endswith(','): all_string = all_string[:-1] print(all_string) SegFeild[36] = all_string else: SegFeild[36] = "" break CPV = '' for CPV in browser.find_elements_by_xpath('//*[@class="commodity"]'): CPV = CPV.get_attribute('innerText').strip().replace('\n', ', ') break SegFeild[18] = "Agency: " + str(SegFeild[12]) + "<br>\n""Contract Type: " + CONTRACT_TYPE + "<br>\n""Solicitation Number: " + str(SegFeild[13]) + "<br>\n"\ "Status: " + STATUS + "<br>\n""Closing Date Status: " + DATE_STATUS + "<br>\n""DESCRIPTION: " + str(SegFeild[19]) + "<br>\n""BUYER: " + BUYER_ID+"<br>\n""CPV: " + CPV SegFeild[7] = "US" # notice type SegFeild[14] = "2" SegFeild[22] = "0" SegFeild[26] = "0.0" SegFeild[27] = "0" # Financier SegFeild[28] = browser.current_url # Source Name SegFeild[31] = 'ok.gov' SegFeild[42] = SegFeild[7] # project_location SegFeild[43] = '' # set_aside for SegIndex in range(len(SegFeild)): print(SegIndex, end=' ') print(SegFeild[SegIndex]) SegFeild[SegIndex] = html.unescape(str(SegFeild[SegIndex])) SegFeild[SegIndex] = str(SegFeild[SegIndex]).replace("'", "''") if len(SegFeild[19]) >= 200: SegFeild[19] = str(SegFeild[19])[:200]+'...' if len(SegFeild[18]) >= 1500: SegFeild[18] = str(SegFeild[18])[:1500]+'...' a = False insert_in_Local(get_htmlSource , SegFeild) print(" Total: " + str(Global_var.Total) + " Duplicate: " + str( Global_var.duplicate) + " Expired: " + str(Global_var.expired) + " Inserted: " + str( Global_var.inserted) + " Skipped: " + str( Global_var.skipped) + " Deadline Not given: " + str( Global_var.deadline_Not_given) + " QC Tenders: " + str(Global_var.QC_Tender), "\n") a = False else: print("Tender Expired") Global_var.expired += 1 a = False print(" Total: " + str(Global_var.Total) + " Duplicate: " + str( Global_var.duplicate) + " Expired: " + str(Global_var.expired) + " Inserted: " + str( Global_var.inserted) + " Skipped: " + str( Global_var.skipped) + " Deadline Not given: " + str( Global_var.deadline_Not_given) + " QC Tenders: " + str(Global_var.QC_Tender), "\n") else: print("Deadline was not given") Global_var.deadline_Not_given += 1 print(" Total: " + str(Global_var.Total) + " Duplicate: " + str( Global_var.duplicate) + " Expired: " + str(Global_var.expired) + " Inserted: " + str( Global_var.inserted) + " Skipped: " + str( Global_var.skipped) + " Deadline Not given: " + str( Global_var.deadline_Not_given) + " QC Tenders: " + str(Global_var.QC_Tender), "\n") a = False ctypes.windll.user32.MessageBoxW(0, "Total: " + str(Global_var.Total) + "\n""Duplicate: " + str( Global_var.duplicate) + "\n""Expired: " + str(Global_var.expired) + "\n""Inserted: " + str( Global_var.inserted) + "\n""Skipped: " + str( Global_var.skipped) + "\n""Deadline Not given: " + str( Global_var.deadline_Not_given) + "\n""QC Tenders: " + str(Global_var.QC_Tender) + "", "ok.gov", 1) a = False Global_var.Process_End() browser.close() sys.exit() except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Error ON : ", sys._getframe().f_code.co_name + "--> " + str(e), "\n", exc_type, "\n", fname, "\n", exc_tb.tb_lineno) a = True sys.exit()