def _update_contract_from_document_cloud(self, document_cloud_id, fields): """ Update an existing contract in the local database. TODO: compare to add_contract(), because this doesn't update. It adds. :param document_cloud_id: The unique ID in DocumentCloud. :type document_cloud_id: string :param fields: The metadata fields to add along with the contract? :type fields: dict """ log.debug('Updating contract in database that has DocumentCloud ID %s', document_cloud_id) contract = (SESSION.query(Contract).filter( Contract.doc_cloud_id == document_cloud_id).first()) contract.contractnumber = fields['contractno'] contract.vendorid = fields['vendor'] contract.departmentid = fields['department'] contract.dateadded = fields['dateadded'] contract.title = fields['title'] contract.purchaseordernumber = fields['purchaseno'] contract.description = fields['description'] SESSION.add(contract) SESSION.commit()
def prepare_then_add_contract(self, purchase_order_object): ''' Call on method to make minor adjustments, then call on another method to upload the contract file and its metadata to the DocumentCloud project. :param purchase_order_object: A PurchaseOrder object instance. ''' # Verify that there is at least one file to download. number_of_attachments = len(purchase_order_object.attachments) log.debug('There are %d attachments to upload', number_of_attachments) if number_of_attachments > 0: for i, attachment in enumerate(purchase_order_object.attachments): attachment_id = re.search( '[0-9]+', attachment.get('href')).group() attachment_location = ( '%s/%s.pdf' % (DOCUMENTS_DIR, attachment_id) ) purchase_order_object = self.prepare_contract( purchase_order_object, i ) self._upload_contract( attachment_location, purchase_order_object )
def __init__(self, purchase_order_number): self.purchaseorder = purchase_order_number validity = Utilities().check_if_valid_purchase_order_format( self.purchaseorder) if validity is False: log.debug('Purchase order %s is invalid', self.purchaseorder) return html = self._get_html() self.vendor_id_city = self._get_city_vendor_id(html) self._download_vendor_profile(self.vendor_id_city) soup = BeautifulSoup(html) self.description = self._get_description(soup) try: self.vendor_name = self._get_vendor_name() except IOError as e: log.error(e, exc_info=True) self.vendor_name = "unknown" log.info('No vendor info for purchase order %s', self.purchaseorder) self.department = self._get_department(soup) self.k_number = self._get_knumber(soup) self.attachments = self._get_attachments(soup) self.data = self._get_data() self.title = "%s : %s" % (self.vendor_name, self.description)
def process_direct_hit(self, raw_html, vendor_name): """TODO.""" vendor_name = vendor_name.strip("\n").replace(".", "") log.debug("Adding vendor %s", vendor_name) self.add_vendor(vendor_name) soup = BeautifulSoup(raw_html) try: officers = soup.find_all( id="ctl00_cphContent_pnlOfficers")[0].select(".TableBorder") except IndexError: # some places have no listed officers. ex 311 networks officers = [] # agents = [] # try: # agents = soup.find_all( # id="ctl00_cphContent_pnlAgents")[0].select(".TableBorder") # except: # agents = [] for officer in officers: name = [l.text for l in officer.select("span")].pop(0) self.add_name(name) self.link(name, vendor_name)
def _get_city_vendor_id(html): ''' Parses the contract page's HTML to find the vendor ID. :param html: The contract page's HTML. :type html: string :returns: string. The vendor ID, or an empty string if none is found. ''' pattern = r"(?<=ExternalVendorProfile\(')\d+" vendor_ids = re.findall(pattern, html) if len(vendor_ids) == 0: log.error('No vendor ID found') vendor_id = "" else: # You need to take the first one for this list or you'll sometimes # end up w/ the vendor_id for a subcontractor, which will sometimes # end up on the vendor page. # http://www.purchasing.cityofno.com/bso/external/purchaseorder/ # poSummary.sdo?docId=FC154683&releaseNbr=0&parentUrl=contract vendor_id = vendor_ids[0] log.debug('Vendor ID %s', vendor_id) return vendor_id
def _download_attachment(self, attachment): ''' Download an attachment associated with a purchase order. :param attachment: The name of the attachment file to download. :type attachment: string ''' # The city's purchasing site has an internal ID for each attachment. # Here we use it to download the attachment files, and also to store # locally so we can have a list of the attachments we have on hand. city_attachment_id = re.search( '[0-9]+', attachment.get('href')).group() log.debug('Gathering data for attachment %s', city_attachment_id) document_path = '%s/%s.pdf' % (DOCUMENTS_DIR, city_attachment_id) display_name = self._get_attachment_display_name(city_attachment_id) if os.path.isfile(document_path): # Have already downloaded log.info('Already have PDF for attachment %s', city_attachment_id) else: self._download_attachment_file(city_attachment_id, display_name, document_path)
def _find_number_of_pages(self): ''' Finds how many pages of contracts there are on the city's purchasing site. :returns: int. The number of pages. ''' html = self._get_index_page(1) soup = BeautifulSoup(html) main_table = soup.select('.table-01').pop() metadata_row = main_table.find_all( 'tr', recursive=False)[3].findChildren( # [3] if zero-based, [4] if not ['td']) metadata_row = metadata_row[0].findChildren(['table'])[0].findChildren( ['tr'])[0].findChildren(['td'])[0].findChildren( ['table'])[0].findChildren(['tr'])[1] href = metadata_row.findChildren(['td'])[0].findChildren( ['a'])[-1].get('href') number_of_pages = re.search('[0-9]+', href).group() log.debug("There were %d pages found on the city's purchasing portal", number_of_pages) return int(number_of_pages)
def check_pages(self): ''' Runs a scan for each of the 10 most recent pages on the city's purchasing website. :params pages: A range of page numbers to check. :type pages: list. ''' number_of_pages = self._find_number_of_pages() new_pages = range(1, 11) old_pages = range(11, number_of_pages + 1) shuffle(new_pages) shuffle(old_pages) new_counter = 0 for new_page in new_pages: log.debug('New page %d', new_page) need_to_scrape = LensDatabase().check_if_need_to_scrape(new_page) if need_to_scrape is False: continue self._scan_index_page(new_page) LensDatabase().update_scrape_log(new_page) new_counter += 1 # Run five times per day, so break after 2 pages in order to reach # 10 pages per day. if new_counter == 2: break time.sleep(10) old_counter = 0 for old_page in old_pages: log.debug('Old page %s', old_page) need_to_scrape = LensDatabase().check_if_need_to_scrape(old_page) if need_to_scrape is False: continue self._scan_index_page(old_page) LensDatabase().update_scrape_log(old_page) old_counter += 1 # Run five times per day, seven days per week, so break after 13 # pages in order to reach about 450 pages per week. if old_counter == 13: break time.sleep(10)
def _add_department(self, department): """ Add department to the local database. :param meta_field: The department to add to local database. :type meta_field: string """ log.debug('Adding department "%s" to database', department) SESSION.add(Department(department)) SESSION.commit()
def _add_vendor(self, vendor, vendor_id_city=None): """ Add vendor to the local database. :param vendor: The vendor to add to our database. :type vendor: string """ log.debug('Adding vendor "%s" to database', vendor) vendor = Vendor(vendor, vendor_id_city) SESSION.add(vendor) SESSION.commit()
def _get_database_vendor_id(self, vendor): """ Get a vendor's ID from our database. :param vendor: The vendor name. :type vendor: string :returns: string. The database's vendor ID for this vendor. """ log.debug('Fetching database ID for vendor "%s"', vendor) vendor = (SESSION.query(Vendor).filter(Vendor.name == vendor).first()) SESSION.close() return vendor.id
def _get_department_id(self, department): """ Get the department's ID from our database. :param department: The department name. :type department: string :returns: string. The database ID for the department name. """ log.debug('Finding ID for department "%s" in database', department) department = (SESSION.query(Department).filter( Department.name == department).first()) SESSION.close() return department.id
def _get_contract_doc_cloud_id(self, document_cloud_id): """ Get a contract from the DocumentCloud project. :param document_cloud_id: The unique ID in the DocumentCloud project. :type document_cloud_id: string :returns: dict. A dict (?) for the matching contract. """ log.debug('Find contract in database that has DocumentCloud ID %s', document_cloud_id) query = (SESSION.query(Contract).filter( Contract.doc_cloud_id == document_cloud_id).first()) SESSION.close() return query
def _check_if_need_to_download_contract(purchase_order_number): ''' Determines whether this contract should be downloaded, and also whether it needs to be added to our DocumentCloud and local database. :param purchase_order_number: The contract's purchase order number. :type purchase_order_number: string ''' log.info('Checking purchase order %s', purchase_order_number) # Check local file repository try: log.debug('LensRepository') need_to_download = LensRepository( purchase_order_number).check_if_need_to_download() if need_to_download: LensRepository(purchase_order_number).download_purchase_order() except urllib2.HTTPError: log.exception('Purchase order %s not posted publically', purchase_order_number) try: log.debug('PurchaseOrder') purchase_order_object = PurchaseOrder(purchase_order_number) purchase_order_object.download_attachments() except IndexError: log.exception(purchase_order_number) return # Check DocumentCloud project try: log.debug('DocumentCloudProject') need_to_upload = DocumentCloudProject().check_if_need_to_upload( purchase_order_number) if need_to_upload: DocumentCloudProject().prepare_then_add_contract( purchase_order_object) except urllib2.HTTPError: log.exception('Purchase order %s not posted publically', purchase_order_number) # Check local database try: log.debug('LensDatabase') contract_exist = LensDatabase().check_if_database_has_contract( purchase_order_number) if contract_exist is False: LensDatabase().add_to_database(purchase_order_object) except urllib2.HTTPError: log.exception('Purchase order %s is not posted publically.', purchase_order_number)
def _check_if_vendor_exists(self, vendor): """ Check if database has this vendor. :param vendor: The vendor to check for. :type vendor: string? :returns: boolean. True if vendor exists in database, False if not. """ count = (SESSION.query(Vendor).filter(Vendor.name == vendor).count()) SESSION.close() if count == 0: log.debug('Vendor "%s" is missing from database', vendor) return False else: return True
def _scan_index_page(self, page_number): ''' Run the downloader helper for this page on the purchasing site. :param page_number: The page to check on the city's website. :type page_number: string ''' html = self._get_index_page(page_number) purchase_order_numbers = self._get_purchase_order_numbers(html) for i, purchase_order_number in enumerate(purchase_order_numbers): log.debug('Purchase order %s', purchase_order_number) log.debug('(%d of %d)', i + 1, len(purchase_order_numbers)) self._check_if_need_to_download_contract(purchase_order_number) time.sleep(2)
def _check_if_department_exists(self, department): """ Check if database has this department. :param department: ??? :type department: ??? :returns: boolean. True if it exists in the database, False if not. """ count = (SESSION.query(Department).filter( Department.name == department).count()) SESSION.close() if count == 0: log.debug('Department "%s" is missing from database', department) return False else: return True
def _find_number_of_pages(self): ''' Finds how many pages of contracts there are on the city's purchasing site. :returns: int. The number of pages. ''' html = self._get_index_page(1) soup = BeautifulSoup(html) main_table = soup.select('.table-01').pop() metadata_row = main_table.find_all( 'tr', recursive=False )[3].findChildren( # [3] if zero-based, [4] if not ['td'] ) metadata_row = metadata_row[0].findChildren( ['table'] )[0].findChildren( ['tr'] )[0].findChildren( ['td'] )[0].findChildren( ['table'] )[0].findChildren( ['tr'] )[1] href = metadata_row.findChildren( ['td'] )[0].findChildren( ['a'] )[-1].get('href') number_of_pages = re.search( '[0-9]+', href).group() log.debug("There were %d pages found on the city's purchasing portal", number_of_pages) return int(number_of_pages)
def check_if_need_to_download(self): ''' Checks local directory to determine whether a local copy is needed. :returns: boolean. True if need to download, False if don't need to. ''' # Check if contract has valid format and is public validity = Utilities().check_that_contract_is_valid_and_public( self.purchase_order_number) file_location = ('%s/%s.html' % (PURCHASE_ORDER_DIR, self.purchase_order_number)) local_copy_exists = os.path.isfile(file_location) if validity is False or local_copy_exists: log.debug("Don't download. Contract is invalid, private or we " + "already the HTML") return False # Don't download else: return True
def _upload_contract(self, filename, purchase_order_object): ''' This actually uploads a contract to our DocumentCloud project. :param filename: The path to the downloaded contract PDF file (?). :type filename: string :param description: The contract's description. :type description: string. :param title: The contract's title. :type title: string. ''' log.debug('Uploading purchase order %s to DocumentCloud', filename) is_null = self._check_if_contract_number_is_null(purchase_order_object) if is_null: return purchase_order_object.title = purchase_order_object.title.replace( "/", "") # Not sure why this is necessary purchase_order_number = str(purchase_order_object.purchaseorder) title = str(purchase_order_object.title) log.debug('Uploading purchase order %s ("%s") to DocumentCloud...', purchase_order_number, title) self.api_connection.documents.upload( filename, title, 'City of New Orleans', # Source of this file purchase_order_object.description, None, # Related article PROJECT_URL, # Published URL 'public', # Access self.project_id, # Project purchase_order_object.data, # Data False # Secure )
def check_if_need_to_download(self): ''' Checks local directory to determine whether a local copy is needed. :returns: boolean. True if need to download, False if don't need to. ''' # Check if contract has valid format and is public validity = Utilities().check_that_contract_is_valid_and_public( self.purchase_order_number) file_location = ( '%s/%s.html' % (PURCHASE_ORDER_DIR, self.purchase_order_number)) local_copy_exists = os.path.isfile(file_location) if validity is False or local_copy_exists: log.debug( "Don't download. Contract is invalid, private or we " + "already the HTML") return False # Don't download else: return True
def _check_when_last_scraped(self, page): """ Look up this page in scrape_log table to see when it was last scraped. :params page: The purchasing site's page to check. :type page: int. :returns: date. When this page was last scraped. None if never. """ query = (SESSION.query(ScrapeLog).filter(ScrapeLog.page == page).all()) if len(query) == 0: # No row yet for this page (total number varies) return None SESSION.close() # for row in query: date_last_scraped = query.pop().last_scraped log.debug('This page was last scraped %s', date_last_scraped.strftime('%Y-%m-%d')) return date_last_scraped
def check_if_need_to_upload(self, purchase_order_number): ''' Checks DocumentCloud project to determine whether this contract needs to be uploaded. :param purchase_order_number: The contract's purchase order number. :type purchase_order_number: string. :returns: boolean. True if need to upload, False if don't need to. ''' validity = Utilities().check_that_contract_is_valid_and_public( purchase_order_number) contract_exists = self._check_if_document_cloud_has_contract( "purchase order", purchase_order_number) if validity is False or contract_exists: log.debug('Not uploading to DocumentCloud') log.debug('Purchase order %s is invalid or already there', purchase_order_number) return False else: return True
def add_to_database(self, purchase_order_object): """ Add this contract to the local database. Initialize a Contract object class instance and fill out with the relevant information. :param purchase_order_object: The PurchaseOrder object instance. :type purchase_order_object: A PurchaseOrder object instance. """ log.debug("Adding purchase order %s to contracts table", purchase_order_object.purchaseorder) contract = Contract() # TODO: Might need to have a follow-up method that pulls from # DocumentCloud project and inserts its ID into this row in the # database. # contract.doc_cloud_id = TODO contract.contractnumber = purchase_order_object.k_number contract.purchaseordernumber = purchase_order_object.purchaseorder contract.description = purchase_order_object.description contract.title = purchase_order_object.title contract.dateadded = date.today() self._add_department_if_missing(purchase_order_object.department) self._add_vendor_if_missing( purchase_order_object.vendor_name, vendor_id_city=purchase_order_object.vendor_id_city) contract.departmentid = self._get_department_id( purchase_order_object.department) contract.vendorid = self._get_database_vendor_id( purchase_order_object.vendor_name) self._add_contract_to_local_database(contract)
def _download_attachment(self, attachment): ''' Download an attachment associated with a purchase order. :param attachment: The name of the attachment file to download. :type attachment: string ''' # The city's purchasing site has an internal ID for each attachment. # Here we use it to download the attachment files, and also to store # locally so we can have a list of the attachments we have on hand. city_attachment_id = re.search('[0-9]+', attachment.get('href')).group() log.debug('Gathering data for attachment %s', city_attachment_id) document_path = '%s/%s.pdf' % (DOCUMENTS_DIR, city_attachment_id) display_name = self._get_attachment_display_name(city_attachment_id) if os.path.isfile(document_path): # Have already downloaded log.info('Already have PDF for attachment %s', city_attachment_id) else: self._download_attachment_file(city_attachment_id, display_name, document_path)
def _download_attachment_file(self, attachment_id, display_name, document_file_path): ''' Download the attachment file found on contract page. :param attachment_id: The city's internal attachment ID. :type attachment_id: string :param document_file_path: The path for where to save the \ attachment file. :type document_file_path: string ''' log.debug('Saving PDF for attachment "%s" with city ID %s', display_name, attachment_id) if not os.path.exists(attachment_id): # TODO: convert to Python call([ 'curl', '-s', '-o', document_file_path, 'http://www.purchasing.cityofno.com/bso/external/document/' + 'attachments/attachmentFileDetail.sdo', '-H', 'Pragma: no-cache', '-H', 'Origin: http://www.purchasing.cityofno.com', '-H', 'Accept-Encoding: gzip, deflate', '-H', 'Accept-Language: en-US,en;q=0.8', '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X ' + '10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/43.0.2357.81 Safari/537.36', '-H', 'Content-Type: multipart/form-data; boundary=----' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP', '-H', 'Accept: text/html,application/xhtml+xml,application/' + 'xml;q=0.9,image/webp,*/*;q=0.8', '-H', 'Cache-Control: no-cache', '-H', 'Referer: http://www.purchasing.cityofno.com/bso/external/' + 'document/attachments/attachmentFileDetail.sdo?fileNbr=' + '%s&docId=%s' % (attachment_id, self.purchaseorder) + '&docType=P&releaseNbr=0&parentUrl=/external/purchaseorder/' + 'poSummary.sdo&external=true', '-H', 'Cookie: JSESSIONID=5FC84DA3EC020E1FC19700761C0EBEB3', '-H', 'Connection: keep-alive', '--data-binary', '$\'------WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-' + 'Disposition: form-data; name="mode"\r\n\r\ndownload\r\n' + '------WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-' + 'Disposition: form-data; name="parentUrl"\r\n\r\n/external/' + 'purchaseorder/poSummary.sdo\r\n------WebKitFormBoundary' + 'GAY56ngXMDvs6qDP\r\nContent-Disposition: form-data; ' + 'name="parentId"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="fileNbr"\r\n\r\n' + '%s' % attachment_id + '\r\n------WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-' + 'Disposition: form-data; name="workingDir"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="docId"\r\n\r\n' + '%s' % self.purchaseorder + '\r\n------WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-' + 'Disposition: form-data; name="docType"\r\n\r\nP\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="docSubType"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="releaseNbr"\r\n\r\n0\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="downloadFileNbr"\r\n\r\n' + '%s' % attachment_id + '\r\n------WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-' + 'Disposition: form-data; name="itemNbr"\r\n\r\n0\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="currentPage"\r\n\r\n1\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="querySql"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="sortBy"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="sortByIndex"\r\n\r\n0\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="sortByDescending"\r\n\r\nfalse\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="revisionNbr"\r\n\r\n0\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="receiptId"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="vendorNbr"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="vendorGrp"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="invoiceNbr"\r\n\r\n\r\n------' + 'WebKitFormBoundaryGAY56ngXMDvs6qDP\r\nContent-Disposition: ' + 'form-data; name="displayName"\r\n\r\n' + '%s' % display_name + '\r\n------WebKitFormBoundaryGAY56ngXMDvs6qDP--\r\n\'', '--compressed' ])