def save_to_csv(self, response, **meta):
     il = ItemLoader(item=AlMassageTherapyLicensesSpiderItem(),
                     response=response)
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'AL_Massage_Therapy_Licenses')
     il.add_value('url', 'http://www.almtbd.alabama.gov/licensee.aspx')
     il.add_value('category', meta['category'])
     il.add_value('company_name', self._getDBA(meta['company_name'])[0])
     il.add_value('dba_name', self._getDBA(meta['company_name'])[1])
     il.add_value('approved by', meta['approved_by'])
     il.add_value('permit_lic_no', meta['permit_lic_no'])
     il.add_value('renewal date', meta['Renewal_Date'])
     il.add_value('permit_lic_status', meta['permit_lic_status'])
     il.add_value('location_address_string',
                  meta['location_address_string'])
     il.add_value('mail_address_string', meta['mailing_address'])
     il.add_value('person_name', meta['person_name'])
     il.add_value('person_subtype', meta['person_subtype'])
     il.add_value('company_phone', meta['company_phone'])
     il.add_value('company_fax', meta['fax'])
     il.add_value('permit_lic_eff_date', meta['permit_lic_eff_date'])
     il.add_value('permit_lic_exp_date', meta['permit_lic_exp_date'])
     il.add_value('approved date', meta['approved_date'])
     il.add_value('company_email', meta['email'])
     il.add_value('company_website', meta['website_address'])
     il.add_value('permit_lic_desc', meta['permit_lic_desc'])
     il.add_value('permit_type', 'therapy_license')
     return il.load_item()
Ejemplo n.º 2
0
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=IlKankakeeFoodInspectionsSpiderItem(),
                     response=response)
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'IL_Kankakee_Food_Inspections')
     il.add_value(
         'url',
         'http://www.kankakeehealth.org/environmental-health/food-sanitation/food_inspections.html'
     )
     il.add_value('violation_date', meta['violation_date'])
     il.add_value('permit_lic_no', meta['permit_lic_no'])
     il.add_value('location_address_string',
                  meta['location_address_string'])
     il.add_value('inspector_comments', meta['inspector_comments'])
     il.add_value('inspection_date', meta['inspection_date'])
     il.add_value('company_name', meta['company_name'])
     il.add_value('violation_rule_id', meta['rule_id'])
     il.add_value('violation_subtype', meta['violation_subtype'])
     il.add_value('inspection_pass_fail', meta['inspection_pass_fail'])
     il.add_value('violation category', meta['violation_category'])
     il.add_value('dba_name', meta['dba_name'])
     il.add_value('inspection_type', meta['inspection_type'])
     il.add_value('violation_description', meta['violation_description'])
     il.add_value('risk category', meta['risk'])
     il.add_value('abate_date', meta['abate_date'])
     il.add_value('abate_status', meta['abate_status'])
     il.add_value('temperature observations-item/location',
                  meta['temperature'])
     il.add_value('inspection_subtype', meta['inspection_subtype'])
     il.add_value('violation_rule', meta['rule'])
     il.add_value('permit_lic_desc', meta['permit_lic_desc'])
     il.add_value('permit_type', 'restaurant_license')
     il.add_value('violation_type', meta['violation_type'])
     return il.load_item()
Ejemplo n.º 3
0
 def parse_row(self, response, row):
     # print(row)
     il = ItemLoader(item=IlHospitalLicensesSpiderItem())
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value(
         'url',
         'https://data.illinois.gov/dataset/410idph_hospital_directory/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981'
     )
     il.add_value('sourceName', 'IL_Renal_Desease_Facility_Licenses')
     il.add_value('permit_type', "medical_license")
     name = self._getDBA(row['End Stage Renal Disease'])
     company_name = str(name[0]).replace(' -', '') if ' -' in str(
         name[0]) else name[0]
     address = self.format__address_4(
         row['Address'], row['City'], 'IL',
         str(row['Zip']) if '.' not in str(row['Zip']) else str(
             row['Zip'])[:str(row['Zip']).rfind('.')])
     il.add_value('dba_name', name[1])
     il.add_value('permit_lic_no', row.get('Medicare #', ''))
     il.add_value(
         'permit_lic_exp',
         self.format_date(row.get('Exp. Date', ''))
         if row.get('Exp. Date') else '')
     il.add_value('company_name', company_name)
     il.add_value('location_address_string', address)
     il.add_value('county', row.get('County', ''))
     il.add_value(
         'permit_lic_desc', "Medical License for " +
         company_name if name[0] else "Medical License")
     il.add_value('company_phone', row.get('Phone', ''))
     il.add_value(
         'company_subtype',
         row.get('Type', '') if row.get('Type', '') else 'Medical License')
     yield il.load_item()
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=IaJohnsonIowacityBuildingPermitsSpiderItem(),
                     response=response)
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'IA_Johnson_IowaCity_Building_Permits')
     il.add_value('url',
                  'http://www.iowa-city.org/IcgovApps/Tidemark/Search')
     il.add_value('permit_lic_no', meta['case_number'])
     il.add_value('permit_lic_status', meta['status'])
     il.add_value('location_address_string', meta['address'])
     il.add_value('permit_lic_desc', meta['description'])
     il.add_value('case actions-date', meta['case_action_date'])
     il.add_value('case actions-description',
                  meta['case_action_description'])
     il.add_value('case actions-status', meta['case_action_status'])
     il.add_value('case action-notes', meta['case_action_notes'])
     il.add_value('inspection_date', meta['inspection_date'])
     il.add_value('inspection_type', meta['inspection_type'])
     il.add_value('inspection_pass_fail', meta['inspection_pass_fail'])
     il.add_value('inspection_description', meta['inspection_description'])
     il.add_value('violation_date', meta['violation_date'])
     il.add_value('violation_type', meta['violation_type'])
     il.add_value('permit_type', 'building_permit')
     return il.load_item()
 def save_to_csv(self,response,**meta):
     il = ItemLoader(item=WaKittitasBuildingPermitsSpiderItem())
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'WA_Kittitas_Building_Permits')
     il.add_value('url', 'https://www.co.kittitas.wa.us/cds/building/reports.aspx')
     il.add_value('report date',meta['date'])
     il.add_value('permit_lic_no',meta['permit_number'])
     il.add_value('permit_subtype',meta['permit_type'])
     il.add_value('permit_lic_desc',meta['permit_lic_desc'])
     il.add_value('location_address_string',meta['address'])
     if meta['valuation']:
         meta['valuation']=meta['valuation'].replace('$0.00','')
     il.add_value('permit_lic_value',meta['valuation'])
     il.add_value('permit_lic_fee',meta['fees'])
     if meta['owner_name']:
         company_names=meta['owner_name']
         meta['owner_name']=self._getDBA(company_names)[0]
         meta['dba_name']=self._getDBA(company_names)[1]
     il.add_value('mixed_name',meta['owner_name'])
     il.add_value('dba_name',meta['dba_name'])
     il.add_value('mixed_subtype',meta['mixed_subtype'])
     il.add_value('mail_address_string',meta['mailing'])
     il.add_value('contractor_company',meta['contractor'])
     il.add_value('contractor_dba',meta['contractor_dba'])
     il.add_value('parcel #',meta['parcel_number'])
     il.add_value('permit_lic_eff_date',meta['issue_date'])
     il.add_value('permit_type', 'building_permit')
     return il.load_item()
Ejemplo n.º 6
0
 def spider_opened(self, spider):
     if hasattr(spider, 'start') and spider.start:
         l = list(os.path.splitext(self.file_name))
         remove_spec = lambda x: ''.join(e for e in x if e.isalnum())
         l.insert(
             1, "_{}_{}".format(remove_spec(spider.start),
                                remove_spec(spider.end)))
         self.file_name = "".join(l)
     if self.appendMode:
         outpath = os.path.join(
             settings.get('STORAGE_DIR'), self._settings.get('JIRA_ID'),
             'resume_{}'.format(Utils.getingestion_timestamp()),
             self.file_name if self.file_name else '%s_items.csv' %
             spider.name)
     else:
         outpath = os.path.join(
             settings.get('STORAGE_DIR'), self._settings.get('JIRA_ID'),
             self.file_name if self.file_name else '%s_items.csv' %
             spider.name)
     self.createFolder(outpath)
     self.file = open(outpath, 'w+b')
     kwargs = {'delimiter': self.delimiter}
     if self.fields_to_export:
         kwargs['fields_to_export'] = self.fields_to_export
     if self.null_header:
         kwargs['null_header'] = self.null_header
     self.exporter = CustomCsvItemExporter(self.file, **kwargs)
     self.exporter.start_exporting()
Ejemplo n.º 7
0
 def save_csv(self, response, data_dic):
     il = ItemLoader(item=HiSosSpiderItem(), response=response)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'HI_SOS')
     il.add_value('url', 'https://hbe.ehawaii.gov/documents/search.html')
     il.add_value('permit_type', 'business_license')
     for k in data_dic:
         il.add_value(k, data_dic[k])
     return il
Ejemplo n.º 8
0
 def save_csv(self,response,data_dic):
     il = ItemLoader(item=IlAgricultureLicensesSpiderItem())
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'IL_Agriculture_Licenses')
     il.add_value('permit_type', 'agriculture_license')
     il.add_value('url', 'https://www2.illinois.gov/sites/agr/licenses/Pages/A-Z-License-List.aspx')
     for k in data_dic:
         il.add_value(k,(data_dic[k]))
     return il
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=WaKittitasBuildingPermitsSpiderItem())
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'WA_Kittitas_Building_Permits')
     il.add_value(
         'url', 'https://www.co.kittitas.wa.us/cds/building/reports.aspx')
     il.add_value('report date', meta['date'])
     il.add_value('permit_lic_no', meta['permit_number'])
     il.add_value('permit_subtype', meta['permit_type'])
     il.add_value('permit_lic_desc', meta['permit_lic_desc'])
     add = meta['address'].split(',')
     if len(add) > 3:
         meta['address'] = re.split("WA \d+",
                                    meta['address'])[0] + re.search(
                                        "WA \d+", meta['address']).group()
     else:
         if 'PERMIT' in meta['address']:
             meta['address'] = 'WA'
         else:
             meta['address'] = meta['address']
     if ':,' in meta['address'] or ':AL,' in meta['address']:
         meta['address'] = meta['address'].replace(':AL,', ':,')
         meta['address'] = meta['address'].split(':,')[1]
     if ',' not in meta['address']:
         meta['address'] = meta['address'] + ', WA'
     il.add_value(
         'location_address_string',
         meta['address'].replace('Address:', 'WA').replace('WA, WA', 'WA'))
     il.add_value('permit_lic_value', meta['valuation'])
     il.add_value('permit_lic_fee', meta['fees'])
     if meta['owner_name']:
         company_names = meta['owner_name']
         meta['company_name'] = self._getDBA(company_names)[0]
         meta['dba_name'] = self._getDBA(company_names)[1]
     il.add_value('mixed_name', meta['company_name'])
     il.add_value('dba_name', meta['dba_name'])
     il.add_value('mixed_subtype', meta['mixed_subtype'])
     il.add_value('mail_address_string', meta['mailing'])
     il.add_value('contractor_company', meta['contractor'])
     il.add_value('contractor_dba', meta['contractor_dba'])
     if 'T' in meta['parcel_number'] or 'F' in meta[
             'parcel_number'] or 'M' in meta[
                 'parcel_number'] or 'B' in meta['parcel_number']:
         meta['parcel_number'] = meta['parcel_number'].replace(
             'B', 'T').replace('M', 'T').replace('F', 'T')
         il.add_value('parcel #', meta['parcel_number'].split('T')[0])
     else:
         il.add_value('parcel #', meta['parcel_number'])
     if ':' in meta['issue_date']:
         il.add_value('permit_lic_eff_date',
                      meta['issue_date'].split(':')[1])
     else:
         il.add_value('permit_lic_eff_date', meta['issue_date'])
     il.add_value('permit_type', 'building_permit')
     return il.load_item()
Ejemplo n.º 10
0
 def save_csv(self,response,data_dic):
     il = ItemLoader(item=AlFoodInspectionsSpiderItem(),response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags,lambda data:re.sub(r'\s+', ' ',data) if data else '',replace_escape_chars)         
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'AL_Food_Inspections')
     il.add_value('url', 'http://www.alabamapublichealth.gov/foodscores/index.html')       
     
     for k in data_dic:
         il.add_value(k,(data_dic[k]))
     return il
Ejemplo n.º 11
0
 def save_csv(self, response, data_dic):
     il = ItemLoader(item=MeSosSpiderItem(), link_page=response)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'ME_SOS')
     il.add_value('permit_type', 'business_license')
     il.add_value('url',
                  'https://icrs.informe.org/nei-sos-icrs/ICRS?MainPage=x')
     for k in data_dic:
         il.add_value(k, (self.remove_tag(data_dic[k])))
     return il
Ejemplo n.º 12
0
 def save_csv(self, response, data_dic):
     il = ItemLoader(item=OhSosSpiderItem(), response=response)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'OH_SOS')
     il.add_value(
         'url',
         'https://www5.sos.state.oh.us/ords/f?p=100:1:::NO:1:P1_TYPE:NAME')
     il.add_value('permit_type', 'business_license')
     for k in data_dic:
         il.add_value(k, data_dic[k])
     return il
Ejemplo n.º 13
0
 def parse_details(self, response):
     tr_list=response.xpath('//*[@id="ctl00_ContentPlaceHolder1_dtgResults"]//tr')[1:]
     for tr in tr_list:
         link=tr.xpath('td[10]/a/@href').extract_first()
         company_name=tr.xpath('td[4]/text()').extract_first()
         f_name=tr.xpath('td[1]/text()').extract_first()
         m_name=tr.xpath('td[2]/text()').extract_first()
         l_name=tr.xpath('td[3]/text()').extract_first()
         person_name=self.format_name(f_name,m_name,l_name)
         if company_name and len(company_name) > 2:
             company_name=company_name
         else:
             company_name=person_name
         if link:
             link_url='https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/'+str(link)
             parse_res=yield scrapy.Request(url=link_url,dont_filter=True)
             add=parse_res.xpath('//*[contains(text(),"City")]/following-sibling::td/span/text()').extract_first()
             state=parse_res.xpath('//*[contains(text(),"State")]/following-sibling::td/span/text()').extract_first()
             if add and state:
                 location_address_string=add+', '+state
             else:
                 location_address_string=state
             permit_lic_no=parse_res.xpath('//*[contains(text(),"License Number")]/following-sibling::td/span/text()').extract_first()
             permit_subtype=parse_res.xpath('//*[contains(text(),"License Type")]/following-sibling::td/span/text()').extract_first()
             permit_lic_exp_date=parse_res.xpath('//*[contains(text(),"License Expiration Date")]/following-sibling::td/span/text()').extract_first()
             permit_lic_status=parse_res.xpath('//*[contains(text(),"License Status")]/following-sibling::td/span/text()').extract_first()
             disciplinary_action=parse_res.xpath('//*[contains(text(),"Disciplinary Action")]/following-sibling::td/span/text()').extract_first()
             il = ItemLoader(item=AlCosmetologyLicensesSpiderItem(),response=response)
             # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
             il.add_value('url', 'https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/VerificationSearch.aspx')
             il.add_value('sourceName', 'AL_Cosmetology_Licenses')
             il.add_value('permit_lic_exp_date',permit_lic_exp_date)
             il.add_value('permit_lic_status',permit_lic_status)
             il.add_value('person_name', person_name)
             il.add_value('violation_type', '')
             il.add_value('disciplinary action', disciplinary_action)
             il.add_value('permit_lic_desc', ('Cosmetology License for'+' '+str(company_name)) if company_name and len(company_name)>2 else 'Cosmetology License')
             il.add_value('permit_type', 'cosmetology_license')
             il.add_value('location_address_string', location_address_string if location_address_string and len(location_address_string) > 2 else 'AL')
             il.add_value('permit_lic_no', permit_lic_no)
             il.add_value('company_name', company_name)
             il.add_value('permit_subtype', permit_subtype)
             yield il.load_item()           
     pageee=response.xpath('//td[@colspan="10"]/span/following-sibling::a/@href').extract_first()
     if pageee:
         page_link=JavaScriptUtils.getValuesFromdoPost(pageee)
         page_data={'__EVENTTARGET':page_link['__EVENTTARGET'],'__EVENTARGUMENT':page_link['__EVENTARGUMENT'],'__VIEWSTATE':response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR':response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(),'__EVENTVALIDATION':response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(),'__VIEWSTATEENCRYPTED':response.xpath('//*[@id="__VIEWSTATEENCRYPTED"]/@value').extract_first()}
         yield scrapy.FormRequest(url=response.url,method='POST',formdata=page_data,callback=self.parse_details,dont_filter=True)
     elif len(self.search_element_a)>0:
         yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True)
     elif len(self.search_element)>0:
         self.check_first=True
         yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True)           
Ejemplo n.º 14
0
 def save_csv(self, response, data_dic):
     il = ItemLoader(item=GaHenryBuildingPermitsSpiderItem(),
                     response=response)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'GA_Henry_Building_Permits')
     il.add_value(
         'url',
         'https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx')
     il.add_value('permit_type', 'building_permit')
     for k in data_dic:
         il.add_value(k, (data_dic[k]))
     return il
 def parse_row(self, response, row):
     il = ItemLoader(item=IlAsbestosWorkerLicensesSpiderItem())
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'IL_Asbestos_Worker_Licenses')
     il.add_value('url', 'https://data.illinois.gov/dataset/378idph_asbestos_licensed_workers/resource/f3266216-1c0e-4326-acb7-0f4341d1b463')
     il.add_value('person_address_string', self.format__address_4(row['Expr1'],row['tech_city'],row['tech_state'],row['tech_zip']))
     il.add_value('person_name', row['tech_name']+' '+row['LAST_NAME'])
     il.add_value('permit_lic_desc', 'Asbestos Contractor License')
     il.add_value('dba_name', '')
     il.add_value('person_phone', row['Expr2'])
     il.add_value('county', row['COUNTY'])
     il.add_value('permit_lic_no', '0'+row['lic_id_number'] if len(row['lic_id_number'])<9 else row['lic_id_number'] )
     il.add_value('permit_type', 'asbestos_contractor_license')
     yield il.load_item()
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=OrAlcoholServerEducatorLicensesSpiderItem())
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'OR_Alcohol_Server_Educator_Licenses')
     il.add_value('url', meta['url'])
     il.add_value('type', meta['type_val'])
     il.add_value('company_name', self._getDBA(meta['company_name'])[0])
     il.add_value('dba_name', self._getDBA(meta['company_name'])[1])
     il.add_value('classes in/online course in', meta['class_in'])
     il.add_value('location_address_String', 'OR')
     il.add_value('company_phone', meta['phone'].replace('Phone:', ''))
     il.add_value('company_website', meta['email'])
     return il.load_item()
Ejemplo n.º 17
0
    def save_to_csv(self, response, **data_pass):

        il = ItemLoader(item=VaMecklenburgBuildingPermitsSpiderItem(),
                        response=response)
        il.default_input_processor = MapCompose(lambda v: v.strip(),
                                                remove_tags,
                                                replace_escape_chars)
        il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
        il.add_value('sourceName', 'VA_Mecklenburg_Building_Permits')
        il.add_value(
            'url',
            'https://webpermit.mecklenburgcountync.gov/Default.aspx?PossePresentation=SearchByAddress'
        )
        il.add_value('person_address_string',
                     data_pass['person_address_string'])
        il.add_value('permit_lic_no', data_pass['permit_lic_no'])
        il.add_value('master #', data_pass['master #'])
        il.add_value('submittal #', data_pass['submittal #'])
        il.add_value('permit_subtype', data_pass['permit_subtype'])
        il.add_value('permit_lic_status', data_pass['permit_lic_status'])
        il.add_value('location_address_string',
                     data_pass['location_address_string'])
        il.add_value('parcel #', data_pass['parcel #'])
        il.add_value('occupancy_subtype', data_pass['occupancy type'])
        il.add_value('permit_subtype', data_pass['permit_subtype'])
        il.add_value('occupancy type', data_pass['occupancy type'])
        il.add_value('usdc code', data_pass['usdc code'])
        il.add_value('type of building', data_pass['type of building'])
        il.add_value('equipment type', data_pass['equipment type'])
        il.add_value('permit_lic_fee', data_pass['permit_lic_fee'])
        il.add_value('mixed_name', data_pass['mixed_name'])
        il.add_value('mixed_subtype', data_pass['mixed_subtype'])
        il.add_value('mixed_phone', data_pass['mixed_phone'])
        il.add_value('mixed_contractor_name',
                     data_pass['mixed_contractor_name'])
        il.add_value('contractor id', data_pass['contractor id'])
        il.add_value('contractor_phone', data_pass['contractor_phone'])
        il.add_value('contractor_lic_no', data_pass['contractor_lic_no'])
        il.add_value('contractor_lic_type', data_pass['contractor_lic_type'])
        il.add_value('contractor_address_string',
                     data_pass['contractor_address_string'])
        il.add_value('inspection_id', data_pass['inspection_id'])
        il.add_value('inspection_subtype', data_pass['inspection_subtype'])
        il.add_value('inspection_date', data_pass['inspection_date'])
        il.add_value('inspection_pass_fail', data_pass['inspection_pass_fail'])
        il.add_value('inspection_type', data_pass['inspection_type'])
        il.add_value('permit_type', 'building_permit')
        return il.load_item()
Ejemplo n.º 18
0
    def parse_pdf(self, response):
        for row in self.__extractData(response):
            for col in row:
                # d = re.search(r"[\d]/[\d]/[\d]$", col['expiration'])
                # if d:
                # self.state['items_count'] = self.state.get('items_count', 0) + 1
                il = ItemLoader(item=CtForestPractitionerLicenseSpiderItem())
                il.default_input_processor = MapCompose(
                    lambda v: v.strip(), remove_tags, replace_escape_chars)
                il.add_value('ingestion_timestamp',
                             Utils.getingestion_timestamp())
                il.add_value(
                    'url',
                    'https://www.depdata.ct.gov/forestry/ForestPractitioner/directry.pdf'
                )
                il.add_value('sourceName', 'CT_Forest_Practitioner_License')
                il.add_value('person_phone', col['phone'])
                name = col['f_name'] + ' ' + col['l_name']
                il.add_value('person_name', name)
                if ' ' in col['expiration']:
                    date = col['expiration'].split(' ')[0]
                    e_permit = col['expiration'].split(' ')[1]
                else:
                    date = col['expiration']
                    e_permit = ''

                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2", date)
                il.add_value('permit_lic_exp_date', date)
                if '490' in e_permit:
                    e_permit = "490- permitted to assist landowners seeking classification of their land as 'Forest Land'"
                il.add_value('extended permit', e_permit)
                il.add_value('permit_lic_no', col['cert'])
                level_desc = col['level']
                if level_desc == 'F':
                    level_desc = 'FORESTER'
                elif level_desc == 'SFPH':
                    level_desc = 'SUPERVISING FOREST PRODUCTS HARVESTER'
                elif level_desc == 'FPH':
                    level_desc = 'FOREST PRODUCTS HARVESTER'
                il.add_value('level', col['level'])
                il.add_value('permit_subtype', level_desc)
                il.add_value('permit_lic_desc', level_desc)
                il.add_value('permit_type', 'forester_license')
                location_address_string = col['address'] + ', ' + col[
                    'city'] + ', ' + col['state'] + ' ' + col['zip']
                il.add_value('location_address_string',
                             location_address_string)
                yield il.load_item()
Ejemplo n.º 19
0
    def save_to_csv(self, response, data_dic):

        il = ItemLoader(item=NhSosSpiderItem(), response=response)
        il.default_input_processor = MapCompose(lambda v: v.strip(),
                                                remove_tags,
                                                replace_escape_chars)
        il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
        il.add_value('url',
                     'https://quickstart.sos.nh.gov/online/BusinessInquire')
        il.add_value('sourceName', 'NH_SOS')
        il.add_value(
            'creation_date',
            data_dic['business_creation_date'].replace('NOT-AVAILABLE', ''))
        il.add_value(
            'dba_name', data_dic['dba_name']
            if data_dic['dba_name'] and len(data_dic['dba_name']) > 3 else
            self._getDBA(data_dic['company_name'])[1])
        il.add_value('non_profit_indicator', data_dic['non_profit_indicator'])
        il.add_value('mail_address_string', data_dic['mailing_address_string'])
        il.add_value('status', data_dic['business_status'])
        il.add_value('citizenship / state of formation',
                     data_dic['state_of_formation'])
        il.add_value('duration', data_dic['duration'])
        il.add_value(
            'mixed_name',
            '' if data_dic['mixed_name'] is None else data_dic['mixed_name'])
        il.add_value('company_name', self._getDBA(data_dic['company_name'])[0])
        il.add_value('company_phone', data_dic['phone'].replace('NONE', ''))
        il.add_value('inactive_date', data_dic['inactive_date'])
        il.add_value('homestate name', self._getDBA(data_dic['host_name'])[0])
        il.add_value('naics_description', data_dic['naics_description'])
        il.add_value('permit_type', 'business_license')
        il.add_value('mixed_subtype', data_dic['mixed_subtype'])
        il.add_value('previous name', data_dic['previous_name'])
        il.add_value('company_subtype',
                     self._getDBA(data_dic['business_type'])[0])
        il.add_value('entity_id', data_dic['business_id'])
        il.add_value(
            'location_address_string', data_dic['location_address_string']
            if data_dic['location_address_string']
            and len(data_dic['location_address_string']) > 5 else 'NH')
        il.add_value('company_email',
                     data_dic['business_mail'].replace('NONE', ''))
        il.add_value('person_address_string',
                     data_dic['person_address_string'])
        return il
Ejemplo n.º 20
0
 def save_csv(self,response,main_res,permit_lic_no):
     location_address_string=rem_esc(main_res.xpath("//em[contains(text(),'Location')]/following::text()").extract_first())
     plat_lot=rem_esc(''.join(main_res.xpath('//em[contains(text(),"Plat")]/following::text()').extract()[:2]))
     Owner_name=rem_esc(main_res.xpath("//em[contains(text(),'Owner Name')]/following::text()").extract_first())
     corp_owner=rem_esc(main_res.xpath("//em[contains(text(),'Corp Owner')]/following::text()").extract_first())
     designer=rem_esc(main_res.xpath("//em[contains(text(),'Designer')]/following::text()").extract_first())
     total=rem_esc(main_res.xpath("//em[contains(text(),'Total')]/following::text()").extract_first())
     plat=''
     lot=''
     sublot=''
     if plat_lot:
         if 'Plat' in plat_lot and 'Lot' in plat_lot and 'Sublot' in plat_lot :
             plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip()
             lot= re.search('Lot.*Sublot',plat_lot).group()[3:-6].strip()
             sublot=re.search('Sublot.*',plat_lot).group()[6:].strip()
         elif 'Plat' in plat_lot and 'Lot' in plat_lot :
             plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip()
             lot= re.search('Lot.*',plat_lot).group()[3:].strip()
         elif 'Plat' in plat_lot and 'Sublot' in plat_lot :
             plat=re.search('Plat.*Sublot',plat_lot).group()[4:-6].strip()
             lot= re.search('Sublot.*',plat_lot).group()[6:].strip()
         elif 'plat' in plat_lot.lower():
             plat=re.search('Plat.*',plat_lot).group()[4:].strip()
     il = ItemLoader(item=RiSepticSystemLicensesSpiderItem(),response=response)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'RI_Septic_System_Licenses')
     il.add_value('url', 'https://www.ri.gov/DEM/isdssearch/')
     il.add_value('permit_lic_no', permit_lic_no)
     il.add_value('city/town', response.meta['city'])
     il.add_value('location_address_string', location_address_string.strip()+", RI")
     il.add_value('plat', (plat.upper().strip())[:-1] if plat.endswith('&') else plat.upper())
     il.add_value('lot', (lot.upper().strip())[:-1] if lot.endswith('&') else lot.upper())
     il.add_value('sublot', (sublot.upper().strip())[:-1] if sublot.endswith('&') else sublot.upper())
     company_name=corp_owner if corp_owner.strip() else Owner_name if Owner_name.strip() else designer if designer.strip() else ''
     com_name=self._getDBA(company_name)
     designer_dba=self._getDBA(designer)
     permit_lic_desc='Septic System Licenses'
     if com_name[0]:
         permit_lic_desc+=" For "+com_name[0]
     il.add_value('company_name', com_name[0] if company[0].strip() else designer[0])
     il.add_value('dba_name', com_name[1] if com_name[1] else designer[1])
     il.add_value('person_name', designer[0])
     il.add_value('total flow','' if 'Not available'  in total else total)
     il.add_value('permit_lic_desc', permit_lic_desc)
     il.add_value('permit_type', 'utility_license')
     yield il.load_item()
 def parse_main_page(self, response):
     value1 = json.loads(response.body_as_unicode())
     value2 = value1['d'].replace('},{', '}~~{').split('[')[1].split(']')[0]
     value3 = value2.split('~~')
     for i in value3:
         json_acceptable_string = i.replace("\\", "").replace(
             '"administrative medicine"', "'administrative medicine'")
         d = json.loads(json_acceptable_string)
         person_name = d['FullName']
         permit_subtype = d['LicenseType']
         permit_lic_no = d['License_Number']
         if d['Address1'] and d['City'] and d['Zip']:
             location_address_string = d['Address1'] + ', ' + d[
                 'City'] + ' ' + d['Zip']
         violation_description = d['Publicfile']
         permit_lic_desc = 'Medical License for ' + str(person_name)
         violation_type = 'professional_violation'
         vio = d['Action_Date']
         if '-' in vio:
             violation_date = ''
         else:
             violation_date = time.strftime(
                 '%m/%d/%Y',
                 time.gmtime(int(re.split('\(|\)', vio)[1]) / 1000.))
         violation_subtype = d['ActionTaken']
         il = ItemLoader(item=AlMedicalLicenseViolationsSpiderItem(),
                         response=response)
         # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
         il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
         il.add_value('sourceName', 'AL_Medical_License_Violations')
         il.add_value(
             'url',
             'https://abme.igovsolution.com/online/Lookups/Publiclogfile.aspx'
         )
         il.add_value('person_name', self._getDBA(person_name)[0])
         il.add_value('dba_name', self._getDBA(person_name)[1])
         il.add_value('permit_subtype', permit_subtype)
         il.add_value('permit_lic_no', permit_lic_no)
         il.add_value('location_address_string', location_address_string)
         il.add_value('violation_description', violation_description)
         il.add_value('permit_lic_desc', permit_lic_desc)
         il.add_value('violation_type', violation_type)
         il.add_value('violation_date', violation_date)
         il.add_value('violation_subtype', violation_subtype)
         il.add_value('permit_type', 'medical_license')
         yield il.load_item()
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=IlChampaignBuildingPermitsSpiderItem(),
                     response=response)
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value(
         'url',
         'http://etrakit.ci.champaign.il.us/etrakit3/Search/permit.aspx')
     il.add_value('sourceName', 'IL_Champaign_Building_Permits')
     il.add_value('finaled date', meta['finaled_date'])
     il.add_value('inspection_date', meta['inspection_date'])
     il.add_value('contractor_dba', meta['contractor_dba'])
     il.add_value('mixed_contractor_name', meta['mixed_contractor_name'])
     il.add_value('dba_name', meta['dba_name'])
     il.add_value('apn', meta['apn'])
     il.add_value('permit_lic_fee', meta['permit_lic_fee'])
     il.add_value('location_address_string',
                  meta['location_address_string'])
     il.add_value('person_address_string', meta['person_address_string'])
     il.add_value('subtype', meta['subtype'])
     il.add_value('permit_subtype', meta['permit_subtype'])
     il.add_value('inspection_subtype', meta['inspection_subtype'])
     il.add_value('mixed_subtype', meta['mixed_subtype'])
     il.add_value('contractor_address_string',
                  meta['contractor_address_string'])
     il.add_value('permit_lic_status', meta['permit_lic_status'])
     il.add_value('permit_lic_exp_date', meta['permit_lic_exp_date'])
     il.add_value('permit_lic_no', meta['permit_lic_no'])
     il.add_value('notes', meta['notes'])
     il.add_value('property type', meta['property_type'])
     il.add_value('mixed_name', meta['mixed_name'])
     il.add_value('inspection_pass_fail', meta['inspection_pass_fail'])
     il.add_value('approved date', meta['approved_date'])
     il.add_value('permit_lic_eff_date', meta['permit_lic_eff_date'])
     il.add_value('permit_applied_date', meta['permit_applied_date'])
     il.add_value('scheduled date', meta['scheduled_date'])
     il.add_value(
         'permit_lic_desc', meta['permit_lic_desc']
         if meta['permit_lic_desc'] and len(meta['permit_lic_desc']) > 2
         else meta['permit_subtype'] if meta['permit_subtype']
         and len(meta['permit_subtype']) > 2 else 'Building Permit')
     il.add_value('inspection_type', meta['inspection_type'])
     il.add_value('permit_type', 'building_permit')
     return il.load_item()
Ejemplo n.º 23
0
 def save_to_csv(self, response, **meta_data):
     il = ItemLoader(item=FlClayBuildingPermitsSpiderItem(),
                     response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(),
                                             remove_tags,
                                             replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('permit_lic_no', str(meta_data['permit_lic_no']))
     il.add_value('permit_subtype', meta_data['permit_subtype'])
     il.add_value('permit_lic_desc', meta_data['permit_lic_desc'])
     il.add_value('location_address_string',
                  meta_data['location_address_string'])
     il.add_value('permit_lic_eff_date', meta_data['permit_lic_eff_date'])
     il.add_value('notes', meta_data['notes'])
     il.add_value('mixed_name', meta_data['mixed_name'])
     il.add_value('mixed_subtype', meta_data['mixed_subtype'])
     il.add_value('person_address_string',
                  meta_data['person_address_string'])
     il.add_value('mixed_contractor_name',
                  meta_data['mixed_contractor_name'])
     il.add_value('contractor_lic_no', meta_data['contractor_lic_no'])
     il.add_value('contractor_lic_type', meta_data['contractor_lic_type'])
     il.add_value('permit_lic_value', meta_data['permit_lic_value'])
     if meta_data['number_of_stories'] == 'None':
         il.add_value('number_of_stories', '')
     else:
         il.add_value('number_of_stories', meta_data['number_of_stories'])
     if meta_data['year_built'] == 'None':
         il.add_value('year_built', '')
     else:
         il.add_value('year_built', meta_data['year_built'])
     il.add_value('inspection_id', meta_data['inspection_id'])
     il.add_value('inspection_date', meta_data['inspection_date'])
     il.add_value('inspection_subtype', meta_data['inspection_subtype'])
     il.add_value('inspection_pass_fail', meta_data['inspection_pass_fail'])
     il.add_value('inspector_comments', meta_data['inspector_comments'])
     il.add_value('inspection_type', meta_data['inspection_type'])
     il.add_value('permit_type', "building_permit")
     il.add_value(
         'url',
         "http://www.claycountygov.com/about-us/local-government/public-records-search/permits"
     )
     il.add_value('sourceName', 'FL_Clay_Building_Permits')
     return il
Ejemplo n.º 24
0
    def save_csv(self, response, data_dic):
        il = ItemLoader(item=VaSosSpiderItem())
        il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
        il.add_value('sourceName', 'VA_SOS')
        il.add_value('url', 'http://www.scc.virginia.gov/clk/dwnld.aspx')
        il.add_value('permit_type', 'business_license')
        for k in data_dic:
            il.add_value(k, data_dic[k])
        return il

#   def parse_row(self, response, row):
#       il = ItemLoader(item=VaSosSpiderItem())
#       il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
# #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#       il.add_value('sourceName', 'va_sos')
#       il.add_value('url', 'http://www.scc.virginia.gov/clk/dwnld.aspx')
#       il.add_value('type', row['Type'])
#       il.add_value('entity_id', row['EntityID'])
#       il.add_value('company_name', row['Name'])
#       il.add_value('dba_name', row[''])
#       il.add_value('status', row['Status'])
#       il.add_value('statusdate', row['StatusDate'])
#       il.add_value('duration', row['Duration'])
#       il.add_value('creation_date', row['IncorpDate'])
#       il.add_value('incorpstate', row['IncorpState'])
#       il.add_value('industrycode', row['IndustryCode'])
#       il.add_value('location_address_string', row['Street1+street2+city+state+zip'])
#       il.add_value('prinoffeffdate', row['PrinOffEffDate'])
#       il.add_value('mixed_name', row['RA-Name'])
#       il.add_value('mixed_subtype', row[''])
#       il.add_value('person_address_string', row['RA-Street1+street2+city+state+zip'])
#       il.add_value('ra-effdate', row['RA-EffDate'])
#       il.add_value('ra-status', row['RA-Status'])
#       il.add_value('ra-loc', row['RA-Loc'])
#       il.add_value('stockind', row['StockInd'])
#       il.add_value('totalshares', row['TotalShares'])
#       il.add_value('mergerind', row['MergerInd'])
#       il.add_value('assessind', row['AssessInd'])
#       il.add_value('stock', row['Stock'])
#       il.add_value('person_name', row['Officer Name'])
#       il.add_value('person_subtype', row['Officer Title'])
#       il.add_value('permit_type', row[''])
#       return il.load_item()
Ejemplo n.º 25
0
 def save_to_csv(self,response,**det_dic):
     il = ItemLoader(item=AlForesterLicensesSpiderItem(),response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'AL_Forester_Licenses')
     il.add_value('url', 'http://asbrf.alabama.gov/vs2k5/rosterofforesters.aspx')
     il.add_value('permit_type', 'forester_license')
    # il.add_value('location_address_string', "AL")
     il.add_value('location_address_string', str(det_dic['person_addrs']))
     il.add_value('county', str(det_dic['person_country']))
     il.add_value('company_email', det_dic['person_mail_id'])
     il.add_value('person_subtype', det_dic['person_subtype'])
     il.add_value('permit_lic_no', det_dic['person_lic_num'])
     il.add_value('person_name', det_dic['user_name'])
     il.add_value('permit_lic_desc', det_dic['permit_lic_desc'])
     il.add_value('dba_name', det_dic['dba_name'])
     il.add_value('company_name', det_dic['comny_name'])
     il.add_value('company_phone', det_dic['person_phone_num'])
     return il
Ejemplo n.º 26
0
 def __init__(self,
              settings,
              file_name,
              delimiter,
              fields_to_export,
              null_header,
              customHeader=False,
              topHeader=None):
     self.settings = settings
     self.file_name = file_name
     self.delimiter = delimiter
     self.fields_to_export = fields_to_export
     self.customHeader = customHeader
     self.chunk_folder = "chunk_{}".format(Utils.getingestion_timestamp())
     self.topHeader = topHeader
     self.null_header = null_header
     self.items = []
     self.chunk_number = 0
     self.job_dir = settings.get('JOB_DIR_PAUSE_RESUME')
     self.appendMode = False
 def save_to_csv(self, response, **meta):
     il = ItemLoader(item=WaWhatcomBellinghamBuildingPermitsSpiderItem())
     # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value('sourceName', 'WA_Whatcom_Bellingham_Building_Permits')
     il.add_value('url', 'https://www.cob.org/epermits/Search/permit.aspx')
     il.add_value('permit_lic_no', meta['record_number'])
     il.add_value('permit_subtype', meta['permit_lic_type'])
     il.add_value('subtype', meta['permit_subtype'])
     il.add_value('property type', meta["property_type"])
     if meta["permit_lic_desc"]:
         meta["permit_lic_desc"] = meta["permit_lic_desc"]
     else:
         meta["permit_lic_desc"] = 'Building Permit'
     il.add_value('permit_lic_desc', meta["permit_lic_desc"])
     il.add_value('Status', meta["permit_lic_status"])
     il.add_value('permit_applied_date', meta["permit_applied_date"])
     il.add_value('approved date', meta["approved_date"])
     il.add_value('permit_lic_eff_date', meta["permit_lic_eff_date"])
     il.add_value('finaled date', meta["finaled_date"])
     il.add_value('permit_lic_exp_date', meta["permit_lic_exp_date"])
     il.add_value('location_address_string', meta['address'])
     il.add_value('apn/pin', meta["apn_pin"])
     il.add_value('parcel #', meta['parcel_number'])
     il.add_value('permit_lic_fee', meta['permit_lic_fee'])
     il.add_value('mixed_name', self._getDBA(meta['mixed_name'])[0])
     il.add_value('dba_name', self._getDBA(meta['mixed_name'])[1])
     il.add_value('mixed_subtype', meta["mixed_subtype"])
     il.add_value('person_address_string', meta["person_address_string"])
     il.add_value('mixed_contractor_name',
                  self._getDBA(meta['mixed_contractor_name'])[0])
     il.add_value('contractor_dba',
                  self._getDBA(meta['mixed_contractor_name'])[1])
     il.add_value('contractor_address_string',
                  meta["contractor_address_string"])
     il.add_value('inspection_subtype', meta["inspection_subtype"])
     il.add_value('inspection_date', meta["completed_date"])
     il.add_value('inspection_pass_fail', meta["inspection_pass_fail"])
     il.add_value('inspection_type', meta["inspection_type"])
     il.add_value('permit_type', 'building_permit')
     return il.load_item()
Ejemplo n.º 28
0
 def __createChunkFile(self, spider):
     remove_spec = lambda x: ''.join(e for e in x if e.isalnum())
     if self.file_name:
         l = list(os.path.splitext(self.file_name))
         if self.chunk_number != 0:
             l.insert(1, "_file_{}".format(str(self.chunk_number)))
         if hasattr(spider, 'start') and spider.start:
             if self.chunk_number != 0:
                 l.insert(
                     2, "_{}_{}".format(remove_spec(spider.start),
                                        remove_spec(spider.end)))
             else:
                 l.insert(
                     1, "_{}_{}".format(remove_spec(spider.start),
                                        remove_spec(spider.end)))
         file_name = "".join(l)
     if self.appendMode:
         outpath = os.path.join(
             self.settings.get('STORAGE_DIR'), self.settings.get('JIRA_ID'),
             'resume_{}'.format(Utils.getingestion_timestamp()),
             file_name if self.file_name else '{}_file_{}.csv'.format(
                 spider.name, str(self.chunk_number)))
     else:
         outpath = os.path.join(
             self.settings.get('STORAGE_DIR'), self.settings.get('JIRA_ID'),
             file_name if self.file_name else '{}_file_{}.csv'.format(
                 spider.name, str(self.chunk_number)))
     self.createFolder(outpath)
     self.file = open(outpath, 'w+b')
     kwargs = {'delimiter': self.delimiter}
     if self.fields_to_export:
         kwargs['fields_to_export'] = self.fields_to_export
     if self.null_header:
         kwargs['null_header'] = self.null_header
     self.exporter = CustomCsvItemExporter(self.file, **kwargs)
     self.exporter.start_exporting()
     if self.customHeader:
         fields = self.fields_to_export
         values = [self.topHeader.get(i) for i in fields]
         self.exporter.csv_writer.writerow(values)
Ejemplo n.º 29
0
 def parse_row(self, response, row):
     self.logger.info("started to extracting CSV data from {}".format(
         response.url))
     il = ItemLoader(item=NyAlbanyStateItems())
     lat, lng = map(str,
                    row['Location'].splitlines()[-1].strip('()').split(','))
     il.add_value('permit_lic_no', row['Permit Number'])
     il.add_value('permit_lic_eff_date', row['Date'])
     il.add_value('application_number', row['Application Number'])
     il.add_value('location_address_string', row['Address'])
     il.add_value('person_name', row['Owner'])
     il.add_value('person_subtype', "Owner")
     il.add_value('contractor_name', row['Contractor'])
     il.add_value('permit_lic_value', row['Estimated Cost'])
     il.add_value('permit_lic_fee', row['Fee'])
     il.add_value('permit_lic_desc', row['Description of Work'])
     il.add_value('longitude', lng)
     il.add_value('latitude', lat)
     il.add_value('permit_type', "building_permits")
     il.add_value('url', response.url)
     il.add_value('sourceName', "NY_Albany_Building_Permits")
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     return il.load_item()
 def save_to_csv(self, response, **data_pass):
     if data_pass['permit_lic_desc'] == '' or data_pass[
             'permit_lic_desc'] == None:
         data_pass['permit_lic_desc'] = 'Building Permit'
     il = ItemLoader(item=WiDouglasBuildingPermitsSpiderItem(),
                     response=response)
     il.default_input_processor = MapCompose(lambda v: v.strip(),
                                             remove_tags,
                                             replace_escape_chars)
     il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
     il.add_value(
         'url', 'https://gcs.douglascountywi.org/gcswebportal/search.aspx')
     il.add_value('sourceName', 'WI_Douglas_Building_Permits')
     il.add_value('inspector_comments', data_pass['inspector_comments'])
     il.add_value('mixed_name', data_pass['mixed_name'])
     il.add_value('permit_subtype', data_pass['permit_subtype'])
     il.add_value('permit_lic_desc', data_pass['permit_lic_desc'])
     il.add_value('mixed_subtype', data_pass['mixed_subtype'])
     il.add_value('permit_type', data_pass['permit_type'])
     il.add_value('permit_lic_fee', data_pass['permit_lic_fee'])
     il.add_value('inspection_pass_fail', data_pass['inspection_pass_fail'])
     il.add_value('permit_lic_status', data_pass['permit_lic_status'])
     il.add_value('location_address_string',
                  data_pass['location_address_string'])
     il.add_value('dba_name', data_pass['dba_name'])
     il.add_value('inspection_subtype', data_pass['inspection_subtype'])
     il.add_value('permit_lic_eff_date', data_pass['permit_lic_eff_date'])
     il.add_value('permit_lic_no', data_pass['permit_lic_no'])
     il.add_value('prop type', data_pass['prop type'])
     il.add_value('inspection_date', data_pass['inspection_date'])
     il.add_value('inspection_type', data_pass['inspection_type'])
     il.add_value('person', data_pass['person'])
     il.add_value('municipality', data_pass['municipality'])
     il.add_value('issue #', data_pass['issue #'])
     il.add_value('parcel number', data_pass['parcel number'])
     il.add_value('mail_address_string', data_pass['mail_address_string'])
     return il.load_item()