def parse(self, response): if "officials" in response.url: for quote in response.xpath( "//div[contains(h4/strong/u/text(),'Ward ')]"): alltext = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="COMMISSIONER", name=alltext[2], district=alltext[0].upper(), termEnd=alltext[1], address=alltext[3] + ", " + alltext[4], phone=alltext[5], url=response.url) elif "tax" in response.url: name = response.xpath( "//h3[contains(u/text(),'-Tax Collector')]/u/text()").get( ).split("-")[0] address = getAllText( response.xpath( "//div[contains(strong/text(),'Tax Office')]"))[1] phone = getAllText(response.xpath("//div[strong/text()='Phone:']")) print(phone) phone = phone[1] email = getAllText( response.xpath("//div[contains(strong/text(),'Email:')]"))[1] yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=name, address=address, phone=phone, email=email, url=response.url)
def parse(self, response): supervisors = getAllText(response.xpath("//table[@id='Table1']")) for i in range(len(supervisors) // 2): yield Official(muniName=self.muniName, muniType=self.muniType, office="SUPERVISOR", name=supervisors[i].split(",")[0], phone=supervisors[i + 3], url=response.url) for quote in response.xpath( "//b[contains(text(),'Real Estate Tax Collector')]/.."): taxman = quote.xpath("./u/strong/text()").get().strip() taxinfo = [ i.strip() for i in quote.xpath("./text()").getall() if len(i.strip()) != 0 ][2].split(" ") yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=taxman, address=" ".join(taxinfo[10:19])[:-1], phone=taxinfo[-7], url=response.url) auditors = getAllText( response.xpath("//font[contains(b/text(),'Board of Auditors')]")) audArr = [auditors[3], auditors[5][:-1], auditors[7]] for person in audArr: yield Official(muniName=self.muniName, muniType=self.muniType, office="AUDITOR", name=person, url=response.url)
def parse(self, response): surnames = {} sidearr = [] sidelinks = getAllLinks(response.xpath("//h4[contains(text(),'Board of Supervisors')]/../..")) for parts in response.xpath("//h4[contains(text(),'Board of Supervisors')]/../../li"): sidearr.append(getAllText(parts)) index = 0 for person in sidearr[1:]: surnames[person[0].split(" ")[-1]] = sidelinks[index] index += 1 addr = sidearr[0][2]+", "+"".join(sidearr[0][3:5])+" "+sidearr[0][5]+" "+sidearr[0][6] phone = sidearr[0][7] for quote in response.xpath("//h2[contains(text(),'Members')]/../../../../../..//li"): bits = getAllText(quote) name = bits[0].split(",")[0] termEnd = bits[1] email = surnames[name.split(" ")[-1]] yield Official( muniName=self.muniName, muniType=self.muniType, office="SUPERVISOR", name=name, termEnd=termEnd, email=email, address=addr, phone=phone, url=response.url)
def parse(self, response): if "elected" in response.url: for quote in response.xpath( "//div[@class='et_pb_blurb_container']"): allText = getAllText(quote) email = quote.xpath(".//img/@alt").getall() if len(email) > 0: email[0] = email[0].replace("mifflon", "mifflin") yield Official(muniName=self.muniName, muniType=self.muniType, office="MAYOR" if allText[0] == "Mayor" else "MEMBER OF COUNCIL", name=allText[-1], email=None if email == [] else email[0], url=response.url) elif "taxes" in response.url: for quote in response.xpath( "//div[@class='et_pb_text_inner']")[0:1]: allText = getAllText(quote) print(allText) yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", phone=allText[3], name=allText[0].split(',')[0], address=allText[1] + " " + allText[2], url=response.url)
def parse(self, response): for quote in response.xpath("//div[@class='twocolbig']"): alltext = getAllText(quote) # don't try to make the next line interact w quote it wouldn't work headers = getAllText( response.xpath("//div[@class='twocolbig']/h4")) for h in headers: loc = [e for e, part in enumerate(alltext) if h == alltext[e]][0] if "mayor" in h.lower(): yield Official(muniName=self.muniName, muniType=self.muniType, office="MAYOR", name=alltext[loc + 1], url=response.url) if h == "MEMBERS OF COUNCIL": for member in alltext[loc + 1:loc + 8]: yield Official( muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL", # if their position, e.g. pres or VP follows name=member.split(",")[0], url=response.url) if "tax" in h.lower(): yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", phone=alltext[loc + 3].split(":")[1], name=alltext[loc + 2], url=response.url)
def parse(self, response): addressSuffix = ", Pittsburgh, PA 15205" for quote in response.xpath('//div[@class="pf-content"]/p[1]'): mayorBits = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="MAYOR", name=mayorBits[1], url=response.url, address=mayorBits[2] + addressSuffix, phone=mayorBits[3]) for quote in response.xpath( '//div[@class="pf-content"]/table/tbody/tr'): memberBits = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL", name=memberBits[0].split("–")[0], url=response.url, address=memberBits[1] + addressSuffix, phone=memberBits[2]) for quote in response.xpath('//div[@class="pf-content"]/p[5]'): taxBits = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=taxBits[1], url=response.url, email=taxBits[2], phone=taxBits[3])
def parse(self, response): if "tax" in response.url: for quote in response.xpath("//div[p/text()='Tax Collector']/.."): yield Official( muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=quote.xpath("div[2]/h1/span/span/span/text()").get(), phone=quote.xpath("div[4]/p/text()").get(), url=response.url) elif "commissioners" in response.url: names = [getAllText(i)[0].split(" -")[0] for i in response.xpath("//h1")] # print(names) data = [getAllText(quote) for quote in response.xpath("//p[contains(text(),'Ward ')]/../..")] # print(data) for allText in data: if len(allText)<4: allText.insert(0, names[2]) yield Official( muniName=self.muniName, muniType=self.muniType, office="COMMISSIONER", name=allText[0].split(" -")[0], email=allText[1], phone=allText[2], district=allText[3].upper(), url=response.url)
def parse(self, response): for quote in response.xpath( "//h1[contains(text(),'Elected Officials')]/../../div"): names = getTextOfType(quote, "h3") allText = getAllText(quote) allPeeps = allText[:-3] phone = allText[-1] split = [] temp = [] for i in allText: if i in names and temp != []: split.append(temp) temp = [] temp.append(i) split.append(temp) print(split) for person in split: office = "TAX COLLECTOR" if "Tax" in person[ 1] else "MAYOR" if person[ 1] == "Mayor" else "MEMBER OF COUNCIL" yield Official( muniName=self.muniName, muniType=self.muniType, office=office, name=person[0], phone=phone if office == "MEMBER OF COUNCIL" else None, url=response.url)
def parse(self, response): for quote in response.xpath( '//div[@id="divEditor909176e4-b373-45cd-af7f-deacc7efb43e"]/span' ): parts = " ".join(getAllText(quote)) parts = [ x for x in split(r"\s{2,}", parts)[4:] if x != "Vice President" ] for i in range(7): yield Official(muniName=self.muniName, muniType=self.muniType, office="COMMISSIONER", name=parts[i * 6].split(",")[0], district=parts[i * 6 + 1].upper(), termEnd=parts[i * 6 + 3], phone=parts[i * 6 + 5], address=parts[i * 6 + 2] + ", " + parts[i * 6 + 4], url=response.url) for quote in response.xpath('//tr[@class="textContent"]')[0:1]: yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=quote.xpath("td[1]/text()").get(), termEnd=quote.xpath("td[4]/text()").get(), phone=quote.xpath("td[3]/div[3]/text()").get(), address=quote.xpath("td[3]/text()").get() + ", " + quote.xpath("td[3]/div[1]/text()").get(), url=response.url)
def parse(self, response): if response.url[-2] == 'l': for quote in response.xpath('//article[@id="post-346"]'): bits = getAllText(quote) peeps = [] peeps.append(bits[8:10]) peeps.append([bits[10], bits[14]]) peeps.append(bits[15:17]) peeps.append([bits[15], bits[20], bits[23]]) peeps.append(bits[24:26]) peeps.append([bits[24]] + bits[27:30]) peeps.append(bits[31:34]) peeps.append([bits[31], bits[34]]) peeps.append([bits[31]] + bits[-2:]) for i in peeps: yield self._member(i, response) elif response.url[-2] == 'e': for quote in response.xpath('//article[@id="post-343"]'): bits = quote.xpath("p[16]/text()").get() yield Official(muniName=self.muniName, muniType=self.muniType, office="MAYOR", email=bits.strip().split(" ")[-1], name=quote.xpath("h3[2]/text()").get(), phone=bits, url=response.url)
def parse(self, response): if response.url[-1] == 'p': namesdates = [] emails = [] for quote in response.xpath("//ul[@class='listnone']")[0:2]: namesdates.append(getAllText(quote)[1:]) for quote in response.xpath( "//li[contains(strong/text(),'ail:')]"): emails.append(quote.xpath('text()').get().strip()) for i in range(5): yield Official(muniName=self.muniName, muniType=self.muniType, office="SUPERVISOR", name=namesdates[0][i], email=emails[i], termEnd=namesdates[1][i], url=response.url) elif response.url[-1] == '/': for quote in response.xpath('//div[@id="mainContent"]'): yield Official( muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=" ".join( quote.xpath('p[1]/text()').get().strip().split(" ") [3:5]), phone=quote.xpath('text()').get(), address=", ".join( response.xpath("//div[@id='footer']/div/p/text()"). getall()[-1].split(' · ')[1:3]), url=response.url) # INCOMPLETE # Expected offices: 3 auditors, unable to be found on website
def parse(self, response): if "supervisors" in response.url: for quote in response.xpath( "//div[contains(h2/text(),'Current Board of Supervisors')]/ul/li" ): name = quote.xpath("text()").get().split("–")[0] yield Official(muniName=self.muniName, muniType=self.muniType, office="SUPERVISOR", name=name, district=self._district(name), url=response.url) elif "tax" in response.url: for quote in response.xpath("//div[@class='entry']"): name = getAllText(quote.xpath('p[13]')) email = quote.xpath('p[14]/a/@href').get() tempAddr = name[3].split(" ") addr = " ".join([tempAddr[0], "PA", tempAddr[2]]) yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=name[0], address=name[2] + ", " + addr, phone=name[4].replace(".", ""), email=email, url=response.url)
def parse(self, response): #file = open(response.url.split("/")[-1],'w') #file.write(response.xpath(".").get()) #file.close() if "elected" in response.url: folks = [] bits = getAllText(response.xpath('//td[@id="esbCr2x1"]/..')) temp = [] for i in bits: if "Mayor" in i or "City Council" in i: if len(temp) > 0: folks.append(temp) temp = [i] else: temp.append(i) folks.append(temp) for folk in folks: yield Official(muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL" if folk[0] == "City Council" else "MAYOR", name=folk[1], email=None if len(folk) < 3 else folk[2], url=response.url) elif "city" in response.url: bits = getAllText( response.xpath( '//span[contains(text(),"Treasurer")]/../../..')) yield Official(muniName=self.muniName, muniType=self.muniType, office="TREASURER", name=bits[1], url=response.url) if response.xpath( '//*[contains(text(),"Controller")]').get() == None: #Emailed Duquesne Manager concerning the Controller, who I could not find listed on the website. #On 11/15/20 he responded that the Duquesne Controller was Maureen Strahl. #While ideally we would scrape this, this will be returned as long as 'Controller' is not on this webpage yield Official(muniName=self.muniName, muniType=self.muniType, office="CONTROLLER", name="Maureen Strahl", url=None)
def parse(self, response): if "taxcollector" in response.url: addr = getAllText( response.xpath( "//p[text()[contains(.,'Tax Office Address')]]"))[-2:] addr[1] = " ".join([ i.upper() if i == "Pa" else i + "," if i == "McKeesport" else i for i in addr[1].split(" ") ]) email = response.xpath( "//p[text()[contains(.,'E-mail')]]/a/@href").get() phone = response.xpath( "//p[contains(text(),'Phone')]/text()").get() for quote in response.xpath( '//p[contains(text(), "Tax Collector")]'): yield Official( muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=quote.xpath("./text()").get().split("-")[0], address=", ".join(addr), email=email, phone=phone, url=response.url) elif "contact" in response.url: parts = getAllText( response.xpath('//p[contains(text(), "Council")]'))[:-2] for i in range(len(parts) // 2): yield Official(muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL", name=parts[i * 2].split(":")[1].split("-")[0], email=parts[i * 2 + 1], url=response.url) for quote in response.xpath('//p[contains(text(), "Mayor")]'): email = {"email": quote.xpath("a/@href").get().split(":")[1]} url = "http://eastmckeesportboro.com/leaders.htm" req = scrapy.Request(url=url, callback=self.mayorParse, cb_kwargs=email) yield req
def parse(self, response): for quote in response.xpath( "//th[contains(text(),'Name')]/../../../tbody/tr"): allText = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="TREASURER" if "Treasurer" in allText[1] else "MEMBER OF COUNCIL", district=self._district(allText[1]), name=allText[0], phone=None if len(allText) < 3 else allText[2], url=response.url)
def parse(self, response): if "contact" in response.url: for quote in response.xpath("//form[@id='adminForm']/ul/li"): alltext = getAllText(quote) if "Council" in alltext[1] or "President" in alltext[1] or "Mayor" in alltext[1]: yield Official( muniName=self.muniName, muniType=self.muniType, office="MAYOR" if "Mayor"==alltext[1] else "MEMBER OF COUNCIL", name=alltext[0], url=response.url) elif "taxes" in response.url: for quote in response.xpath("//span[contains(strong/text(),'Mercantile Tax Collector')]"): alltext = getAllText(quote) yield Official( muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=alltext[1].split(",")[0], address=", ".join([i.strip() for i in alltext[1].split(",")[1:-1]]), phone=alltext[2], url=response.url)
def parse(self, response): if "Council" in response.url: for quote in response.xpath("//div[@class='fr-view']")[5:7]: for person in quote.xpath('.//li'): thing = getAllText(person) if len(thing)==4: thing = [thing[0]+thing[1]]+thing[2:] if 'Junior Council Person' not in thing[0]: yield Official( muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL", name=thing[0].split(",")[0], district=thing[0].split(",")[-1].strip().upper(), termEnd=thing[1], email=thing[2], url=response.url) elif "Mayor" in response.url: for quote in response.xpath("//div[contains(h2/text(),'Responsibilities')]/p[2]"): yield Official( muniName=self.muniName, muniType=self.muniType, office="MAYOR", name=quote.xpath("text()").get().split(",")[0], termEnd=quote.xpath("text()[2]").get(), email=quote.xpath("a/@href").get(), url=response.url) elif "Tax" in response.url: for quote in response.xpath("//ol[contains(li/div/text(),'Real Estate Tax Collector')]"): address = getAllText(quote.xpath("li[2]/div[1]"))[2:] address = address[0]+", "+address[1]+" ".join(address[2:]) yield Official( muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=quote.xpath("li[1]/h4/text()").get(), phone=quote.xpath("li[2]/div[3]/text()").get(), address=address, email=quote.xpath("li[1]/div/a/@href").get(), url=response.url)
def parse(self, response): for quote in response.xpath('//div[contains(strong/text(),"Sitting left to right")]'): bits = getAllText(quote) bits = bits[1].split(",")[0].strip(":") yield Official( muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL", name=bits, url=response.url) for link in quote.xpath(".//a"): url = link.xpath("./@href").get() req = scrapy.Request(url=url, callback=self.linkParse) yield req
def parse(self, response): if 'commissioners' in response.url: for quote in response.xpath( '//h4[contains(text(),"Township Commissioners")]/../div'): bits = getAllText(quote) for person in range(len(bits) // 4): yield Official(muniName=self.muniName, muniType=self.muniType, office="COMMISSIONER", name=bits[person * 4], email=bits[person * 4 + 3], url=response.url) elif 'tax' in response.url: for quote in response.xpath( '//h4[contains(text(),"Tax Collector")]/../div'): bits = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=bits[0], phone=bits[2], email=bits[3], url=response.url)
def parse(self, response): if response.url[-1] == "7": universals = response.xpath( "//span[@class='DirectoryNormalText' and contains(label/text(),'Physical Address')]" ) allText = getAllText(universals) address = ", ".join(allText[1:4]) for quote in response.xpath("//tr[td/span/text()='Mayor']"): yield Official( muniName=self.muniName, muniType=self.muniType, address=address, office="MAYOR", email=self._email( quote.xpath('td[3]/span/script/text()').get()), phone=quote.xpath('td[4]/span/text()').get(), name=self._name(quote.xpath('td[1]/span/a/text()').get()), url=response.url) elif response.url[-1] == "9": universals = response.xpath( "//span[@class='DirectoryNormalText' and contains(label/text(),'Physical Address')]" ) allText = getAllText(universals) address = ", ".join(allText[5:8]) phone = allText[-2] for quote in response.xpath( "//table[@id='cityDirectoryDepartmentDetails']//tr[contains(td[2]/span/text(), 'Council')]" ): yield Official(muniName=self.muniName, muniType=self.muniType, address=address, office="MEMBER OF COUNCIL", phone=phone, name=self._name( quote.xpath('td[1]/span/a/text()').get()), url=response.url)
def parse(self, response): if "council" in response.url: for quote in response.xpath('//tbody/tr'): bits = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="MAYOR" if bits[1] == "Mayor" else "MEMBER OF COUNCIL", name=bits[0], email=bits[3], termEnd=bits[2], url=response.url) ##EXPECTED: 1 tax collector ## It may be Lorraine Rehtoric given section 8(h) ##in the September 14, 2020 agenda of the borough council
def parse(self, response): counter = 0 for quote in response.xpath('//td[@width="609"]/table'): names = [sub(r"\s+"," ",i) for i in getAllText(quote)] if counter==0: names = names[1:5] emails = [i for i in quote.xpath('.//a/@href').getall() if "mailto:" in i] counter += 1 for x in range(len(names)//4): yield Official( muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL" if "Council" in names[x*4+1] else names[x*4+1].upper(), name=names[x*4], email=emails[x], district="AT-LARGE" if "Ward" not in names[x*4+1] else names[x*4+1][:6].upper(), phone=names[x*4+2], url=response.url)
def parse(self, response): if "Officials" in response.url: for quote in response.xpath("//span[@id='ContentPage1_ctl04_lblText']/table//tr"): allText = getAllText(quote) print(allText) yield Official( muniName=self.muniName, muniType=self.muniType, office="MAYOR" if allText[0]=="Mayor" else "MEMBER OF COUNCIL", name=allText[1]+" "+allText[2], #email has to be manual because the site has weird server-side protections email=(allText[1][0]+allText[2]+"@jeffersonhills.net").lower(), url=response.url) elif "Taxes" in response.url: for quote in response.xpath("//tr[td/text()='Real Estate Tax Collector\xa0']"): yield Official( muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=quote.xpath("td[2]/text()").get().split("(")[1].strip()[:-1], phone=quote.xpath("td[2]/text()").get(), url=response.url)
def parse(self, response): for quote in response.xpath( '//div[@data-mesh-id="Containerc1qrainlineContent-gridContainer"]' ): folks = [] temp = [] texto = getAllText(quote) for i in texto: if i.isupper(): if len(temp) > 0: folks.append(temp) temp = [i] else: temp.append(i) for person in folks: yield Official( muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL" if person[0] not in ["MAYOR", "TAX COLLECTOR"] else person[0], phone=None if len(person) < 3 else person[2] if "(" in person[2] else None, name=person[1], url=response.url)
def parse(self, response): if response.url[-1] == 't': for quote in response.xpath( '//table[@class="views-table cols-3"]/tbody/tr'): yield Official(muniName=self.muniName, muniType=self.muniType, office="MAYOR" if quote.xpath("td[2]/text()").get().strip() == "Mayor" else "MEMBER OF COUNCIL", name=quote.xpath("td[1]/a/text()").get(), phone=quote.xpath("td[3]/text()").get(), url=response.url) elif response.url[-1] == 'x': for quote in response.xpath( '//div[contains(h5/text(),"Real Estate Property Tax")]/p[3]' ): text = getAllText(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=text[0].split(",")[0], phone=text[3], address=text[1] + ", " + text[2], url=response.url)
def parse(self, response): if response.url[-2] == "r": for quote in response.xpath('//article[@id="post-137"]'): yield Official( muniName=self.muniName, muniType=self.muniType, office="MAYOR", name=quote.xpath("h1/text()").get().split("–")[-1], email=quote.xpath('div/div[2]/p[2]/a/@href').get(), url=response.url) elif response.url[-2] == "l": for quote in response.xpath('//div[@class="entry-content"]'): fullNames = getAllText(quote.xpath("div")) emails = [(i.xpath("a/text()").get(), i.xpath("a/@href").get()) for i in quote.xpath("p")] names = { i[0].split(" ")[-1]: { "name": None, "email": i[1] } for i in emails } for i in fullNames: if i.split(" ")[-1] in names: names[i.split(" ")[-1]]["name"] = " ".join( i.split(" ")[-2:]) #first name taken from 2019 election returns; was not on website when spider was first made if names["Moore"]["name"] == None: names["Moore"]["name"] = "John Moore" for person in names: yield Official(muniName=self.muniName, muniType=self.muniType, office="MEMBER OF COUNCIL", name=names[person]["name"], email=names[person]["email"], url=response.url)
def parse(self, response): if "board" in response.url: for quote in response.xpath( "//h2[contains(text(),'Board Members')]/.."): alltext = [ i for i in getAllText(quote)[1:] if i != ',' and "Chairman" not in i ] for i in range(len(alltext) // 5): yield Official( muniName=self.muniName, muniType=self.muniType, office="COMMISSIONER", name=alltext[i * 5], district="WARD " + alltext[i * 5 + 2], email=alltext[i * 5 + 1] if "@" in alltext[i * 5 + 1] else None, phone=alltext[i * 5 + 1] if "(" in alltext[i * 5 + 1] else None, termEnd=alltext[i * 5 + 4][-4:], url=response.url) elif 'taxes' in response.url: quote = response.xpath( "//p[contains(strong/text(),'Delinquent Earned Income Tax Collector:')]/following-sibling::p/text()" ).getall()[0:4] quote = [i.replace("\xa0", "").strip() for i in quote] print(quote) yield Official(muniName=self.muniName, muniType=self.muniType, office="TAX COLLECTOR", name=quote[0], phone=quote[1], address=quote[2] + ", " + quote[3], url=response.url)