def get_station_data(self, resp): row_rows = resp.xpath( u"/html/body/div[7]/div/div[1]/div[2]/div/div[3]/table/tbody/tr[2]" ) print(row_rows) station_name = row_rows.xpath(u"td[1]/b/text()").extract_first() station_id = station_name aqi = row_rows.xpath(u"td[2]/p/text()").extract_first() pm25 = row_rows.xpath(u"td[3]/p/text()").extract_first() row_data_time = row_rows.xpath(u"td[4]/span/text()").extract_first() print(row_data_time) data_time = self.parse_date(row_data_time) _tmp_dict = Kind(self.name).get_dict(r_key=u"pm25", r_val=pm25) station_data = dict() if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] _tmp_dict = Kind(self.name).get_dict(r_key=u"aqi", r_val=aqi) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source_name items[u"source_id"] = str(station_id) yield items
def get_st_data(self, resp): regex = u"Details/(.+)\?type=1" st_id = findall(regex, resp.url) st_id = str(st_id[0]) row_data = resp.xpath(u'//*[@id="recentResults"]/tbody/tr') row_dt_hour = resp.xpath( u'//*[@id="recentResults"]/thead/tr[2]/th[1]/text()').re_first( u"Current \((.+)\)") new_dt = self.check_date(row_dt_hour) st_data = dict() for data in row_data: _name = data.xpath(u"td[1]/text()").re_first(u"(.+[\S])") _val = data.xpath(u"td[2]/text()").re_first(u"(.+[\S])") _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): row_json = resp.body.decode(u"utf-8") row_json = row_json.replace(u"\ufeff", u"") json = js_loads(row_json) data_time = self.get_date(json[u"DateTime"]) stations = json[u"Stations"] for st in stations: name = st[u"Station"] params = st[u"ParameterValueList"] st_data = dict() for p in params: pol_name = p[u"Id"] pol_value = p[u"Value"] _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = name yield items
def get_station_data(self, resp): body = resp.body root = etree.fromstring(body) row_data = root.xpath(u"//Measurement") station = dict() date = root.xpath(u"//Measurement[1]/Data/E/T/text()") data_time = parse(date[0]).replace(tzinfo=timezone(self.tz)) for st in row_data: name = st.xpath(u"@SiteName") pol_name = st.xpath(u"DataSource/@Name") pol_val = st.xpath(u"Data/E/I1/text()") pol_time = st.xpath(u"Data/E/T/text()") print(name, pol_name, pol_val, pol_time) if name[0] not in station: station[name[0]] = dict() _tmp_dict = Kind(self.name).get_dict(r_key=pol_name[0], r_val=pol_val[0]) if _tmp_dict: station[name[0]][_tmp_dict[u"key"]] = _tmp_dict[u"val"] for st_data in station: if st_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station[st_data] items[u"source"] = self.source items[u"source_id"] = st_data yield items
def get_station_data(self, resp): _station_id = Selector(text=resp.url).re(u"site_id=(\d+)") station_id = _station_id[0] data_time = self.get_date(resp) table = resp.xpath(u'//*[@id="tab1"]/table/tr') station_data = dict() for row in table: col = row.xpath(u"td/text()").extract() col = col[:2] _name = col[0] _val = col[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_st_data(self, resp): regex = u".*ST_ID=(.+)" st_id = findall(regex, resp.url) st_id = str(st_id[0]) table_name = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td/span') table_val = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td/span') names = list() for row in table_name: # print(row) row_name = row.xpath(u"@title").extract_first() regex_name = u"(.+)" _name = findall(regex_name, row_name) try: names.append(_name[0]) except IndexError: names.append(None) values = list() for row in table_val: row_val = row.xpath(u"@title").extract_first() regex_val = u"\) (.+)" _val = findall(regex_val, row_val) try: values.append(_val[0]) except IndexError: values.append(None) # for n in names: # self.tmp_set.add(n) # open("manitoba_names.txt", "a").write(str(self.tmp_set) + "\n") try: new_dt = self.check_date(values[0]) except IndexError: new_dt = None data = zip(names, values) data.pop(0) # print(data) st_data = dict() for val in data: _name = val[0] _val = val[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def aqi_to_value(self, row_aqi): pollutant_name = None if row_aqi is not None: if u"*" in row_aqi: pollutant_name = u"pm10" elif u"a" in row_aqi: pollutant_name = u"so2" elif u"b" in row_aqi: pollutant_name = u"no2" elif u"c" in row_aqi: pollutant_name = u"o3" elif u"d" in row_aqi: pollutant_name = u"co" station_data = dict() if pollutant_name is not None: val = row_aqi[:-1] # конвертуєму в з начення якщо це можливо val = Aqi().aqi_to_val(val, pollutant_name) _tmp_dict = Kind(self.name).get_dict(r_key=pollutant_name, r_val=val) # print("res", _tmp_dict) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] # print station_data return station_data
def get_st_data(self, resp): json = self.get_page(resp) for obj in json: body = Selector(text=obj[u"description"]) rows = body.xpath(u"//html/body/table/tr[3]/td/table/tr") st_id = str(body.xpath(u"//html/body/table/tr[1]/td/text()").extract_first()) data_date = body.xpath(u"//html/body/table/tr[2]/td/text()").extract_first() data_date = str(data_date.rstrip(u" (IST)")) data_date = parse(data_date) # print(data_date) new_dt = data_date.replace(tzinfo=pytz.timezone(self.tz)) st_data = dict() for row in rows: col = row.xpath(u"td/text()").extract() try: _name = col[0] _val = col[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] except IndexError: pass if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now(tz=pytz.timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def read_st_data(self, resp): st_data = resp.xpath(u"StationMeasurement") stations = dict() dt = resp.xpath( u"StationMeasurement[1]/ReadingDate/text()").extract_first() new_dt = self.get_date(dt) print(new_dt) for data in st_data: name = data.xpath(u"StationName/text()").extract_first() poll_name = data.xpath(u"ParameterName/text()").extract_first() poll_val = data.xpath(u"Value/text()").extract_first() _name = str(poll_name) _val = str(poll_val) # print(_name, _val) if name not in stations: name = str(name) stations[name] = dict() _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) # print(_tmp_dict) if _tmp_dict: stations[name][_tmp_dict[u"key"]] = _tmp_dict[u"val"] result = (stations, new_dt) return result
def get_st_data(self, resp): body = resp.body body = body.replace(u"\n", u"") json = js_parse(body) exception = (u"name", u"gps", u"date", u"datetime", u"time") for station in json: station_name = station[u"name"] data_time = self.get_date(station[u"datetime"]) station_data = dict() for attr_name in station: if attr_name not in exception: pol_name = attr_name pol_value = station[attr_name] _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_name yield items
def get_station_data(self, resp): data_time = resp.xpath(u'//*[@id="main"]/div[1]/div[2]/p[3]/text()').re(u"(\d\d\/\d\d\/\d\d\d\d\s\d\d:\d\d)") data_time = parser.parse(data_time[0]).replace(tzinfo=timezone(self.tz)) if data_time else None table = resp.xpath(u'//*[@id="tabs-content-data"]/table/tbody/tr') station_data = dict() for row in table: pollutant_index = row.xpath(u"td[1]/sub/text()").extract_first() if row.xpath(u"td[1]/sub/text()").extract_first() != None else u"" pollutant_name = u" ".join(( row.xpath(u"td[1]/text()").extract_first().split(u" (")[0], pollutant_index, row.xpath(u"td[4]/text()").extract_first() )).replace(u" ", u" ") pollutant_value = row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] if row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] != u"No" else None pollutant = Kind(self.name).get_dict(r_key=pollutant_name, r_val=pollutant_value) if pollutant: station_data[pollutant[u"key"]] = pollutant[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): data_time = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td[1]/span/text()' ).extract_first() data_time = parser.parse(data_time) # pollutant_name = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:] pollutant_name = [ el.xpath(u"span/text()").extract_first() for el in pollutant_name ] pollutant_data = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td')[1:] pollutant_data = [ el.xpath(u"span/text()").extract_first() for el in pollutant_data ] data = zip(pollutant_name, pollutant_data) station_data = dict() for record in data: pollutant = Kind(self.name).get_dict(r_key=record[0], r_val=record[1]) if pollutant: station_data[pollutant[u"key"]] = pollutant[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time.replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_st_data(self, resp): regex = u"Data/(.+)_Line\.xml" st_id = findall(regex, resp.url) st_id = str(st_id[0]) row_data = resp.xpath(u"tname[1]/child::*") row_dt = resp.xpath(u"tname[1]/DATE_TIME/text()").extract_first() new_dt = self.check_date(row_dt) st_data = dict() for el in row_data: tag_name = el.xpath(u"name()").extract_first() tag_val = el.xpath(u"text()").extract_first() _name = tag_name _val = tag_val _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): date = resp.xpath(u'//*[@id="MainContent"]/div[2]/div[2]/ul/li/span/a/text()').extract_first() date = date.replace(u"\t", u"") date = date.replace(u"\nAir Quality Index ", u"") all_tables = resp.xpath(u'//*[@class="table table-alternate table-condensed"]') tables = [x for i, x in enumerate(all_tables) if i % 2 == 0] tables_date = [x for i, x in enumerate(all_tables) if i % 2 != 0] tables = tables[:len(tables) - 1] for index, table in enumerate(tables): hour = tables_date[index].xpath(u"tbody/tr/td/text()").re_first(u"AQI at (\d+) hrs") row_data_time = date + u" " + hour data_time = parse(row_data_time).replace(tzinfo=timezone(self.tz)) station_id = table.xpath(u"thead/tr/th[1]/text()").re_first(u"(.+) A.Q.M.S.") if u" Particles" in station_id: station_id = station_id.replace(u" Particles", u"") if u" Mobile" in station_id: station_id = station_id.replace(u" Mobile", u"") rows = table.xpath(u"tbody/tr") station_data = dict() for row in rows: name = row.xpath(u"td[1]/text()").extract_first() name = name.replace(u" ", u"") aqi = float(row.xpath(u"td[3]/text()").extract_first()) if name == u"SO2": val_name = u"so2" elif name == u"NO2": val_name = u"no2" elif name == u"O3": val_name = u"o3" elif name == u"PM10": val_name = u"pm10" elif name == u"PM2.5": val_name = u"pm25" elif name == u"CO": val_name = u"co" else: val_name = None if val_name: val = Aqi().aqi_to_val(aqi, val_name) _tmp_dict = Kind(self.name).get_dict(r_key=name, r_val=val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def validate_station_data(self, meta): """validate dictionary from response META attribute""" results = meta.get(u"results") res1 = results.get(u"res1") res2 = results.get(u"res2") res3 = results.get(u"res3") df1 = DataFrame.from_dict(res1).sort_values(by=u"date") df2 = DataFrame.from_dict(res2).sort_values(by=u"date") df3 = DataFrame.from_dict(res3).sort_values(by=u"date") df = merge(df1, df2, on=u"date") df = merge(df, df3, on=u"date") # print(df) source_code = meta.get(u"station_code") for obs in df.itertuples(): st_data = dict() co = Kind(self.name).get_dict(r_key=u"co", r_val=obs.co) pm10 = Kind(self.name).get_dict(r_key=u"pm10", r_val=obs.pm10) no2 = Kind(self.name).get_dict(r_key=u"no2", r_val=obs.no2) if co: st_data[co[u"key"]] = co[u"val"] if pm10: st_data[pm10[u"key"]] = pm10[u"val"] if no2: st_data[no2[u"key"]] = no2[u"val"] data_time = obs.date.to_datetime() data_time = data_time.replace(tzinfo=timezone(self.tz)) if st_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = source_code yield items
def get_additional_data(self, resp): weather_data = resp.meta[u"data"] body = resp.body body = body.split(u"\r\n") col_names = body[8].lstrip(u"#") col_names = col_names.split(u", ") col_names = col_names[1:] # print(col_names) data_time = self.get_date(body[1]) table = body[9:len(body) - 1] for row in table: col = row.split(u",") col_values = list() for el in col: if u" " in el: el = el.replace(u" ", u"") if u"/" in el: el = None if u"-99" == el: el = None if u"-999" == el: el = None if u"-9999" == el: el = None col_values.append(el) station_id = col_values[0] col_values = col_values[1:] # print(col_values, station_id) data = zip(col_names, col_values) station_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] # доклеюємо дані із першої відповіді station_data.update(weather_data[station_id]) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): row_names = ('Date', 'HOP', 'HGC', 'BSY', 'MEX', 'MTC', 'HEW', 'CLY') # опрацювання pdf документу stream = StringIO.StringIO(resp.body) rsrcmgr = PDFResourceManager() retstr = StringIO.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, stream) device.close() doc_str = retstr.getvalue() retstr.close() # розбиваємо построчно row_data = doc_str.split('\n') # data_time = row_data[len(row_data)-7] # print(data_time) first_row = row_data[17:25] data = zip(row_names, first_row) data_time = self.get_date(data[0][1]) data = data[1:] for st in data: station_id = st[0] _name = 'PM10_24HR' _val = st[1] if '*' in _val: _val = _val.replace('*', '') _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) station_data = dict() if _tmp_dict: station_data[_tmp_dict['key']] = _tmp_dict['val'] # print(station_data) if station_data: items = AppItem() items['scrap_time'] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items['data_time'] = data_time items['data_value'] = station_data items['source'] = 'http://superpit.com.au' items['source_id'] = station_id yield items
def get_st_data(self, resp): regex = u"AP=(.+)" st_id = findall(regex, resp.url) st_id = str(st_id[0]) row_names = resp.xpath(u'//*[@id="apTable"]/table/tr[1]/th/span') row_data = resp.xpath(u'//*[@id="apTable"]/table/tr[2]/td') new_dt = self.check_date(resp) print(new_dt) names = list() for name in row_names: _name = name.xpath(u"text()").extract() try: names.append(_name[0]) except IndexError: names.append(None) # print(_name) vals = list() for val in row_data: _val = val.xpath(u"text()").extract() try: vals.append(_val[0]) except IndexError: vals.append(None) data = zip(names, vals) data.pop(0) st_data = dict() for val in data: _name = val[0] _val = val[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): regex = u".*ST_ID=(.+)" station_id = findall(regex, resp.url) station_id = str(station_id[0]) # формужмо значення назв забрудників таблиці row_col_names = resp.xpath( u'//*[@id="C1WebGrid1"]/tbody/tr[1]/td/div/text()').extract() col_names = list() for col_name in row_col_names: col_name = col_name.lstrip(u'\n\t') col_name = col_name.rstrip(u'\n\t') col_names.append(col_name) col_names = col_names[1:] # витягуємо значення забрудників row_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tbody/tr[5]/td') data_values = list() for data in row_data: row_value = data.xpath(u"div/text()").re(u"([\S].+[\S])") try: data_values.append(row_value[0]) except IndexError: data_values.append(None) # значення дати даних data_date = parse(data_values[0]) data_date = data_date.replace(tzinfo=timezone(self.tz)) data_values = data_values[1:] data = zip(col_names, data_values) station_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_date items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id return items
def get_station_data(self, resp): body = resp.body body = body.split(u"\r\n") col_names = body[8].lstrip(u"#") col_names = col_names.split(u", ") col_names = col_names[1:] # print(col_names) table = body[9:len(body)-1] weather_data = dict() for row in table: col = row.split(u",") col_values = list() for el in col: if u" " in el: el = el.replace(u" ", u"") if u"/" in el: el = None if u"-99" == el: el = None if u"-999" == el: el = None if u"-9999" == el: el = None col_values.append(el) station_id = col_values[0] col_values = col_values[1:] # print(col_values) data = zip(col_names, col_values) station_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] weather_data[station_id] = station_data # print(station_data) # print(weather_data) return Request( u"http://epa.tas.gov.au/air/live/epa_tas_latest_particle_data.txt", callback=self.get_additional_data, meta=dict(data=weather_data) )
def get_st_data(self, resp): table = resp.xpath(u'//*[@id="right_column"]/div/table/tbody/tr') col_names = resp.xpath(u'//*[@id="right_column"]/div/table/thead//th/@abbr').extract() data_date = resp.xpath(u'//*[@id="right_column"]/div/h1/text()').extract() data_date = str(data_date[0]).lstrip(u"Pollutant Concentrations for ") dt = parse(data_date) new_dt = dt.replace(tzinfo=timezone(self.tz)) # get correct values, if there is no value add "" for row in table: cols = row.xpath(u"td") # get id from href url = cols[0].xpath(u"div/a/@href").extract() url = str(url[0]) regex_station = u"stationid=(.+)" st_id = re.findall(regex_station, url) st_id = str(st_id[0]) row_data = [] for col in cols: text = col.xpath(u"div[1]/text()").extract() try: row_data.append(text[0]) except IndexError: row_data.append(None) gen_data = zip(col_names, row_data) st_data = dict() for data in gen_data: _key = data[0] _val = data[1] _tmp_dict = Kind(self.name).get_dict(r_key=_key, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): # get local id from url regex_state = u"StateId=(.+?)&" regex_city = u"CityId=(.+)" regex_station = u"StationName=(.+?)&" _state_id = re.findall(regex_state, resp.url) _city_id = re.findall(regex_city, resp.url) _station_id = re.findall(regex_station, resp.url) st_id = u"".join( (_station_id[0].replace(u"%20", u" "), _state_id[0], _city_id[0])) table = resp.xpath(u'//*[@id="lblReportCurrentData"]/table/child::*') data_date = resp.xpath( u'//*[@id="lblCurrentDateTime"]/text()').extract_first() st_data = {} for el in table: col = el.xpath(u"child::td") try: name = col[0].xpath(u"text()").extract() _name = name[0] _val = col[3].xpath(u"span/text()").extract_first() _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] except IndexError: pass if data_date: data_date = data_date.replace(u"Date Time : ", u"") new_dt = parse(data_date) new_dt = new_dt.replace(tzinfo=timezone(self.tz)) else: new_dt = None if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id return items
def get_station_data(self, resp): data_time = resp.xpath(u'//*[@id="tab_content"]/p[1]/strong/text()' ).re(u"(\d\d\/\d\d\/\d\d\d\d\s\d\d:\d\d)") data_time = parser.parse(data_time[0]).replace( tzinfo=timezone(self.tz)) if data_time else None table = resp.xpath(u'//*[@id="tab_content"]/table/tr')[1:] station_data = dict() for row in table: pollutant_name = row.xpath(u"td[1]/text()").extract_first() pollutant_name_ind = row.xpath( u"td[1]/sub/text()").extract_first() if row.xpath( u"td[1]/sub/text()").extract_first() != None else u"" pollutant_name_time = row.xpath( u"td[last()]/text()").extract_first() pollutant_name = (u" ".join( (pollutant_name, pollutant_name_ind, pollutant_name_time))).replace(u" ", u" ") pollutant_value = row.xpath( u"td[last() - 1]/text()").extract_first() if u"\xa0" in pollutant_value: pollutant_value = pollutant_value.split(u"\xa0")[0] else: pollutant_value = pollutant_value.split(u" ")[0] pollutant_value = pollutant_value if pollutant_value != u"No" else None pollutant = Kind(self.name).get_dict(r_key=pollutant_name, r_val=pollutant_value) if pollutant: station_data[pollutant[u"key"]] = pollutant[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): # get data time row_data_time = resp.xpath(u'//*[@id="titulo"]').re( u"<h4>(.+)</h4>")[0] data_time = self.check_data_time(row_data_time) # get data row_data_table = resp.xpath(u'//*[@id="dados_estacoes"]/table/tr')[2:] for row in row_data_table: colspan_check = row.xpath(u"td/@colspan").extract() if not colspan_check: cols = row.xpath(u"td/text()").extract() station_name = cols[0] station_id = station_name print(cols) so2 = (u"so2", cols[1].strip(u" ")) co = (u"co", cols[2].strip(u" ")) pm10 = (u"pm10", cols[3].strip(u" ")) o3 = (u"o3", cols[4].strip(u" ")) no2 = (u"no2", cols[5].strip(u" ")) # full pollution data data = (so2, co, pm10, o3, no2) station_data = dict() for val in data: _name = val[0] _val = val[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): tabs = resp.xpath(u'//*[@id="Form1"]/div[3]/div[2]/article')[1:] for tab in tabs: row_data_time = tab.xpath(u"header/span/text()").extract_first() data_time = self.check_data_time(row_data_time) row_data_table = tab.xpath(u"section/ul/li") for row in row_data_table: station_name = row.xpath(u"a/h3/text()").extract_first() station_id = station_name pollution_table = row.xpath(u"div/div[3]/table/tbody/tr") station_data = dict() for pollution in pollution_table: # pollution names pollution_name = pollution.xpath(u"td[1]/text()").extract_first() sub_pollution_name = pollution.xpath(u"td[1]/sub/text()").extract_first() if sub_pollution_name is not None: pollution_name += sub_pollution_name pollution_name = pollution_name.split(u" \u2013 ")[1] pollution_name = clean(pollution_name) # print(repr(pollution_name)) # pollution values pollution_value = pollution.xpath(u"td[2]/text()").extract_first() pollution_value = clean(pollution_value, (u"\xcdndice:",)) # print(repr(pollution_value)) _tmp_dict = Kind(self.name).get_dict(r_key=pollution_name, r_val=pollution_value) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_st_data(self, resp): data_date = self.get_date(resp) col_names = resp.xpath( u'//*[@id="column-main"]/table/tr[1]/th/text()').extract() value_names = list() for name in col_names: value_names.append(name) value_names = value_names[2:] table = resp.xpath(u'//*[@id="column-main"]/table/tr') table = table[1:] for row in table: col = row.xpath(u"td/text()").extract() st_id = col[0].strip(u"\xa0") col = col[2:] data_values = list() for value in col: data_values.append(value) # print(data_values) # print len(data_values) data = zip(value_names, data_values) st_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_date items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): stations = resp.xpath(u"STATIONS/STRD") for station in stations: st_id = station.xpath(u"@ID").extract_first() st_id = str(st_id) row_st_data = station.xpath(u"READS/RD[1]") dt = row_st_data.xpath(u"@DT").extract() if dt: dt_arr = str(dt[0]).split(u",") dt_str = dt_arr[1] + u"." + dt_arr[2] + u"." + dt_arr[0] + u" " + dt_arr[3] + u":" + dt_arr[4] + u":" + dt_arr[5] new_dt = parse(str(dt_str)) # open("testtt.txt", "a").write(str(dt_str) + "\n") # new_dt = time.strptime(str(dt[0]), "%Y,%m,%d,%H,%M,%S") # new_dt = parse(new_dt) new_dt = new_dt.replace(tzinfo=timezone(self.tz)) else: new_dt = None row_params = row_st_data.xpath(u"PARAMS/PV") st_data = {} for pv in row_params: _name = pv.xpath(u"@NM").extract_first() # recognize key val from table _val = pv.xpath(u"@VL").extract_first() _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] if st_data: items = AppItem() items[u"scrap_time"] = datetime.datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = new_dt items[u"data_value"] = st_data items[u"source"] = self.source items[u"source_id"] = st_id yield items
def get_st_data(self, resp): col_names = (u"SO2", u"NO", u"NO2", u"NOX", u"CO", u"OX", u"NMHC", u"CH4", u"THC", u"SPM", u"PM2.5", u"SP", u"WS", u"TEMP", u"HUM") data_time = parse(self.date[u"date"] + u" " + self.date[u"hour"]).replace(tzinfo=timezone(self.tz)) rows = resp.xpath(u"//table/tr") for row in rows: col = row.xpath(u"td/text()").extract() col.pop(1) col.pop(13) station_id = col[0] col = col[1:] data = zip(col_names, col) station_data = dict() for st in data: _name = st[0] _val = st[1] _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] # print(station_data) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_value(self, string): regex = u"<h2>(\d*.?)</h2>" substr = findall(regex, string) try: _substr = substr[0] aqi = _substr[:-1] # print("aqi", aqi) names = [u"aqi"] if u"*" in _substr: names.append(u"pm10") elif u"a" in _substr: names.append(u"so2") elif u"b" in _substr: names.append(u"no2") elif u"c" in _substr: names.append(u"o3") elif u"d" in _substr: names.append(u"co") # print("names", names) st_data = dict() for name in names: val = aqi if name != u"aqi": # конвертуєму в з начення якщо це можливо val = Aqi().aqi_to_val(float(aqi), name) _tmp_dict = Kind(self.name).get_dict(r_key=name, r_val=val) # print("res", _tmp_dict) if _tmp_dict: st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] return st_data except IndexError: return None
def get_station_data(self, resp): stations = resp.xpath(u"//station") data_date = resp.xpath(u"category/@measurementdate").extract_first() data_hour = resp.xpath(u"category/@measurementhour").extract_first() if u"24" == data_hour: data_hour = u"00" data_time = data_date + u" " + data_hour data_time = parse(data_time).replace(tzinfo=timezone(self.tz)) for st in stations: station_id = st.xpath(u"@name").extract_first() if u"'" in station_id: station_id = station_id.replace(u"'", u"") measurements = st.xpath(u"measurement") station_data = dict() for meas in measurements: pol_name = meas.xpath(u"@name").extract_first() pol_val = meas.xpath(u"text()").extract_first() # print(pol_name, pol_val) _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_val) if _tmp_dict: station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"] # print(station_data) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items