def get_station_data(self, resp): # print(resp.text) all_data = pd.read_csv(StringIO(resp.text), names=(u"station_name", u"str_date", u"pollutant_name", u"pollutant value")).dropna(axis=0) all_data[u"date_time"] = [ parser.parse(x) for x in all_data[u'str_date'] ] current_data_time = all_data[u"date_time"].max() # print(current_data_time) curr_all_data = all_data[all_data[u"date_time"] == current_data_time] idx = curr_all_data.groupby( by=u"pollutant_name")[u"date_time"].transform( max) == curr_all_data[u"date_time"] units = { u"PM2.5": u"ug/m3", u"PM10-NEW": u"ug/m3", u"Sulfur Dioxide": u"ppb", u"Carbon Monoxide": u"ppb", u"Ozone 1 hour": u"ppb", u"Nitrogen Dioxide": u"ppb", } data = curr_all_data[idx].copy() station_data = dict() for el in data.itertuples(): pollutant_name = el[3] pollutant_value = el[4] pollutant_units = units.get(pollutant_name) # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() # print(station_data) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_data_time).replace( tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_text = resp.xpath(u'/html/head/script[last()]/text()') raw_pollutant_data = raw_text.re(u"var data = (.+);") raw_pollutant_data = [ el.replace(u"'", u'"').replace(u"],]", u"]]") for el in raw_pollutant_data ] pollutant_data = [ujson.loads(el) for el in raw_pollutant_data] pollutant_data = [el[-1] for el in pollutant_data] pollutant_value = [el[1] for el in pollutant_data] # pollutant_date = [el[0] for el in pollutant_data] pollutant_date = [ parser.parse(el[0]).replace(tzinfo=timezone(self.tz)) for el in pollutant_data ] # max value as current date current_data_time = max(pollutant_date) # print(current_data_time) # data_time = parser.parse(raw_data_time).replace(tzinfo=timezone(self.tz)) raw_pollution_name = raw_text.re(u"series: \[\s+\{\s+name: '(.+)',?") pollutant_name = [ Selector(text=el).xpath(u"/html/body/p/text()").re(u"(.+)\(")[0] for el in raw_pollution_name ] pollutant_units = [ Selector(text=el).xpath(u"/html/body/p/text()").re(u"\((.+)\)")[0] for el in raw_pollution_name ] data = zip(pollutant_name, pollutant_value, pollutant_units, pollutant_date) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None and record[3] == current_data_time: station_data[pollutant.get_name()] = pollutant.get_value() if station_data and current_data_time: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = current_data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_col_names = resp.xpath(u"/html/body/div[1]/div[2]/table/tr[1]/td").extract() col_names = [re.sub(u"<.+?>", u"", el) for el in raw_col_names] # for el in col_names: # print el table = resp.xpath(u'/html/body/div[1]/div[2]/table//td') table_data = [el.xpath(u".").re(u"<td>(.+)<\/td>")[0] if el.xpath(u".").re(u"<td>(.+)<\/td>") else None for el in table] # print(table_data) # print(len(table_data)) table_data = np.asarray(table_data).reshape(len(table_data)/len(col_names), len(col_names)) df = pd.DataFrame(table_data[1:, ], columns=col_names) # print(df) raw_data = df.iloc[0].to_dict() raw_data_time = raw_data.pop(u"Дата і час", None) data_time = parser.parse(raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz)) data = raw_data units = { u"Температура повітря": u"degc", u"Опади": u"mm", u"Рівень №2": u"NA", u"Рівень №1": u"NA", u"Рівень": u"NA", u"Температура води": u"degc", } station_data = dict() for key, val in data.items(): # print(key) poll_name = key poll_value = val poll_units = units[key] # print(poll_name, poll_value, poll_units) pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(poll_name) pollutant.set_raw_value(poll_value) pollutant.set_raw_units(poll_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): records = resp.xpath(u"//record") table = list() for rec in records: date = rec.xpath(u"./@date").extract_first() hour = rec.xpath(u"./@hour").extract_first() if hour == u"2400": hour = u"0000" raw_data_date = u" ".join((date, hour)) data_date = parser.parse(raw_data_date) values = rec.xpath(u"child::node()") for val in values: pollutant_name = val.xpath(u"name(.)").extract_first() pollutant_value = val.xpath(u"./text()").extract_first() pollutant_unit = val.xpath(u"./@unit").extract_first() row = { u"date": data_date, u"pollutant_name": pollutant_name, u"pollutant_value": pollutant_value, u"pollutant_unit": pollutant_unit, } table.append(row) data = pd.DataFrame(table) data = data.dropna(axis=0) current_data_time = data[u"date"].max() curr_data = data[data[u"date"] == current_data_time] station_data = dict() for el in curr_data[[u"pollutant_name", u"pollutant_value", u"pollutant_unit"]].itertuples(index=False): pollutant_name = el[0] pollutant_value = el[1] pollutant_units = el[2] pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_data_time).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] return items
def get_station_data(self, resp): raw_poll_name = resp.xpath( u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:].extract() poll_name = [ re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0] for el in raw_poll_name ] raw_poll_unit = resp.xpath( u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:].extract() poll_unit = [ re.findall(u"\r\n\t(.+)\r\n", re.sub(u"<.+?>", u"", el))[0] for el in raw_poll_unit ] raw_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[last()]/td') data_time = raw_data[0].xpath(u'.//div[1]/text()').extract_first() data_time = data_time.replace(u"24:00", u"00:00") data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz)) # print(data_time) raw_pollutant_value = raw_data[1:] pollutant_value = list() for el in raw_pollutant_value: value = el.xpath(u'.//div/text()').extract_first() value = clean(value) if u"\xa0" in value: value = None pollutant_value.append(value) data = zip(poll_name, pollutant_value, poll_unit) # print(data) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_poll_name = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[1]/td')[1:] poll_name = [ el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0] for el in raw_poll_name ] raw_poll_units = resp.xpath(u'//*[@id="C1WebGrid1"]/tr[2]/td')[1:] poll_units = [ el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0] for el in raw_poll_units ] raw_poll_value_data = resp.xpath( u'//*[@id="C1WebGrid1"]/tr[last()]/td') raw_poll_value = raw_poll_value_data[1:] poll_value = [ el.xpath(u".//div[1]/text()").re(u"\r\n\t(.+)\r\n")[0] for el in raw_poll_value ] raw_data_time = raw_poll_value_data[0].xpath(u".//div[1]/text()").re( u"\r\n\t(.+)\r\n")[0] data_time = parser.parse(raw_data_time).replace( tzinfo=timezone(self.tz)) data = zip(poll_name, poll_value, poll_units) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): all_data = self.get_clean_data(resp) current_data_time = all_data[u"date"].max() curr_all_data = all_data[all_data[u"date"] == current_data_time] # idx = curr_all_data.groupby(by=u"pollutant_name")[u"date"].transform(max) == curr_all_data[u"date"] # data = curr_all_data[idx].copy() data = curr_all_data data = data[[ u"station_name", u"pollutant_name", u"pollutant_value", u"unit" ]] grouped = data.groupby(by=u"station_name") for name, gr in grouped: station_data = dict() station_id = None for record in gr.itertuples(index=False): if station_id is None: station_id = record[0] pollutant_name = record[1] pollutant_value = record[2] pollutant_units = record[3] # print(station_id, pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # # print("answare", station_id, pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime( current_data_time).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): raw_pollutant_name = resp.xpath( u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:] pollutant_name_data = [ el.xpath(u".//span[1]/@title").extract_first().split(u"\n\n") for el in raw_pollutant_name ] pollutant_name = [el[0] for el in pollutant_name_data] pollutant_units = [el[1] for el in pollutant_name_data] pollutant_name = map(lambda x: u" ".join(x.split()), pollutant_name) raw_data = resp.xpath(u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td') data_time = raw_data[0].xpath(u".//span[1]/text()").extract_first() data_time = data_time.replace(u"24:00", u"00:00") data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz)) raw_pollutant_value = raw_data[1:] pollutant_value = [ el.xpath(u'.//span[1]/text()').extract_first() for el in raw_pollutant_value ] data = zip(pollutant_name, pollutant_value, pollutant_units) # print(data) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def push_data(self, resp): # print("PUSH!!!!!!!!!!!!!!!!!!!!!!") data = resp.meta["data"] df = pd.DataFrame(data) # print(df.groupby(by=["unit", "name"]).size()) df["value"] = pd.to_numeric(df["value"]) df = df[pd.notnull(df["value"])] current_time = self.get_max_valid_date(df) # print(current_time) current_data = df[df["time"] == current_time] print(current_data) grouped = current_data.groupby(by="station_id") # print(df.groupby(by="name").size()) for station_id, gr in grouped: station_data = dict() for poll in gr[["name", "value", "unit"]].itertuples(index=False): pollutant_name = poll[0] pollutant_value = poll[1] pollutant_units = poll[2] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_time).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): data = resp.xpath(u"//pre/text()").extract_first() json = ujson.loads(data) stations = json[u"d"] for st in stations: data_time = st[u"AqiTimeString"] data_time = parser.parse(data_time) station_id = st[u"SourceSiteID"] station_data = dict() for record in st[u"LayerInfos"]: pollutant_name = record.get(u"ParameterName") pollutant_unit = record.get(u"UnitName") pollutant_value = record.get(u"Concentration") # print(pollutant_name, pollutant_value, pollutant_unit) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_unit) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() # print(station_data) data_time = data_time.replace(tzinfo=timezone(self.tz)) if station_data and data_time: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): raw_poll_name = resp.xpath( u'//*[@id="main"]/div[1]/div[1]/table/thead/tr/th')[1:] poll_name = [ el.xpath(u"abbr/text()").extract_first() for el in raw_poll_name ] poll_unit = [ el.xpath(u"span/abbr/text()").extract_first() for el in raw_poll_name ] raw_poll_value = resp.xpath( u"//*[@id='main']/div[1]/div[1]/table/tbody/tr[last()]/td")[1:] poll_value = [ el.xpath(u"text()").extract_first() for el in raw_poll_value ] data = zip(poll_name, poll_value, poll_unit) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print(record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() # print(station_data) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = resp.meta[u"date_time"] items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_record = resp.xpath( u"//*[@id='ContentPlaceHolder1_tablebody']/tr[last()]/td") hour = raw_record[0].xpath(u"./text()").re(u"-(\d?\d:\d\d)")[0] pollutant_value = raw_record[1].xpath(u"./text()").extract_first() # only one pollutant pollutant_name = u"PM25" pollutant_units = u"µg/m" cor_date = resp.meta["date"].strftime("%d-%m-%Y") raw_data_time = " ".join((cor_date, hour)) date_time = parser.parse( raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz)) # print(date_time, hour, pollutant_value) station_data = dict() pollutant = Feature(self.name) pollutant.set_source(self.source) # print(record) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = date_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): data = self.data_to_df(resp) data = data.dropna(axis=0) # print(data) current_data_time = data[u"time"].max() current_data = data[data[u"time"] == current_data_time] station_data = dict() for record in current_data[[u"name", u"value", u"unit"]].itertuples(index=False): pollutant_name = record[0] pollutant_value = record[1] pollutant_units = record[2] # print(station_id, pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_data_time).replace( tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta["code"] yield items
def get_station_data(self, resp): raw_data_time = resp.xpath(u'//*[@id="ctl00_ContentPlaceHolder2_lbCaption"]/text()').re(u"Current Max Pollution Level \((\d\d?/\d\d?/\d\d\d\d \d\d?..)")[0] data_time = parser.parse(raw_data_time).replace(tzinfo=timezone(self.tz)) res = resp.xpath(u'//*/script/text()').extract() script = res[2] raw_part = re.findall(u"^.+?function initSiteList\(\)\s+\{(.+?)}\s+/\*", script, re.DOTALL) station_datas = re.findall(u"\.params = (\[.+?\]);", raw_part[0]) station_ids = re.findall(u"new SiteInfo\(\"(.+?)\"", raw_part[0]) stations = zip(station_ids, station_datas) for st in stations: station_id = st[0] data_json = ujson.loads(st[1]) data = [(el[u"name"], el[u"val"], el[u"unit"]) for el in data_json] station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print(record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): raw_date = resp.xpath( u"id('col2')/div[1]/center[1]/h3/text()").extract_first() raw__all_data = resp.xpath( u"//*[@id='col2']/div[1]/center[2]/table/tr") raw_poll_name_p1 = raw__all_data[0].xpath(u"td")[1:] raw_poll_name_p2 = raw__all_data[1].xpath(u"td")[1:] raw_units = raw__all_data[2].xpath(u"td")[1:] poll_name_p1 = list() for el in raw_poll_name_p1: poll_name = el.xpath(u"./text()").extract() if not poll_name: poll_name = [u""] dup_count = el.xpath(u"./@colspan").extract_first(default=1) poll_name_p1.extend(poll_name * int(dup_count)) poll_name_p1 = [u" ".join(el.split()) for el in poll_name_p1] poll_name_p2 = list() for el in raw_poll_name_p2: poll_name = el.xpath(u"./text()").extract() if not poll_name: poll_name = [u""] dup_count = el.xpath(u"./@colspan").extract_first(default=1) poll_name_p2.extend(poll_name * int(dup_count)) poll_name_p2 = [u" ".join(el.split()) for el in poll_name_p2] pollutant_name = map(u" ".join, zip(poll_name_p1, poll_name_p2)) pollutant_name = [u" ".join(el.split()) for el in pollutant_name] pollutant_name = [None if el == u"" else el for el in pollutant_name] units = list() for el in raw_units: unit_name = el.xpath(u"./text()").extract() dup_count = el.xpath(u"./@colspan").extract_first(default=1) units.extend(unit_name * int(dup_count)) units = [u" ".join(el.split()) for el in units] units = [None if el == u"" else el for el in units] # print(units) # print(pollutant_name) raw_table = raw__all_data[3:-18] records = list() for el in raw_table: col = el.xpath(u"td") hour = col[0].xpath(u"center/text()").extract_first() # print(hour) raw_values = [ el.xpath(u"./text()").extract_first(default=u"") for el in col[1:] ] values = [ u" ".join(el.replace(u"\n", u"").split()) for el in raw_values ] values = [None if el == u"" else el for el in values] raw_data_date = u" ".join((raw_date, hour)) raw_data_date = u" ".join(raw_data_date.split()) data_time = parser.parse(raw_data_date) data = zip(pollutant_name, values, units) # print(data) for rec in data: _rec = { u"name": rec[0], u"value": rec[1], u"unit": rec[2], u"date": data_time, } records.append(_rec) df = pd.DataFrame(records) df = df.dropna(axis=0) # df.replace(r'\s*', None, regex=True) # df.replace(to_replace="", value=None) grouped = df.groupby(by="date", as_index=False).count() current_date = grouped[grouped["name"] > 1]["date"].max() curr_data = df[df[u"date"] == current_date] print(curr_data) station_data = dict() for el in curr_data[[u"name", u"value", u"unit"]].itertuples(index=False): pollutant_name = el[0] pollutant_value = el[1] pollutant_units = el[2] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_date).replace( tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] return items
def get_station_data(self, resp): raw_data_time = self.get_date(resp) table = resp.xpath(u"/html/body/div[1]/left/table/tbody/tr") # print(table) # hours = table.pop(0).xpath(u"td")[2:-3] col_names = table.pop(0).xpath(u"td") col_names = [el.xpath(u"p/b/text()").extract_first() for el in col_names] global_pollutant_name = None data = list() for row in table: col = row.xpath(u"td") pollutant_name = col.pop(0).xpath(u"tt/text()").extract_first() if pollutant_name: global_pollutant_name = pollutant_name pollutant_name = global_pollutant_name if pollutant_name is None else pollutant_name row_values = [el.xpath(u"tt/text()").extract_first() for el in col] row_values.insert(0, pollutant_name) data.append(row_values) df = pd.DataFrame(data, columns=col_names) df = df.dropna(thresh=1, axis=1) # df = df.dropna(thresh=1, axis=0) del_cols = list(df.columns.values) df.drop(labels=del_cols[-3:], axis=1, inplace=True) del_mid_cols = list(df.columns.values) df.drop(labels=del_mid_cols[2:-1], axis=1, inplace=True) # print(df) units = { u"o3": u"ppb", u"pm25": u"ug/m3", u"pm10": u"ug/m3", u"co": u"ppm", u"so2": u"ppb", u"no2": u"ppb", } grouped = df.groupby(by=u"Site Name") for name, gr in grouped: station_id = name hour = list(gr.columns.values)[-1] data_time = "{0} {1}:00".format(raw_data_time, hour) data_time = parser.parse(data_time) station_data = dict() for rec in gr.itertuples(index=False): pollutant_name = rec[0] pollutant_value = rec[2] pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) try: pollutant.set_raw_units(units[pollutant.get_name()]) except KeyError: print( "There is no such pollutant in local units list <<<<<<<<{0}>>>>>>".format(pollutant.get_name())) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time.replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_test(self, resp): local_units = { u"temp": u"degc", u"snow": u"", u"wd": u"deg", u"temp_dew_p": u"degc", u"hum": u"%", u"sn_d": u"mm", u"pres": u"gpa", u"sky": u"%", } json = ujson.loads(resp.text) lc = LayerContainer() min_forecast_date = None for rec in json["hourly_forecast"]: fdate = parser.parse(rec.pop("FCTTIME")["pretty"]) if min_forecast_date is None: min_forecast_date = fdate if fdate < min_forecast_date: min_forecast_date = fdate layer = dict() for key, val in rec.items(): pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(key) pollutant.set_raw_value(self.get_value(val)) try: pollutant.set_raw_units(local_units[pollutant.get_name()]) except KeyError: if pollutant.get_name() is not None: print( "There is no such pollutant in local units list <<<<<<<<{0}>>>>>>" .format(pollutant.get_name())) else: print("Name is None: <<<<<<<<{0}>>>>>>".format( pollutant.get_name())) # print( # "answare", # pollutant.get_name(), # # pollutant.get_value(), # pollutant.get_units() # ) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: layer[pollutant.get_name()] = pollutant.get_value() lc.add_layer(fdate, layer) curr_date = min_forecast_date - timedelta(hours=1) forecast_data = lc.get_layers() if forecast_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = curr_date items[u"forecast_data"] = forecast_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): table = resp.xpath( u'//*[@id="meteostar_wrapper"]/div[2]/table/tr')[2:-4] pollutants_hour = resp.xpath( u'//*[@id="meteostar_wrapper"]/div[2]/table/tr[2]/th/text()' ).extract() pollutants_hour = self.validate_hours(pollutants_hour) hour = pollutants_hour[ len(pollutants_hour) - 2] if len(pollutants_hour) > 1 else pollutants_hour[0] data_time = resp.xpath( u'//*[@id="meteostar_wrapper"]/p[5]/b/text()').extract_first() data_time = u" ".join((data_time, hour)) data_time = parser.parse( data_time, dayfirst=True).replace(tzinfo=timezone(self.tz)) units = { u"o3": u"ppb", u"pm25": u"ug/m3", u"co": u"ppm", u"so2": u"ppb", u"no2": u"ppb", u"no": u"ppb", u"n2o": u"ppb", u"ws": u"mph", u"wd": u"deg", u"temp": u"degf", u"pres": u"mbar", } station_data = dict() for row in table: pollutant_name = row.xpath(u"td[1]/a/b/text()").extract_first() # print(pollutant_name) pollutants_data = row.xpath(u"td[last()-3]").re( u">(\d+?)<|>(\d+?\.\d+)<") pollutant_value = [el for el in pollutants_data if el != u""] pollutant_value = pollutant_value[0] if pollutant_value else None pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) try: pollutant.set_raw_units(units[pollutant.get_name()]) except KeyError: print( "There is no such pollutant in local units list <<<<<<<<{0}>>>>>>" .format(pollutant.get_name())) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): try: pollutant_name = { u"2": u"PM25", u"1": u"Ozone", u"5": u"CO", u"4": u"NO2", u"9": u"SO2" } pollutant_unit = { u"2": u"ug/m3", u"1": u"ppb", u"5": u"ppm", u"4": u"ppb", u"9": u"ppb" } json = ujson.loads(resp.text) one_pollutant_data = list() for station in json: raw_station_name = station[0] station_name = raw_station_name["siteName"] raw_data = station[1] station_data = pd.DataFrame(raw_data) station_data[u"date"] = [parser.parse(x) for x in station_data[u'date']] station_data[u"station_name"] = station_name station_data[u"pollutant_name"] = pollutant_name.get(resp.meta[u"code"]) if "hexColor" in station_data.columns.values: del station_data[u"hexColor"] station_data[u"unit"] = pollutant_unit.get(resp.meta[u"code"]) one_pollutant_data.append(station_data) one_pollutant_data = pd.concat(one_pollutant_data, ignore_index=True) res_def = resp.meta["res_df"] res_def.append(one_pollutant_data) new_code = resp.meta[u"all_codes"].pop() url = add_or_replace_parameter(resp.meta["href"], "paramId", new_code) yield Request( url=url, callback=self.get_station_data, meta={ u"code": new_code, u"all_codes": resp.meta[u"all_codes"], u"res_df": res_def, u"href": url } ) # print(one_pollutant_data) except IndexError: all_data = pd.concat(resp.meta["res_df"], ignore_index=True) all_data[all_data["aqi"] == -999.0] = np.nan all_data = all_data.dropna(axis=0) current_date = all_data["date"].max() - timedelta(hours=2) current_data = all_data[all_data["date"] == current_date] grouped = current_data[["pollutant_name", "aqi", "unit", "station_name"]].groupby(by="station_name") # print(grouped) for name, gr in grouped: station_id = name station_data = dict() for rec in gr.itertuples(index=False): pollutant_name = rec[0] pollutant_value = rec[1] pollutant_unit = rec[2] # print(pollutant_name, pollutant_value, pollutant_unit) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_unit) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() # if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_date).replace(tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): # //*[@id="datatable"]/tbody/tr[1]/th raw_data = resp.xpath(u"//*[@id='datatable'][last()]/tr[last()]/td") raw_date = resp.xpath(u"//*[@id='datatable'][last()]/tr[1]/th/text()").extract_first() date = " ".join(raw_date.split(" ")[-4:]) raw_col_names = resp.xpath(u"//*[@id='datatable'][last()]/tr[2]/th") poll_values = [el.xpath(u"./text()").extract_first() for el in raw_data] raw_poll_names = [self.get_name_and_unit(" ".join(el.xpath(u"./text()").extract())) for el in raw_col_names] poll_names = [el[0] for el in raw_poll_names] poll_units = [el[1] for el in raw_poll_names] data = zip(poll_names, poll_values, poll_units) hour = data.pop(0) hour = hour[1] data_time = " ".join((date, hour)) data_time = parser.parse(data_time).replace(tzinfo=timezone(self.get_tz(resp.meta[u"code"]))) units = { u"wd": u"cardinals", } station_data = dict() for el in data: pollutant_name = el[0] pollutant_value = el[1] pollutant_units = el[2] pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) if pollutant_units is not False: pollutant.set_raw_units(pollutant_units) else: try: pollutant.set_raw_units(units[pollutant.get_name()]) except KeyError: print( u"There is no such pollutant in local units list <<<<<<<<{0}>>>>>>".format( pollutant.get_name())) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_col_names = resp.xpath( u'//*[@id="content1"]/div[1]/table/tr[1]/td').extract() col_names = [re.sub(u"<.+?>", u"", el) for el in raw_col_names] # for el in col_names: # print el table = resp.xpath(u'//*[@id="content1"]/div[1]/table//td') table_data = [ el.xpath(u".").re(u"<td>(.+)<\/td>")[0] if el.xpath(u".").re(u"<td>(.+)<\/td>") else None for el in table ] table_data = np.asarray(table_data).reshape( len(table_data) / len(col_names), len(col_names)) df = pd.DataFrame(table_data[1:, ], columns=col_names) df[u"Опади"] = df[u"Опади"].apply(lambda x: re.search( u"(.+) \s*\(", x).group(1) if x is not None else 0) raw_data = df.iloc[0].to_dict() raw_data_time = raw_data.pop(u"Дата і час", None) data_time = parser.parse( raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz)) data = raw_data units = { u"Температура повітря": u"degc", u"Температура точки роси": u"degc", u"Опади": u"mm", u"Атмосферний тиск": u"mbar", u"Напрямок вітру": u"deg", u"Швидкість вітру": u"ms", } station_data = dict() for key, val in data.items(): poll_name = key poll_value = val poll_units = units[key] # print(poll_name, poll_value, poll_units) pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(poll_name) pollutant.set_raw_value(poll_value) pollutant.set_raw_units(poll_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): table = resp.xpath( u'//*[@id="meteostar_wrapper"]/div[3]/table/tr')[2:-4] pollutants_hour = resp.xpath( u'//*[@id="meteostar_wrapper"]//table[1]/tr[2]/th/text()').extract( ) pollutants_hour = self.validate_hours(pollutants_hour) hour = pollutants_hour[ len(pollutants_hour) - 2] if len(pollutants_hour) > 1 else pollutants_hour[0] data_time = resp.xpath( u'//*[@id="meteostar_wrapper"]/p[5]/b/text()').extract_first() data_time = u" ".join((data_time, hour)) data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz)) station_data = dict() for row in table: raw_pollutant_name = row.xpath(u"td[1]/a/b/text()") pollutant_name = raw_pollutant_name.extract()[0] if len( raw_pollutant_name) == 1 else None # print(pollutant_name) raw_pollutant_unit = row.xpath(u"td[1]/a/@onmouseover") # print(raw_pollutant_unit.extract()) pollutant_unit = raw_pollutant_unit.re(u"\d*\.\s+(\D+)',") pollutant_unit = pollutant_unit[0].replace( "Measured in ", "") if len(pollutant_unit) == 1 else None pollutants_data = row.xpath(u"td[last()-4]").re( u">(\d+?)<|>(\d+?\.\d+)<") # print(pollutants_data) pollutant_value = [el for el in pollutants_data if el != u""] pollutant_value = pollutant_value[0] if pollutant_value else None # print(pollutant_name, pollutant_value) pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_unit) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): json = ujson.loads(resp.text) param = ("ParameterDescription", "Average", "Units", "AverageHour") all_records = list() for site in json["Sites"]: station_id = site["AQSSiteId"] st_records = list() readings = site.get("Readings") if readings is not None: for rec in readings: res = {key: rec[key] for key in param} res["station_id"] = station_id st_records.append(res) all_records.extend(st_records) # print(all_records) all_data = pd.DataFrame(all_records) current_date_time = all_data["AverageHour"].max() current_data = all_data[all_data["AverageHour"] == current_date_time] grouped = current_data[[ "station_id", "ParameterDescription", "Average", "Units" ]].groupby(by="station_id") for name, gr in grouped: # print(name) station_data = dict() station_id = name for record in gr.itertuples(index=False): # if station_id is None: # station_id = record[0] pollutant_name = record[1] pollutant_value = record[2] pollutant_units = record[3] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("Validated", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = parser.parse(current_date_time).replace( tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): try: pollutant_unit = {u"pm": u"ug/m3", u"ozone": u"ppb"} pollutant_name = resp.url.split(u"/")[-1].replace( u"_monitors.aspx", u"") # print(pollutant_name) # raw_station_names = resp.xpath(u"//*[@id='tblGrid']/thead/tr[1]/td") raw_table = resp.xpath(u"//*[@id='tblGrid'][1]").extract_first() raw_table = re.sub(u"</?tbody>", u"", raw_table) raw_table = re.sub(u"</?thead>", u"", raw_table) raw_table = re.sub(u"</th>", u"</td>", raw_table) raw_table = re.sub(u"<th ", u"<td ", raw_table) table = Selector(text=raw_table) # print(table) raw_station_names = table.xpath(u"//tr[1]/td")[1:] # print(raw_station_names) station_names = [ u" ".join(el.xpath(u"./b/text()").extract()) for el in raw_station_names ] station_names = [u" ".join(el.split()) for el in station_names] # print(station_names) raw_poll_data = table.xpath(u"//*[@id='tblGrid'][1]/tr[last()]/td") raw_hour = raw_poll_data[0].xpath(u"./text()").extract_first() try: raw_date = resp.xpath( u"//*[@id='mainContent']/div[1]/p[2]/text()").re( u"(\d\d?/\d\d?/\d\d\d\d)")[0] except IndexError: raw_date = None # print(raw_date) raw_data_time = u" ".join((raw_date, raw_hour)) data_time = parser.parse(raw_data_time).replace( tzinfo=timezone(self.tz)) raw_poll_value = raw_poll_data[1:] poll_values = [ el.xpath(u"font/text()").extract_first() for el in raw_poll_value ] data = zip(station_names, poll_values) table_data = [{ u"station_id": el[0], u"pollutant_name": pollutant_name, u"pollutant_value": el[1], u"pollutant_unit": pollutant_unit.get(pollutant_name), u"date": data_time } for el in data] df = pd.DataFrame(table_data) if resp.meta.get(u"global_data") is not None: new_global_data = pd.concat( [resp.meta.get(u"global_data"), df], ignore_index=True) else: new_global_data = df # print(new_global_data) resp.meta[u"global_data"] = new_global_data yield Request(url=resp.meta[u"urls"].pop(), callback=self.get_station_data, meta={ u"urls": resp.meta[u"urls"], u"global_data": resp.meta[u"global_data"] }) except IndexError: # pass data = resp.meta[u"global_data"] current_data_time = data[u"date"].max() data = data[data[u"date"] == current_data_time] data = data[[ u"station_id", u"pollutant_name", u"pollutant_value", u"pollutant_unit" ]] # print(data) grouped = data.groupby(by=u"station_id") for name, gr in grouped: station_data = dict() # print(name) station_id = None for record in gr.itertuples(index=False): if station_id is None: station_id = record[0] pollutant_name = record[1] pollutant_value = record[2] pollutant_units = record[3] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name( ) is not None and pollutant.get_value() is not None: station_data[ pollutant.get_name()] = pollutant.get_value() # print(station_data) data_time = pd.to_datetime(current_data_time).replace( tzinfo=timezone(self.tz)) if station_data and data_time: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): try: data_time = resp.xpath(u".//*[@id='Content_cphMain_pnLastUpdate']/p/text()").re(u"Last Updated on (.+)")[0] data_time = parser.parse(data_time).replace(tzinfo=timezone(self.tz)) except IndexError: data_time = None raw_poll_data = resp.xpath(u".//*[@id='Content_cphMain_dlAirQualityParameters']/tr/td") data = list() for el in raw_poll_data: poll_name_part = el.xpath(u"h6/text()").extract_first() poll_name_part_add = el.xpath(u"h6/sub/text()").extract_first() poll_name_part = u"".join((poll_name_part, poll_name_part_add)) if poll_name_part_add is not None else poll_name_part poll_name_part = u" ".join(poll_name_part.split()) # print(poll_name_part) _data = el.xpath(u'div[not(@class="clearfix")]') _data = [el.xpath(u"text()").extract_first() for el in _data] _data = [u" ".join(el.split()) for el in _data] _data = [el for el in _data if el != u""] poll_subnames = _data[::2] poll_names = [" ".join((poll_name_part, el)) for el in poll_subnames] raw_poll_values = _data[1::2] poll_values = list() poll_units = list() for el in raw_poll_values: value = re.findall(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", el)[0] unit = re.sub(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", u"", el) if unit is not None: unit = u" ".join(unit.split()) poll_values.append(value) poll_units.append(unit) subdata = zip(poll_names, poll_values, poll_units) data.extend(subdata) raw_weather_data = resp.xpath(u".//*[@id='Content_cphMain_gvMeteorology']/tr") wind_dir = { u"N": u"0", u"NNE": u"22.5", u"NE": u"45", u"ENE": u"68.5", u"E": u"90", u"ESE": u"112.5", u"SE": u"135", u"SSE": u"157.5", u"S": u"180", u"SSW": u"202.5", u"SW": u"225", u"WSW": u"247.5", u"W": u"270", u"WNW": u"292.5", u"NW": u"315", u"NNW": u"337.5", } w_data = list() for el in raw_weather_data: name = el.xpath(u"td[1]/text()").extract_first() name = u" ".join(name.split()) if name is not None else None raw_poll_val_1 = u" ".join(el.xpath(u"td[2]/text()").extract()) raw_poll_val_2 = u" ".join(el.xpath(u"td[2]/sup/text()").extract()) raw_poll_val = u" ".join((raw_poll_val_1, raw_poll_val_2)) raw_poll_val = u" ".join(raw_poll_val.split()) if wind_dir.get(raw_poll_val) is not None: raw_poll_val = wind_dir.get(raw_poll_val) + u" deg" value = re.findall(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", raw_poll_val)[0] unit = re.sub(u"^([-]?\d*[.]?\d+|\d+[.]?\d*)", u"", raw_poll_val) unit = u" ".join(unit.split()) res = (name, value, unit) w_data.append(res) data.extend(w_data) # print(data) station_data = dict() for el in data: pollutant_name = el[0] pollutant_value = el[1] pollutant_units = el[2] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value() is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_table = resp.xpath( u"//*[@id='meteostar_wrapper']/div[3]/table/tr")[2:-3] raw_data_time = resp.xpath( u"//*[@id='meteostar_wrapper']/form/table[2]/tr[2]/td") month = raw_data_time[0].xpath( u"select/option[@selected='selected']/text()").extract_first() day = raw_data_time[1].xpath( u"select/option[@selected='selected']/text()").extract_first() year = raw_data_time[2].xpath( u"select/option[@selected='selected']/text()").extract_first() pollutants_hour = resp.xpath( u"//*[@id='meteostar_wrapper']/div[3]/table/tr[2]/th/text()" ).extract() pollutants_hour = self.validate_hours(pollutants_hour) data_times = [ parser.parse(u" ".join((month, day, year, hour))) for hour in pollutants_hour ] units = { u"o3": u"ppb", u"ws": u"mph", u"wd": u"deg", u"temp": u"degf", u"pm10": u"ug/m3", u"pm25": u"ug/m3", u"pm": u"ug/m3", u"co": u"ppm", u"rain": u"in", u"no": u"ppb", u"no2": u"ppb", u"pres": u"mbar", u"hum_rel": u"%", u"no_y": u"ppb", u"so2": u"ppb", } station_data = dict() table = list() for row in raw_table: try: pollutant_name = row.xpath(u"td[1]/a/b/text()").extract()[0] except IndexError: pollutant_name = None pollutants_data = row.xpath(u"td")[1:-2] pollutants_data = [ "".join(el.xpath(u".//text()").extract()) for el in pollutants_data ] pollutants_data = map(self.coerce_float, pollutants_data) print(pollutants_data) print(pollutants_hour) records = list() for el in zip([pollutant_name] * len(data_times), pollutants_data, data_times): pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(el[0]) pollutant.set_raw_value(el[1]) pollutant.set_raw_units(units.get(pollutant.get_name())) res = { "name": pollutant.get_name(), "value": pollutant.get_value(), "date": el[2], "unit": pollutant.get_units() } records.append(res) table.extend(records) df = pd.DataFrame(table) df["value"] = df["value"].astype(float) df = df.dropna(axis=0) # print(df) current_data_time = df["date"].max() current_data = df[df["date"] == current_data_time] print(current_data) station_data = dict() for el in current_data[["name", "value"]].itertuples(index=False): station_data[el[0]] = el[1] if station_data: items = AppItem() items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = pd.to_datetime(current_data_time).replace( tzinfo=timezone(self.tz)) items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = resp.meta[u"code"] yield items
def get_station_data(self, resp): raw_date = resp.xpath( u"/html/body/table[3]/tr/td/table/tr/td/table[2]/tr/td/form/table/tr[1]/td/text()" ).extract()[1] raw_date = u" ".join(raw_date.split()) curr_date = parser.parse(raw_date) str_date = u"{dd}-{mm}-{yyyy}".format(dd=curr_date.day, mm=curr_date.month, yyyy=curr_date.year) # print(curr_date) col_names = resp.xpath( u"/html/body/table[3]/tr/td/table/tr/td/table[2]/tr/td/form/table/tr[last()]/td[1]/table/tr[1]/th" )[1:-1] col_names = [el.xpath(u"./text()").extract_first() for el in col_names] # print(col_names) table = resp.xpath( u"/html/body/table[3]/tr/td/table/tr/td/table[2]/tr/td/form/table/tr[last()]/td[1]/table/tr[@id]" ) df_records = list() for row in table: record = row.xpath(u"./td")[1:-1] record = [el.xpath(u"./text()").extract_first() for el in record] data = zip(col_names, record) df_record = dict(data) df_records.append(df_record) df = pd.DataFrame(df_records, columns=col_names) # could be one value in column df = df.dropna(axis=1, thresh=1) df = df.dropna(axis=0, how=u'all') latest_data = df[[u"Site", u"Param", u"UNITS", df.columns[-1]]] hour = latest_data.columns[-1] raw_data_time = u"{0} {1}:00".format(str_date, hour) data_time = parser.parse( raw_data_time, dayfirst=True).replace(tzinfo=timezone(self.tz)) grouped = latest_data.groupby(by=u"Site") for name, gr in grouped: station_data = dict() station_id = None for record in gr.itertuples(index=False): if station_id is None: station_id = record[0] pollutant_name = record[1] pollutant_value = record[3] pollutant_units = record[2] # print(pollutant_name, pollutant_value, pollutant_units) pollutant = Feature(self.name) pollutant.set_source(self.source) pollutant.set_raw_name(pollutant_name) pollutant.set_raw_value(pollutant_value) pollutant.set_raw_units(pollutant_units) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() # print(station_data) # print(name) if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items
def get_station_data(self, resp): raw_stations_data = resp.xpath(u"/html/head/script[last()]/text()").re( u"createMultiStation\((.*)\);") for station in raw_stations_data: # test_data = u"[{0}]".format(raw_stations_data[0]) test_data = u"[{0}]".format(station) test_data = test_data.replace(u'"', u'\\"') test_data = test_data.replace(u"'", u'"') test_data = test_data.replace(u"role", u'"role"') # print(test_data) json = ujson.loads(test_data) raw_data = json[3][0] station_id = raw_data[0] # print(station_id) row_data_time = Selector(text=raw_data[len(raw_data) - 2]) data_time = row_data_time.xpath(u"//td[2]/text()").extract_first() data_time = parser.parse(data_time).replace( tzinfo=timezone(self.tz)) pollutant_data = raw_data[len(raw_data) - 4] + raw_data[len(raw_data) - 3] pollutant_data = Selector(text=pollutant_data) pollutants_name_p1 = pollutant_data.xpath(u"//table/tr[1]/td")[1:3] pollutants_name_p1 = [ el.xpath(u"u/text()").extract_first() for el in pollutants_name_p1 ] pollutants_name_p2 = pollutant_data.xpath(u"//table/tr[1]/td")[1:3] pollutants_name_p2 = [ el.xpath(u"sub/text()").extract_first() for el in pollutants_name_p2 ] pollutants_name = [ u" ".join(x) for x in zip(pollutants_name_p1, pollutants_name_p2) ] pollutant_value_data = pollutant_data.xpath( u"//table/tr[2]/td")[1:3] pollutant_value_data = [ val.xpath(u"text()").extract_first().split(u" ") for val in pollutant_value_data ] pollutant_value = [el[0] for el in pollutant_value_data] pollutant_units = [el[1] for el in pollutant_value_data] data = zip(pollutants_name, pollutant_value, pollutant_units) # print(data) station_data = dict() for record in data: pollutant = Feature(self.name) pollutant.set_source(self.source) # print("record", record) pollutant.set_raw_name(record[0]) pollutant.set_raw_value(record[1]) pollutant.set_raw_units(record[2]) # print("answare", pollutant.get_name(), pollutant.get_value(), pollutant.get_units()) if pollutant.get_name() is not None and pollutant.get_value( ) is not None: station_data[pollutant.get_name()] = pollutant.get_value() if station_data: items = AppItem() items[u"scrap_time"] = datetime.now( tz=timezone(SCRAPER_TIMEZONE)) items[u"data_time"] = data_time items[u"data_value"] = station_data items[u"source"] = self.source items[u"source_id"] = station_id yield items