Esempio n. 1
0
    def get_station_data(self, resp):
        row_rows = resp.xpath(
            u"/html/body/div[7]/div/div[1]/div[2]/div/div[3]/table/tbody/tr[2]"
        )
        print(row_rows)
        station_name = row_rows.xpath(u"td[1]/b/text()").extract_first()
        station_id = station_name
        aqi = row_rows.xpath(u"td[2]/p/text()").extract_first()
        pm25 = row_rows.xpath(u"td[3]/p/text()").extract_first()
        row_data_time = row_rows.xpath(u"td[4]/span/text()").extract_first()
        print(row_data_time)
        data_time = self.parse_date(row_data_time)

        _tmp_dict = Kind(self.name).get_dict(r_key=u"pm25", r_val=pm25)
        station_data = dict()
        if _tmp_dict:
            station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        _tmp_dict = Kind(self.name).get_dict(r_key=u"aqi", r_val=aqi)
        if _tmp_dict:
            station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source_name
            items[u"source_id"] = str(station_id)

            yield items
Esempio n. 2
0
    def get_st_data(self, resp):
        regex = u"Details/(.+)\?type=1"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        row_data = resp.xpath(u'//*[@id="recentResults"]/tbody/tr')
        row_dt_hour = resp.xpath(
            u'//*[@id="recentResults"]/thead/tr[2]/th[1]/text()').re_first(
                u"Current \((.+)\)")
        new_dt = self.check_date(row_dt_hour)

        st_data = dict()
        for data in row_data:
            _name = data.xpath(u"td[1]/text()").re_first(u"(.+[\S])")
            _val = data.xpath(u"td[2]/text()").re_first(u"(.+[\S])")

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
Esempio n. 3
0
    def get_st_data(self, resp):
        row_json = resp.body.decode(u"utf-8")
        row_json = row_json.replace(u"\ufeff", u"")
        json = js_loads(row_json)

        data_time = self.get_date(json[u"DateTime"])
        stations = json[u"Stations"]

        for st in stations:
            name = st[u"Station"]
            params = st[u"ParameterValueList"]

            st_data = dict()
            for p in params:
                pol_name = p[u"Id"]
                pol_value = p[u"Value"]

                _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = name

                yield items
Esempio n. 4
0
    def get_station_data(self, resp):
        body = resp.body
        root = etree.fromstring(body)
        row_data = root.xpath(u"//Measurement")

        station = dict()
        date = root.xpath(u"//Measurement[1]/Data/E/T/text()")
        data_time = parse(date[0]).replace(tzinfo=timezone(self.tz))

        for st in row_data:
            name = st.xpath(u"@SiteName")
            pol_name = st.xpath(u"DataSource/@Name")
            pol_val = st.xpath(u"Data/E/I1/text()")
            pol_time = st.xpath(u"Data/E/T/text()")
            print(name, pol_name, pol_val, pol_time)

            if name[0] not in station:
                station[name[0]] = dict()

            _tmp_dict = Kind(self.name).get_dict(r_key=pol_name[0],
                                                 r_val=pol_val[0])
            if _tmp_dict:
                station[name[0]][_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        for st_data in station:
            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station[st_data]
                items[u"source"] = self.source
                items[u"source_id"] = st_data

                yield items
Esempio n. 5
0
    def get_station_data(self, resp):
        _station_id = Selector(text=resp.url).re(u"site_id=(\d+)")
        station_id = _station_id[0]

        data_time = self.get_date(resp)
        table = resp.xpath(u'//*[@id="tab1"]/table/tr')

        station_data = dict()
        for row in table:
            col = row.xpath(u"td/text()").extract()
            col = col[:2]

            _name = col[0]
            _val = col[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = station_id

            yield items
Esempio n. 6
0
    def get_st_data(self, resp):
        regex = u".*ST_ID=(.+)"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        table_name = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td/span')
        table_val = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td/span')
        names = list()
        for row in table_name:
            # print(row)
            row_name = row.xpath(u"@title").extract_first()
            regex_name = u"(.+)"
            _name = findall(regex_name, row_name)
            try:
                names.append(_name[0])
            except IndexError:
                names.append(None)

        values = list()
        for row in table_val:
            row_val = row.xpath(u"@title").extract_first()
            regex_val = u"\) (.+)"
            _val = findall(regex_val, row_val)
            try:
                values.append(_val[0])
            except IndexError:
                values.append(None)

        # for n in names:
        #     self.tmp_set.add(n)
        #     open("manitoba_names.txt", "a").write(str(self.tmp_set) + "\n")

        try:
            new_dt = self.check_date(values[0])
        except IndexError:
            new_dt = None

        data = zip(names, values)
        data.pop(0)
        # print(data)

        st_data = dict()
        for val in data:
            _name = val[0]
            _val = val[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
Esempio n. 7
0
    def aqi_to_value(self, row_aqi):
        pollutant_name = None
        if row_aqi is not None:
            if u"*" in row_aqi:
                pollutant_name = u"pm10"
            elif u"a" in row_aqi:
                pollutant_name = u"so2"
            elif u"b" in row_aqi:
                pollutant_name = u"no2"
            elif u"c" in row_aqi:
                pollutant_name = u"o3"
            elif u"d" in row_aqi:
                pollutant_name = u"co"

        station_data = dict()
        if pollutant_name is not None:
            val = row_aqi[:-1]
            # конвертуєму в з начення якщо це можливо
            val = Aqi().aqi_to_val(val, pollutant_name)

            _tmp_dict = Kind(self.name).get_dict(r_key=pollutant_name,
                                                 r_val=val)
            # print("res", _tmp_dict)
            if _tmp_dict:
                station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        # print station_data
        return station_data
Esempio n. 8
0
    def get_st_data(self, resp):
        json = self.get_page(resp)

        for obj in json:
            body = Selector(text=obj[u"description"])
            rows = body.xpath(u"//html/body/table/tr[3]/td/table/tr")
            st_id = str(body.xpath(u"//html/body/table/tr[1]/td/text()").extract_first())

            data_date = body.xpath(u"//html/body/table/tr[2]/td/text()").extract_first()
            data_date = str(data_date.rstrip(u" (IST)"))
            data_date = parse(data_date)
            # print(data_date)
            new_dt = data_date.replace(tzinfo=pytz.timezone(self.tz))

            st_data = dict()
            for row in rows:
                col = row.xpath(u"td/text()").extract()
                try:
                    _name = col[0]
                    _val = col[1]
                    _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                    if _tmp_dict:
                        st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]
                except IndexError:
                    pass

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.datetime.now(tz=pytz.timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = new_dt
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id
                yield items
Esempio n. 9
0
    def read_st_data(self, resp):
        st_data = resp.xpath(u"StationMeasurement")
        stations = dict()
        dt = resp.xpath(
            u"StationMeasurement[1]/ReadingDate/text()").extract_first()
        new_dt = self.get_date(dt)
        print(new_dt)

        for data in st_data:
            name = data.xpath(u"StationName/text()").extract_first()
            poll_name = data.xpath(u"ParameterName/text()").extract_first()
            poll_val = data.xpath(u"Value/text()").extract_first()
            _name = str(poll_name)
            _val = str(poll_val)
            # print(_name, _val)
            if name not in stations:
                name = str(name)
                stations[name] = dict()

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            # print(_tmp_dict)
            if _tmp_dict:
                stations[name][_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        result = (stations, new_dt)
        return result
Esempio n. 10
0
    def get_st_data(self, resp):
        body = resp.body
        body = body.replace(u"\n", u"")
        json = js_parse(body)
        exception = (u"name", u"gps", u"date", u"datetime", u"time")
        for station in json:
            station_name = station[u"name"]
            data_time = self.get_date(station[u"datetime"])

            station_data = dict()
            for attr_name in station:
                if attr_name not in exception:
                    pol_name = attr_name
                    pol_value = station[attr_name]

                    _tmp_dict = Kind(self.name).get_dict(r_key=pol_name, r_val=pol_value)
                    if _tmp_dict:
                        station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_name

                yield items
Esempio n. 11
0
    def get_station_data(self, resp):
        data_time = resp.xpath(u'//*[@id="main"]/div[1]/div[2]/p[3]/text()').re(u"(\d\d\/\d\d\/\d\d\d\d\s\d\d:\d\d)")
        data_time = parser.parse(data_time[0]).replace(tzinfo=timezone(self.tz)) if data_time else None

        table = resp.xpath(u'//*[@id="tabs-content-data"]/table/tbody/tr')

        station_data = dict()
        for row in table:
            pollutant_index = row.xpath(u"td[1]/sub/text()").extract_first() if row.xpath(u"td[1]/sub/text()").extract_first() != None else u""
            pollutant_name = u" ".join((
                row.xpath(u"td[1]/text()").extract_first().split(u" (")[0],
                pollutant_index,
                row.xpath(u"td[4]/text()").extract_first()
            )).replace(u"  ", u" ")

            pollutant_value = row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] if row.xpath(u"td[3]/text()").extract_first().split(u" ")[0] != u"No" else None

            pollutant = Kind(self.name).get_dict(r_key=pollutant_name, r_val=pollutant_value)
            if pollutant:
                station_data[pollutant[u"key"]] = pollutant[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
Esempio n. 12
0
    def get_station_data(self, resp):
        data_time = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td[1]/span/text()'
        ).extract_first()
        data_time = parser.parse(data_time)
        #
        pollutant_name = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[1]/td')[1:]
        pollutant_name = [
            el.xpath(u"span/text()").extract_first() for el in pollutant_name
        ]
        pollutant_data = resp.xpath(
            u'//*[@id="EnvitechGrid1_GridTable"]/tr[2]/td')[1:]
        pollutant_data = [
            el.xpath(u"span/text()").extract_first() for el in pollutant_data
        ]

        data = zip(pollutant_name, pollutant_data)

        station_data = dict()
        for record in data:
            pollutant = Kind(self.name).get_dict(r_key=record[0],
                                                 r_val=record[1])
            if pollutant:
                station_data[pollutant[u"key"]] = pollutant[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time.replace(tzinfo=timezone(self.tz))
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
Esempio n. 13
0
    def get_st_data(self, resp):
        regex = u"Data/(.+)_Line\.xml"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        row_data = resp.xpath(u"tname[1]/child::*")
        row_dt = resp.xpath(u"tname[1]/DATE_TIME/text()").extract_first()
        new_dt = self.check_date(row_dt)

        st_data = dict()
        for el in row_data:
            tag_name = el.xpath(u"name()").extract_first()
            tag_val = el.xpath(u"text()").extract_first()
            _name = tag_name
            _val = tag_val

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
Esempio n. 14
0
    def get_st_data(self, resp):
        date = resp.xpath(u'//*[@id="MainContent"]/div[2]/div[2]/ul/li/span/a/text()').extract_first()
        date = date.replace(u"\t", u"")
        date = date.replace(u"\nAir Quality Index ", u"")

        all_tables = resp.xpath(u'//*[@class="table table-alternate table-condensed"]')
        tables = [x for i, x in enumerate(all_tables) if i % 2 == 0]
        tables_date = [x for i, x in enumerate(all_tables) if i % 2 != 0]
        tables = tables[:len(tables) - 1]
        for index, table in enumerate(tables):
            hour = tables_date[index].xpath(u"tbody/tr/td/text()").re_first(u"AQI at (\d+) hrs")
            row_data_time = date + u" " + hour
            data_time = parse(row_data_time).replace(tzinfo=timezone(self.tz))

            station_id = table.xpath(u"thead/tr/th[1]/text()").re_first(u"(.+) A.Q.M.S.")
            if u" Particles" in station_id:
                station_id = station_id.replace(u" Particles", u"")
            if u" Mobile" in station_id:
                station_id = station_id.replace(u" Mobile", u"")

            rows = table.xpath(u"tbody/tr")

            station_data = dict()
            for row in rows:
                name = row.xpath(u"td[1]/text()").extract_first()
                name = name.replace(u" ", u"")
                aqi = float(row.xpath(u"td[3]/text()").extract_first())

                if name == u"SO2":
                    val_name = u"so2"
                elif name == u"NO2":
                    val_name = u"no2"
                elif name == u"O3":
                    val_name = u"o3"
                elif name == u"PM10":
                    val_name = u"pm10"
                elif name == u"PM2.5":
                    val_name = u"pm25"
                elif name == u"CO":
                    val_name = u"co"
                else:
                    val_name = None

                if val_name:
                    val = Aqi().aqi_to_val(aqi, val_name)

                    _tmp_dict = Kind(self.name).get_dict(r_key=name, r_val=val)
                    if _tmp_dict:
                        station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id
                yield items
Esempio n. 15
0
    def validate_station_data(self, meta):
        """validate dictionary from response META attribute"""
        results = meta.get(u"results")

        res1 = results.get(u"res1")
        res2 = results.get(u"res2")
        res3 = results.get(u"res3")

        df1 = DataFrame.from_dict(res1).sort_values(by=u"date")
        df2 = DataFrame.from_dict(res2).sort_values(by=u"date")
        df3 = DataFrame.from_dict(res3).sort_values(by=u"date")

        df = merge(df1, df2, on=u"date")
        df = merge(df, df3, on=u"date")

        # print(df)

        source_code = meta.get(u"station_code")

        for obs in df.itertuples():
            st_data = dict()
            co = Kind(self.name).get_dict(r_key=u"co", r_val=obs.co)
            pm10 = Kind(self.name).get_dict(r_key=u"pm10", r_val=obs.pm10)
            no2 = Kind(self.name).get_dict(r_key=u"no2", r_val=obs.no2)

            if co:
                st_data[co[u"key"]] = co[u"val"]
            if pm10:
                st_data[pm10[u"key"]] = pm10[u"val"]
            if no2:
                st_data[no2[u"key"]] = no2[u"val"]

            data_time = obs.date.to_datetime()
            data_time = data_time.replace(tzinfo=timezone(self.tz))

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = source_code

                yield items
Esempio n. 16
0
    def get_additional_data(self, resp):
        weather_data = resp.meta[u"data"]
        body = resp.body
        body = body.split(u"\r\n")
        col_names = body[8].lstrip(u"#")
        col_names = col_names.split(u", ")
        col_names = col_names[1:]
        # print(col_names)

        data_time = self.get_date(body[1])

        table = body[9:len(body) - 1]
        for row in table:
            col = row.split(u",")

            col_values = list()
            for el in col:
                if u" " in el:
                    el = el.replace(u" ", u"")
                if u"/" in el:
                    el = None
                if u"-99" == el:
                    el = None
                if u"-999" == el:
                    el = None
                if u"-9999" == el:
                    el = None
                col_values.append(el)

            station_id = col_values[0]
            col_values = col_values[1:]
            # print(col_values, station_id)

            data = zip(col_names, col_values)

            station_data = dict()
            for st in data:
                _name = st[0]
                _val = st[1]
                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            # доклеюємо дані із першої відповіді
            station_data.update(weather_data[station_id])

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
Esempio n. 17
0
    def get_station_data(self, resp):
        row_names = ('Date', 'HOP', 'HGC', 'BSY', 'MEX', 'MTC', 'HEW', 'CLY')

        # опрацювання pdf документу
        stream = StringIO.StringIO(resp.body)

        rsrcmgr = PDFResourceManager()
        retstr = StringIO.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)

        process_pdf(rsrcmgr, device, stream)
        device.close()

        doc_str = retstr.getvalue()
        retstr.close()

        # розбиваємо построчно
        row_data = doc_str.split('\n')

        # data_time = row_data[len(row_data)-7]
        # print(data_time)

        first_row = row_data[17:25]
        data = zip(row_names, first_row)

        data_time = self.get_date(data[0][1])
        data = data[1:]

        for st in data:
            station_id = st[0]
            _name = 'PM10_24HR'
            _val = st[1]
            if '*' in _val:
                _val = _val.replace('*', '')

            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)

            station_data = dict()
            if _tmp_dict:
                station_data[_tmp_dict['key']] = _tmp_dict['val']

            # print(station_data)

            if station_data:
                items = AppItem()
                items['scrap_time'] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items['data_time'] = data_time
                items['data_value'] = station_data
                items['source'] = 'http://superpit.com.au'
                items['source_id'] = station_id

                yield items
Esempio n. 18
0
    def get_st_data(self, resp):
        regex = u"AP=(.+)"
        st_id = findall(regex, resp.url)
        st_id = str(st_id[0])

        row_names = resp.xpath(u'//*[@id="apTable"]/table/tr[1]/th/span')
        row_data = resp.xpath(u'//*[@id="apTable"]/table/tr[2]/td')

        new_dt = self.check_date(resp)
        print(new_dt)

        names = list()
        for name in row_names:
            _name = name.xpath(u"text()").extract()

            try:
                names.append(_name[0])

            except IndexError:
                names.append(None)
            # print(_name)

        vals = list()
        for val in row_data:
            _val = val.xpath(u"text()").extract()
            try:
                vals.append(_val[0])
            except IndexError:
                vals.append(None)

        data = zip(names, vals)
        data.pop(0)

        st_data = dict()
        for val in data:
            _name = val[0]
            _val = val[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id

            yield items
    def get_st_data(self, resp):
        regex = u".*ST_ID=(.+)"
        station_id = findall(regex, resp.url)
        station_id = str(station_id[0])

        # формужмо значення назв забрудників таблиці
        row_col_names = resp.xpath(
            u'//*[@id="C1WebGrid1"]/tbody/tr[1]/td/div/text()').extract()
        col_names = list()
        for col_name in row_col_names:
            col_name = col_name.lstrip(u'\n\t')
            col_name = col_name.rstrip(u'\n\t')
            col_names.append(col_name)

        col_names = col_names[1:]

        # витягуємо значення забрудників
        row_data = resp.xpath(u'//*[@id="C1WebGrid1"]/tbody/tr[5]/td')
        data_values = list()
        for data in row_data:
            row_value = data.xpath(u"div/text()").re(u"([\S].+[\S])")
            try:
                data_values.append(row_value[0])
            except IndexError:
                data_values.append(None)

        # значення дати даних
        data_date = parse(data_values[0])
        data_date = data_date.replace(tzinfo=timezone(self.tz))

        data_values = data_values[1:]
        data = zip(col_names, data_values)

        station_data = dict()
        for st in data:
            _name = st[0]
            _val = st[1]
            _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
            if _tmp_dict:
                station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_date
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = station_id

            return items
Esempio n. 20
0
    def get_station_data(self, resp):
        body = resp.body
        body = body.split(u"\r\n")
        col_names = body[8].lstrip(u"#")
        col_names = col_names.split(u", ")
        col_names = col_names[1:]
        # print(col_names)

        table = body[9:len(body)-1]
        weather_data = dict()
        for row in table:
            col = row.split(u",")
            col_values = list()

            for el in col:
                if u" " in el:
                    el = el.replace(u" ", u"")
                if u"/" in el:
                    el = None
                if u"-99" == el:
                    el = None
                if u"-999" == el:
                    el = None
                if u"-9999" == el:
                    el = None
                col_values.append(el)
            station_id = col_values[0]

            col_values = col_values[1:]
            # print(col_values)
            data = zip(col_names, col_values)

            station_data = dict()
            for st in data:
                _name = st[0]
                _val = st[1]
                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            weather_data[station_id] = station_data
            # print(station_data)

        # print(weather_data)
        return Request(
            u"http://epa.tas.gov.au/air/live/epa_tas_latest_particle_data.txt",
            callback=self.get_additional_data,
            meta=dict(data=weather_data)
        )
Esempio n. 21
0
    def get_st_data(self, resp):
        table = resp.xpath(u'//*[@id="right_column"]/div/table/tbody/tr')
        col_names = resp.xpath(u'//*[@id="right_column"]/div/table/thead//th/@abbr').extract()

        data_date = resp.xpath(u'//*[@id="right_column"]/div/h1/text()').extract()
        data_date = str(data_date[0]).lstrip(u"Pollutant Concentrations for ")
        dt = parse(data_date)
        new_dt = dt.replace(tzinfo=timezone(self.tz))

        #  get correct values, if there is no value add ""
        for row in table:
            cols = row.xpath(u"td")

            #  get id from href
            url = cols[0].xpath(u"div/a/@href").extract()
            url = str(url[0])
            regex_station = u"stationid=(.+)"
            st_id = re.findall(regex_station, url)
            st_id = str(st_id[0])

            row_data = []
            for col in cols:
                text = col.xpath(u"div[1]/text()").extract()
                try:
                    row_data.append(text[0])
                except IndexError:
                    row_data.append(None)

            gen_data = zip(col_names, row_data)
            st_data = dict()
            for data in gen_data:
                _key = data[0]
                _val = data[1]

                _tmp_dict = Kind(self.name).get_dict(r_key=_key, r_val=_val)

                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = new_dt
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id

                yield items
Esempio n. 22
0
    def get_st_data(self, resp):
        #  get local id from url
        regex_state = u"StateId=(.+?)&"
        regex_city = u"CityId=(.+)"
        regex_station = u"StationName=(.+?)&"
        _state_id = re.findall(regex_state, resp.url)
        _city_id = re.findall(regex_city, resp.url)
        _station_id = re.findall(regex_station, resp.url)
        st_id = u"".join(
            (_station_id[0].replace(u"%20", u" "), _state_id[0], _city_id[0]))

        table = resp.xpath(u'//*[@id="lblReportCurrentData"]/table/child::*')
        data_date = resp.xpath(
            u'//*[@id="lblCurrentDateTime"]/text()').extract_first()

        st_data = {}
        for el in table:
            col = el.xpath(u"child::td")
            try:
                name = col[0].xpath(u"text()").extract()
                _name = name[0]
                _val = col[3].xpath(u"span/text()").extract_first()

                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            except IndexError:
                pass

        if data_date:
            data_date = data_date.replace(u"Date Time : ", u"")
            new_dt = parse(data_date)
            new_dt = new_dt.replace(tzinfo=timezone(self.tz))
        else:
            new_dt = None

        if st_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.datetime.now(
                tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = new_dt
            items[u"data_value"] = st_data
            items[u"source"] = self.source
            items[u"source_id"] = st_id
            return items
Esempio n. 23
0
    def get_station_data(self, resp):
        data_time = resp.xpath(u'//*[@id="tab_content"]/p[1]/strong/text()'
                               ).re(u"(\d\d\/\d\d\/\d\d\d\d\s\d\d:\d\d)")
        data_time = parser.parse(data_time[0]).replace(
            tzinfo=timezone(self.tz)) if data_time else None

        table = resp.xpath(u'//*[@id="tab_content"]/table/tr')[1:]

        station_data = dict()
        for row in table:
            pollutant_name = row.xpath(u"td[1]/text()").extract_first()
            pollutant_name_ind = row.xpath(
                u"td[1]/sub/text()").extract_first() if row.xpath(
                    u"td[1]/sub/text()").extract_first() != None else u""
            pollutant_name_time = row.xpath(
                u"td[last()]/text()").extract_first()

            pollutant_name = (u" ".join(
                (pollutant_name, pollutant_name_ind,
                 pollutant_name_time))).replace(u"  ", u" ")

            pollutant_value = row.xpath(
                u"td[last() - 1]/text()").extract_first()
            if u"\xa0" in pollutant_value:
                pollutant_value = pollutant_value.split(u"\xa0")[0]
            else:
                pollutant_value = pollutant_value.split(u" ")[0]

            pollutant_value = pollutant_value if pollutant_value != u"No" else None

            pollutant = Kind(self.name).get_dict(r_key=pollutant_name,
                                                 r_val=pollutant_value)
            if pollutant:
                station_data[pollutant[u"key"]] = pollutant[u"val"]

        if station_data:
            items = AppItem()
            items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
            items[u"data_time"] = data_time
            items[u"data_value"] = station_data
            items[u"source"] = self.source
            items[u"source_id"] = resp.meta[u"code"]

            yield items
Esempio n. 24
0
    def get_station_data(self, resp):
        # get data time
        row_data_time = resp.xpath(u'//*[@id="titulo"]').re(
            u"<h4>(.+)</h4>")[0]
        data_time = self.check_data_time(row_data_time)

        # get data
        row_data_table = resp.xpath(u'//*[@id="dados_estacoes"]/table/tr')[2:]

        for row in row_data_table:
            colspan_check = row.xpath(u"td/@colspan").extract()
            if not colspan_check:
                cols = row.xpath(u"td/text()").extract()
                station_name = cols[0]
                station_id = station_name
                print(cols)
                so2 = (u"so2", cols[1].strip(u" "))
                co = (u"co", cols[2].strip(u" "))
                pm10 = (u"pm10", cols[3].strip(u" "))
                o3 = (u"o3", cols[4].strip(u" "))
                no2 = (u"no2", cols[5].strip(u" "))

                # full pollution data
                data = (so2, co, pm10, o3, no2)

                station_data = dict()
                for val in data:
                    _name = val[0]
                    _val = val[1]
                    _tmp_dict = Kind(self.name).get_dict(r_key=_name,
                                                         r_val=_val)
                    if _tmp_dict:
                        station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

                if station_data:
                    items = AppItem()
                    items[u"scrap_time"] = datetime.now(
                        tz=timezone(SCRAPER_TIMEZONE))
                    items[u"data_time"] = data_time
                    items[u"data_value"] = station_data
                    items[u"source"] = self.source
                    items[u"source_id"] = station_id

                    yield items
Esempio n. 25
0
    def get_station_data(self, resp):
        tabs = resp.xpath(u'//*[@id="Form1"]/div[3]/div[2]/article')[1:]
        for tab in tabs:
            row_data_time = tab.xpath(u"header/span/text()").extract_first()
            data_time = self.check_data_time(row_data_time)

            row_data_table = tab.xpath(u"section/ul/li")
            for row in row_data_table:
                station_name = row.xpath(u"a/h3/text()").extract_first()
                station_id = station_name
                pollution_table = row.xpath(u"div/div[3]/table/tbody/tr")

                station_data = dict()
                for pollution in pollution_table:
                    # pollution names
                    pollution_name = pollution.xpath(u"td[1]/text()").extract_first()

                    sub_pollution_name = pollution.xpath(u"td[1]/sub/text()").extract_first()
                    if sub_pollution_name is not None:
                        pollution_name += sub_pollution_name

                    pollution_name = pollution_name.split(u" \u2013 ")[1]
                    pollution_name = clean(pollution_name)
                    # print(repr(pollution_name))
                    # pollution values
                    pollution_value = pollution.xpath(u"td[2]/text()").extract_first()
                    pollution_value = clean(pollution_value, (u"\xcdndice:",))

                    # print(repr(pollution_value))

                    _tmp_dict = Kind(self.name).get_dict(r_key=pollution_name, r_val=pollution_value)
                    if _tmp_dict:
                        station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

                if station_data:
                    items = AppItem()
                    items[u"scrap_time"] = datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                    items[u"data_time"] = data_time
                    items[u"data_value"] = station_data
                    items[u"source"] = self.source
                    items[u"source_id"] = station_id

                    yield items
Esempio n. 26
0
    def get_st_data(self, resp):
        data_date = self.get_date(resp)

        col_names = resp.xpath(
            u'//*[@id="column-main"]/table/tr[1]/th/text()').extract()
        value_names = list()
        for name in col_names:
            value_names.append(name)
        value_names = value_names[2:]

        table = resp.xpath(u'//*[@id="column-main"]/table/tr')
        table = table[1:]
        for row in table:
            col = row.xpath(u"td/text()").extract()
            st_id = col[0].strip(u"\xa0")

            col = col[2:]
            data_values = list()
            for value in col:
                data_values.append(value)
            # print(data_values)
            # print len(data_values)

            data = zip(value_names, data_values)

            st_data = dict()
            for st in data:
                _name = st[0]
                _val = st[1]
                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_date
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id

                yield items
Esempio n. 27
0
    def get_st_data(self, resp):
        stations = resp.xpath(u"STATIONS/STRD")

        for station in stations:
            st_id = station.xpath(u"@ID").extract_first()
            st_id = str(st_id)
            row_st_data = station.xpath(u"READS/RD[1]")

            dt = row_st_data.xpath(u"@DT").extract()
            if dt:
                dt_arr = str(dt[0]).split(u",")
                dt_str = dt_arr[1] + u"." + dt_arr[2] + u"." + dt_arr[0] + u" " + dt_arr[3] + u":" + dt_arr[4] + u":" + dt_arr[5]
                new_dt = parse(str(dt_str))
                # open("testtt.txt", "a").write(str(dt_str) + "\n")
                # new_dt = time.strptime(str(dt[0]), "%Y,%m,%d,%H,%M,%S")
                # new_dt = parse(new_dt)
                new_dt = new_dt.replace(tzinfo=timezone(self.tz))
            else:
                new_dt = None

            row_params = row_st_data.xpath(u"PARAMS/PV")
            st_data = {}
            for pv in row_params:
                _name = pv.xpath(u"@NM").extract_first()

                #  recognize key val from table
                _val = pv.xpath(u"@VL").extract_first()

                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            if st_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.datetime.now(tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = new_dt
                items[u"data_value"] = st_data
                items[u"source"] = self.source
                items[u"source_id"] = st_id

                yield items
Esempio n. 28
0
    def get_st_data(self, resp):
        col_names = (u"SO2", u"NO", u"NO2", u"NOX", u"CO", u"OX", u"NMHC",
                     u"CH4", u"THC", u"SPM", u"PM2.5", u"SP", u"WS", u"TEMP",
                     u"HUM")
        data_time = parse(self.date[u"date"] + u" " +
                          self.date[u"hour"]).replace(tzinfo=timezone(self.tz))

        rows = resp.xpath(u"//table/tr")
        for row in rows:
            col = row.xpath(u"td/text()").extract()
            col.pop(1)
            col.pop(13)

            station_id = col[0]

            col = col[1:]
            data = zip(col_names, col)

            station_data = dict()
            for st in data:
                _name = st[0]
                _val = st[1]
                _tmp_dict = Kind(self.name).get_dict(r_key=_name, r_val=_val)
                if _tmp_dict:
                    station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            # print(station_data)

            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items
Esempio n. 29
0
    def get_value(self, string):
        regex = u"<h2>(\d*.?)</h2>"
        substr = findall(regex, string)
        try:
            _substr = substr[0]
            aqi = _substr[:-1]
            # print("aqi", aqi)

            names = [u"aqi"]
            if u"*" in _substr:
                names.append(u"pm10")
            elif u"a" in _substr:
                names.append(u"so2")
            elif u"b" in _substr:
                names.append(u"no2")
            elif u"c" in _substr:
                names.append(u"o3")
            elif u"d" in _substr:
                names.append(u"co")

            # print("names", names)

            st_data = dict()
            for name in names:
                val = aqi
                if name != u"aqi":
                    # конвертуєму в з начення якщо це можливо
                    val = Aqi().aqi_to_val(float(aqi), name)

                _tmp_dict = Kind(self.name).get_dict(r_key=name, r_val=val)
                # print("res", _tmp_dict)
                if _tmp_dict:
                    st_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            return st_data

        except IndexError:
            return None
Esempio n. 30
0
    def get_station_data(self, resp):
        stations = resp.xpath(u"//station")
        data_date = resp.xpath(u"category/@measurementdate").extract_first()
        data_hour = resp.xpath(u"category/@measurementhour").extract_first()
        if u"24" == data_hour:
            data_hour = u"00"
        data_time = data_date + u" " + data_hour
        data_time = parse(data_time).replace(tzinfo=timezone(self.tz))

        for st in stations:
            station_id = st.xpath(u"@name").extract_first()
            if u"'" in station_id:
                station_id = station_id.replace(u"'", u"")

            measurements = st.xpath(u"measurement")

            station_data = dict()
            for meas in measurements:
                pol_name = meas.xpath(u"@name").extract_first()
                pol_val = meas.xpath(u"text()").extract_first()
                # print(pol_name, pol_val)

                _tmp_dict = Kind(self.name).get_dict(r_key=pol_name,
                                                     r_val=pol_val)
                if _tmp_dict:
                    station_data[_tmp_dict[u"key"]] = _tmp_dict[u"val"]

            # print(station_data)
            if station_data:
                items = AppItem()
                items[u"scrap_time"] = datetime.now(
                    tz=timezone(SCRAPER_TIMEZONE))
                items[u"data_time"] = data_time
                items[u"data_value"] = station_data
                items[u"source"] = self.source
                items[u"source_id"] = station_id

                yield items