Exemple #1
0
    def __init__(cls, **retrieval_kwargs):
        """Retrieve sensor information from the InfluencAir project.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Raises:
            KeyError if sheet structure does not match listed columns
        """
        sensor_info = retrieve(SENSOR_INFO_CACHE_FILE,
                               SENSOR_SHEET_DOWNLOAD_URL,
                               "InfluencAir sensor information",
                               read_func=pd.read_csv,
                               read_func_kwargs={"header": 1,
                                                 "dtype": "object"},
                               **retrieval_kwargs)
        try:
            sensor_info = (sensor_info[["Chip ID", "PM Sensor ID",
                                        "Hum/Temp Sensor ID", "Label",
                                        "Address", "Floor",
                                        "Side (Street/Garden)"]]
                           .rename(columns={"Side (Street/Garden)": "Side"}))
        except KeyError:
            raise KeyError("Could not get columns. Check if the structure or "
                           "labels of the InfluencAir sensor Google Sheet "
                           "have changed.")
        cls.sensors = sensor_info
        cls.initialized = True
Exemple #2
0
class PricerServer(object):
    _reg_progid_ = "ppf.pricer"
    _reg_clsid_ = "{08632905-0B63-45B5-B388-30C73CAE611C}"
    _public_methods_ = \
    [
        "CreateHullWhiteLatticePricer"
                      , "InvokePricer"
    ]
    _pricers = {}

    retrieve = staticmethod(lambda tag, which: utils.retrieve(
        'pricer_server', 'PricerServer', tag, which))

    def CreateHullWhiteLatticePricer(self, tag, trade_id, env_id, num_states,
                                     num_std_dev):
        try:
            from trade_server import TradeServer
            from market_server import MarketServer
            trade = TradeServer.retrieve(trade_id, 'trades')
            env = MarketServer.retrieve(env_id, 'environments')
            model_args = {"num states": num_states, "num std dev": num_std_dev}
            factory = ppf.model.hull_white_lattice_model_factory()
            model = factory(trade, env, model_args)
            pricer = ppf.pricer.lattice_pricer(trade, model, env, None)
            PricerServer._pricers[tag] = pricer
            return tag
        except RuntimeError, e:
            ppf.com.utils.raise_com_exception(e)
Exemple #3
0
    def get_stations(cls, **retrieval_kwargs):
        """Retrieve a list of measuring stations.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """

        # Retrieve and reshape data
        stations = retrieve(STATIONS_CACHE_FILE, STATIONS_URL,
                            "station metadata", **retrieval_kwargs)
        stations = (stations.drop(columns=["geometry.type", "type"]).rename(
            columns={
                "properties.id": "id",
                "properties.label": "label"
            }).set_index("id"))

        # Split coordinates into columns
        coords = pd.DataFrame(
            [row for row in stations["geometry.coordinates"]],
            index=stations.index)
        stations[["lon", "lat", "alt"]] = coords
        stations.drop(columns=["geometry.coordinates", "alt"], inplace=True)

        cls.stations = stations
Exemple #4
0
def _add_arch_linux_libcs():
    def _find_packages_urls(architecture):
        url = "https://archive.archlinux.org/packages/g/glibc/"
        try:
            packages_filenames = utils.findall(
                fr"['\"](?P<filename>glibc-(?:.*?)-{architecture}\.pkg\.tar\.[gx]z)['\"]",
                url,
            )
        except AttributeError:
            print(utils.make_warning(f"Problems: {utils.make_bright(url)}"))
            return []
        else:
            packages_urls = [
                os.path.join(url, package_filename)
                for package_filename in packages_filenames
            ]
            return packages_urls

    distro_dirpath = os.path.join(utils.get_libcs_dirpath(), "arch")
    os.makedirs(distro_dirpath, exist_ok=True)
    for architecture in ("i686", "x86_64"):
        for package_url in _find_packages_urls(architecture):
            if _already_in_db(package_url):
                print(f"Skipping: {utils.make_bright(package_url)}")
                continue
            with tempfile.TemporaryDirectory() as tmp_dirpath:
                print(f"Downloading: {utils.make_bright(package_url)}")
                package_filepath = utils.retrieve(package_url, tmp_dirpath)
                add(package_filepath, dest_dirpath=distro_dirpath)
Exemple #5
0
def _add_debian_libcs():
    def _find_packages_urls(release, architecture, package):
        url = f"https://packages.debian.org/{release}/{architecture}/{package}/download"
        try:
            package_url = utils.search(
                r"['\"](?P<url>https?.*?libc6.*?.deb)['\"]", url).group("url")
        except AttributeError:
            print(utils.make_warning(f"Problems: {utils.make_bright(url)}"))
            return []
        else:
            return [package_url]

    distro_dirpath = os.path.join(utils.get_libcs_dirpath(), "debian")
    os.makedirs(distro_dirpath, exist_ok=True)
    for release in ("squeeze", "wheezy", "jessie", "stretch", "buster"):
        release_dirpath = os.path.join(distro_dirpath, release)
        os.makedirs(release_dirpath, exist_ok=True)
        for architecture in ("i386", "amd64"):
            for package in ("libc6", "libc6-dbg"):
                for package_url in _find_packages_urls(release, architecture,
                                                       package):
                    if _already_in_db(package_url):
                        print(f"Skipping: {utils.make_bright(package_url)}")
                        continue
                    with tempfile.TemporaryDirectory() as tmp_dirpath:
                        print(f"Downloading: {utils.make_bright(package_url)}")
                        package_filepath = utils.retrieve(
                            package_url, tmp_dirpath)
                        add(package_filepath, dest_dirpath=release_dirpath)
Exemple #6
0
def text(i, url, params):
    """
    抓取保存文章正文
    """
    content = retrieve(url, params)
    tree = etree.HTML(content)
    t = tree.xpath("//div[@id='article_content']/*")
    _content = ''.join(map(lambda x: etree.tostring(x, encoding='utf-8'), t))
    data[i].append(_content)
Exemple #7
0
def text(i, url, params):
    """
    抓取保存文章正文
    """
    content = retrieve(url, params)
    tree = etree.HTML(content)
    t = tree.xpath("//section[@class='article']/*")
    _content = ''.join(map(lambda x: etree.tostring(x, encoding='utf-8'), t))
    print len(_content)
    data[i].append(_content)
Exemple #8
0
def all_words():
    data = retrieve('ordmyndalisti')
    if data is not None:
        return data
    else:
        data1 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti.txt'), 'r').read())
        data2 = to_unicode_or_bust(open(os.path.join(os.path.dirname(__file__), 'ordmyndalisti2.txt'), 'r').read())
        data = data1+data2
        store('ordmyndalisti', data)
        return data
Exemple #9
0
    def get_time_series(cls, **retrieval_kwargs):
        """Retrieve information on available time series: a collection
        of station & phenomenon combinations.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """

        def get_phenomenon_name(label):
            """Extract phenomenon name from time series label."""
            phenomenon_name_series_id = (label
                                         .split(sep=" - ", maxsplit=1)[0])
            phenomenon_name = phenomenon_name_series_id.rsplit(maxsplit=1)[0]
            return phenomenon_name

        # Retrieve and reshape data
        time_series = retrieve(TIME_SERIES_CACHE_FILE,
                               API_ENDPOINTS["timeseries"],
                               "time series metadata", **retrieval_kwargs)
        time_series["id"] = time_series["id"].astype("int")
        time_series = (time_series
                       .set_index("id")
                       .drop(columns=["station.geometry.type",
                                      "station.type"])
                       .rename(columns={"station.properties.id": "station_id",
                                        "station.properties.label":
                                            "station_label",
                                        "uom": "unit"}))

        # Extract phenomenon names from labels
        labels = time_series["label"]
        time_series["phenomenon"] = labels.apply(get_phenomenon_name)

        # Split coordinates into columns
        coords = pd.DataFrame([row
                               for row
                               in time_series["station.geometry.coordinates"]],
                              index=time_series.index)
        time_series[["station_lat", "station_lon"]] = coords[[1, 0]]

        # Sort and drop columns
        time_series = time_series[["label", "phenomenon", "unit",
                                   "station_id", "station_label",
                                   "station_lat", "station_lon"]]

        # Clean unit descriptors
        time_series["unit"] = (time_series["unit"]
                               .str.replace("m3", "m³")
                               .str.replace("ug", "µg"))
        (time_series
         .loc[time_series["phenomenon"] == "temperature", "unit"]) = "°C"

        cls.time_series = time_series
Exemple #10
0
    def get_phenomena(cls, **retrieval_kwargs):
        """Retrieve a list of measured phenomena.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        phenomena = retrieve(PHENOMENA_CACHE_FILE, API_ENDPOINTS["phenomena"],
                             "phenomenon metadata", **retrieval_kwargs)
        phenomena["id"] = phenomena["id"].astype("int")
        phenomena = phenomena.set_index("id").sort_index()
        cls.phenomena = phenomena
Exemple #11
0
    def profile(self, url, params):
        tree = etree.HTML(retrieve(url, params))

        for x in tree.xpath("//ul[@class='daily-list']/li"):
            cover = x.find('a/img').attrib['src']
            title = x.find("div[@class='daily-cont']/h2/a").text
            url = 'http://zhidao.baidu.com' + x.find("div[@class='daily-cont']/h2/a").attrib['href']
            brief = x.find("div[@class='daily-cont']/div[@class='summer']/a").text
            author = '知道日报'
            code = 'zdrb'

            self.data.append([url, cover, title, brief, author, code])
Exemple #12
0
    def get_phenomena(cls, **retrieval_kwargs):
        """Retrieve a list of measured phenomena.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        phenomena = retrieve(PHENOMENA_CACHE_FILE, PHENOMENA_URL,
                             "phenomenon metadata", **retrieval_kwargs)
        # FIXME: id not converted to int
        phenomena.set_index("id", inplace=True)
        phenomena.sort_index(inplace=True)
        cls.phenomena = phenomena
Exemple #13
0
def post():
    title = 'Artist Search Result'
    if request.method == 'POST':
        kwds = request.form['keyword']
        try:
            type = request.form['search_type']
        except:
            message = '検索手法を選択してください'
            return render_template('index.html', title=title, message=message)

        result = retrieve(kwds, type)
        return render_template('index.html', title=title, keyword=kwds, result=result)
    else:
        return redirect(url_for('index'))
Exemple #14
0
    def get_metadata(self, **retrieval_kwargs):
        """Get sensor metadata and current measurements from cache or
        luftdaten.info API.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Warns:
            UserWarning if sensor does not appear to be online
        """

        # Get and cache metadata and measurements of past five minutes
        filename = os.path.basename(self.metadata_url.rstrip("/")) + ".json"
        filepath = os.path.join(CACHE_DIR, filename)
        parsed = retrieve(
            filepath, self.metadata_url,
            "sensor {} metadata from luftdaten.info".format(self.sensor_id),
            **retrieval_kwargs)

        try:
            metadata = (parsed.drop(
                columns=["sensordatavalues", "timestamp"]).iloc[0])
        except ValueError:
            warnings.warn("Sensor metadata could not be retrieved")
        else:
            metadata.name = "metadata"
            self.metadata = metadata

            # Extract metadata into corresponding properties
            self.sensor_type = metadata["sensor.sensor_type.name"]
            self.lat = float(metadata["location.latitude"])
            self.lon = float(metadata["location.longitude"])
            self.label = "at " + label_coordinates(self.lat, self.lon)

            # Extract most current measurements
            current = parsed["sensordatavalues"].iloc[-1]
            current = (json_normalize(current).replace({
                "P1": "pm10",
                "P2": "pm2.5"
            }).set_index("value_type")["value"])
            current = (pd.to_numeric(current).replace([999.9, 1999.9],
                                                      pd.np.nan))
            self.current_measurements = dict(current)
            self.phenomena = list(current.index)
            self.units = {
                phenomenon: UNITS[phenomenon]
                for phenomenon in UNITS if phenomenon in self.phenomena
            }
Exemple #15
0
def profile(url, params):
    """
    抓取保存封面,标题,摘要
    """
    content = retrieve(url, params)
    tree = etree.HTML(content)

    for x in tree.xpath("//article[@class='posts post-1 cf']"):
        cover = x.find("div[@class='left left-col']/div[2]/a/img").attrib['data-src']
        title = x.find("div[@class='right-col']/h1/a").text
        url = 'http://www.36kr.com' + x.find("div[@class='right-col']/h1/a").attrib['href']
        brief = x.find("div[@class='right-col']/p").text
        author = '36氪'
        code = '36kr'

        data.append([url, cover, title, brief, author, code])
Exemple #16
0
def profile(url, params):
    """
    抓取保存封面,标题,摘要
    """
    content = retrieve(url, params)
    tree = etree.HTML(content)

    for x in tree.xpath("//div[@class='clearfix mod-b mod-art']"):
        cover = x.find('a/img').attrib['src']
        title = x.find("div/h3/a").text
        url = 'http://www.huxiu.com' + x.find("div/h3/a").attrib['href']
        brief = x.find("div/div[2]").text
        author = '虎嗅网'
        code = 'hx'

        data.append([url, cover, title, brief, author, code])
Exemple #17
0
def process_source(source):
    fname = None
    _buffer = utils.retrieve(source)
    (_, tmpfname) = tempfile.mkstemp()
    tmpf = open(tmpfname, "w")
    tmpf.write(_buffer.getvalue())
    tmpf.close()

    archive = utils.get_archive_type(tmpfname)
    if archive == "gzip":
        fname = utils.ungzip(tmpfname)
        os.remove(tmpfname)
    elif archive == "bzip":
        fname = utils.unbzip(tmpfname)
        os.remove(tmpfname)
    elif archive is False:
        fname = tmpfname
    return (fname, True)
Exemple #18
0
    def get_time_series(cls, **retrieval_kwargs):
        """Retrieve information on available time series: a collection
        of station & phenomenon combinations.

        Args:
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        def get_phenomenon_name(label):
            """Extract phenomenon name from time series label."""
            phenomenon_name_series_id = (label.split(sep=" - ", maxsplit=1)[0])
            phenomenon_name = phenomenon_name_series_id.rsplit(maxsplit=1)[0]
            return phenomenon_name

        # Retrieve and reshape data
        time_series = retrieve(TIME_SERIES_CACHE_FILE, TIME_SERIES_URL,
                               "time series metadata", **retrieval_kwargs)
        time_series.set_index("id", inplace=True)
        time_series.drop(columns=["station.geometry.type", "station.type"],
                         inplace=True)
        time_series.rename(columns={
            "station.properties.id": "station_id",
            "station.properties.label": "station_label",
            "uom": "unit"
        },
                           inplace=True)

        # Extract phenomenon names from labels
        labels = time_series["label"]
        time_series["phenomenon"] = labels.apply(get_phenomenon_name)

        # Split coordinates into columns
        coords = pd.DataFrame(
            [row for row in time_series["station.geometry.coordinates"]],
            index=time_series.index)
        time_series[["station_lon", "station_lat", "station_alt"]] = coords

        # Sort and drop columns
        time_series = time_series[[
            "label", "phenomenon", "unit", "station_id", "station_label",
            "station_lon", "station_lat"
        ]]

        cls.time_series = time_series
Exemple #19
0
def _add_ubuntu_libcs():
    def _find_packages_urls(release, architecture, package):
        url = f"https://launchpad.net/ubuntu/{release}/{architecture}/{package}"
        packages_versions = set(
            utils.findall(
                fr'"/ubuntu/.+?/{package}/(?P<version>.+?)(?:\.\d+)?"', url))
        if not packages_versions:
            print(utils.make_warning(f"Problems: {utils.make_bright(url)}"))
            return []
        n = 3
        most_recent_packages_versions = sorted(packages_versions,
                                               reverse=True)[:n]

        packages_urls = [
            utils.search(
                r"['\"](?P<url>https?.*?libc6.*?.deb)['\"]",
                f"https://launchpad.net/ubuntu/{release}/{architecture}/{package}/{package_filename}",
            ).group("url")
            for package_filename in most_recent_packages_versions
        ]
        if not packages_urls:
            print(utils.make_warning(f"Problems: {utils.make_bright(url)}"))
            return []
        return packages_urls

    distro_dirpath = os.path.join(utils.get_libcs_dirpath(), "ubuntu")
    os.makedirs(distro_dirpath, exist_ok=True)
    for release in ("trusty", "xenial", "artful", "bionic"):
        release_dirpath = os.path.join(distro_dirpath, release)
        os.makedirs(release_dirpath, exist_ok=True)
        for architecture in ("i386", "amd64"):
            for package in ("libc6", "libc6-dbg"):
                for package_url in _find_packages_urls(release, architecture,
                                                       package):
                    if _already_in_db(package_url):
                        print(f"Skipping: {utils.make_bright(package_url)}")
                        continue
                    with tempfile.TemporaryDirectory() as tmp_dirpath:
                        print(f"Downloading: {utils.make_bright(package_url)}")
                        package_filepath = utils.retrieve(
                            package_url, tmp_dirpath)
                        add(package_filepath, dest_dirpath=release_dirpath)
Exemple #20
0
class TradeServer(object):
    _reg_progid_ = "ppf.trade"
    _reg_clsid_ = "{E33DA322-B011-4FE9-8AB9-87A964EDD046}"
    _public_methods_ = \
     [
          "GenerateFixedCouponObservables"
              , "GenerateLiborObservables"
                         , "GenerateFlows"
                 , "GenerateAdjuvantTable"
              , "GenerateExerciseSchedule"
                             , "CreateLeg"
                           , "CreateTrade"
    ]
    _observables = {}
    _flows = {}
    _adjuvants = {}
    _legs = {}
    _exercises = {}
    _trades = {}

    retrieve = staticmethod(lambda tag, which: utils.retrieve(
        'trade_server', 'TradeServer', tag, which))

    def GenerateFixedCouponObservables(self, tag, start, end, roll_period,
                                       roll_duration, reset_currency,
                                       coupon_shift_method, coupon_rate):
        try:
            observables = \
             ppf.core.generate_fixed_coupon_observables(
                  start=utils.to_ppf_date(start)
                , end=utils.to_ppf_date(end)
                , roll_period=roll_period
                , roll_duration=eval("ppf.date_time."+roll_duration)
                , reset_currency=reset_currency
                , coupon_shift_method=
                    eval("ppf.date_time.shift_convention."+coupon_shift_method)
                , coupon_rate=coupon_rate)
            TradeServer._observables[tag] = observables
            return tag
        except RuntimeError, e:
            utils.raise_com_exception(e)
Exemple #21
0
class MarketServer(object):
    _reg_progid_ = "ppf.market"
    _reg_clsid_ = "{CAFAEEDF-E876-4DD6-9B6F-7038EDA25BCD}"
    _public_methods_ = \
    [
        "CreateEnvironment"
       , "EraseEnvironment"
               , "AddCurve"
             , "AddSurface"
            , "AddConstant"
               , "ListKeys"
    ]
    _environments = {}

    retrieve = staticmethod(lambda tag, which: utils.retrieve(
        'market_server', 'MarketServer', tag, which))

    def CreateEnvironment(self, tag, t):
        try:
            MarketServer._environments[tag] = \
               ppf.market.environment(utils.to_ppf_date(t))
            return tag
        except RuntimeError, e:
            utils.raise_com_exception(e)
Exemple #22
0
    ret.add_argument("repo_dir2", type=str, help="source filename 2")
    ret.add_argument("--destdir", type=str, help="result filename",
                     required=True)
    return ret


if __name__ == "__main__":
    ap = make_arg_parser()
    args = ap.parse_args()

    fst_repo_files = {}
    snd_repo_files = {}
    for target, source in zip((fst_repo_files, snd_repo_files),
                              (args.repo_dir1, args.repo_dir2)):
        p = parser.Parser()
        strbuffer = utils.retrieve(os.path.join(source,
                                                "repodata", "repomd.xml"))
        parsed = p.parse_str(strbuffer.getvalue())
        for data in parsed.get("repomd.data").objects:
            url = data.get("location.href")
            _type = data.get("type")
            target[_type] = os.path.join(source, url)

    missing_in_1 = set(fst_repo_files) - set(snd_repo_files)
    missing_in_2 = set(snd_repo_files) - set(fst_repo_files)
    print "missing types in 1 repo %s" % ",".join(missing_in_1)
    print "missing types in 2 repo %s" % ",".join(missing_in_2)
    common = set(fst_repo_files) & set(snd_repo_files)
    if not os.path.exists(args.destdir):
        os.mkdir(args.destdir)

    for _type in common:
Exemple #23
0
    def get_measurements(self, start_date, end_date, **retrieval_kwargs):
        """Get measurement data of the sensor in a given period.

        Data are read from cache if available, or downloaded from
        luftdaten.info and saved to cache as retrieved, and then
        cleaned for self.measurements. If the instance already has data
        associated with it, calling this method replaces them.

        Args:
            start_date: first date of data to retrieve, in ISO 8601
                (YYYY-MM-DD) format
            end_date: last date of data to retrieve, in ISO 8601
                (YYYY-MM-DD) format
            retrieval_kwargs: keyword arguments to pass to retrieve
                function
        """
        sid = self.sensor_id
        if self.sensor_type is None:
            self.sensor_type = input("Type of sensor {} has not been set yet. "
                                     "Enter sensor type: ".format(sid))
        stype = self.sensor_type.lower()

        # Get and process the data file for each date in the requested range
        daily_data = []
        for date in pd.date_range(start_date, end_date):
            date_iso = date.strftime("%Y-%m-%d")
            filename = ARCHIVE_FILENAME_PATTERN.format(date=date_iso,
                                                       sensor_type=stype,
                                                       sensor_id=sid)
            filepath = os.path.join(CACHE_DIR, filename)
            url = ARCHIVE_URL_PATTERN.format(date=date_iso, filename=filename)
            data = retrieve(filepath,
                            url,
                            "luftdaten.info data for sensor {} on {}".format(
                                sid, date_iso),
                            read_func=pd.read_csv,
                            read_func_kwargs={"sep": ";"},
                            **retrieval_kwargs)
            if data is None:
                continue

            # Parse timestamps and make them timezone aware
            timestamps = pd.to_datetime(data["timestamp"], utc=True)

            # Reformat data according to sensor type
            data.set_index(timestamps, inplace=True)
            if self.sensor_type in ("SDS011", "HPM"):
                data = (data[["P1", "P2"]].rename(columns={
                    "P1": "pm10",
                    "P2": "pm2.5"
                }))
            elif self.sensor_type == "DHT22":
                data = data[["temperature", "humidity"]]
            else:
                raise NotImplementedError("No data parsing method implemented "
                                          "for sensor type {}".format(
                                              self.sensor_type))

            daily_data.append(data)

        # If daily data were retrieved, concatenate them to a single dataframe
        if daily_data:
            self.measurements = pd.concat(daily_data)
        else:
            self.measurements = None
            print("No data for sensor", sid)
            return

        # Remove duplicates
        duplicates = self.measurements.index.duplicated(keep="last")
        self.measurements = self.measurements[~duplicates]

        self.measurements.sort_index(inplace=True)
        self.clean_measurements()
Exemple #24
0
 def text(self, i, url, params):
     content = retrieve(url, params)
     tree = etree.HTML(content)
     t = tree.xpath("//div[@id='daily-cont']/*")
     _content = ''.join(map(lambda x: etree.tostring(x, encoding='utf-8'), t))
     self.data[i].append(_content)
Exemple #25
0
def get_data(time_series, start_date, end_date, **retrieval_kwargs):
    """Retrieve time series data.

    Args:
        time_series: time series ID as listed in Metadata.time_series
        start_date: date string in ISO 8601 format. Interpreted as UTC.
        end_date: date string like start. If the current date or a
            future date is entered, end will be truncated so that only
            complete days are downloaded.
        retrieval_kwargs: keyword arguments to pass to retrieve function

    Returns:
        Dataframe of values, indexed by hourly periods

    Raises:
        ValueError if start_date is later than end_date
    """

    # Make start and end timezone aware and truncate time values
    query_start_date = pd.to_datetime(start_date, format="%Y-%m-%d",
                                      utc=True).normalize()
    query_end_date = pd.to_datetime(end_date, format="%Y-%m-%d",
                                    utc=True).normalize()

    # Check validity of input and truncate end date if needed
    today = pd.to_datetime("today", utc=True)
    yesterday = today - pd.Timedelta(days=1)
    if query_end_date > yesterday:
        # TODO: Raise warning
        query_end_date = yesterday
        end_date = query_end_date.strftime("%Y-%m-%d")
    if query_start_date > query_end_date:
        raise ValueError("end_date must be greater than or equal to "
                         "start_date")

    # IRCELINE API takes local times. Convert start and end accordingly.
    query_start_dt = query_start_date.tz_convert("Europe/Brussels")
    query_start_dt_formatted = query_start_dt.strftime("%Y-%m-%dT%H")
    query_end_dt = query_end_date.tz_convert("Europe/Brussels")
    query_end_dt = (query_end_dt - pd.Timedelta(1, "s"))
    query_end_dt_formatted = query_end_dt.strftime("%Y-%m-%dT%H:%M:%S")

    url = DATA_URL_PATTERN.format(time_series_id=time_series,
                                  start=query_start_dt_formatted,
                                  end=query_end_dt_formatted)

    # TODO: Split response into days and cache as daily files. Also check cache
    #       day by day. Find longest missing intervals to make as few requests
    #       as possible.
    filename = (
        "irceline_{time_series_id}_{start_date}_{end_date}.json".format(
            time_series_id=time_series,
            start_date=start_date,
            end_date=end_date))
    filepath = os.path.join(CACHE_DIR, filename)

    # TODO: Check day by day if data are cached
    # Retrieve and parse data
    data = retrieve(filepath, url, "IRCELINE timeseries data",
                    **retrieval_kwargs)
    data = pd.DataFrame.from_dict(data.loc[0, "values"])

    # Convert Unix timestamps to datetimes and then to periods for index
    timestamps = pd.to_datetime(data["timestamp"], unit="ms", utc=True)
    periods = timestamps.dt.to_period(freq="h")
    data = pd.Series(data["value"].values, index=periods, dtype="float")

    return data
Exemple #26
0
    def get_measurements(self, start_date, end_date, **retrieval_kwargs):
        """Retrieve time series data.

        Args:
            start_date: date string in ISO 8601 (YYYY-MM-DD) format.
                Interpreted as UTC.
            end_date: date string like start_date. If the current date
                or a future date is entered, end will be truncated so
                that only complete days are downloaded.
            retrieval_kwargs: keyword arguments to pass to retrieve
                function

        Raises:
            ValueError if start_date is later than end_date
        """

        # Make start and end timezone aware and truncate time values
        query_start_date = pd.to_datetime(start_date, format="%Y-%m-%d",
                                          utc=True).normalize()
        query_end_date = (pd.to_datetime(end_date, format="%Y-%m-%d",
                                         utc=True).normalize()
                          + pd.Timedelta(days=1))  # To include end_date data

        # Check validity of input and truncate end date if needed
        today = pd.to_datetime("today", utc=True)
        if query_end_date > today:
            warnings.warn("Resetting end_date to yesterday")
            yesterday = today - pd.Timedelta(days=1)
            end_date = yesterday.strftime("%Y-%m-%d")
            query_end_date = today  # 00:00, to include yesterday's data
        if query_start_date > query_end_date:
            raise ValueError("end_date must be greater than or equal to "
                             "start_date")

        # IRCELINE API takes local times. Convert start and end accordingly.
        query_start_local = query_start_date.tz_convert("Europe/Brussels")
        query_start_local_str = query_start_local.strftime("%Y-%m-%dT%H")
        query_end_local = query_end_date.tz_convert("Europe/Brussels")
        query_end_local -= pd.Timedelta(1, "s")
        query_end_local_str = query_end_local.strftime("%Y-%m-%dT%H:%M:%S")

        url = (API_ENDPOINTS["data pattern"]
               .format(time_series_id=self.sensor_id,
                       start=query_start_local_str,
                       end=query_end_local_str))

        # TODO: Split response into days and cache as daily files; check cache
        #       day by day. Find longest missing intervals to make as few
        #       requests as possible.
        filename = ("irceline_{time_series_id}_{start_date}_{end_date}.json"
                    .format(time_series_id=self.sensor_id,
                            start_date=start_date, end_date=end_date))
        filepath = os.path.join(CACHE_DIR, filename)

        # TODO: Check day by day if data are cached
        # Retrieve and parse data
        data = retrieve(filepath, url, "IRCELINE timeseries data",
                        **retrieval_kwargs)
        data = pd.DataFrame.from_dict(data.loc[0, "values"])
        if len(data) == 0:
            return
        data["value"] = data["value"].astype("float")
        data = data.rename(columns={"value": self.metadata["phenomenon"]})

        # Convert Unix timestamps to datetimes and then to periods for index
        data.index = (pd.to_datetime(data["timestamp"], unit="ms", utc=True)
                      .dt.to_period(freq="h"))
        data.index.name = "Period"
        data = data.drop(columns=["timestamp"])

        self.measurements = data
Exemple #27
0
        if not window2:
            window2 = make_win2()

        # Tells cprint which widget element to print the colored text in
        sg.cprint_set_output_destination(window2, '-WIN2 TEXT-')

        #Clear output box
        window2['-WIN2 TEXT-']('')

        #Get user query
        query = values['-QUERY-']

        #Get the list of pdfs/txt_files user has chosen
        files = values['-FILE LIST-']

        output_sents_zipped, num_sents_found = retrieve(files, query, pdf_obj, txt_obj)

        #Display the legend
        for i, file in enumerate(files):
            # Create legend denoting which color corresponds to which document
            window2['-LEGEND-'].print(file, end='', background_color=background_colors[i])
            window2['-LEGEND-'].print('\n', end='')

        #Print the color-coded output sentences
        print_output_sents(output_sents_zipped, files, background_colors, cprint)

        #Print the number of sentences that were found
        window2['-WIN2 NUM SENTS-'].update(f"Found {num_sents_found} sentences containing '{query}'.")


        #For word2vec
Exemple #28
0
                     type=str,
                     help="result filename",
                     required=True)
    return ret


if __name__ == "__main__":
    ap = make_arg_parser()
    args = ap.parse_args()

    fst_repo_files = {}
    snd_repo_files = {}
    for target, source in zip((fst_repo_files, snd_repo_files),
                              (args.repo_dir1, args.repo_dir2)):
        p = parser.Parser()
        strbuffer = utils.retrieve(
            os.path.join(source, "repodata", "repomd.xml"))
        parsed = p.parse_str(strbuffer.getvalue())
        for data in parsed.get("repomd.data").objects:
            url = data.get("location.href")
            _type = data.get("type")
            target[_type] = os.path.join(source, url)

    missing_in_1 = set(fst_repo_files) - set(snd_repo_files)
    missing_in_2 = set(snd_repo_files) - set(fst_repo_files)
    print "missing types in 1 repo %s" % ",".join(missing_in_1)
    print "missing types in 2 repo %s" % ",".join(missing_in_2)
    common = set(fst_repo_files) & set(snd_repo_files)
    if not os.path.exists(args.destdir):
        os.mkdir(args.destdir)

    for _type in common: