Example #1
0
def get_indicators():
    '''Download information about all World Bank data series
    '''
    url = 'http://api.worldbank.org/indicators?per_page=50000&format=json'
    with urlopen(url) as response:
        data = response.read()
    data = json.loads(data)[1]
    data = pandas.DataFrame(data)
    # Clean fields
    data.source = [x['value'] for x in data.source]
    fun = lambda x: x.encode('ascii', 'ignore')
    data.sourceOrganization = data.sourceOrganization.apply(fun)
    # Clean topic field

    def get_value(x):
        try:
            return x['value']
        except:
            return ''
    fun = lambda x: [get_value(y) for y in x]
    data.topics = data.topics.apply(fun)
    data.topics = data.topics.apply(lambda x: ' ; '.join(x))
    # Clean outpu
    data = data.sort(columns='id')
    data.index = pandas.Index(lrange(data.shape[0]))
    return data
Example #2
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        try:
            with urlopen(io) as url:
                raw_text = url.read()
        except urllib2.URLError:
            raise ValueError('Invalid URL: "{0}"'.format(io))
    elif hasattr(io, "read"):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, basestring):
        raw_text = io
    else:
        raise TypeError("Cannot read object of type " "'{0.__class__.__name__!r}'".format(io))
    return raw_text
Example #3
0
def get_elements_from_url(url, element='table', base_url="file://"):
    _skip_if_none_of(('bs4', 'html5lib'))
    url = "".join([base_url, url])
    from bs4 import BeautifulSoup
    with urlopen(url) as f:
        soup = BeautifulSoup(f, features='html5lib')
    return soup.find_all(element)
Example #4
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        with urlopen(io) as url:
            raw_text = url.read()
    elif hasattr(io, 'read'):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, compat.string_types):
        raw_text = io
    else:
        raise TypeError("Cannot read object of type "
                        "'{0.__class__.__name__!r}'".format(io))
    return raw_text
Example #5
0
def dump_as_gist(data, desc="The Commit", njobs=None):
    host, njobs2 = get_travis_data()[:2]

    if njobs:  # be slightly more reliable
        njobs = max(njobs, njobs2)

    content = dict(version="0.1.1",
                   timings=data,
                   datetime=get_utcdatetime(),   # added in 0.1.1
                   hostname=host,   # added in 0.1.1
                   njobs=njobs    # added in 0.1.1, a measure of load on the travis box
                   )

    payload = dict(description=desc,
                   public=True,
                   files={'results.json': dict(content=json.dumps(content))})
    try:
        with closing(urlopen("https://api.github.com/gists",
                             json.dumps(payload), timeout=WEB_TIMEOUT)) as r:
            if 200 <= r.getcode() < 300:
                print("\n\n" + "-" * 80)

                gist = json.loads(r.read())
                file_raw_url = list(gist['files'].items())[0][1]['raw_url']
                print("[vbench-gist-raw_url] %s" % file_raw_url)
                print("[vbench-html-url] %s" % gist['html_url'])
                print("[vbench-api-url] %s" % gist['url'])

                print("-" * 80 + "\n\n")
            else:
                print("api.github.com returned status %d" % r.getcode())
    except:
        print("Error occured while dumping to gist")
Example #6
0
def _read(obj):
    """Try to read from a url, file or string.

    Parameters
    ----------
    obj : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(obj):
        with urlopen(obj) as url:
            text = url.read()
    elif hasattr(obj, 'read'):
        text = obj.read()
    elif isinstance(obj, char_types):
        text = obj
        try:
            if os.path.isfile(text):
                with open(text, 'rb') as f:
                    return f.read()
        except (TypeError, ValueError):
            pass
    else:
        raise TypeError("Cannot read object of type %r" % type(obj).__name__)
    return text
Example #7
0
def _retry_read_url(url, retry_count, pause, name):
    """
    Open url (and retry)
    """
    for _ in range(retry_count):
        time.sleep(pause)

        # kludge to close the socket ASAP
        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
                          parse_dates=True, na_values='-')[::-1]
            # Yahoo! Finance sometimes does this awesome thing where they
            # return 2 rows for the most recent business day
            if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
                rs = rs[:-1]

            #Get rid of unicode characters in index name.
            try:
                rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore')
            except AttributeError:
                #Python 3 string has no decode method.
                rs.index.name = rs.index.name.encode('ascii', 'ignore').decode()

            return rs

    raise IOError("after %d tries, %s did not "
                  "return a 200 for url %r" % (retry_count, name, url))
Example #8
0
def get_indicators():
    """Download information about all World Bank data series
    """
    url = "http://api.worldbank.org/indicators?per_page=50000&format=json"
    with urlopen(url) as response:
        data = response.read()
    data = json.loads(data)[1]
    data = pandas.DataFrame(data)
    # Clean fields
    data.source = [x["value"] for x in data.source]
    fun = lambda x: x.encode("ascii", "ignore")
    data.sourceOrganization = data.sourceOrganization.apply(fun)
    # Clean topic field

    def get_value(x):
        try:
            return x["value"]
        except:
            return ""

    fun = lambda x: [get_value(y) for y in x]
    data.topics = data.topics.apply(fun)
    data.topics = data.topics.apply(lambda x: " ; ".join(x))
    # Clean outpu
    data = data.sort(columns="id")
    data.index = pandas.Index(lrange(data.shape[0]))
    return data
Example #9
0
def get_elements_from_file(url, element='table'):
    _skip_if_none_of(('bs4', 'html5lib'))
    url = file_path_to_url(url)
    from bs4 import BeautifulSoup
    with urlopen(url) as f:
        soup = BeautifulSoup(f, features='html5lib')
    return soup.find_all(element)
Example #10
0
def get_data_famafrench(name):
    # path of zip files
    zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name)

    with urlopen(zip_file_path) as url:
        raw = url.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).readlines()

    line_lengths = np.array(lmap(len, data))
    file_edges = np.where(line_lengths == 2)[0]

    datasets = {}
    edges = zip(file_edges + 1, file_edges[1:])
    for i, (left_edge, right_edge) in enumerate(edges):
        dataset = [d.split() for d in data[left_edge:right_edge]]
        if len(dataset) > 10:
            ncol_raw = np.array(lmap(len, dataset))
            ncol = np.median(ncol_raw)
            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
            header = dataset[header_index]
            ds_header = dataset[header_index + 1:]
            # to ensure the header is unique
            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
                                                                     start=1)]
            index = np.array([d[0] for d in ds_header], dtype=int)
            dataset = np.array([d[1:] for d in ds_header], dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Example #11
0
def get_dividends_yahoo(sid, start, end):
    # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends
    from pandas.compat import StringIO, bytes_to_str
    from pandas.io.common import urlopen

    start, end = pd.to_datetime(start), pd.to_datetime(end)
    url = ('http://ichart.finance.yahoo.com/table.csv?' + 's=%s' % sid +
           '&a=%s' % (start.month - 1) +
           '&b=%s' % start.day +
           '&c=%s' % start.year +
           '&d=%s' % (end.month - 1) +
           '&e=%s' % end.day +
           '&f=%s' % end.year +
           '&g=v' +  # THE CHANGE
           '&ignore=.csv')

    with urlopen(url) as resp:
        lines = resp.read()
    rs = pd.read_csv(StringIO(bytes_to_str(lines)), index_col=0,
                     parse_dates=True, na_values='-')[::-1]
    # Yahoo! Finance sometimes does this awesome thing where they
    # return 2 rows for the most recent business day
    if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
        rs = rs[:-1]
    return rs
Example #12
0
def get_vbench_log(build_url):
    with urlopen(build_url) as r:
        if not (200 <= r.getcode() < 300):
            return

        s = json.loads(r.read())
        s = [x for x in s['matrix'] if "VBENCH" in ((x.get('config', {})
                                                    or {}).get('env', {}) or {})]
                # s=[x for x in s['matrix']]
        if not s:
            return
        id = s[0]['id']  # should be just one for now
        with urlopen("https://api.travis-ci.org/jobs/%s" % id) as r2:
            if not 200 <= r.getcode() < 300:
                return
            s2 = json.loads(r2.read())
            return s2.get('log')
Example #13
0
def get_elements_from_file(url, element="table"):
    _skip_if_none_of(("bs4", "html5lib"))
    url = file_path_to_url(url)
    from bs4 import BeautifulSoup

    with urlopen(url) as f:
        soup = BeautifulSoup(f, features="html5lib")
    return soup.find_all(element)
Example #14
0
def _get_page(page_number):
    gh_url = ('https://api.github.com/repos/pydata/pandas/issues?'
              'milestone=*&state=closed&assignee=*&page=%d') % page_number
    with urlopen(gh_url) as resp:
        rs = resp.readlines()[0]
    jsondata = json.loads(rs)
    issues = [Issue(x['title'], x['labels'], x['number'],
                    get_milestone(x['milestone']), x['body'], x['state'])
              for x in jsondata]
    return issues
Example #15
0
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country="US", start=2002, end=2005):

    if type(country) == str:
        country = [country]

    countries = ";".join(country)

    # Build URL for api call
    url = (
        "http://api.worldbank.org/countries/"
        + countries
        + "/indicators/"
        + indicator
        + "?date="
        + str(start)
        + ":"
        + str(end)
        + "&per_page=25000&format=json"
    )

    # Download
    with urlopen(url) as response:
        data = response.read()

    # Check to see if there is a possible problem
    possible_message = json.loads(data)[0]
    if "message" in possible_message.keys():
        msg = possible_message["message"][0]
        try:
            msg = msg["key"].split() + ["\n "] + msg["value"].split()
            wb_err = " ".join(msg)
        except:
            wb_err = ""
            if "key" in msg.keys():
                wb_err = msg["key"] + "\n "
            if "value" in msg.keys():
                wb_err += msg["value"]
        error_msg = "Problem with a World Bank Query \n %s"
        return None, error_msg % wb_err

    if "total" in possible_message.keys():
        if possible_message["total"] == 0:
            return None, "No results from world bank."

    # Parse JSON file
    data = json.loads(data)[1]
    country = [x["country"]["value"] for x in data]
    iso_code = [x["country"]["id"] for x in data]
    year = [x["date"] for x in data]
    value = [x["value"] for x in data]
    # Prepare output
    out = pandas.DataFrame([country, iso_code, year, value]).T
    out.columns = ["country", "iso_code", "year", indicator]
    return out, "Success"
Example #16
0
 def fetch_data(url, name):
     with urlopen(url) as resp:
         data = read_csv(
             resp, index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values="."
         )
     try:
         return data.truncate(start, end)
     except KeyError:
         if data.ix[3].name[7:12] == "Error":
             raise IOError("Failed to get the data. Check that {0!r} is " "a valid FRED series.".format(name))
         raise
Example #17
0
def _download_data_famafrench(name):
    url = "".join([_URL, _URL_PREFIX, name, _URL_SUFFIX])
    with urlopen(url) as socket:
        raw = socket.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, "r") as zf:
            data = zf.open(zf.namelist()[0]).read().decode()

    return data
Example #18
0
def get_components_yahoo(idx_sym):
    """
    Returns DataFrame containing list of component information for
    index represented in idx_sym from yahoo. Includes component symbol
    (ticker), exchange, and name.

    Parameters
    ----------
    idx_sym : str
        Stock index symbol
        Examples:
        '^DJI' (Dow Jones Industrial Average)
        '^NYA' (NYSE Composite)
        '^IXIC' (NASDAQ Composite)

        See: http://finance.yahoo.com/indices for other index symbols

    Returns
    -------
    idx_df : DataFrame
    """
    stats = 'snx'
    # URL of form:
    # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
    url = ('http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}'
           '&e=.csv&h={2}')

    idx_mod = idx_sym.replace('^', '@%5E')
    url_str = url.format(idx_mod, stats, 1)

    idx_df = DataFrame()
    mask = [True]
    comp_idx = 1

    # LOOP across component index structure,
    # break when no new components are found
    while True in mask:
        url_str = url.format(idx_mod, stats,  comp_idx)
        with urlopen(url_str) as resp:
            raw = resp.read()
        lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"')
        lines = [line.strip().split('","') for line in lines]

        temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
        temp_df = temp_df.drop_duplicates()
        temp_df = temp_df.set_index('ticker')
        mask = ~temp_df.index.isin(idx_df.index)

        comp_idx = comp_idx + 50
        idx_df = idx_df.append(temp_df[mask])

    return idx_df
Example #19
0
File: wb.py Project: jaidevd/pandas
def get_countries():
    """Query information about countries
    """
    url = "http://api.worldbank.org/countries/?per_page=1000&format=json"
    with urlopen(url) as response:
        data = response.read()
    data = json.loads(data)[1]
    data = pandas.DataFrame(data)
    data.adminregion = [x["value"] for x in data.adminregion]
    data.incomeLevel = [x["value"] for x in data.incomeLevel]
    data.lendingType = [x["value"] for x in data.lendingType]
    data.region = [x["value"] for x in data.region]
    data = data.rename(columns={"id": "iso3c", "iso2Code": "iso2c"})
    return data
Example #20
0
File: wb.py Project: APWaldo/pandas
def get_countries():
    '''Query information about countries
    '''
    url = 'http://api.worldbank.org/countries/all?format=json'
    with urlopen(url) as response:
        data = response.read()
    data = json.loads(data)[1]
    data = pandas.DataFrame(data)
    data.adminregion = [x['value'] for x in data.adminregion]
    data.incomeLevel = [x['value'] for x in data.incomeLevel]
    data.lendingType = [x['value'] for x in data.lendingType]
    data.region = [x['value'] for x in data.region]
    data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'})
    return data
Example #21
0
def xsg_data(year=None, month=None, 
            retry_count=3, pause=0.001):
    """
    获取限售股解禁数据
    Parameters
    --------
    year:年份,默认为当前年
    month:解禁月份,默认为当前月
    retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
    pause : int, 默认 0
                重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题
    
    Return
    ------
    DataFrame
    code:股票代码
    name:名称
    date:解禁日期
    count:解禁数量(万股)
    ratio:占总盘比率
    """
    year = dt.get_year() if year is None else year
    month = dt.get_month() if month is None else month
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            with urlopen(rv.XSG_URL%(ct.P_TYPE['http'], ct.DOMAINS['em'],
                                     ct.PAGES['emxsg'], year, month)) as resp:
                lines = resp.read()
                lines = lines.decode('utf-8') if ct.PY3 else lines
        except _network_error_classes:
            pass
        else:
            da = lines[3:len(lines)-3]
            list =  []
            for row in da.split('","'):
                list.append([data for data in row.split(',')])
            df = pd.DataFrame(list)
            df = df[[1, 3, 4, 5, 6]]
            for col in [5, 6]:
                df[col] = df[col].astype(float)
            df[5] = df[5]/10000
            df[6] = df[6]*100
            df[5] = df[5].map(ct.FORMAT)
            df[6] = df[6].map(ct.FORMAT)
            df.columns = rv.XSG_COLS
            return df
    raise IOError("获取失败,请检查网络和URL")   
Example #22
0
def convert_json_to_df(results_url):
    """retrieve json results file from url and return df

    df contains timings for all successful vbenchmarks
    """

    with urlopen(results_url) as resp:
        res = json.loads(resp.read())
    timings = res.get("timings")
    if not timings:
        return
    res = [x for x in timings if x.get('succeeded')]
    df = pd.DataFrame(res)
    df = df.set_index("name")
    return df
Example #23
0
def guba_sina(show_content=False):
    from pandas.io.common import urlopen
    try:
        html = lxml.html.parse(nv.GUBA_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['sina']))
#         res = html.xpath('//div[@class=\"topNav\"]/div')
#         print res
#         return ''
    
        with urlopen(nv.GUBA_SINA_URL%(ct.P_TYPE['http'],
                                       ct.DOMAINS['sina'])) as resp:
            lines = resp.read()
        print lines
    except Exception as er:
        print str(er)  
    
    pass
Example #24
0
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US',
              start=2002, end=2005):

    if type(country) == str:
        country = [country]

    countries = ';'.join(country)

    # Build URL for api call
    url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" +
           indicator + "?date=" + str(start) + ":" + str(end) +
           "&per_page=25000&format=json")

    # Download
    with urlopen(url) as response:
        data = response.read()

    # Check to see if there is a possible problem
    possible_message = json.loads(data)[0]
    if 'message' in possible_message.keys():
        msg = possible_message['message'][0]
        try:
            msg = msg['key'].split() + ["\n "] + msg['value'].split()
            wb_err = ' '.join(msg)
        except:
            wb_err = ""
            if 'key' in msg.keys():
                wb_err = msg['key'] + "\n "
            if 'value' in msg.keys():
                wb_err += msg['value']
        error_msg = "Problem with a World Bank Query \n %s"
        return None, error_msg % wb_err

    if 'total' in possible_message.keys():
        if possible_message['total'] == 0:
            return None, "No results from world bank."

    # Parse JSON file
    data = json.loads(data)[1]
    country = [x['country']['value'] for x in data]
    iso_code = [x['country']['id'] for x in data]
    year = [x['date'] for x in data]
    value = [x['value'] for x in data]
    # Prepare output
    out = pandas.DataFrame([country, iso_code, year, value]).T
    out.columns = ['country', 'iso_code', 'year', indicator]
    return out,"Success"
Example #25
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, "text_content"):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
def _read(obj):
    if _is_url(obj):
        with urlopen(obj) as url:
            text = url.read()
    elif hasattr(obj, 'read'):
        text = obj.read()
    elif isinstance(obj, char_types):
        text = obj
        try:
            if os.path.isfile(text):
                with open(text, 'rb') as f:
                    return f.read()
        except (TypeError, ValueError):
            pass
    else:
        raise TypeError("Cannot read object of type %r" % type(obj).__name__)
    return text
Example #27
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
Example #28
0
def guba_sina(show_content=False):
    from pandas.io.common import urlopen
    try:
        html = lxml.html.parse(nv.GUBA_SINA_URL %
                               (ct.P_TYPE['http'], ct.DOMAINS['sina']))
        #         res = html.xpath('//div[@class=\"topNav\"]/div')
        #         print res
        #         return ''

        with urlopen(nv.GUBA_SINA_URL %
                     (ct.P_TYPE['http'], ct.DOMAINS['sina'])) as resp:
            lines = resp.read()
        print lines
    except Exception as er:
        print str(er)

    pass
Example #29
0
def _get_data(symbols):
    """
    Get current yahoo quote

    Returns a DataFrame
    """
    if isinstance(symbols, compat.string_types):
        sym_list = symbols
    else:
        sym_list = '+'.join(symbols)

    # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
    request = ''.join(compat.itervalues(_yahoo_codes))  # code request string
    header = list(_yahoo_codes.keys())

    data = defaultdict(list)

    params = {
        's': sym_list,
        'f': request
    }
    url = _encode_url(_URL, params)

    with urlopen(url) as response:
        lines = response.readlines()

    def line_gen(lines):
        for line in lines:
            yield line.decode('utf-8').strip()

    for line in csv.reader(line_gen(lines)):
        for i, field in enumerate(line):
            if field[-2:] == '%"':
                v = float(field.strip('"%'))
            elif field[0] == '"':
                v = field.strip('"')
            else:
                try:
                    v = float(field)
                except ValueError:
                    v = field
            data[header[i]].append(v)

    idx = data.pop('symbol')
    return DataFrame(data, index=idx)
Example #30
0
File: wb.py Project: APWaldo/pandas
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US',
              start=2002, end=2005):
    # Build URL for api call
    url = "http://api.worldbank.org/countries/" + country + "/indicators/" + \
        indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000" + \
        "&format=json"
    # Download
    with urlopen(url) as response:
        data = response.read()
    # Parse JSON file
    data = json.loads(data)[1]
    country = [x['country']['value'] for x in data]
    iso2c = [x['country']['id'] for x in data]
    year = [x['date'] for x in data]
    value = [x['value'] for x in data]
    # Prepare output
    out = pandas.DataFrame([country, iso2c, year, value]).T
    return out
Example #31
0
def get_hist_data(code=None, start=None, end=None, retry_count=3,
                   pause=0.001):
    """
        获取个股历史交易记录
    Parameters
    ------
      code:string
                  股票代码 e.g. 600848
      start:string
                  开始日期 format:YYYY-MM-DD 为空时取到API所提供的最早日期数据
      end:string
                  结束日期 format:YYYY-MM-DD 为空时取到最近一个交易日数据
      retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 Number of times to retry query request.
      pause : int, 默认 0
                重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题
    return
    -------
      DataFrame
          属性:日期 ,开盘价, 最高价, 收盘价, 最低价, 成交量, 价格变动 ,涨跌幅,5日均价,10日均价,20日均价,5日均量,10日均量,20日均量,换手率
    """
    if code is None or len(code)!=6:
        return None
    symbol = code_to_symbol(code)
    url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'],ct.DOMAINS['ifeng'],symbol)
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            js = json.loads(lines)
            df = pd.DataFrame(js['record'],columns=ct.DAY_PRICE_COLUMNS)
            df = df.applymap(lambda x: x.replace(u',', u''))#删除千位分隔符,
            df = df.drop('price_change',axis=1)
            df = df.set_index(['date']) 
            if start is not None:
                df = df.ix[df.index>=start]
            if end is not None:
                df = df.ix[df.index<=end]
            return df
    raise IOError("%s获取失败,请检查网络和URL:%s" % (code, url))
Example #32
0
def get_countries():
    '''Query information about countries
    
    Provides information such as: 
        country code, region, income level, capital city, latitude and longitude
    '''
    url = 'http://api.worldbank.org/countries/?per_page=1000&format=json'
    with urlopen(url) as response:
        data = response.read()
    data = json.loads(data)[1]
    data = pandas.DataFrame(data)
    data.adminregion = [x['value'] for x in data.adminregion]
    data.incomeLevel = [x['value'] for x in data.incomeLevel]
    data.lendingType = [x['value'] for x in data.lendingType]
    data.region = [x['value'] for x in data.region]
    data.latitude = [float(x) if x != "" else np.nan for x in data.latitude]
    data.longitude = [float(x) if x != "" else np.nan for x in data.longitude]
    data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'})
    return data
Example #33
0
def _holding_cotent(start, end, pageNo, retry_count, pause):
    url = rv.FUND_HOLDS_URL % (ct.P_TYPE['http'], ct.DOMAINS['163'],
                               ct.PAGES['163fh'], ct.PAGES['163fh'], pageNo,
                               start, end, _random(5))
    for _ in range(retry_count):
        time.sleep(pause)
        if pageNo > 0:
            print rv.DP_MSG % pageNo
        try:
            with urlopen(url) as resp:
                lines = resp.read()
                lines = lines.replace('--', '0')
                lines = json.loads(lines)
                data = lines['list']
                df = pd.DataFrame(data)
                df = df.drop([
                    'CODE', 'ESYMBOL', 'EXCHANGE', 'NAME', 'RN',
                    'SHANGQIGUSHU', 'SHANGQISHIZHI', 'SHANGQISHULIANG'
                ],
                             axis=1)
                for col in ['GUSHU', 'GUSHUBIJIAO', 'SHIZHI', 'SCSTC27']:
                    df[col] = df[col].astype(float)
                df['SCSTC27'] = df['SCSTC27'] * 100
                df['GUSHU'] = df['GUSHU'] / 10000
                df['GUSHUBIJIAO'] = df['GUSHUBIJIAO'] / 10000
                df['SHIZHI'] = df['SHIZHI'] / 10000
                df['GUSHU'] = df['GUSHU'].map(ct.FORMAT)
                df['GUSHUBIJIAO'] = df['GUSHUBIJIAO'].map(ct.FORMAT)
                df['SHIZHI'] = df['SHIZHI'].map(ct.FORMAT)
                df['SCSTC27'] = df['SCSTC27'].map(ct.FORMAT)
                df.columns = rv.FUND_HOLDS_COLS
                df = df[[
                    'code', 'name', 'date', 'nums', 'nlast', 'count', 'clast',
                    'amount', 'ratio'
                ]]
        except _network_error_classes:
            pass
        else:
            if pageNo == 0:
                return df, int(lines['pagecount'])
            else:
                return df
    raise IOError("获取失败,请检查网络和URL:%s" % url)
Example #34
0
def _retry_read_url(url, retry_count, pause, name):
    for _ in range(retry_count):
        time.sleep(pause)

        # kludge to close the socket ASAP
        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True)[::-1]
            # Yahoo! Finance sometimes does this awesome thing where they
            # return 2 rows for the most recent business day
            if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
                rs = rs[:-1]
            return rs

    raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
Example #35
0
def get_countries():
    """Query information about countries
    
    Provides information such as: 
        country code, region, income level, capital city, latitude and longitude
    """
    url = "http://api.worldbank.org/countries/?per_page=1000&format=json"
    with urlopen(url) as response:
        data = response.read()
    data = json.loads(data)[1]
    data = pandas.DataFrame(data)
    data.adminregion = [x["value"] for x in data.adminregion]
    data.incomeLevel = [x["value"] for x in data.incomeLevel]
    data.lendingType = [x["value"] for x in data.lendingType]
    data.region = [x["value"] for x in data.region]
    data.latitude = [float(x) if x != "" else np.nan for x in data.latitude]
    data.longitude = [float(x) if x != "" else np.nan for x in data.longitude]
    data = data.rename(columns={"id": "iso3c", "iso2Code": "iso2c"})
    return data
Example #36
0
    def __init__(self, filepath_or_buffer):
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
        elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)

        if isinstance(filepath_or_buffer, self._workbook_class):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            filepath_or_buffer.seek(0)
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, str):
            self.book = self.load_workbook(filepath_or_buffer)
        else:
            raise ValueError(
                "Must explicitly set engine if not passing in buffer or path for io."
            )
Example #37
0
def guba_sina(show_content=False):
    """
       获取sina财经股吧首页的重点消息
    Parameter
    --------
        show_content:是否显示内容,默认False
    
    Return
    --------
    DataFrame
        title, 消息标题
        content, 消息内容(show_content=True的情况下)
        ptime, 发布时间
        rcounts,阅读次数
    """

    from pandas.io.common import urlopen
    try:
        with urlopen(nv.GUBA_SINA_URL %
                     (ct.P_TYPE['http'], ct.DOMAINS['sina'])) as resp:
            lines = resp.read()
        html = lxml.html.document_fromstring(lines)
        res = html.xpath('//ul[@class=\"list_05\"]/li')
        heads = html.xpath('//div[@class=\"tit_04\"]')
        data = []
        for head in heads[:1]:
            title = head.xpath('a/text()')[0]
            url = head.xpath('a/@href')[0]
            ds = [title]
            ds.extend(_guba_content(url))
            data.append(ds)
        for row in res:
            title = row.xpath('a[2]/text()')[0]
            url = row.xpath('a[2]/@href')[0]
            ds = [title]
            ds.extend(_guba_content(url))
            data.append(ds)
        df = pd.DataFrame(data, columns=nv.GUBA_SINA_COLS)
        df['rcounts'] = df['rcounts'].astype(float)
        return df if show_content is True else df.drop('content', axis=1)
    except Exception as er:
        print str(er)
Example #38
0
def guba_sina(show_content=False):
    """
       获取sina财经股吧首页的重点消息
    Parameter
    --------
        show_content:是否显示内容,默认False
    
    Return
    --------
    DataFrame
        title, 消息标题
        content, 消息内容(show_content=True的情况下)
        ptime, 发布时间
        rcounts,阅读次数
    """
    
    from pandas.io.common import urlopen
    try:
        with urlopen(nv.GUBA_SINA_URL%(ct.P_TYPE['http'],
                                       ct.DOMAINS['sina'])) as resp:
            lines = resp.read()
        html = lxml.html.document_fromstring(lines)
        res = html.xpath('//ul[@class=\"list_05\"]/li')
        heads = html.xpath('//div[@class=\"tit_04\"]')
        data = []
        for head in heads[:1]:
            title = head.xpath('a/text()')[0]
            url = head.xpath('a/@href')[0]
            ds = [title]
            ds.extend(_guba_content(url))
            data.append(ds)
        for row in res:
            title = row.xpath('a[2]/text()')[0]
            url = row.xpath('a[2]/@href')[0]
            ds = [title]
            ds.extend(_guba_content(url))
            data.append(ds)
        df = pd.DataFrame(data, columns=nv.GUBA_SINA_COLS)
        df['rcounts'] = df['rcounts'].astype(float)
        return df if show_content is True else df.drop('content', axis=1)
    except Exception as er:
        print str(er)  
Example #39
0
def _get_data(indicator="NY.GNS.ICTR.GN.ZS",
              country='US',
              start=2002,
              end=2005):
    # Build URL for api call
    url = ("http://api.worldbank.org/countries/" + country + "/indicators/" +
           indicator + "?date=" + str(start) + ":" + str(end) +
           "&per_page=25000&format=json")
    # Download
    with urlopen(url) as response:
        data = response.read()
    # Parse JSON file
    data = json.loads(data)[1]
    country = [x['country']['value'] for x in data]
    iso2c = [x['country']['id'] for x in data]
    year = [x['date'] for x in data]
    value = [x['value'] for x in data]
    # Prepare output
    out = pandas.DataFrame([country, iso2c, year, value]).T
    return out
Example #40
0
def _retry_read_url(url, retry_count, pause, name):
    for _ in range(retry_count):
        time.sleep(pause)

        # kludge to close the socket ASAP
        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
                          parse_dates=True)[::-1]
            # Yahoo! Finance sometimes does this awesome thing where they
            # return 2 rows for the most recent business day
            if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
                rs = rs[:-1]
            return rs

    raise IOError("after %d tries, %s did not "
                  "return a 200 for url %r" % (retry_count, name, url))
Example #41
0
def can_connect(url, error_classes=_network_error_classes):
    """Try to connect to the given url. True if succeeds, False if IOError
    raised

    Parameters
    ----------
    url : basestring
        The URL to try to connect to

    Returns
    -------
    connectable : bool
        Return True if no IOError (unable to connect) or URLError (bad url) was
        raised
    """
    try:
        with urlopen(url):
            pass
    except error_classes:
        return False
    else:
        return True
Example #42
0
def get_quote_yahoo(symbols):
    """
    Get current yahoo quote

    Returns a DataFrame
    """
    if isinstance(symbols, basestring):
        sym_list = symbols
    else:
        sym_list = '+'.join(symbols)

    # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
    request = ''.join(_yahoo_codes.itervalues())  # code request string
    header = _yahoo_codes.keys()

    data = defaultdict(list)

    url_str = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (sym_list,
                                                                   request)

    with urlopen(url_str) as url:
        lines = url.readlines()

    for line in lines:
        fields = line.decode('utf-8').strip().split(',')
        for i, field in enumerate(fields):
            if field[-2:] == '%"':
                v = float(field.strip('"%'))
            elif field[0] == '"':
                v = field.strip('"')
            else:
                try:
                    v = float(field)
                except ValueError:
                    v = np.nan
            data[header[i]].append(v)

    idx = data.pop('symbol')
    return DataFrame(data, index=idx)
Example #43
0
def get_travis_data():
    """figure out what worker we're running on,  and the number of jobs it's running
    """
    import os
    jobid = os.environ.get("TRAVIS_JOB_ID")
    if not jobid:
        return None, None

    with urlopen("https://api.travis-ci.org/workers/") as resp:
        workers = json.loads(resp.read())

    host = njobs = None
    for item in workers:
        host = item.get("host")
        id = ((item.get("payload") or {}).get("job") or {}).get("id")
        if id and str(id) == str(jobid):
            break
        if host:
            njobs = len(
                [x for x in workers if host in x['host'] and x['payload']])

    return host, njobs
Example #44
0
def can_connect(url, error_classes=_network_error_classes):
    """Try to connect to the given url. True if succeeds, False if IOError
    raised

    Parameters
    ----------
    url : basestring
        The URL to try to connect to

    Returns
    -------
    connectable : bool
        Return True if no IOError (unable to connect) or URLError (bad url) was
        raised
    """
    try:
        with urlopen(url):
            pass
    except error_classes:
        return False
    else:
        return True
Example #45
0
    def __init__(self,
                 filepath_or_buffer,
                 storage_options: StorageOptions = None):
        self.ioargs = IOArgs(
            filepath_or_buffer=filepath_or_buffer,
            encoding=None,
            mode=None,
            compression={"method": None},
        )
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if is_url(filepath_or_buffer):
            self.ioargs = IOArgs(
                filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()),
                should_close=True,
                encoding=None,
                mode=None,
                compression={"method": None},
            )
        elif not isinstance(filepath_or_buffer,
                            (ExcelFile, self._workbook_class)):
            self.ioargs = get_filepath_or_buffer(
                filepath_or_buffer, storage_options=storage_options)

        if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class):
            self.book = self.ioargs.filepath_or_buffer
        elif hasattr(self.ioargs.filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            assert not isinstance(self.ioargs.filepath_or_buffer, str)
            self.ioargs.filepath_or_buffer.seek(0)
            self.book = self.load_workbook(self.ioargs.filepath_or_buffer)
        elif isinstance(self.ioargs.filepath_or_buffer, str):
            self.book = self.load_workbook(self.ioargs.filepath_or_buffer)
        elif isinstance(self.ioargs.filepath_or_buffer, bytes):
            self.book = self.load_workbook(
                BytesIO(self.ioargs.filepath_or_buffer))
        else:
            raise ValueError(
                "Must explicitly set engine if not passing in buffer or path for io."
            )
Example #46
0
def _retry_read_url(url, retry_count, pause, name):
    """
    Open url (and retry)
    """
    for _ in range(retry_count):

        # kludge to close the socket ASAP
        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            rs = read_csv(StringIO(bytes_to_str(lines)),
                          index_col=0,
                          parse_dates=True,
                          na_values='-')[::-1]
            # Yahoo! Finance sometimes does this awesome thing where they
            # return 2 rows for the most recent business day
            if len(rs
                   ) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
                rs = rs[:-1]

            #Get rid of unicode characters in index name.
            try:
                rs.index.name = rs.index.name.decode('unicode_escape').encode(
                    'ascii', 'ignore')
            except AttributeError:
                #Python 3 string has no decode method.
                rs.index.name = rs.index.name.encode('ascii',
                                                     'ignore').decode()

            return rs

        time.sleep(pause)

    raise IOError("after %d tries, %s did not "
                  "return a 200 for url %r" % (retry_count, name, url))
Example #47
0
def get_data_famafrench(name):
    # path of zip files
    zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/'
                    'ken.french/ftp/')
    zip_file_path = '{0}{1}.zip'.format(zip_file_url, name)

    with urlopen(zip_file_path) as url:
        raw = url.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.read(name + '.txt').splitlines()

    line_lengths = np.array(map(len, data))
    file_edges = np.where(line_lengths)[0]

    datasets = {}
    edges = itertools.izip(file_edges[:-1], file_edges[1:])
    for i, (left_edge, right_edge) in enumerate(edges):
        dataset = [d.split() for d in data[left_edge:right_edge]]
        if len(dataset) > 10:
            ncol_raw = np.array(map(len, dataset))
            ncol = np.median(ncol_raw)
            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
            header = dataset[header_index]
            ds_header = dataset[header_index + 1:]
            # to ensure the header is unique
            header = [
                '{0} {1}'.format(*items)
                for items in enumerate(header, start=1)
            ]
            index = np.fromiter((d[0] for d in ds_header), dtype=int)
            dataset = np.fromiter((d[1:] for d in ds_header), dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Example #48
0
def get_data_fred(name, start=dt.datetime(2010, 1, 1),
                  end=dt.datetime.today()):
    """
    Get data for the given name from the St. Louis FED (FRED).
    Date format is datetime

    Returns a DataFrame.
    """
    start, end = _sanitize_dates(start, end)

    fred_URL = "http://research.stlouisfed.org/fred2/series/"

    url = fred_URL + '%s' % name + '/downloaddata/%s' % name + '.csv'
    with urlopen(url) as resp:
        data = read_csv(resp, index_col=0, parse_dates=True,
                        header=None, skiprows=1, names=["DATE", name],
                        na_values='.')
    try:
        return data.truncate(start, end)
    except KeyError:
        if data.ix[3].name[7:12] == 'Error':
            raise IOError("Failed to get the data. Check that {0!r} is "
                          "a valid FRED series.".format(name))
        raise
Example #49
0
def get_all_results(repo_id=53976):  # travis pydata/pandas id
    """Fetches the VBENCH results for all travis builds, and returns a list of result df

   unsuccesful individual vbenches are dropped.
    """
    from collections import OrderedDict

    def get_results_from_builds(builds):
        dfs = OrderedDict()
        for build in builds:
            build_id = build['id']
            build_number = build['number']
            print(build_number)
            res = get_build_results(build_id)
            if res is not None:
                dfs[build_number] = res
        return dfs

    base_url = 'https://api.travis-ci.org/builds?url=%2Fbuilds&repository_id={repo_id}'
    url = base_url.format(repo_id=repo_id)
    url_after = url + '&after_number={after}'
    dfs = OrderedDict()

    while True:
        with urlopen(url) as r:
            if not (200 <= r.getcode() < 300):
                break
            builds = json.loads(r.read())
        res = get_results_from_builds(builds)
        if not res:
            break
        last_build_number = min(res.keys())
        dfs.update(res)
        url = url_after.format(after=last_build_number)

    return dfs
Example #50
0
def dump_as_gist(data, desc="The Commit", njobs=None):
    host, njobs2 = get_travis_data()[:2]

    if njobs:  # be slightly more reliable
        njobs = max(njobs, njobs2)

    content = dict(
        version="0.1.1",
        timings=data,
        datetime=get_utcdatetime(),  # added in 0.1.1
        hostname=host,  # added in 0.1.1
        njobs=njobs  # added in 0.1.1, a measure of load on the travis box
    )

    payload = dict(description=desc,
                   public=True,
                   files={'results.json': dict(content=json.dumps(content))})
    try:
        with closing(
                urlopen("https://api.github.com/gists",
                        json.dumps(payload),
                        timeout=WEB_TIMEOUT)) as r:
            if 200 <= r.getcode() < 300:
                print("\n\n" + "-" * 80)

                gist = json.loads(r.read())
                file_raw_url = list(gist['files'].items())[0][1]['raw_url']
                print("[vbench-gist-raw_url] %s" % file_raw_url)
                print("[vbench-html-url] %s" % gist['html_url'])
                print("[vbench-api-url] %s" % gist['url'])

                print("-" * 80 + "\n\n")
            else:
                print("api.github.com returned status %d" % r.getcode())
    except:
        print("Error occured while dumping to gist")
Example #51
0
def get_dividends_yahoo(sid, start, end):
    # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends
    from pandas.compat import StringIO, bytes_to_str
    from pandas.io.common import urlopen

    start, end = pd.to_datetime(start), pd.to_datetime(end)
    url = (
        "http://ichart.finance.yahoo.com/table.csv?" + "s=%s" % sid + "&a=%s" %
        (start.month - 1) + "&b=%s" % start.day + "&c=%s" % start.year +
        "&d=%s" % (end.month - 1) + "&e=%s" % end.day + "&f=%s" % end.year +
        "&g=v" + "&ignore=.csv"  # THE CHANGE
    )

    with urlopen(url) as resp:
        lines = resp.read()
    rs = pd.read_csv(StringIO(bytes_to_str(lines)),
                     index_col=0,
                     parse_dates=True,
                     na_values="-")[::-1]
    # Yahoo! Finance sometimes does this awesome thing where they
    # return 2 rows for the most recent business day
    if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
        rs = rs[:-1]
    return rs
Example #52
0
def get_symbol_yahoo_stats_url(symbols):
    """
    Get the symbols' basic statistics from Yahoo Finance.
    Input:
       symbols - a list of symbol strings, e.g. ['AAPL']
    Output: stats in Pandas DataFrame.
    This function is ported from pandas_datareader/yahoo/components.py
    """
    sym_list = str2list(symbols)
    if sym_list == None:
        return DataFrame()

    url_str = 'http://download.finance.yahoo.com/d/quotes.csv?'
    # Form a BUNCH of STOCK SYMBOLS separated by "+",
    # e.g. XOM+BBDb.TO+JNJ+MSFT
    sym_str = '+'.join(sym_list)
    url_str += 's=' + sym_str
    url_str = url_str.strip().replace(' ','') # remove all spaces

    # Yahoo Finance tags, refer to http://www.financialwisdomforum.org/gummy-stuff/Yahoo-data.htm
    tags = {'s':'Symbol', 'x':'Exchange', 'j1':'Market Cap', 'b4':'Book Value', 'r':'P/E', 'p5':'Price/Sales',
            'p6':'Price/Book', 'j4':'EBITDA', 'j':'52-week Low', 'k':'52-week High', 'l1':'Last Trade',
            'd':'Dividend/Share', 'y':'Dividend Yield', 'e':'EPS', 's7':'Short Ratio', 's1':'Shares Owned',
            'f6':'Float Shares'}
    url_str += '&f=' + ''.join(pd.compat.iterkeys(tags))
    with urlopen(url_str) as resp:
        raw = resp.read()
    lines = raw.decode('utf-8').strip().replace('"', '').split('\n')
    lines = [line.strip().split(',') for line in lines]
    if len(lines) < 1 or len(lines[0]) < len(tags) :
        print('Error: failed to download Yahoo stats from %s' %url_str)
        return DataFrame()
    stats = DataFrame(lines, columns=list(tags.values()))
    stats = stats.drop_duplicates()
    stats = stats.set_index('Symbol')
    return stats
Example #53
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        with urlopen(io) as url:
            raw_text = url.read()
    elif hasattr(io, 'read'):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, string_types):
        raw_text = io
    else:
        raise TypeError("Cannot read object of type %r" % type(io).__name__)
    return raw_text
Example #54
0
def query_osm(typ,
              bbox=None,
              recurse=None,
              tags='',
              raw=False,
              meta=False,
              operation='and',
              **kwargs):
    """
    Query the Overpass API to obtain OpenStreetMap data.

    See also:
    http://wiki.openstreetmap.org/wiki/Overpass_API/Language_Guide

    The OSM XML data is parsed into an intermediate set of DataFrames.
    By passing in 'render=False', this will return these DataFrames stored
    as the OSMData namedtuple. If render is True, then the DataFrames
    are built into their corresponding geometries.

    Parameters
    ----------
    typ : {'node', 'way', 'relation'}
        The type of OSM data to query
    bbox : (min lon, min lat, max lon, max lat) bounding box
        Optional bounding box to restrict the query. Unless the query
        is extremely restricted, you usually want to specify this.
        It can be retrieved from GeoPandas objects as 'df.total_bounds' or
        from Shapely objects as 'geom.bounds'
    recurse : {'up, 'down', 'uprel', 'downrel'}
        This is used to get more data than the original query. If 'typ' is
        'way', you'll usually want this set to 'down' which grabs all nodes
        of the matching ways
    tags : string or list of query strings
        See also the OverpassQL (referenced above) for more tag options
        Examples:
            tags='highway'
                Matches objects with a 'highway' tag
            tags='highway=motorway' <-- Matches ob
                Matches objects where the 'highway' tag is 'motorway'
            tags='name~[Mm]agazine'
                Match if the 'name' tag matches the regular expression

            Specify a list of tag requests to match all of them or the any of them
            tags=['highway', 'name~"^Magazine"']
                Match tags that have 'highway' and where 'name' starts
                with 'Magazine'

    raw : boolean, default False
        Return the raw XML data returned by the request
    render : boolean, default True
        Parse the output and return a final GeoDataFrame
    meta : boolean, default False
        Indicates whether to query the metadata with each OSM object. This
        includes the changeset, timestamp, uid, user, and version.
    operation: {'and', 'or'}, default 'and'
        the operation of query conditions
        'and' :  return a list of tag requests to match all of them
        'or' :  return a list of tag requests to match any of them

    way_type:{'Line','Polygon'} (optional)
        when typ equals 'way'
        'Line' : the type of geometry in geodataframe is LineString
        'Polygon' : the type of geometry in geodataframe is Polygon
    Returns
    -------
    df - GeoDataFrame
    Note that there's probably a bit more filtering required to get the
    exact desired data. For example if you only want ways, you may want
    to grab only the linestrings like:
        >>> df = df[df.type == 'LineString']

    """
    url = _build_url(typ, operation, bbox, recurse, tags, meta)

    # TODO: Raise on non-200 (or 400-599)
    with urlopen(url) as response:
        content = response.read()

    if raw:
        return content
    return read_osm(content, **kwargs)
Example #55
0
def get_COOPS_json(begin_dt, end_dt, base_url):
    """Function accepts: a base url (API endpoint), a beginning and end datetime string in the form 'YYYYMMDD mm:ss'
    which are <= 1 year apart, passing these to the query_builder function.
    Function returns the hourly prediction data as a PANDAS DataFrame Object where the returned time becomes the
    datetime index."""

    # import dependencies

    import pandas as pd
    import numpy as np
    from pandas.io.common import urlopen
    from pandas.io import json

    # construct the query

    query, query_dict = query_builder(begin_dt, end_dt, base_url)

    # execute query and read response

    with urlopen(query) as response:
        data = response.read()

        # convert json object to python dictionary and extract time and values for predictions

        data = json.loads(data)['predictions']

        # read into PANDAS DataFrame, then manipulate DataFrame object
        data = pd.DataFrame(data)
        data.columns = ['Date_Time', 'Level']
        data.index = data.Date_Time
        data.index = pd.to_datetime(data.index)
        data = data.drop('Date_Time', axis=1)

        # reindex to fill in any missing time values, this needs
        # work to initialize the range on the data/query vs. hardcoding as it
        # currently stands.

        periods, begin, end = dt_periodizer(query_dict)

        begin_string = begin.strftime('%Y-%m-%d %H:%M:%S')

        rng = pd.date_range(begin_string, periods=periods, freq='6min')

        # the actual reindex itself needs to be reworked for a better fill
        # a good start might be the median of the points directly above and
        # below the missing dt index. Since this is very few points typically
        # I am filling them with 100 for easy removal later. I would rather
        # remove the points than fill in a non-measured value.

        # protect against index duplicates
        data = data.reset_index()

        # reindex to datetime
        data = data.reindex(rng, fill_value=100)

        # convert value from string to float
        data.Level = data.Level.astype(float)

        # adjust level to account for distance of Carkeek from NOAA
        # monitoring station (+ 5.5%)
        level_adjust = data.Level.values + (.05 * data.Level.values)
        data.Level = np.round(level_adjust, decimals=2)

        # add date column to dataframe for later use with weather data
        data['Date'] = data.index.date

        # add a column for hourly re-sample

        # data['Hour'] = data.index.hour
        # data['Time'] = data.index.time

        # return DataFrame object

        return data
Example #56
0
def get_hist_data(code=None, start=None, end=None,
                  ktype='D', retry_count=3,
                  pause=0.001):
    """
        获取个股历史交易记录
    Parameters
    ------
      code:string
                  股票代码 e.g. 600848
      start:string
                  开始日期 format:YYYY-MM-DD 为空时取到API所提供的最早日期数据
      end:string
                  结束日期 format:YYYY-MM-DD 为空时取到最近一个交易日数据
      ktype:string
                  数据类型,D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟,默认为D
      retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
      pause : int, 默认 0
                重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题
    return
    -------
      DataFrame
          属性:日期 ,开盘价, 最高价, 收盘价, 最低价, 成交量, 价格变动 ,涨跌幅,5日均价,10日均价,20日均价,5日均量,10日均量,20日均量,换手率
    """
    symbol = code_to_symbol(code)
    url = ''
    if ktype.upper() in ct.K_LABELS:
        url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                ct.K_TYPE[ktype.upper()], symbol)
    elif ktype in ct.K_MIN_LABELS:
        url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                    symbol, ktype)
    else:
        raise TypeError('ktype input error.')
    
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            js = json.loads(lines)
            cols = []
            if (code in ct.INDEX_LABELS) & (ktype.upper() in ct.K_LABELS):
                cols = ct.INX_DAY_PRICE_COLUMNS
            else:
                cols = ct.DAY_PRICE_COLUMNS
            df = pd.DataFrame(js['record'], columns=cols)
            if ktype.upper() in ['D','W','M']:
                df = df.applymap(lambda x: x.replace(u',', u''))
            for col in cols[1:]:
                df[col] = df[col].astype(float)
            if start is not None:
                df = df[df.date >= start]
            if end is not None:
                df = df[df.date <= end]
            if (code in ct.INDEX_LABELS) & (ktype in ct.K_MIN_LABELS):
                df = df.drop('turnover', axis=1)
            df = df.set_index('date')
            return df
    raise IOError("%s获取失败,请检查网络和URL:%s" % (code, url))
Example #57
0
def get_data_yahoo_actions(symbol,
                           start=None,
                           end=None,
                           retry_count=3,
                           pause=0.001):
    """
  Returns DataFrame of historical corporate actions (dividends and stock
  splits) from symbols, over date range, start to end.

  Parameters
  ----------
    sym : string with a single Single stock symbol (ticker).
    start : string, (defaults to '1/1/2010')
        Starting date, timestamp. Parses many different kind of date
        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
    end : string, (defaults to today)
        Ending date, timestamp. Same format as starting date.
    retry_count : int, default 3
        Number of times to retry query request.
    pause : int, default 0
        Time, in seconds, of the pause between retries.
  """

    start, end = _sanitize_dates(start, end)
    url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + '&a=%s' %
           (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year +
           '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year +
           '&g=v')

    for _ in range(retry_count):
        time.sleep(pause)

        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            actions_index = []
            actions_entries = []

            for line in csv.reader(StringIO(bytes_to_str(lines))):
                # Ignore lines that aren't dividends or splits (Yahoo
                # add a bunch of irrelevant fields.)
                if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
                    continue

                action, date, value = line
                if action == 'DIVIDEND':
                    actions_index.append(to_datetime(date))
                    actions_entries.append({
                        'action': action,
                        'value': float(value)
                    })
                elif action == 'SPLIT' and ':' in value:
                    # Convert the split ratio to a fraction. For example a
                    # 4:1 split expressed as a fraction is 1/4 = 0.25.
                    denominator, numerator = value.split(':', 1)
                    split_fraction = float(numerator) / float(denominator)

                    actions_index.append(to_datetime(date))
                    actions_entries.append({
                        'action': action,
                        'value': split_fraction
                    })

            return DataFrame(actions_entries, index=actions_index)

    raise IOError("after %d tries, Yahoo! did not "
                  "return a 200 for url %r" % (retry_count, url))