Example #1
0
    def add_season_to_data(self, data: pd.Series, segment: pd.Series,
                           offset: int, seasonality: int,
                           bound_type: Bound) -> pd.Series:
        #data - smoothed data to which seasonality will be added
        #if addition == True -> segment is added
        #if addition == False -> segment is subtracted
        len_smoothed_data = len(data)
        for idx, _ in enumerate(data):
            if idx - offset < 0:
                #TODO: add seasonality for non empty parts
                continue
            if (idx - offset) % seasonality == 0:
                if bound_type == Bound.UPPER:
                    upper_segment_bound = self.get_segment_bound(
                        segment, Bound.UPPER)
                    data = data.add(pd.Series(upper_segment_bound.values,
                                              index=segment.index + idx),
                                    fill_value=0)
                elif bound_type == Bound.LOWER:
                    lower_segment_bound = self.get_segment_bound(
                        segment, Bound.LOWER)
                    data = data.add(pd.Series(lower_segment_bound.values * -1,
                                              index=segment.index + idx),
                                    fill_value=0)
                else:
                    raise ValueError(f'unknown bound type: {bound_type.value}')

        return data[:len_smoothed_data]
Example #2
0
 def simulate(self,
              p=None,
              tmin=None,
              tmax=None,
              freq=None,
              dt=1,
              istress=None):
     self.update_stress(tmin=tmin, tmax=tmax, freq=freq)
     h = Series(data=0, index=self.stress[0].series.index, name=self.name)
     stresses = self.get_stress(istress=istress)
     distances = self.get_distances(istress=istress)
     for stress, r in zip(stresses, distances):
         npoints = stress.index.size
         p_with_r = np.concatenate([p, np.asarray([r])])
         b = self.get_block(p_with_r, dt, tmin, tmax)
         c = fftconvolve(stress, b, 'full')[:npoints]
         h = h.add(Series(c, index=stress.index, fastpath=True),
                   fill_value=0.0)
     if istress is not None:
         if self.stress[istress].name is not None:
             h.name = self.stress[istress].name
         else:
             h.name = self.name + "_" + str(istress)
     else:
         h.name = self.name
     return h
Example #3
0
def rate_of_return(period_ret: pd.Series, base_period: str) -> pd.Series:
    """
    跨期收益转换
    假设 factor_data 对应的收益率列名为 period_30D, period_150D, period_450D, 如果以
    period_30D 作为基准,假设 period_150D 的收益率为 r, 那么 period_150D 在收益率稳定
    的情况下,理论上, period_30 从 period_150D 换算下来的收益率应该为 (1+r)^{30/150} - 1

    参数
    ---
    :param period_ret: 包含远期收益的数据,名称应该包括相应周期
    :param base_period: 转换中使用的基准周期,譬如 ('1 days', '1D', '30m', '3h', '1D1h', etc)
    """
    period_len = get_period(period_ret.name.replace("period_", ""))
    base_period = get_period(base_period.replace("period_", ""))
    pattern = re.compile(r"\d+")
    interval = pattern.findall(period_len)[0]
    base_interval = pattern.findall(base_period)[0]
    if (period_len.replace(interval, "") != "min") or (period_len.replace(
            interval, "") != "d"):
        if period_len.replace(interval, "") == "m":
            period_len = int(interval) * pd.Timedelta(days=DAYS_PER_MONTH)
            base_period = int(base_interval) * pd.Timedelta(
                days=DAYS_PER_MONTH)
        elif period_len.replace(interval, "") == "q":
            period_len = int(interval) * pd.Timedelta(days=DAYS_PER_QUARTER)
            base_period = int(base_interval) * pd.Timedelta(
                days=DAYS_PER_QUARTER)
        elif period_len.replace(interval, "") == "y":
            period_len = int(interval) * pd.Timedelta(days=DAYS_PER_YEAR)
            base_period = int(base_interval) * pd.Timedelta(days=DAYS_PER_YEAR)
    conversion_factor = pd.Timedelta(base_period) / pd.Timedelta(period_len)
    return period_ret.add(1).pow(conversion_factor).sub(1.0)
    def test_fill_value_when_combine_const(self):
        # GH12723
        s = Series([0, 1, np.nan, 3, 4, 5])

        exp = s.fillna(0).add(2)
        res = s.add(2, fill_value=0)
        assert_series_equal(res, exp)
Example #5
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        s = Series([0, 1, np.nan, 3, 4, 5])

        exp = s.fillna(0).add(2)
        res = s.add(2, fill_value=0)
        assert_series_equal(res, exp)
Example #6
0
def chunkRead():
    reader = pd.read_csv('data6.csv', sep=',', chunksize=1000)
    #print(reader.get_chunk(5)['key'].value_counts())
    series = Series([])
    for chunk in reader:
        series = series.add(chunk['key'].value_counts(), fill_value=0)
    print(series.sort_values(ascending=False)[:10])
Example #7
0
def series_simple_math(ser: pd.Series, function: str,
                       number: int) -> pd.core.series.Series:
    """Write some simple math helper functions for series.
    Take the given series, perfrom the required operation and
        return the new series.
    For example. Give the series:
        0    0
        1    1
        2    2
        dtype: int64
    Function 'add' and 'number' 2 you should return
        0     2
        1     3
        2     4
        dtype: int64
    :param ser: Series to perform operation on
    :param function: The operation to perform
    :param number: The number to apply the operation to
    """
    if function == "add":
        return ser.add(number)
    elif function == "sub":
        return ser.sub(number)
    elif function == "mul":
        return ser.mul(number)
    elif function == "div":
        return ser.div(number)
Example #8
0
    def test_flex_add_scalar_fill_value(self):
        # GH12723
        ser = Series([0, 1, np.nan, 3, 4, 5])

        exp = ser.fillna(0).add(2)
        res = ser.add(2, fill_value=0)
        tm.assert_series_equal(res, exp)
Example #9
0
def get_sum(docs, df, length):
    sum = Series([])
    if length > 0:
        for doc in docs:
            sum = sum.add(df.loc[doc + ".xml"], fill_value=0)
        return sum
    else:
        return [0] * length
    def _holding_ret(self, ret: pd.Series) -> pd.Series:
        """
        计算持有不同周期的股票收益率
        :param ret: 股票收益率序列
        :return:
        """

        # Holding period return
        ret = ret.add(1)

        ret_label = 1
        for shift_ in range(self.hp):
            ret_label *= ret.groupby(KN.STOCK_ID.value).shift(-shift_)

        ret_label = ret_label.sub(1)

        return ret_label
    def analysize(self):
        """
        分析每个不同因素中,各类群体平均总的消费金额.
        """
        attributeSeries = Series([])
        attributeDict = dict()
        for piece in self.chunks:

            attributeSeries = attributeSeries.add(piece.groupby(self.attribute).amount.sum(), fill_value=0.0)

            #注意以下语句的使用方式.经过民族分组之后的学号可能不一定是唯一的.因此还需要通过nunique即返回唯一的大小.
            #attributeCount = attributeCount.add(piece.groupby(self.attribute).studentID.nunique(), fill_value=0.0)

            for attribute,idArray in piece.groupby(self.attribute).studentID.unique().iteritems():
                attributeDict.setdefault(attribute, np.array([]))
                attributeDict[attribute] = np.union1d(attributeDict[attribute], idArray)
        return (attributeSeries/(Series(attributeDict).apply(lambda x : len(x)))).sort_values()
Example #12
0
    def _get_view_target_weights(self, view: View, market_weights: pd.Series,
                                 market_covariance: pd.DataFrame,
                                 view_matrix: pd.DataFrame,
                                 view_out_performance: pd.Series) -> pd.Series:
        """
        get target weights based on the view allocation and
        stated confidence in the view
        """

        zero_view_cov = pd.DataFrame([0], index=[view.id], columns=[view.id])
        full_confidence_weights = self._get_weights(market_weights,
                                                    market_covariance,
                                                    view_matrix, zero_view_cov,
                                                    view_out_performance)
        max_weight_difference = full_confidence_weights - market_weights
        target_weights = market_weights.add(view.confidence *
                                            max_weight_difference)

        return target_weights
def predictions_better(ts_data, window_size, should_plot=True):
    results_ARIMA = model_combined_no_log(ts_data, window_size, True)

    # Make a series with cumulative fitted values
    predictions_ARIMA_diff = Series(results_ARIMA.fittedvalues, copy=True)
    predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()

    # Make a series with combined original and cumulative fitted values
    predictions_ARIMA = Series(ts_data.ix[0], index=ts_data.index)
    # predictions_ARIMA = predictions_ARIMA.add(predictions_ARIMA_diff, fill_value=0)
    predictions_ARIMA = predictions_ARIMA.add(predictions_ARIMA_diff_cumsum,
                                              fill_value=0)

    if should_plot:
        pyplot.figure(2)
        pyplot.plot(ts_data, color="blue", label="Original")
        pyplot.plot(predictions_ARIMA, color="green", label="Prediction")
        pyplot.legend(loc="best")
        pyplot.title("Predictions")
        pyplot.show(block=False)
Example #14
0
    def test_timedelta_arithmetic(self):
        data = Series(["nat", "32 days"], dtype="timedelta64[ns]")
        deltas = [timedelta(days=1), Timedelta(1, unit="D")]
        for delta in deltas:
            result_method = data.add(delta)
            result_operator = data + delta
            expected = Series(["nat", "33 days"], dtype="timedelta64[ns]")
            tm.assert_series_equal(result_operator, expected)
            tm.assert_series_equal(result_method, expected)

            result_method = data.sub(delta)
            result_operator = data - delta
            expected = Series(["nat", "31 days"], dtype="timedelta64[ns]")
            tm.assert_series_equal(result_operator, expected)
            tm.assert_series_equal(result_method, expected)
            # GH 9396
            result_method = data.div(delta)
            result_operator = data / delta
            expected = Series([np.nan, 32.0], dtype="float64")
            tm.assert_series_equal(result_operator, expected)
            tm.assert_series_equal(result_method, expected)
def main(tweets_filepath, count_filepath, median_filepath):
    ''' Function to read the file and construct word counter '''
    f_read = open(tweets_filepath, 'r')
    wordcount_series = Series()
    stream_of_median = []
    first_half_max_heap = []
    second_half_min_heap = []
    first_element_median_flag = False
    for each in f_read:
        word_counter = Counter(each.lower().rstrip().split(' '))
        word_series = Series(word_counter)
        wordcount_series = wordcount_series.add(word_series, fill_value=0)
        if first_element_median_flag:
            curr_median = median_unique.running_median(first_half_max_heap, second_half_min_heap, float(len(word_counter.keys())))
            stream_of_median.append(curr_median)
        else:
            first_element_median_flag = True
            curr_median = len(word_counter.keys())
            second_half_min_heap.append(curr_median)
            stream_of_median.append(curr_median)
    f_read.close()
    write_count_file(wordcount_series, count_filepath)
    write_median_file(stream_of_median, median_filepath)
Example #16
0
                index = ['a', 'b', 'c'])
y = DataFrame(numpy.arange(12).reshape((4, 3)),
                columns = ['A','B','C'],
                index = ['a', 'b', 'c', 'd'])
print x
print y
print x + y
'''
      A     B     C
a   0.0   2.0   4.0
b   6.0   8.0  10.0
c  12.0  14.0  16.0
d   NaN   NaN   NaN
'''
print '对x/y的不重叠部分填充,不是对结果NaN填充'
print x.add(y, fill_value = 0) # x不变化
'''

      A     B     C
a   0.0   2.0   4.0
b   6.0   8.0  10.0
c  12.0  14.0  16.0
d   9.0  10.0  11.0
'''

print 'DataFrame与Series运算:行运算'
frame = DataFrame(numpy.arange(9).reshape((3, 3)),
                  columns = ['A','B','C'],
                  index = ['a', 'b', 'c'])
series = frame.ix[0]
print frame
Example #17
0
df=pd.DataFrame(data,index=index,columns=columns)#生成一个数据框


df.ix[:,0:2]

df.ix[:,[0,2]]

iris.query('Species == "setosa"')
dt.query('sl >5 & pw >2')
print(dt.query('sl  >5'))


############################   panda
from pandas import Series, DataFrame  
import pandas as pd 
arr=[1,2,3,4]

series_1 = Series(arr)
series_2=Series([1,2,3,4]) 
series_3=Series([1,2,'3',4,'a'])

series_4 =Series([1,2,3])
series_4.index=['a','b','c'] #创建索引

temp =Series([5])
type(temp)
series_4.append(temp)       #增    Series的add()方法是加法计算不是增加Series元素用的。
series_4.add(temp)          #对应索引位置的相加
series_4.drop('a')   # 删
series_4['a']=4         #改
series_4['a']       #查
Example #18
0
def practice_two():
    # 重新索引      reindex
    obj = Series(['b', 'p', 'y'], index=[0, 2, 4])
    obj.reindex(range(6), method='ffill')
    '''
    ffill   前向填充值
    bfill   后向填充值
    pad     前向搬运值
    backfill    后向搬运值
    '''

    frame = DataFrame(np.arange(9).reshape((3, 3)),
                      index=['a', 'c', 'd'],
                      columns=['Ohio', 'Texas', 'California'])
    # 3行3列的数组,行索引为index,列索引为columns
    frame2 = frame.reindex(['a', 'b', 'c', 'd'])  # 添加索引为b这一行
    states = ['Texas', 'Utah', 'California']
    frame.reindex(columns=states)  # 使用columns可重新索引列
    '''
    reindex函数的参数
        index       用作索引的新序列
        method      插值方式
        fill_value  重新索引的过程中,需要引入缺失值时使用的代替值
        limit       前向或后向填充时的最大填充量
        level       在Multilndex的指定级别上匹配简单索引,否则取其子集
        copy        默认True,无论如何都复制;若为False,则新旧相等不复制
    '''

    # 丢弃指定轴上的项
    obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
    obj.drop('c')  # 删除c行
    obj.drop(['d', 'c'])  # 删除d,c行
    data = DataFrame(np.arange(16).reshape((4, 4)),
                     index=['o', 'c', 'u', 'n'],
                     columns=['one', 'two', 'three', 'four'])
    data.drop(['two', 'four'], axis=1)  # 删除列,two,four

    # 索引,选取,过滤
    obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
    obj['b']  # 等价于obj[1]
    obj[2:4]
    obj[['b', 'a', 'd']]
    obj[[1, 3]]
    obj[obj < 2]
    obj['b':'c']
    obj['b':'c'] = 5  # 修改值
    '''
    DataFrame的索引选项
        obj[val]          选取单列或一组列
        obj.ix[val]       单行或一组行
        obj.ix[val1, val2]  同时选取行和列
        reindex方法       将一个或多个轴匹配到新索引
        xs方法            根据标签选取单行或单列,返回Series
        icol,irow方法     根据整数位置选取单列或单行,返回Series
        get_value,set_value方法   根据行标签和列标签选取单个值
    '''

    # 算术运算和数据对齐
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1],
                index=['a', 'c', 'd', 'e', 'f', 'g'])
    s1 + s2  # 在不重叠的索引处引入NA值
    # 同样会发生在DataFrame上
    s1.add(s2, fill_value=0)  # 不会出现NA值,单纯加
    s1.reindex(columns=s2.columns, fill_value=0)  # 指定值
    '''
    add     +
    sub     -
    div     /
    mul     *
    '''

    frame = DataFrame(np.arange(12.).reshape((3, 4)),
                      columns=list('bde'),
                      index=['U', 'O', 'T', 'R'])
    series = frame.ix[0]
    frame - series
    series2 = Series(range(3), index=['b', 'e', 'f'])
    frame + series2  # 会出现NA值
    series3 = frame['d']
    frame.sub(series3, axis=0)

    # 函数应用与映射
    frame = DataFrame(np.random.randn(3, 4),
                      columns=list('bde'),
                      index=['U', 'O', 'T', 'R'])
    np.abs(frame)  # 绝对值
    f = lambda x: x.max() - x.min()
    frame.apply(f)
    frame.apply(f, axis=1)
    format = lambda x: '%.2f' % x
    frame.applymap(format)
    frame['e'].map(format)

    # 排序和排名
    '''
    .sort_index()       按字典顺序排序     行
    .sort_index(axis=1)         列
    .sort_index(ascending=False)        降序,默认升
    .order()            对Series
    .sort_index(by='*')         针对*列
    .rank(ascending=False,method='first',axis=1)
    # 'average' 默认,平均   'min' 最小    'max' 最大    'first' 按值在原始出现顺序分配排名
    '''

    # 带有重复值的轴索引
    obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
    obj.index.is_unique  # 值是否唯一

    pass
Example #19
0
'''
data = pd.read_csv('data/ex4.csv', nrows=2, skiprows=[0, 2])

print(data)
#   value1 value2  key1  key2
# 0    one      a     0     1
# 1    one      b     2     3

# 将数据分成块
chunker = pd.read_csv('data/ex3.csv', chunksize=2)
print(
    chunker)  # <pandas.io.parsers.TextFileReader object at 0x000000000B1B9F60>

tot = Series([])
for piece in chunker:
    tot = tot.add(piece['value1'].value_counts(), fill_value=0)

print(tot)
# one    5.0
# two    3.0
# dtype: float64
print(tot[0])
'''
输出文本格式,自定义分隔符
'''
data = pd.read_csv('data/ex3.csv')
print(data)

import sys

data.to_csv('data/ex6.csv', sep='|')
Example #20
0
			tf.ix[fila,word] = 1
		else:
			tf.ix[fila,word] = tf.ix[fila,word] + 1
	tf.ix[fila] = tf.ix[fila] / len(tokens)
	fila = fila + 1
	print "Fila: ", fila
#print tf
print "TF MATRIX LISTO"
idf = Series()
#print idf.index
for term in termslist.keys():
	apariciones = termslist[term]
	totaldoc = data.shape[0]
	argumento = totaldoc / (1 + apariciones)
	#print argumento
	test = Series({term : math.log(argumento)})
	idf = idf.add(test, fill_value=0)
#print idf
print "IDF LISTO"
gc.collect()
for i, row in tf.iterrows():
	print i
	tf.ix[i] = row.multiply(idf)
	#gc.collect()

#print tf
#print idf 
#tfidf = tf.apply(lambda x: x.multiply(idf), axis = 1)
#print tfidf
tf.to_csv('tfidf.csv')
idf.to_csv('idf.csv')
def main():
    out_dir = os.path.dirname(__file__)

    ex1_path = study.DATA_DIR + '/ch06/ex1.csv'
    cat(ex1_path)

    df = pd.read_csv(ex1_path)
    p(df)
    p(pd.read_table(ex1_path, sep=','))

    p('header less---------------------')
    ex2_path = study.DATA_DIR + '/ch06/ex2.csv'
    cat(ex2_path)
    names = ['a','b', 'c', 'd', 'message']
    p(pd.read_csv(ex2_path, header=None))
    p(pd.read_csv(ex2_path, names=names))
    p(pd.read_csv(ex2_path, names=names, index_col='message'))

    p('hierarchy index---------------------')
    mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv'
    cat(mindex_path)
    p(pd.read_csv(mindex_path, index_col=['key1', 'key2']))

    p('separate by regex-------------')
    ex3_path = study.DATA_DIR + '/ch06/ex3.csv'
    cat(ex3_path)
    p(pd.read_csv(ex3_path, sep='\s+'))

    p('skip rows-----------')
    ex4_path = study.DATA_DIR + '/ch06/ex4.csv'
    cat(ex4_path)
    p(pd.read_csv(ex4_path, skiprows=[0,2,3]))

    p('N/A------------------')
    ex5_path = study.DATA_DIR + '/ch06/ex5.csv'
    cat(ex5_path)
    result = pd.read_csv(ex5_path)
    p(result)
    p(pd.isnull(result))
    result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA
    p(result)

    p('N/A dict------------------')
    sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
    p(sentinels)
    p(pd.read_csv(ex5_path, na_values=sentinels))

    p('6.1.1 read data chunk size---------------------')
    ex6_path = study.DATA_DIR + '/ch06/ex6.csv'
    p(pd.read_csv(ex6_path).count())
    p(pd.read_csv(ex6_path, nrows=5))
    chunker = pd.read_csv(ex6_path, chunksize=1000)
    p(chunker)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['key'].value_counts(), fill_value=0)
    tot.order(ascending=False)
    p(tot[:10])

    p('6.1.2 write---------------------')
    data = pd.read_csv(ex5_path)
    p(data)

    ex5_out_path = out_dir + '/ex5_out.csv'
    data.to_csv(ex5_out_path)
    cat(ex5_path)

    data.to_csv(sys.stdout, index=False, header=False)
    print ''
    data.to_csv(sys.stdout, index=False, cols=list('abc'))
    print ''

    p('Series--------------')
    tseries_out_path = out_dir + '/tseries_out.csv'
    dates = pd.date_range('1/1/2000', periods=7)
    ts = Series(np.arange(7), index=dates)
    ts.to_csv(tseries_out_path)
    cat(tseries_out_path)
    p(Series.from_csv(tseries_out_path, parse_dates=True))

    p('6.1.3 csv-------------------------')
    ex7_path = study.DATA_DIR + '/ch06/ex7.csv'
    cat(ex7_path)
    f = open(ex7_path)
    reader = csv.reader(f)
    for line in reader:
        print line
    lines = list(csv.reader(open(ex7_path)))
    header, values = lines[0], lines[1:]
    data_dict = {h: v for h,v in zip(header, zip(*values))}
    p(data_dict)

    my_data_out_path = out_dir + '/mydata.csv'
    with open(my_data_out_path, 'w') as fp:
        writer = csv.writer(fp, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))
    cat(my_data_out_path)

    p('6.1.4 JSON-------------------------')
    obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
             {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
    result = json.loads(obj)
    p(result)
    asjson = json.dumps(result)
    p(asjson)
    siblings = DataFrame(result['siblings'], columns=['name', 'age'])
    p(siblings)

    p('6.1.4 XML/HTML Web Scraping-------------------------')
    url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options'
    if not url is '':
        parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
        doc = parsed.getroot()
        p([lnk.get('href') for lnk in doc.findall('.//a')][-10:])

        tables = doc.findall('.//table')
        p(parse_options_data(tables[9])[:5])
        p(parse_options_data(tables[13])[:5])

    p('6.1.5 Read XML-------------------------')
    xml_path = out_dir + '/Performance_MNR.xml'
    xml_content ="""
<INDICATOR>
    <INDICATOR_SEQ>373889</INDICATOR_SEQ>
    <PARENT_SEQ></PARENT_SEQ>
    <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
    <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
    <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
    <PERIOD_YEAR>2011</PERIOD_YEAR>
    <PERIOD_MONTH>12</PERIOD_MONTH>
    <CATEGORY>Service Indicators</CATEGORY>
    <FREQUENCY>M</FREQUENCY>
    <DESIRED_CHANGE>U</DESIRED_CHANGE>
    <INDICATOR_UNIT>%</INDICATOR_UNIT>
    <DECIMAL_PLACES>1</DECIMAL_PLACES>
    <YTD_TARGET>97.00</YTD_TARGET>
    <YTD_ACTUAL></YTD_ACTUAL>
    <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
    <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
"""
    if not os.path.exists(xml_path):
        with open(xml_path, 'w') as f:
            f.write(xml_content)
    parsed = objectify.parse(open(xml_path))
    root = parsed.getroot()
    data = []
    skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
                   'DESIRED_SEQ', 'DECIMAL_PLACES']
    p(dir(root))
    for elt in root: # .INDICATOR:
        el_data = {}
        for child in elt.getchildren():
            if child.tag in skip_fields:
                continue
            el_data[child.tag] = child.pyval
        data.append(el_data)
    perf = DataFrame(data)
    p(perf)

    tag = '<a href="http://google.com">Google</a>'
    root = objectify.parse(StringIO.StringIO(tag)).getroot()
    p(root)
    p(root.get('href'))
    p(root.text)
Example #22
0
 n_file = len(idxs_file)
 n_process = 12
 
 arg_lists = list(zip(idxs_file, [path] * n_file, [True] * n_file))
 
 # Parallel run
 start_time = time.time()
 with Pool(processes = n_process) as pool:
     results = pool.map(analyze_file, arg_lists)
     
     # Reduce the results
     count_fare = Series()
     mat_reg1_XX_XY = np.zeros((2, 3))
     mat_reg2_XX_XY = np.zeros((3, 4))
     for result in results:
         count_fare = count_fare.add(result[0], fill_value = 0)
         mat_reg1_XX_XY += result[1]
         mat_reg2_XX_XY += result[2]
     
     # Compute the deciles    
     cdf = np.cumsum(count_fare) / np.sum(count_fare)
     deciles = [cdf[cdf >= p].index[0] for p in np.arange(0, 1.05, 0.1)]
     
     # Solve the regressions
     coeff1 = np.linalg.solve(mat_reg1_XX_XY[:, 0:2], mat_reg1_XX_XY[:, 2])
     coeff2 = np.linalg.solve(mat_reg2_XX_XY[:, 0:3], mat_reg2_XX_XY[:, 3])
     
     print("Deciles of the total amount less toll:")
     print(deciles)
     print("Linear model of the total amount less the tolls versus trip time:")
     print(coeff1)
Example #23
0
student


■ series끼리 연산하기
obj1 = Series([10,5,3,7],index=['a','b','c','d'])
obj2 = Series([2,4,6,8,10], index=['a','b','c','d','e'])

#series 사칙연산
obj1 *100

#더하기
#series 끼리 연산하기 - 인덱스 이름을 기준으로 연산한다. 
obj1 + obj2

#series 끼리 연살 할 때, 인덱스가 없는 것은 0으로 계산 작업을 하기
obj1.add(obj2, fill_value=0)

# 빼기
obj1-obj2
obj1.sub(obj2, fill_value=0)

# 곱하기
obj1*obj2
obj.mul(obj,fill_value=1)

#나누기
obj1/obj2
obj1.div(obj, fill_value=1)

■ dataframe 사칙연산: 인덱스를 기준으로 사칙연산
df1 = DataFrame(np.arange(6).reshape(2,3), 
x
print
y
print
x + y
'''
      A     B     C
a   0.0   2.0   4.0
b   6.0   8.0  10.0
c  12.0  14.0  16.0
d   NaN   NaN   NaN
'''
print
'对x/y的不重叠部分填充,不是对结果NaN填充'
print
x.add(y, fill_value=0)  # x不变化
'''

      A     B     C
a   0.0   2.0   4.0
b   6.0   8.0  10.0
c  12.0  14.0  16.0
d   9.0  10.0  11.0
'''

print
'DataFrame与Series运算:行运算'
frame = DataFrame(numpy.arange(9).reshape((3, 3)),
                  columns=['A', 'B', 'C'],
                  index=['a', 'b', 'c'])
series = frame.ix[0]
Example #25
0
#use dict to create Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

pd.isnull(obj4)
pd.notnull(obj4)

#auto align
obj3 + obj4
obj3.add(obj4, fill_value = 0)

#DataFrame=====================================================================
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data) #can set column and index
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])

#selecting by columns, return a Series, so you can index the Series again
frame['state']
frame[['year','state']]
frame[[0]]
frame[[0,2,1]]
#selecting by rows or index
Example #26
0
#   u'New Lilianland' u'Iowa' 76517L 91000L 120000L 35000L]]
print("\n")
print(matrix[:,-3:])
print("\n")
print(matrix[:,-3:].sum(axis=1))

print("\n=======================================================")
print("Sries_operation")
print("=======================================================")
s1 = Series(range(1,6), index=list("abced"))
print(s1)
print("\n")
s2 = Series(range(5,11), index=list("bcedef"))
print(s2)
print("\n")
print(s1.add(s2))
print("\n")
print(s1+s2) # 위와 같은 결과
# index 기준으로 연산 수행
# 겹치는 index가 없을경우 NaN으로 반환

print("\n=======================================================")
print("Dataframe_operation")
print("=======================================================")
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("abc"))
print(df1)
print("\n")
df2 = DataFrame(np.arange(16).reshape(4,4), columns=list("abcd"))
print(df2)
print("\n")
print(df1+df2)
s1 = s + 10  # series的每一个元素均加上1
print(s1)

s2 = Series(data=np.random.randint(0, 100, size=5))
print(s2)
print(s)
print(s + s2)  # 按照每个元素对应的索引进行相加,索引对不上的结果直接返回nan!

# 4.2 索引对不上的计算
s1 = Series(np.random.randint(0, 150, size=4),
            index=["A", "B", "C", "Sara"],
            name="数学")
s2 = Series(data=np.random.randint(0, 150, size=5),
            index=["张三", "李四", "Sara", "Lisa", "Machel"])
print(s1 + s2)
s1.add(s2)
s1.add(s2, fill_value=0)  # 将nan自动填充为0再进行加法运算

# 4.3 其他加减乘除方法
s.add(20)
s.subtract(20)
s.multiply(2)
s.divide(2)

# 4.4要想保留所有的index,则需要使用.add()函数
np.full((2, 5), fill_value=10)
s.add(s2, fill_value=0)  # 这样后5个数据就自动填上0了,不会返回nan!

# -*- coding: utf-8 -*-
"""
5.傅里叶案例
Example #28
0
# print b
# print c
# print d
# print e
# print g
# print h
# print i

# 获取行数
len(f.index)



# 运算  DataFrame同理
s1 = Series(np.arange(10,20),index=np.arange(0,10))
s2 = Series(np.arange(50,60),index=np.arange(5,15))
s3 = s1 + s2
# 对原本没有的值进行填充
s4 = s1.add(s2,fill_value = 0)
# print s3
# print s4

# Series 与 DataFrame的运算
s1 = f.ix[0,:]
# DataFrame会每行都根据索引减去series中相应的值
f1 = f - s1
# print f1
# 若需要按列运算,需指定axis轴
s1 = f.ix[:,0]
f1 = f.add(s1,axis = 0)
print f1
Example #29
0
def get_monthly_return1(date_index):
    this_month = d4[d4.date == dates.loc[date_index]]
    next_month = d4[d4.date == dates.loc[date_index + 1]]
    temp = pd.merge(next_month,
                    this_month,
                    how='inner',
                    left_on='PERMNO',
                    right_on='PERMNO',
                    suffixes=('_n', '_t'))
    tickers = get_tickers1(temp)
    return get_value_weighted_return(temp, tickers)


dates = Series(d4.date.unique())
small_mon = Series(dates.index[:-1]).map(get_monthly_return1)
small = np.cumprod(small_mon.add(1))
"""## 2. Top 35% B/M ratio"""


def get_tickers2(df):
    number = int(round(len(df.permno.drop_duplicates()) * 0.35))
    bm = df.bm.sort_values(ascending=False)[:number]
    tickers = df.permno.loc[bm.index]
    return tickers


def get_monthly_return2(date_index):
    this_month = d4[d4.date == dates.loc[date_index]]
    next_month = d4[d4.date == dates.loc[date_index + 1]]
    bm = d1[d1.public_date == dates.loc[date_index]]
    temp = pd.merge(next_month,
Example #30
0
#!/usr/bin/env python
# encoding=utf-8

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 逐块读取文件
# 在处理大文件时,或找出大文件中的参数集以便后续处理时,可以读取文件的一小部分或逐块对文件进行迭代

result = pd.read_csv('ex6.csv')
# 可以指定读取其中的几行,通过nrows来指定,即读取前几行
result_part = pd.read_csv('ex6.csv', nrows=5)
print result_part

# 可以读取逐块读取文件,需要设置chunksize(行数)
chunker = pd.read_csv('ex6.csv', chunksize=1000)
# chunker是一个TextFileReader
print chunker
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
# 降序,order修改为sort_values
tot = tot.sort_values(ascending=False)
print tot
print tot[:10]
# print chunker.get_chunk(500)
Example #31
0
# 算术运算和数据对齐
x = DataFrame(n.arange(9.).reshape((3, 3)),
              index=['a', 'b', 'c'],
              columns=['A', 'B', 'C'])
y = DataFrame(n.arange(12).reshape((4, 3)),
              index=['a', 'b', 'c', 'd'],
              columns=['A', 'B', 'C'])
x + y
# result  不重叠部分为NaN,重叠部分元素运算
#       A     B     C
# a   0.0   2.0   4.0
# b   6.0   8.0  10.0
# c  12.0  14.0  16.0
# d   NaN   NaN   NaN

x.add(y, fill_value=0)
# result  对x/y的不重叠部分填充,不是对结果NaN填充
#       A     B     C
# a   0.0   2.0   4.0
# b   6.0   8.0  10.0
# c  12.0  14.0  16.0
# d   9.0  10.0  11.0

frame = DataFrame(n.arange(9).reshape((3, 3)),
                  index=['a', 'b', 'c'],
                  columns=['A', 'B', 'C'])
series = frame.ix[0]

frame - series
# result  默认按行运算
#    A  B  C
Example #32
0

# 4.算术运算和数据对齐
if __name__ == '__main__':
  print('DataFrame 算术:不重叠部分为NaN,重叠部分元素运算:')
  x = DataFrame(numpy.arange(9.).reshape((3,3)),
                columns = ['A','B','C'],
                index = ['a','b','c'])
  y = DataFrame(numpy.arange(12).reshape((4,3)),
                columns = ['A','B','C'],
                index = ['a','b','c','d'])
  print(x)
  print(y)
  print(x + y)
  print('对x/y的不重叠部分填充,不是对结果NaN填充:')
  print(x.add(y,fill_value = 0))                                                  # x不变化

  print('DataFrame 与 Series 运算:每行/每列进行运算:')
  frame = DataFrame(numpy.arange(9).reshape((3,3)),
                    columns = ['A','B','C'],
                    index = ['a','b','c'])
  series = frame.ix[0]                                                            # frame 的第一行
  print(frame)
  print(series)
  print(frame - series)                                                           # frame 的每行减 series

  series2 = Series(range(4),index = ['A','B','C','D'])
  print(frame + series2)                                                          # 按行运算,缺失列则为 NaN

  series3 = frame.A                                                               # frame 的第一列
  print(frame.sub(series3,axis = 0))                                              # 按列运算
class MySeries:
    def __init__(self, *args, **kwargs):
        self.x = Series(*args, **kwargs)
        self.values = self.x.values
        self.index = self.x.index
    
    def rolling_mean(self, *args, **kwargs):
        return MySeries(pd.rolling_mean(self.x, *args, **kwargs))

    def rolling_count(self, *args, **kwargs):
        return MySeries(pd.rolling_count(self.x, *args, **kwargs))

    def rolling_sum(self, *args, **kwargs):
        return MySeries(pd.rolling_sum(self.x, *args, **kwargs))

    def rolling_median(self, *args, **kwargs):
        return MySeries(pd.rolling_median(self.x, *args, **kwargs))
        
    def rolling_min(self, *args, **kwargs):
        return MySeries(pd.rolling_min(self.x, *args, **kwargs))

    def rolling_max(self, *args, **kwargs):
        return MySeries(pd.rolling_max(self.x, *args, **kwargs))

    def rolling_std(self, *args, **kwargs):
        return MySeries(pd.rolling_std(self.x, *args, **kwargs))

    def rolling_var(self, *args, **kwargs):
        return MySeries(pd.rolling_var(self.x, *args, **kwargs))

    def rolling_skew(self, *args, **kwargs):
        return MySeries(pd.rolling_skew(self.x, *args, **kwargs))

    def rolling_kurtosis(self, *args, **kwargs):
        return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs))

    def rolling_window(self, *args, **kwargs):
        return MySeries(pd.rolling_window(self.x, *args, **kwargs))

    def cumprod(self, *args, **kwargs):
        return MySeries(self.x.cumprod(*args, **kwargs))

    def cumsum(self, *args, **kwargs):
        return MySeries(self.x.cumsum(*args, **kwargs))

    def diff(self, *args, **kwargs):
        return MySeries(self.x.diff(*args, **kwargs))

    def div(self, *args, **kwargs):
        return MySeries(self.x.div(*args, **kwargs))

    def mul(self, *args, **kwargs):
        return MySeries(self.x.mul(*args, **kwargs))

    def add(self, *args, **kwargs):
        return MySeries(self.x.add(*args, **kwargs))

    def dropna(self, *args, **kwargs):
        return MySeries(self.x.dropna(*args, **kwargs))
    
    def fillna(self, *args, **kwargs):
        return MySeries(self.x.fillna(*args, **kwargs))

    def floordiv(self, *args, **kwargs):
        return MySeries(self.x.floordiv(*args, **kwargs))

    def mod(self, *args, **kwargs):
        return MySeries(self.x.mod(*args, **kwargs))

    def nlargest(self, *args, **kwargs):
        return MySeries(self.x.nlargest(*args, **kwargs))

    def nonzero(self, *args, **kwargs):
        return MySeries(self.x.nonzero(*args, **kwargs))

    def nsmallest(self, *args, **kwargs):
        return MySeries(self.x.nsmallest(*args, **kwargs))

    def pow(self, *args, **kwargs):
        return MySeries(self.x.pow(*args, **kwargs))

    def rank(self, *args, **kwargs):
        return MySeries(self.x.rank(*args, **kwargs))

    def round(self, *args, **kwargs):
        return MySeries(self.x.round(*args, **kwargs))

    def shift(self, *args, **kwargs):
        return MySeries(self.x.shift(*args, **kwargs))

    def sub(self, *args, **kwargs):
        return MySeries(self.x.sub(*args, **kwargs))

    def abs(self, *args, **kwargs):
        return MySeries(self.x.abs(*args, **kwargs))

    def clip(self, *args, **kwargs):
        return MySeries(self.x.clip(*args, **kwargs))

    def clip_lower(self, *args, **kwargs):
        return MySeries(self.x.clip_lower(*args, **kwargs))

    def clip_upper(self, *args, **kwargs):
        return MySeries(self.x.clip_upper(*args, **kwargs))
    
    def interpolate(self, *args, **kwargs):
        return MySeries(self.x.interpolate(*args, **kwargs))

    def resample(self, *args, **kwargs):
        return MySeries(self.x.resample(*args, **kwargs))
        
    def replace(self, *args, **kwargs):
        return MySeries(self.x.replace(*args, **kwargs))
# 逐块读取,需要设置chunksize(行数)
# 还可以使用get_chunk方法,可以使你读取任意大小的块
chunker = pd.read_csv('pydata-book-2nd-edition/examples/ex6.csv',
                      chunksize=1000)
chunker

# In[25]:

# 可以对此对象进行迭代处理
tot = Series([])

# In[26]:

for piece in chunker:
    tot.add(piece['key'].value_counts(), fill_value=0)
tot

# In[27]:

# 将数据写入到文本格式
data = pd.read_csv('pydata-book-2nd-edition/examples/ex5.csv')
data

# In[46]:

# 使用DataFrame的to_csv方法,可以将数据写到一个以逗号分隔的文件中
data.at[0, 'something'] = 'yuki'
data.to_csv('pydata-book-2nd-edition/examples/ex6_out.csv')

# In[29]:
Example #35
0
print(s1)

# a   -2.1
# c    3.6
# e   -1.5
# f    4.0
# g    3.1
# dtype: float64
print(s2)

# 两者相加,两边都存在则值相加,至少有一方不存在,则为NaN
# a    5.2
# c    1.1
# d    NaN
# e    0.0
# f    NaN
# g    NaN
# dtype: float64
print(s1 + s2)

# 如果需要对不重叠的位置使用填充,使用add方法中的fill_value参数实现
# 这种方式会以前者为主
# a    5.2
# c    1.1
# d    3.4
# e    0.0
# f    4.0
# g    3.1
# dtype: float64
print(s1.add(s2, fill_value=0))
Example #36
0
print(df.loc[:'2월', ['서초']])
print(df.loc[:'2월', ['서초', '강남']])

print('\niloc')
print(df.iloc[2])
print(df.iloc[2, :])  # 2행의 모든 열
print(df.iloc[:3, 2])
print(df.iloc[:3, 1:3])  # 1열 부터 3열 미만

print('\n\n산술연산------------')
s1 = Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = Series([4, 5, 6, 7], index=['a', 'b', 'd', 'c'])
print(s1)
print(s2)
print(s1 + s2)
print(s1.add(s2))  # 같은 인덱스명이 대응될 때 연산 가능(인덱스가 같아야 한다)

print()
df1 = DataFrame(np.arange(9.).reshape(3, 3),
                columns=list('kbs'),
                index=['서울', '인천', '수원'])
print(df1)
df2 = DataFrame(np.arange(12.).reshape(4, 3),
                columns=list('kbs'),
                index=['서울', '인천', '일산', '수원'])
print(df2)

print()
print(df1 + df2)  # 얘는 속성을 쓸 수 없다.
print(df1.add(df2))  # 얘는 속성을 쓸 수 있다.
print(df1.add(df2, fill_value=0))  # 얘는 속성(ex. fill_value)를 쓸 수 있다.
Example #37
0
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ex5.csv', na_values=sentinels)

### reading text files in pieces

result = pd.read_csv('ex6.csv')
result

pd.read_csv('ex6.csv', nrows=5)

chunker = pd.read_csv('ex6.csv', chunksize=10)
chunker

tot = Series([])
for chunk in chunker:
    tot = tot.add(chunk['key'].value_counts(), fill_value=0)

tot

### Writing out data -- just like read in no examples
### stop for today

### JSON
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
Example #38
0
def practice_one():
    obj = Series([4, 7, -5, 3])
    '''
    pandas解析函数
        read_csv        从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为逗号
        read_table      从文件、URL、文件型对象中加载带分隔符的数据,默认分隔符为制表符
        read_fwf        读取定宽列格式数据(没有分隔符)
        read_clipboard  读取剪贴板中的数据,可以看作read_table的剪贴板
    '''
    '''
    read_csv/read_table函数的参数:
        path            表示文件系统位置,URL,文件型对象的字符串
        sep,delimiter   用于对行中个字段进行拆分的字符序列或正则表达式
        header          用作列名的行号。默认0(第一行),若无则设置为None
        index_col       用作行索引的列编号或列名
        names           用于结果的列名列表
        skiprows        需要忽略的行数(从文件开始处算起),或需要跳过的行号列表(从0开始)
        na_values       一组用于替换NA的值
        comment         用于将注释信息从行尾拆分出去的字符(一或多)
        parse_dates     将数据解析为日期,默认False;若为True,则尝试解析所有列。此外,还可以指定需要的一组列号或列名
        keep_data_col   如果连接多列解析日期,则保持参与连接的列。默认False
        converters      由列名/列名跟函数之间的映射关系组成的字典
        dayfirst        当解析有歧义的日期时,将其看做国际格式
        data_parser     用于解析日期的函数
        nrows           需要读取的行数
        iterator        返回一个TextParser以便逐块读取文件
        chunksize       文件块的大小(用于迭代)
        skip_footer     需要忽略的行数
        verbose         打印各种解析器输出信息
        encoding        用于unicode的文件编码格式
        squeeze         如果数据经解析后仅含一列,则返回Series
        thousands       千分位分隔符,如‘,’或‘。’
    '''

    # 逐块读取文本文件
    '''
    文件夹:ch06    文件名:ex6.csv
    '''
    # 在处理文件时若只想读取一小部分或对文件进行迭代
    pd.read_csv('ch06/ex6.csv')
    # 只想读取几行,通过nrows进行指定即可
    pd.read_csv('ch06/ex6.csv', nrows=5)
    # 逐块读取文件,需要设置chunksize(行数), 返回TextParser对象
    chunker = pd.read_csv('ch06/ex6.csv', chunksize=10)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['message'].value_counts, fill_value=0)
        # 聚合到message列
    tot = tot.order(ascending=False)

    # 将数据写出到文本格式
    data = pd.read_csv('ch06/ex5csv')
    data.to_csv('ch06/out.csv')  # 将数据写入一个以逗号分隔的文件中
    data.to_csv(sys.stdout, sep='|')  # 分隔符为|
    data.to_csv(sys.stdout, na_rep='NULL')  # 缺失值表示为空字符串
    data.to_csv(sys.stdout, index=False, header=False)
    data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])

    # 手工处理分隔符格式
    import csv
    f = open('ch06/ex7.csv')
    reader = csv.reader(f)
    for line in reader:
        print(line)

    lines = list(csv.reader(open('ch06/ex7.csv')))
    header, values = lines[0], lines[1:]  # 分段
    data_dict = {h: v for h, v in zip(header, zip(*values))}

    # 定义csv.Dialect的一个子类,关于格式的
    class my_dialect(csv.Dialect):
        lineterminator = '\n'
        delimiter = ';'
        quotechar = '"'

    reader = csv.reader(f, dialect=my_dialect)
    reader = csv.reader(f, dialect='|')  # 不定义子类,直接提供
    '''
    csv.Dialect的属性及功能
        delimiter       用于分隔字段的单字符字符串,默认','
        lineterminator  用于写操作的行结束,默认'\r\n'
        quotechar       用于带有特殊字符的字段的引用符号,默认'"'
        quoting         引用约定。可选值包括csv.QUOTE_ALL(引用所有字段),
                        csv.QUOTE_MINIMAL(只引用带有如分隔符之类特殊字符的字段),
                        csv.QUOTE_NONNUMERIC以及csv.QUOTE_NON(不引用),默认QUOTE_MINIMAL
        skipinitialspace    忽略分隔符后面的空白符,默认False
        doublequote     处理字段内的引用符号。True,则双写
        escapechar      用于对分隔符进行转义的字符串,默认禁用
    '''
    with open('mydata.csv', 'w') as f:
        writer = csv.writer(f, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))

    # JSON数据
    obj = """
    {"name": "Wes",
     "places_lived": ["United States", "Spain", "Germany"],
     "pet": null,
     "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
                    {"name": "Katie", "age": 33, "pet": "Cisco"}]
    }
    """
    import json
    result = json.loads(obj)  # 将JSON对象转换为python格式
    json.dumps(result)  # 将python对象转换为JSON格式
    siblings = DataFrame(result['siblings'],
                         columns=['name', 'age'])  # 将JSON对象转换为DataFrame

    # XML和HTML:Web信息收集
    from lxml.html import parse
    from urllib2 import urlopen  # 无法下载urllib2类
    parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
    doc = parsed.getroot()
    links = doc.findall('.//a')  # 查询
    links[28].get('href')  # 获得url
    links[28].text_content()  # 获得文本
    urls = [links[28].get('href') for lnk in doc.findall('.//a')]  # 获得文档中全部URL

    tables = doc.findall('.//table')
    calls = tables[9]
    puts = tables[13]
    rows = calls.findall('.//tr')

    def _unpack(row, kind='td'):
        elts = row.findall('.//%s' % kind)
        return [val.text_content() for val in elts]

    _unpack(rows[1], kind='th')
    _unpack(rows[1], kind='td')
    from pandas.io.parsers import TextParser

    def parse_options_data(table):
        rows = table.findall('.//tr')
        header = _unpack(rows[0], kind='th')
        data = [_unpack(r) for r in rows[1:]]
        return TextParser(data, names=header).get_chunk()

    parse_options_data(calls)
    parse_options_data(puts)

    pass
Example #39
0
    def _series_add(previous_result: pd.Series, new_result: pd.Series):
        """Reducing function for adding up the results across chunks.

        Equivalent to ``lambda a,b: a+b`` except takes advantage of
        ``fill_value`` in pd.Series.add"""
        return previous_result.add(new_result, fill_value=0)
Example #40
0
#print tfidf.head(5)
#print idf.size
tfquery = Series()

linea = "Armed Robbery Suspect Arrested w/ Handgun" # QUERY A EVALUAR
linea = linea.upper()
tokens = linea.split()	
for word in tokens:
	if word in tfidf.columns:
		print word
		if word in tfquery:
			tfquery[word] = tfquery[word] + 1
		else :
			tfidf[word] = 0
			test = Series({word : 1})
			tfquery = tfquery.add(test, fill_value=0)
tfquery = tfquery/len(tokens)
tfquery = tfquery.multiply(idf,fill_value = 0)
print "TERMINO TFIDF"

vectorTFIDF = tfquery.as_matrix()
distancias = []
for i , f in tfidf.iterrows():
	#distancias.append(dist(vectorTFIDF,f.as_matrix()))
	distancias.append(1 - spatial.distance.cosine(f.as_matrix(), vectorTFIDF))
#b = numpy.argsort(distancias)
distancias = sorted(distancias,reverse=True)
print distancias[0:100]