def commentsForStory(objectId, log):
   try:
      url = 'https://hn.algolia.com/api/v1/items/%d' % (objectId)

      req = urllib.request.Request(url)
      response = urllib.request.urlopen(req)
      data = json.loads(response.read().decode("utf-8"))
   except (KeyboardInterrupt, SystemExit):
      raise
   except IOError as e:
      message = '%d: %s' % (e.code, e.reason)
      log[str(objectId)] = message
      print(message)
      return

   tree = commentTree(data)
   commentRecords = preorderTraversalIgnoreRoot(tree)

   if len(commentRecords) == 0:
      log[str(objectId)] = NO_COMMENTS
      return

   columns = ['id', 'author', 'text', 'points', 'created_at', 'parent_id', 'story_id']
   df = DataFrame(columns = columns, index = numpy.arange(len(commentRecords)))
   for index, comment in enumerate(commentRecords):
      df.ix[index] = comment

   df.to_csv("comments-by-story/comments-%d.csv" % objectId, encoding='utf-8', index=False)
   log[str(objectId)] = SUCCESS
    def chi(self, customattribute):
        """
        计算其卡方值.
        """
        attributeDict = dict()
        classAttributeDict = dict()
        for piece in self.chunks:
            for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems():
                attributeDict.setdefault((attribute, classAttribute), np.array([]))
                attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays)

            for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems():
                classAttributeDict.setdefault(classAttribute, np.array([]))
                classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays)

        #各个类别的毕业去向群体中所占的比例.
        classSeries = Series(classAttributeDict).apply(lambda x:len(x))
        classSeries /= classSeries.sum()

        #在各个attribute上的实际观测值.
        attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0)

        attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns)

        #设置初始值.
        for index in attributeExp.index:
            attributeExp.ix[index] = attributeObs.ix[index].sum()
        #根据各个目标类别中的比例来获得其期望值.
        attributeExp = attributeExp.mul(classSeries).fillna(0)
        #根据实际观测值与期望值来计算其卡方值,并返回p-value值.
        return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
Example #3
0
    def _parse(cls, body):
        if body.startswith(cls._PREFIX):
            body = body.replace(cls._PREFIX, "")
        if body.endswith(cls._SUFFIX):
            body = body.replace(cls._SUFFIX, "")

        stock_map = json.loads(body)

        stocks = DataFrame(columns=TRADE_DETAIL_COLUMNS)
        for name, stock in iteritems(stock_map):
            s_date, s_time = stock["time"].split(" ")
            data = [
                stock["name"],
                stock["open"],
                stock["yestclose"],
                stock["price"],
                stock["high"],
                stock["low"],
                stock["volume"],
                stock["turnover"],
                stock["bidvol1"],
                stock["bid1"],
                stock["bidvol2"],
                stock["bid2"],
                stock["bidvol3"],
                stock["bid3"],
                stock["bidvol4"],
                stock["bid4"],
                stock["bidvol5"],
                stock["bid5"],
                stock["askvol1"],
                stock["ask1"],
                stock["askvol2"],
                stock["ask2"],
                stock["askvol3"],
                stock["ask3"],
                stock["askvol4"],
                stock["ask4"],
                stock["askvol5"],
                stock["ask5"],
                int_date.to_int_date(s_date),
                s_time,
            ]
            index = "{}{}".format(stock["type"].lower(), stock["symbol"])
            stocks.ix[index] = data
        return stocks
 def _parse(cls, body):
     stocks = body.split(';')
     ret = DataFrame(columns=SINA_STOCK_INFO_COLUMNS)
     for stock in stocks:
         stock = stock.strip()
         if len(stock) == 0:
             continue
         m = re.match('var hq_str_(.*)_i="(.+)"', stock)
         if m is None:
             raise ValueError("response text is not valid: {}"
                              .format(stock))
         index, data = m.group(1, 2)
         data_array = data.split(',')[:32]
         data_array[2:11] = map(float, data_array[2:11])
         data_array[12:] = map(float, data_array[12:])
         ret.ix[index] = data_array
     return ret
Example #5
0
 def _parse(cls, body):
     stocks = body.split(';')
     ret = DataFrame(columns=TRADE_DETAIL_COLUMNS)
     for stock in stocks:
         stock = stock.strip()
         if len(stock) == 0:
             continue
         m = re.match('var hq_str_(.*)="(.*)"', stock)
         if m is None:
             raise ValueError("response text is not valid: {}"
                              .format(stock))
         index, data = m.group(1, 2)
         if len(data) == 0:
             log.info("data for stock %s is empty, skip.", index)
             continue
         data_array = data.split(',')[:32]
         result = [data_array[0]]
         result.extend(map(float, data_array[1:6]))
         result.extend(map(float, data_array[8:30]))
         result.extend([int_date.to_int_date(data_array[30]),
                        data_array[31]])
         ret.ix[index] = result
     return ret