Ejemplo n.º 1
0
 def __init__(self, region, start_timestamp, end_timestamp, freq="1h"):
     # super(InstagramTimeSeries, self).__init__(region,start_timestamp, end_timestamp, freq)
     self.start_timestamp = start_timestamp
     self.end_timestamp = end_timestamp
     self.region = region
     self._db = MongoDBInterface()
     self._db.setDB(InstagramConfig.db)
     self._db.setCollection(InstagramConfig.posts_collection)
     self.days_to_predict = 1
     self.freq = freq
Ejemplo n.º 2
0
class InstagramTimeSeries:
    def __init__(self, region, start_timestamp, end_timestamp, freq="1h"):
        # super(InstagramTimeSeries, self).__init__(region,start_timestamp, end_timestamp, freq)
        self.start_timestamp = start_timestamp
        self.end_timestamp = end_timestamp
        self.region = region
        self._db = MongoDBInterface()
        self._db.setDB(InstagramConfig.db)
        self._db.setCollection(InstagramConfig.posts_collection)
        self.days_to_predict = 1
        self.freq = freq

    def rangeQuery(self, region, startTimestamp, endTimestamp):
        region_conditions = {}
        period_conditions = {}
        if region:
            region_conditions = {"region.code": region}

        period_conditions = {"created_time": {"$gte": startTimestamp, "$lt": endTimestamp}}
        conditions = dict(region_conditions, **period_conditions)

        return self._db.getAllDocuments(conditions).sort([("created_time", -1)])

    def getRawSeries(self):
        return self.series

    def buildTimeSeries(self, count_people=True, avoid_flooding=True):
        """Return a pandas Series object
        
        count_people = True means we only want to count single user
        instead of # of photos for that region

        avoid_flooding = True means we want to avoid a single user
        flooding many photos into instagram in a short time. Now we
        set the time window as within 5 minutes only count as a single
        user
        
        """
        window_avoid_flooding = 300
        data = []
        photo_cnt = 0
        for photo in self.rangeQuery(self.region, self.start_timestamp, self.end_timestamp):
            p = {"user": photo["user"], "created_time": photo["created_time"]}
            data.append(p)
            photo_cnt += 1
            if photo_cnt % 10000 == 0:
                print photo_cnt
        data = sorted(data, key=lambda x: x["created_time"])
        print (len(data))
        user_last_upload = {}  # for a single user, when is his last upload
        counts = []
        dates = []

        counts.append(1)  # VERY IMPORTANT. FIX THE SIZE OF TIMESERIES IN PANDAS
        dates.append(datetime.utcfromtimestamp(float(self.start_timestamp)))

        for photo_json in data:
            user = photo_json["user"]["username"]
            utc_date = datetime.utcfromtimestamp(float(photo_json["created_time"]))
            if count_people:
                if user not in user_last_upload:
                    user_last_upload[user] = int(photo_json["created_time"])
                    dates.append(utc_date)
                    counts.append(1)
                else:
                    if float(photo_json["created_time"]) - float(user_last_upload[user]) > window_avoid_flooding:
                        user_last_upload[user] = int(photo_json["created_time"])
                        dates.append(utc_date)
                        counts.append(1)
            else:
                dates.append(utc_date)
                counts.append(1)

        counts.append(1)  # VERY IMPORTANT, FIX THE SIZE OF TIMESERIES IN PANDAS
        dates.append(datetime.utcfromtimestamp(float(self.end_timestamp) - 1))
        self.series = Series(counts, index=dates)

        print (self.series.count())
        try:
            self.series2 = self.series.resample(self.freq, how="sum", label="right")
            # self.series2 = self.series2.fillna(0) #fill NaN values with zeros
        except Exception as e:  # not enough data
            print (e)
            pass
        print (self.series2.count())
        return self.series2

    def smoothSeriesEwma(self, series, span=5.0, adjust=True, halflife=None, min_periods=0, how="mean"):
        return pandas.ewma(
            series,
            com=None,
            span=span,
            halflife=halflife,
            min_periods=min_periods,
            freq="1h",
            adjust=adjust,
            how=how,
            ignore_na=True,
        )

    def smoothSeriesEwmstd(self, series, span=5.0, adjust=True, halflife=None, min_periods=0):
        return pandas.ewmstd(
            series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True
        )

    def smoothSeriesEwmvar(self, series, span=5.0, adjust=True, halflife=None, min_periods=0):
        return pandas.ewmstd(
            series, com=None, span=span, halflife=halflife, min_periods=min_periods, adjust=adjust, ignore_na=True
        )

    def dataPrepare(self, serie):
        """This is to return the 'future data points' that you want to
        predict. e.g. predict for each hour tomorrow how many people will
        show up at Times Square

        """
        ts = serie
        index = ts.index
        if len(index) < 3:
            raise Exception("Only %d data points" % (len(index)))
        start_date = ts.index[0]

        """Notice training here is in the format of
        (days from begining of the timeseries, number of data at that time)
        
        """
        training = []
        for idx in index:
            days_diff = (idx - start_date).days + (idx - start_date).seconds / (24 * 3600.0)
            training.append((days_diff, ts[idx]))
        nearest_current_date = index[-1]

        testing = []
        align = []
        converted_align = []
        for hour in range(25 * self.days_to_predict):
            next_date = nearest_current_date + timedelta(seconds=3600 * (hour + 1))
            delta = next_date - start_date
            days_from_start = (delta.seconds + delta.days * 86400) / (3600 * 24.0)
            testing.append(days_from_start)
            align.append(next_date)
            converted_align.append(calendar.timegm(next_date.utctimetuple()))

        return training, testing, align, converted_align