Esempio n. 1
0
 def update_dictionary(dictionary, indicator_name, indicator_value):
     try:
         dictionary[indicator_name]
     except:
         dictionary[indicator_name] = time_series.TimeSeries(
             self.trading_days, [])
     dictionary[indicator_name].data.append(indicator_value)
Esempio n. 2
0
def main():
    path = 'D:\\data\\M3\\M3Other\\N2836.csv'
    data = np.genfromtxt(path)
    print('Data len: {0}'.format(len(data)))
    predict_points = 8

    model = Model()

    ts = tsutils.TimeSeries(data, test_size=predict_points, scaler=processing.StandardScaler())

    x_train, y_train, t_train = ts.train_data(input_window=model.input_window, output_window=model.output_window, expand=True)
    model.train(x_train, y_train, epochs=200)

    #x_test, y_test, t_test = ts.train_data(input_window=model.input_window, output_window=model.output_window)

    ctx = np.expand_dims(ts.get_test_context(model.input_window, expand=True), axis=0)
    y_pred = tsutils.free_run_batch(model.predict, ctx, predict_points, ts, batch_size=1)
    y_true = ts.get_test_data()

    y_pred_flat = ts.inverse_y(np.squeeze(y_pred))
    y_true_flat = ts.inverse_y(np.squeeze(y_true))

    print(metrics.evaluate(y_true_flat, y_pred_flat, metrics=('smape', 'mae', 'umbrae')))

    '''
Esempio n. 3
0
def do_cast(index_pattern,
            indices,
            years,
            pcthisto,
            outfile,
            time_dim='days',
            time_slice=None):
    """Creates the series objects and iterators, then performs the fore-or-hind cast.
    
     index_pattern : filename pattern which should expect to receive a datetime object
                   : (e.g., "file_pattern_{:%Y}.nc".format(the_date))
     indices       : a list of indices to process in this run
     years         : a slice object with the start and stop years
     pcthisto      : percentile histogram file containing ratios of BA to occurrence
     outfile       : name of output file
     time_dim      : name of the time dimension in the index files
     time_slice    : day-of-year range to include in processing 
     """

    # how to translate a date to the correct file to open.
    idx_series = ts.TimeSeries(index_pattern, time_dim)

    # the years to loop over
    cast_periods = ts.AnnualInterval(years.start, years.stop)

    # subset of days within year to loop over
    if time_slice is None:
        model_periods = ts.IntegerInterval(0, 365)
    else:
        model_periods = ts.IntegerInterval(time_slice.start, time_slice.stop)

    cast(idx_series, indices, cast_periods, model_periods, pcthisto, outfile)
Esempio n. 4
0
    def __init__(self, parent_controller):
        super(RPModel, self).__init__()

        self._parent_controller = parent_controller

        self._time_series_loaders = []

        self._active_file = None

        # Empty/non-initialized TimeSeries objects
        self._main_ts = ts.TimeSeries(key_parameters=[])
        self._baseline_ts = ts.TimeSeries(
            key_parameters=['baseline_avg_length'])
        self._pos_thresh_ts = ts.TimeSeries(
            key_parameters=['baseline_avg_length', 'trigger_sigma_threshold'])
        self._neg_thresh_ts = ts.TimeSeries(
            key_parameters=['baseline_avg_length', 'trigger_sigma_threshold'])
        self._filtered_ts = ts.TimeSeries(key_parameters=['filter_frequency'])

        self._event_manager = rp_event_manager.RPEventManager()
Esempio n. 5
0
    def key_pitch_related_nodding(self):
        self.pitch_records += [(self.pitch_records[-1][0] + 0.4, 0)]

        # make time series
        pitch_time_series = time_series.TimeSeries(self.pitch_records)

        # take log of the f0 frequency
        pitch_time_series = pitch_time_series.log()

        # zero-mean for non-zero frequencies
        mean = pitch_time_series.mean(lambda time, value: value > 0)
        pitch_time_series = pitch_time_series.map(
            lambda time, value: (time, (value - mean) if value > 0 else value))

        # moving average
        pitch_time_series = pitch_time_series.moving_average(
            0.01, 0.8, 'center')

        # multiply
        pitch_time_series = pitch_time_series.multiply(30)

        # resample
        pitch_time_series = pitch_time_series.resample(0.3)

        result = (0, (0, 0, 0))  # initial pose
        self.neck_angle_records.append(result)

        for record in pitch_time_series:  # pose based on pitch
            time, value = record
            result = (time, ((math.pi / 180) * -1.0 * value, 0, 0)
                      )  # euler to radian
            self.neck_angle_records.append(result)

        result = (self.wav_length, (0, 0, 0))  # final pose
        self.neck_angle_records.append(result)
        return self.neck_angle_records
Esempio n. 6
0
    def __init__(self, symbol, start_date, end_date):
        self.symbol = symbol

        # check if in database or add to database
        self.__add(start_date, end_date)

        # fill lists with data and make a times dictionary
        times, open, high, low, close, volume, adj_close = self.__read_db(start_date, end_date)

        # fill fields with time series information
        self.open      = time_series.TimeSeries(times, open)
        self.high      = time_series.TimeSeries(times, high)
        self.low       = time_series.TimeSeries(times, low)
        self.close     = time_series.TimeSeries(times, close)
        self.volume    = time_series.TimeSeries(times, volume)
        self.adj_close = time_series.TimeSeries(times, adj_close)

        # add extra market information to symbol object
        self.extra = self._read_db_extra()
def test2_stdiv():
	ts = TS.TimeSeries()
	ts.series = {"0":0,"1":1,"2":2,"3":3,"4":4,"5":5}
	assert(ts.stddiv() == np.std([0,1,2,3,4,5]))
def test1_mean():
	ts = TS.TimeSeries()
	ts.series = {"0":0,"1":1,"2":2,"3":3,"4":4,"5":5}
	print(ts.mean())
	assert(ts.mean() == np.mean([0,1,2,3,4,5]))
Esempio n. 9
0
    def run_backtest(self):
        print '\n#################################################'
        print 'Backtest strategy:', self.strategy_label
        print '#################################################'
        print
        print 'enter_signal: ', self.enter_signal
        print 'exit_signal : ', self.exit_signal
        print 'risk_cap    : ', self.risk_cap
        print

        # --------------------------------------------------------
        def update_dictionary(dictionary, indicator_name, indicator_value):
            try:
                dictionary[indicator_name]
            except:
                dictionary[indicator_name] = time_series.TimeSeries(
                    self.trading_days, [])
            dictionary[indicator_name].data.append(indicator_value)

        p_index_accum = 0
        p_index_hold_accum = 0

        #localtime = time.strftime('%Y%m%d', time.localtime())
        localtime = datetime.date.today()

        self.p_index = {}

        # dictionary to store all securities that have a signal today
        self.has_enter_signal = {}
        self.has_exit_signal = {}
        self.has_risk_cap_signal = {}

        # --------------------------------------------------------
        for my_security in self.trading_symbols.components.keys():
            # shortcut
            security = self.trading_symbols.components[my_security]

            # initialize portfolio for strategy (in backtest mode we only deal with one symbol)
            purchase = 0
            self.portfolio = portfolio.Portfolio(float(self.starting_cash),
                                                 self.trading_start,
                                                 self.trading_commission)
            # initialize portfolio for hold
            self.portfolio_hold = portfolio.Portfolio(
                float(self.starting_cash), self.trading_start,
                self.trading_commission)

            # days when we have data available
            self.trading_days = [
                d for d in self.daterange if d in security.close.times
            ]

            # time series used to store when we are in or out of the market
            self.in_n_out = time_series.TimeSeries(self.trading_days, [])

            # start with empty dictionaries for the current symbol "my_security"
            self.performance_index = {}
            self.charts = {}
            self.stock_chart = {}
            self.indicators = {}

            in_market = False
            enter_market = False
            leave_market = False
            security_had_enter_signal = False

            # parse 'chart_functions' input string to find out what we should plot
            chart_sections = string.split(self.chart_functions, ';')
            chart_panels = {}
            chart_panels['stock_chart'] = string.split(chart_sections[0], ':')
            if len(chart_sections) > 1:
                chart_panels['indicators'] = string.split(
                    chart_sections[1], ':')

            # begin: loop over trading days
            for date in self.trading_days:
                # ---------------------------------------------------------
                def rsi(exp_m_avg_len):
                    if len(security.close.data) > exp_m_avg_len:
                        ts = security.close.rsi(date, date, exp_m_avg_len)
                    if len(ts.data) > 0:
                        rsi_value = ts.data[0]
                    else:
                        rsi_value = 100
                    return rsi_value

                # ---------------------------------------------------------
                def mfi(m_avg_len):
                    ts = security.close.mfi(security.high, security.low,
                                            security.close, security.volume,
                                            date, date, m_avg_len)
                    return ts.data[0]

                # ---------------------------------------------------------
                def l_b_band(bb_len, num_stdev=2):
                    upper, m_avg, lower = security.close.b_bands(
                        date, date, bb_len, num_stdev)
                    return lower.data[0]

                # ---------------------------------------------------------
                def u_b_band(bb_len, num_stdev=2):
                    upper, m_avg, lower = security.close.b_bands(
                        date, date, bb_len, num_stdev)
                    return upper.data[0]

                # ---------------------------------------------------------
                def c_b_band(bb_len, num_stdev=2):
                    upper, m_avg, lower = security.close.b_bands(
                        date, date, bb_len, num_stdev)
                    return m_avg.data[0]

                # ---------------------------------------------------------
                def fsto_k(periods):
                    low = security.low.low(date, date, periods)
                    high = security.high.high(date, date, periods)
                    fsto_k = 100
                    if len(low.data) > 0 and len(high.data) > 0:
                        sto = security.close.fsto_k(low, high, date, date,
                                                    periods)
                        fsto_k = sto.data[0]
                    return fsto_k

                # ---------------------------------------------------------
                def fsto_d(periods, m_avg):
                    dstart = datetime.strptime(
                        date, "%Y%M%d") - timedelta(days=periods)
                    #date_start = dstart.strftime("%Y%m%d")
                    print date_start, date, periods
                    low = security.low.low(date_start, date, periods)
                    high = security.high.high(date_start, date, periods)
                    fsto_k = security.close.fsto_k(low, high, date_start, date,
                                                   periods)
                    fsto_d = fsto_k.m_avg(date, date, m_avg)
                    return fsto_d[0]

                # ---------------------------------------------------------
                def close_monotonous_up(symbol_name, range_hist_len):
                    try:
                        my_security = self.trading_symbols.components[
                            symbol_name]
                    except:
                        print 'Error: Symbol ', symbol_name, ' is not available'
                    return my_security.close.monotonous_up(
                        date, range_hist_len)

                # ---------------------------------------------------------
                def close_monotonous_down(symbol_name, range_hist_len):
                    try:
                        my_security = self.trading_symbols.components[
                            symbol_name]
                    except:
                        print 'Error: Symbol ', symbol_name, ' is not available'
                    return my_security.close.monotonous_down(
                        date, range_hist_len)

                # ---------------------------------------------------------
                def mfi_hist_spread(mfi_periods, hist_len):
                    end_index = self.trading_days.index(date)
                    start_index = end_index - hist_len
                    if start_index >= 0:
                        mfi_hist = security.close.mfi(
                            security.high, security.low, security.close,
                            security.volume, self.trading_days[start_index],
                            date, mfi_periods)
                        return mfi_hist.historic_spread(date, hist_len)

                # ---------------------------------------------------------
                def close_m_avg_up(my_symbol, m_avg, hist_len):
                    m_avg_symbol = symbol.Symbol(my_symbol,
                                                 self.trading_days[0],
                                                 self.trading_days[-1])
                    end_index = self.trading_days.index(date)
                    start_index = end_index - max([m_avg, hist_len])
                    if start_index >= 0:
                        close_m_avg = m_avg_symbol.close.m_avg(
                            self.trading_days[start_index], date, m_avg)
                        return close_m_avg.monotonous_up(date, hist_len)

                # ---------------------------------------------------------
                def roc(hist_len):
                    end_index = self.trading_days.index(date)
                    start_index = end_index - hist_len
                    if start_index >= 0:
                        return security.close.roc(date, hist_len)

                # ---------------------------------------------------------
                def roc_s(my_symbol, hist_len):
                    my_symbol = symbol.Symbol(my_symbol, self.trading_days[0],
                                              self.trading_days[-1])
                    end_index = self.trading_days.index(date)
                    start_index = end_index - hist_len
                    if start_index >= 0:
                        return my_symbol.close.roc(date, hist_len)

                # ---------------------------------------------------------
                def d_m_avg(m_avg_len):
                    ts = security.close.derivative_m_avg(date, date, m_avg_len)
                    return ts.data[0]

                # ---------------------------------------------------------
                def d_m_avg_s(my_symbol, m_avg_len):
                    start_date = datetime.strptime(
                        self.trading_days[0], "%Y%M%d") - timedelta(days=300)
                    mysymbol = symbol.Symbol(my_symbol, start_date,
                                             self.trading_days[-1])
                    ts = mysymbol.close.derivative_m_avg(date, date, m_avg_len)
                    return ts.data[0]

                # ---------------------------------------------------------
                open = security.open.data[security.open.get_index(date)]
                high = security.high.data[security.high.get_index(date)]
                low = security.low.data[security.low.get_index(date)]
                close = security.close.data[security.close.get_index(date)]
                volume = security.volume.data[security.volume.get_index(date)]

                if sc.verbose:
                    print date, 'open  : ', open
                    print date, 'high  : ', high
                    print date, 'low   : ', low
                    print date, 'close : ', close
                    print date, 'volume: ', volume

                # ---------------------------------------------------------
                # compute market_cap in billions, set mcap to zero if only N/A is given
                market_cap = security.extra['market_cap']
                if market_cap[-1] == 'B':
                    mcap = float(market_cap.strip(market_cap[-1]))
                elif market_cap[-1] == 'M':
                    mcap = float(market_cap.strip(market_cap[-1])) / 1000.0
                else:
                    mcap = 0

                transaction_date = date
                transaction_price = float(open)

                # ---------------------------------------------------------
                # portfolio with steady hold
                if date == self.trading_days[0]:
                    num_shares_hold = int(
                        (float(self.portfolio_hold.cash) -
                         float(self.trading_commission)) / transaction_price)
                    self.portfolio_hold.add_security(security,
                                                     transaction_price,
                                                     num_shares_hold,
                                                     transaction_date)
                if date == self.trading_days[-1]:
                    self.portfolio_hold.delete_security(
                        security, transaction_price, num_shares_hold,
                        transaction_date)

                # ---------------------------------------------------------
                if enter_market:
                    num_shares = int(
                        (float(self.portfolio.cash) -
                         float(self.trading_commission)) / transaction_price)
                    self.portfolio.add_security(security, transaction_price,
                                                num_shares, transaction_date)
                    if sc.verbose:
                        self.portfolio.print_holdings(date)
                    in_market = True
                    enter_market = False
                    purchase = float(open)

                    if sc.verbose:
                        print date, 'transaction_price: ', transaction_price

                # ---------------------------------------------------------
                if in_market and not leave_market: self.in_n_out.data.append(1)
                else: self.in_n_out.data.append(0)

                # ---------------------------------------------------------
                if leave_market:
                    in_market = False
                    leave_market = False
                    self.portfolio.delete_security(security, transaction_price,
                                                   num_shares,
                                                   transaction_date)
                    if sc.verbose:
                        self.portfolio.print_holdings(date)

                    if sc.verbose:
                        print date, 'transaction_price: ', transaction_price

                # ---------------------------------------------------------
                if eval(self.enter_signal):
                    security_had_enter_signal = True
                    if not in_market:
                        enter_market = True
                        if sc.verbose:
                            print '%s - %-4s -' % (date, security.symbol) \
                                  + '------------------------> enter signal'
                    # for query mode store securities that have an enter_signal today
                    if localtime == date:
                        signal_msg = '%-4s (%-18s) ' % (security.symbol,
                                                        security.extra['name'])
                        signal_msg += '-  price: ' + security.extra['price']  + \
                                      '  change: ' + security.extra['change'] + \
                                      '  volume: ' + security.extra['volume'] + '\n'
                        self.has_enter_signal[security.symbol] = signal_msg

                if eval(self.risk_cap):
                    if in_market:
                        leave_market = True
                        if sc.verbose:
                            print '%s - %-4s -' % (date, security.symbol) \
                                  + '------------------------> risk cap. exiting'
                    # for query mode store securities that have an risk_cap_signal today
                    if localtime == date:
                        signal_msg = '%-4s (%-18s) ' % (security.symbol,
                                                        security.extra['name'])
                        signal_msg += '-  price: ' + security.extra['price']  + \
                                      '  change: ' + security.extra['change'] + \
                                      '  volume: ' + security.extra['volume'] + '\n'
                        self.has_risk_cap_signal[security.symbol] = signal_msg

                if eval(self.exit_signal):
                    if in_market:
                        leave_market = True
                        if sc.verbose:
                            print '%s - %-4s -' % (date, security.symbol) \
                                  + '------------------------> exit signal'
                    # for query mode store securities that have an exit_signal today
                    if localtime == date:
                        signal_msg = '%-4s (%-18s) ' % (security.symbol,
                                                        security.extra['name'])
                        signal_msg += '-  price: ' + security.extra['price']  + \
                                      '  change: ' + security.extra['change'] + \
                                      '  volume: ' + security.extra['volume'] + '\n'
                        self.has_exit_signal[security.symbol] = signal_msg

                # compute and store chart functions
                for chart in chart_panels['stock_chart']:
                    update_dictionary(self.stock_chart, chart, eval(chart))
                if len(chart_sections) > 1:
                    for chart in chart_panels['indicators']:
                        update_dictionary(self.indicators, chart, eval(chart))

                # also store stock_close/stock_open and p_index for all charts
                update_dictionary(self.stock_chart, 'close', close)
                update_dictionary(self.stock_chart, 'open', open)
                performance_index = self.portfolio.evaluate_assets(
                    date) / float(self.starting_cash)
                update_dictionary(self.performance_index, 'p_index',
                                  performance_index)

                # new chart_functions syntax
                #                chart_sections_new = string.split(self.chart_functions_new, '|')
                #                sindex = 0
                #                self.plot_panels = {}
                #                for section in chart_sections_new:
                #                    sindex += 1
                #                    chart_plots = string.split(section, '%')
                #                    for plot in chart_plots:
                #                        plot_s = plot.strip(' ')
                #                        plot_function = plot_s[0:-2]
                #                        update_dictionary(self.plot_panels, str( (sindex, plot_s) ) , eval(plot_function))
                #                update_dictionary(self.plot_panels, str((0,'p_index:b')), performance_index)
                #                update_dictionary(self.plot_panels, str((1,'close:b')), close)
                #                update_dictionary(self.plot_panels, str((1,'open:b')),  open)

                if sc.verbose:
                    print date, 'performance index:', \
                          self.performance_index[ 'p_index' ].data[-1]
                    print date, \
                          'enter_market:', enter_market, \
                          'leave_market:', leave_market, \
                          'in_market:',     in_market

            # end: loop over trading days
            p_index_data = self.performance_index['p_index'].data[-1]
            p_index = "%5.3f" % p_index_data
            p_index_accum += p_index_data
            p_index_hold_data = self.portfolio_hold.evaluate_assets(
                self.trading_days[-1]) / float(self.starting_cash)
            p_index_hold = "%5.3f" % p_index_hold_data
            p_index_hold_accum += p_index_hold_data

            # ---------------------------------------------------------
            if self.portfolio.transactions > 0:
                days_in_market = reduce(lambda x, y: x + y, self.in_n_out.data)
                trading_days = len(self.trading_days)
                tp_index = "%6.3f" % (trading_days / days_in_market *
                                      (p_index_data - 1.0))
                print '%-7s - performance indices: %s / %s / %s - transactions: %s' \
                      % (security.symbol, p_index, p_index_hold, tp_index, self.portfolio.transactions)
                self.p_index[security.symbol] = [
                    self.portfolio.transactions, p_index, p_index_hold,
                    tp_index
                ]

            if (security_had_enter_signal or self.always_plot):
                sym_chart = trading_chart.TradingChart(security, self)
                filename = security.symbol.ljust(4,'_') + '.' \
                           + self.strategy_label + '_' + str(p_index) + '.png'
                sym_chart.savefig(filename)

        # accumulated performance index
        # ---------------------------------------------------------
        print 'accumulated performance index: ', \
              p_index_accum/len(self.trading_symbols.components.keys()), ' / ', \
              p_index_hold_accum/len(self.trading_symbols.components.keys())

        print '\n---------------------[ Top 20 ]---------------------'
        items = self.p_index.items()
        #items.sort(key=lambda x: x[1][0], reverse=True)
        items.sort(key=lambda x: x[1], reverse=True)
        rank = 0
        print 'symbol    transactions strategy hold    normalized'
        for key, value in items:
            rank += 1
            if rank <= 20:
                print '%-7s :      %3s      %s   %s  %s' % (
                    key, value[0], value[1], value[2], value[3])
Esempio n. 10
0
def find_suspicious_user(df,
                         out_dir,
                         window_size=2*3600,
                         interval=1,
                         activity_th_for_hash=2,
                         shifting_times = 40,
                         modul = 5000,
                         sigma=0.024,
                         good_user_th=5,
                         good_bin_th=5):
    '''
    This corresponds to the indexer part of DeBot (step_2.py). The main
    difference is that random projection will be perform only if the
    author count exceeds 150. This threshold is given by experiment. This
    is because the hashing is time-consuming, and may take more time than
    computing DTW between all the authors without hashing them.
     - df : a dataframe containing 2 columns :
         - author_id : the id of the author of the tweet
         - created : time of creation of the tweet (datetime.datetime)
       with as index the sorted created_at column
     - out_dir : path of a directory where to store the suspicious users.
     - window_size : the window size (in seconds) we use to compare authors
       between them. According to DeBot, the default value is 2 hours.
     - interval : the interval (in seconds) between two values in the time
       series.
     - activity_th_for_hash : threshold of tweet count for a given author,
       under which we filter out the author. Note that the default value
       is low because we don't have the entire timeline of the users yet.
     - shifting times : Shows the number of copies we have from each users
       in our buckets (DeBot)
     - modul : number of buckets (DeBot)
     - sigma : standard deviation for equip_probable function (DeBot)
     - good_user_th : How many copies a user should have to be picked in a bin
       (default 5). Strict comparison (x > good_user_th) (DeBot)
     - good_bin_th : How many suspicious users a bin should have to be picked
       (default 5) strict comparison (x > good_bin_th) (DeBot)
    The suspicious users are reported as a list for each time window, in
    a dataframe with two columns
     - time_window : the start of the time window where the users are
       suspicious
     - suspicious_users : a list of suspicious users, according to random
       projection. They may not be bot, but were hashed into the same
       bucket and thus are suspicious.
    The dataframe is stored on disk in the given location. Intermediate
    csv files are also stored, such that if for some reason the computation
    stop, we can relaunch from the last date saved on disk.
    '''

    # the first date
    start_date = df.iloc[0].created_at
    # the last date
    final_date = df.iloc[-1].created_at
    # the start date of the current window
    start_window = start_date
    # will be used to save the results on disk regularly
    base_time = time.time()
    # will be used to print the progress
    last_time = start_window - timedelta(days=31)
    # current index of the partial results to save on disk
    idx_out = 0
    # wrapper to compute time series
    TS = time_series.TimeSeries(interval=interval)
    # rows that will contain the suspicious users
    rows = []

    # Iterate over all windows
    while start_window + timedelta(seconds=window_size) <= final_date:
        if start_window > last_time + timedelta(days=30):
            print(start_window)
            last_time = start_window

        end_window = start_window + timedelta(seconds=window_size)

        # Keep the tweets corresponding to the window
        df_window = df.loc[start_window:end_window]

        # For each author_id, get the activity (list of datetime)
        df_window = (df_window.groupby('author_id')['created_at']
                                                    .apply(list)
                                                    .reset_index(name='activity'))
        # Compute how much tweet each author created
        df_window["count"] = df_window['activity'].apply(len)
        # Filter out all authors with low activity
        df_window = df_window[df_window["count"]>= activity_th_for_hash]
        # Convert each activity (list of datetime) into a non-normalized time series
        # (list of int)
        df_window["ts"] = df_window['activity'].apply(lambda x:
                                TS.ts_from_datetime(x, start_window, end_window))

        # Hash users into buckets using Debot hashing technique
        ts_list = list(df_window.ts)
        if len(ts_list) > 1:
            test_time = time.time()
            bucket_user = hash_equi_prob(ts_list, modul, shifting_times, sigma)

            author_id_list = list(df_window.author_id)

            to_report = set()
            # From experiment, it will be faster and more accurate
            # to compute the warped correlation for all authors in
            # the time window without doing the hashing part if
            # the count of author is less than 150
            if len(author_id_list) > 150:
                #Find the set of suspicious users in the bucket
                good_bin = 0
                bucket_user = sort_list_based_len(bucket_user)
                for i in range (len(bucket_user)):
                    cur_good_user = get_num_good_usr(bucket_user[i],good_user_th)
                    if (len(cur_good_user) > good_bin_th):
                        good_bin = good_bin + 1
                        for j in range (len(cur_good_user)):
                            cur_id = author_id_list[cur_good_user[j]]
                            to_report.add(cur_id)
            else:
                to_report = to_report.union(set(author_id_list))

            if len(to_report) > 1:
                rows.append([start_window, to_report])

            # Save the suspicious users
            if time.time() > base_time + 300:
                if len(rows) > 0:
                    df_suspicious = pd.DataFrame(rows, columns=['time_window', 'suspicious_users'])
                    df_suspicious.to_csv(os.path.join(out_dir, "out_hash_" + str(idx_out) + ".csv"),
                                  index=False,
                                  encoding='utf-8')
                    print("saved until " + str(start_window))
                    rows = []
                    idx_out += 1
                base_time = time.time()

        # Update start_window
        start_window = end_window

    # Save the remaining users
    df_suspicious = pd.DataFrame(rows, columns=['time_window', 'suspicious_users'])
    df_suspicious.to_csv(os.path.join(out_dir, "out_hash_" + str(idx_out) + ".csv"),
                  index=False,
                  encoding='utf-8')

    # Concatenate the partial results
    concat_csv(out_dir, "out_hash_", os.path.join(out_dir, "out_hash.csv"))
Esempio n. 11
0
    def setUp(self):
        self.test_symbol = symbol.Symbol('^DJI', '20090103', '20091001')

        self.time_series1 = time_series.TimeSeries(
            ['20090101', '20090102', '20090103', '20090104'],
            [1000, 1001, 1002, 1003])
        self.time_series2 = time_series.TimeSeries(
            ['20090101', '20090102', '20090103', '20090104'],
            [1000, 998, 1002, 1003])
        self.time_series3 = time_series.TimeSeries(
            ['20090101', '20090102', '20090103', '20090104'],
            [1003, 1002, 1001, 1000])

        self.time_series_long = time_series.TimeSeries([
            '20090101',
            '20090102',
            '20090103',
            '20090104',
            '20090105',
            '20090106',
            '20090107',
            '20090108',
            '20090109',
            '20090110',
            '20090111',
            '20090112',
            '20090113',
            '20090114',
            '20090115',
            '20090116',
        ], [
            1000,
            998,
            970,
            1008,
            1001,
            978,
            999,
            1002,
            1005,
            1002,
            998,
            1000,
            999,
            1002,
            1004,
            1007,
        ])

        self.time_series_m_avg = time_series.TimeSeries(
            ['20090102', '20090103', '20090104'], [1000.5, 1001.5, 1002.5])

        self.time_series_hist_changes = time_series.TimeSeries(
            ['20090102', '20090103', '20090104'], [-2, 4, 1])
        self.time_series_hist_adv = time_series.TimeSeries(
            ['20090102', '20090103', '20090104'], [0, 4, 1])
        self.time_series_hist_dec = time_series.TimeSeries(
            ['20090102', '20090103', '20090104'], [2, 0, 0])

        self.time_series_low1 = time_series.TimeSeries(
            ['20090110', '20090111', '20090112'], [1002, 998, 1000])
        self.time_series_low2 = time_series.TimeSeries(
            ['20090110', '20090111', '20090112'], [1002, 998, 998])
        self.time_series_low8 = time_series.TimeSeries(
            ['20090110', '20090111', '20090112'], [970, 978, 978])
        self.time_series_high1 = time_series.TimeSeries(
            ['20090110', '20090111', '20090112'], [1002, 998, 1000])
        self.time_series_high2 = time_series.TimeSeries(
            ['20090110', '20090111', '20090112'], [1005, 1002, 1000])
        self.time_series_high8 = time_series.TimeSeries(
            ['20090110', '20090111', '20090112'], [1008, 1008, 1005])
        self.time_series_rsi = time_series.TimeSeries([
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19'
        ], [
            46.125, 47.125, 46.4375, 46.9375, 44.9375, 44.25, 44.625, 45.75,
            47.8125, 47.5625, 47.0, 44.5625, 46.3125, 47.6875, 46.6875,
            45.6875, 43.0625, 43.5625, 44.875, 43.6875
        ])

        self.time_series_stdev = time_series.TimeSeries([
            '20090901', '20090902', '20090903', '20090904', '20090905',
            '20090906', '20090907', '20090908'
        ], [2., 4., 4., 4., 5., 5., 7., 9.])

        self.time_series_exp_m_avg = time_series.TimeSeries([
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20'
        ], [
            64.75, 63.79, 63.73, 63.73, 63.55, 63.19, 63.91, 63.85, 62.95,
            63.37, 61.33, 61.51, 61.87, 60.25, 59.35, 59.95, 58.93, 57.68,
            58.82, 58.87
        ])
Esempio n. 12
0
def find_correlated_activity(df,
                             in_path,
                             out_dir,
                             window_size=2*3600,
                             interval=1,
                             allowable_lag=20,
                             activity_th_for_dtw=10,
                             report_threshold=0.995,
                             use_shift=False,
                             fetch_timelines=False,
                             since_id=None,
                             max_id=None):
    '''
    Iterate over all time window and compute dtw between authors
    in each time window. Each author must have a minimum amount at least
    of tweets during the time window to compute the DTW.
    Report the account whose warped correlation is over report_threshold
    and store it in a csv file containing 3 columns :
     - author_id_x
     - author_id_y
     - warp_corr : warped correlation between the two authors
     - window : start of time window
    No duplicates. If author_id_x = X and author_id_y = Y in a row, there
    will not be author_id_x = Y and author_id_y = X in some other row.
    Parameters :
     - df : a dataframe containing 2 columns :
         - author_id : the id of the author of the tweet
         - created : time of creation of the tweet (datetime.datetime)
       with as index the sorted created_at column
       The dataframe should contain the complete timeline for all users.
     - in_path : the path to the csv file containing the suspicious user.
       The csv is created by the find_suspicious_user function.
       If None, then the warped correlation will be computed between each
       author for each time window
     - out_dir : the path to the output directory
     - window_size : the window size (in seconds) we use to compare authors
       between them. According to DeBot, the default value is 2 hours.
     - interval : the interval (in seconds) between two values in the time
       series.
     - allowable_lag : the maximum shift (in seconds) allowed to match two
       tweets from two different authors.
     - activity_th_for_dtw : the minimum amount of tweet a user must have
       to be candidate for dtw computation. We use 10 by default, as DeBot.
     - use_shift : when computing DTW, if set to true, we will shift one
       time series such that it matches better the other time series. If
       this is set to True, we should use a lower allowable lag, otherwise
       we may have false positives.
    '''
    # Create the correlation dataframe
    df_corr = pd.DataFrame(columns=['author_id_x', 'author_id_y', 'warp_corr', 'window'])
    # will be use to save the results on disk regularly
    base_time = time.time()
    # the first date
    start_date = df.iloc[0].created_at
    # the last date
    end_date = df.iloc[-1].created_at
    # the start date of the current window
    start_window = start_date
    # wrapper to compute time series
    TS = time_series.TimeSeries(interval=interval)
    # current index of the partial results to save on disk
    idx_out = 0
    # create the suspicious user dataframe
    df_suspicious = None
    if in_path is not None:
        df_suspicious = pd.read_csv(in_path,
                                    encoding='utf-8',
                                    engine='python',
                                    index_col="time_window")

    last_time = start_window - timedelta(days=31)

    # Iterate over all windows
    while start_window + timedelta(seconds=window_size) <= end_date:
        # Print the start window every month to give the progress
        if start_window > last_time + timedelta(days=30):
            print(start_window)
            last_time = start_window
        end_window = start_window + timedelta(seconds=window_size)


        # Keep only tweets in the current time window
        df_window = df.loc[start_window:end_window]

        # Filter out authors that are not suspicious
        if df_suspicious is not None and start_window in df_suspicious.index:
            susp_authors = df_suspicious.loc[start_window]["suspicious_authors"]
            df_window = df_window[df_window.author_id.isin(susp_authors)]

        if df_suspicious is None or start_window in df_suspicious.index:
            # Group tweets by authors
            df_grouped = (df_window.groupby('author_id')['created_at']
                                  .apply(list)
                                  .reset_index(name='activity'))

            # Keep only authors who have enough tweets during this time window
            df_grouped["count"] = df_grouped.activity.apply(len)
            df_filtered = df_grouped[df_grouped["count"] >= activity_th_for_dtw].copy()
            authors_kept = set(df_filtered["author_id"])

            # If there is at least two authors remaining, compute DTW between
            # all of them
            if df_filtered.shape[0] > 1:
                # Compute the z-normalized time series
                df_filtered["ts"] = df_filtered["activity"].apply(lambda x:
                                        TS.znorm(TS.ts_from_datetime(x,
                                                                     start_window,
                                                                     end_window)))

                # Compute DTW between each authors
                out = compute_dtw(df_filtered, allowable_lag, use_shift)
                #print("max corr = " + str(out.warp_corr.max()) + "   at " + str(start_window))

                out["window"] = start_window

                corr = out[out["warp_corr"] >= report_threshold].copy()
                df_corr = pd.concat([df_corr, corr])
                if time.time() > base_time + 300:
                    if df_corr.shape[0] > 0:
                        df_corr.to_csv(os.path.join(out_dir, "out_corr_" + str(idx_out) + ".csv"),
                                      index=False,
                                      encoding='utf-8')
                        print("saved until " + str(start_window))
                        df_corr = pd.DataFrame(columns=['author_id_x',
                                                        'author_id_y',
                                                        'warp_corr',
                                                        'window'])
                        idx_out += 1
                    base_time = time.time()

        # Update start_window
        start_window = end_window
    df_corr.to_csv(os.path.join(out_dir, "out_corr_" + str(idx_out) + ".csv"),
                   index=False,
                   encoding='utf-8')

    # Concatenate the partial results
    concat_csv(out_dir, "out_corr_", os.path.join(out_dir, "out_corr.csv"))