def update_dictionary(dictionary, indicator_name, indicator_value): try: dictionary[indicator_name] except: dictionary[indicator_name] = time_series.TimeSeries( self.trading_days, []) dictionary[indicator_name].data.append(indicator_value)
def main(): path = 'D:\\data\\M3\\M3Other\\N2836.csv' data = np.genfromtxt(path) print('Data len: {0}'.format(len(data))) predict_points = 8 model = Model() ts = tsutils.TimeSeries(data, test_size=predict_points, scaler=processing.StandardScaler()) x_train, y_train, t_train = ts.train_data(input_window=model.input_window, output_window=model.output_window, expand=True) model.train(x_train, y_train, epochs=200) #x_test, y_test, t_test = ts.train_data(input_window=model.input_window, output_window=model.output_window) ctx = np.expand_dims(ts.get_test_context(model.input_window, expand=True), axis=0) y_pred = tsutils.free_run_batch(model.predict, ctx, predict_points, ts, batch_size=1) y_true = ts.get_test_data() y_pred_flat = ts.inverse_y(np.squeeze(y_pred)) y_true_flat = ts.inverse_y(np.squeeze(y_true)) print(metrics.evaluate(y_true_flat, y_pred_flat, metrics=('smape', 'mae', 'umbrae'))) '''
def do_cast(index_pattern, indices, years, pcthisto, outfile, time_dim='days', time_slice=None): """Creates the series objects and iterators, then performs the fore-or-hind cast. index_pattern : filename pattern which should expect to receive a datetime object : (e.g., "file_pattern_{:%Y}.nc".format(the_date)) indices : a list of indices to process in this run years : a slice object with the start and stop years pcthisto : percentile histogram file containing ratios of BA to occurrence outfile : name of output file time_dim : name of the time dimension in the index files time_slice : day-of-year range to include in processing """ # how to translate a date to the correct file to open. idx_series = ts.TimeSeries(index_pattern, time_dim) # the years to loop over cast_periods = ts.AnnualInterval(years.start, years.stop) # subset of days within year to loop over if time_slice is None: model_periods = ts.IntegerInterval(0, 365) else: model_periods = ts.IntegerInterval(time_slice.start, time_slice.stop) cast(idx_series, indices, cast_periods, model_periods, pcthisto, outfile)
def __init__(self, parent_controller): super(RPModel, self).__init__() self._parent_controller = parent_controller self._time_series_loaders = [] self._active_file = None # Empty/non-initialized TimeSeries objects self._main_ts = ts.TimeSeries(key_parameters=[]) self._baseline_ts = ts.TimeSeries( key_parameters=['baseline_avg_length']) self._pos_thresh_ts = ts.TimeSeries( key_parameters=['baseline_avg_length', 'trigger_sigma_threshold']) self._neg_thresh_ts = ts.TimeSeries( key_parameters=['baseline_avg_length', 'trigger_sigma_threshold']) self._filtered_ts = ts.TimeSeries(key_parameters=['filter_frequency']) self._event_manager = rp_event_manager.RPEventManager()
def key_pitch_related_nodding(self): self.pitch_records += [(self.pitch_records[-1][0] + 0.4, 0)] # make time series pitch_time_series = time_series.TimeSeries(self.pitch_records) # take log of the f0 frequency pitch_time_series = pitch_time_series.log() # zero-mean for non-zero frequencies mean = pitch_time_series.mean(lambda time, value: value > 0) pitch_time_series = pitch_time_series.map( lambda time, value: (time, (value - mean) if value > 0 else value)) # moving average pitch_time_series = pitch_time_series.moving_average( 0.01, 0.8, 'center') # multiply pitch_time_series = pitch_time_series.multiply(30) # resample pitch_time_series = pitch_time_series.resample(0.3) result = (0, (0, 0, 0)) # initial pose self.neck_angle_records.append(result) for record in pitch_time_series: # pose based on pitch time, value = record result = (time, ((math.pi / 180) * -1.0 * value, 0, 0) ) # euler to radian self.neck_angle_records.append(result) result = (self.wav_length, (0, 0, 0)) # final pose self.neck_angle_records.append(result) return self.neck_angle_records
def __init__(self, symbol, start_date, end_date): self.symbol = symbol # check if in database or add to database self.__add(start_date, end_date) # fill lists with data and make a times dictionary times, open, high, low, close, volume, adj_close = self.__read_db(start_date, end_date) # fill fields with time series information self.open = time_series.TimeSeries(times, open) self.high = time_series.TimeSeries(times, high) self.low = time_series.TimeSeries(times, low) self.close = time_series.TimeSeries(times, close) self.volume = time_series.TimeSeries(times, volume) self.adj_close = time_series.TimeSeries(times, adj_close) # add extra market information to symbol object self.extra = self._read_db_extra()
def test2_stdiv(): ts = TS.TimeSeries() ts.series = {"0":0,"1":1,"2":2,"3":3,"4":4,"5":5} assert(ts.stddiv() == np.std([0,1,2,3,4,5]))
def test1_mean(): ts = TS.TimeSeries() ts.series = {"0":0,"1":1,"2":2,"3":3,"4":4,"5":5} print(ts.mean()) assert(ts.mean() == np.mean([0,1,2,3,4,5]))
def run_backtest(self): print '\n#################################################' print 'Backtest strategy:', self.strategy_label print '#################################################' print print 'enter_signal: ', self.enter_signal print 'exit_signal : ', self.exit_signal print 'risk_cap : ', self.risk_cap print # -------------------------------------------------------- def update_dictionary(dictionary, indicator_name, indicator_value): try: dictionary[indicator_name] except: dictionary[indicator_name] = time_series.TimeSeries( self.trading_days, []) dictionary[indicator_name].data.append(indicator_value) p_index_accum = 0 p_index_hold_accum = 0 #localtime = time.strftime('%Y%m%d', time.localtime()) localtime = datetime.date.today() self.p_index = {} # dictionary to store all securities that have a signal today self.has_enter_signal = {} self.has_exit_signal = {} self.has_risk_cap_signal = {} # -------------------------------------------------------- for my_security in self.trading_symbols.components.keys(): # shortcut security = self.trading_symbols.components[my_security] # initialize portfolio for strategy (in backtest mode we only deal with one symbol) purchase = 0 self.portfolio = portfolio.Portfolio(float(self.starting_cash), self.trading_start, self.trading_commission) # initialize portfolio for hold self.portfolio_hold = portfolio.Portfolio( float(self.starting_cash), self.trading_start, self.trading_commission) # days when we have data available self.trading_days = [ d for d in self.daterange if d in security.close.times ] # time series used to store when we are in or out of the market self.in_n_out = time_series.TimeSeries(self.trading_days, []) # start with empty dictionaries for the current symbol "my_security" self.performance_index = {} self.charts = {} self.stock_chart = {} self.indicators = {} in_market = False enter_market = False leave_market = False security_had_enter_signal = False # parse 'chart_functions' input string to find out what we should plot chart_sections = string.split(self.chart_functions, ';') chart_panels = {} chart_panels['stock_chart'] = string.split(chart_sections[0], ':') if len(chart_sections) > 1: chart_panels['indicators'] = string.split( chart_sections[1], ':') # begin: loop over trading days for date in self.trading_days: # --------------------------------------------------------- def rsi(exp_m_avg_len): if len(security.close.data) > exp_m_avg_len: ts = security.close.rsi(date, date, exp_m_avg_len) if len(ts.data) > 0: rsi_value = ts.data[0] else: rsi_value = 100 return rsi_value # --------------------------------------------------------- def mfi(m_avg_len): ts = security.close.mfi(security.high, security.low, security.close, security.volume, date, date, m_avg_len) return ts.data[0] # --------------------------------------------------------- def l_b_band(bb_len, num_stdev=2): upper, m_avg, lower = security.close.b_bands( date, date, bb_len, num_stdev) return lower.data[0] # --------------------------------------------------------- def u_b_band(bb_len, num_stdev=2): upper, m_avg, lower = security.close.b_bands( date, date, bb_len, num_stdev) return upper.data[0] # --------------------------------------------------------- def c_b_band(bb_len, num_stdev=2): upper, m_avg, lower = security.close.b_bands( date, date, bb_len, num_stdev) return m_avg.data[0] # --------------------------------------------------------- def fsto_k(periods): low = security.low.low(date, date, periods) high = security.high.high(date, date, periods) fsto_k = 100 if len(low.data) > 0 and len(high.data) > 0: sto = security.close.fsto_k(low, high, date, date, periods) fsto_k = sto.data[0] return fsto_k # --------------------------------------------------------- def fsto_d(periods, m_avg): dstart = datetime.strptime( date, "%Y%M%d") - timedelta(days=periods) #date_start = dstart.strftime("%Y%m%d") print date_start, date, periods low = security.low.low(date_start, date, periods) high = security.high.high(date_start, date, periods) fsto_k = security.close.fsto_k(low, high, date_start, date, periods) fsto_d = fsto_k.m_avg(date, date, m_avg) return fsto_d[0] # --------------------------------------------------------- def close_monotonous_up(symbol_name, range_hist_len): try: my_security = self.trading_symbols.components[ symbol_name] except: print 'Error: Symbol ', symbol_name, ' is not available' return my_security.close.monotonous_up( date, range_hist_len) # --------------------------------------------------------- def close_monotonous_down(symbol_name, range_hist_len): try: my_security = self.trading_symbols.components[ symbol_name] except: print 'Error: Symbol ', symbol_name, ' is not available' return my_security.close.monotonous_down( date, range_hist_len) # --------------------------------------------------------- def mfi_hist_spread(mfi_periods, hist_len): end_index = self.trading_days.index(date) start_index = end_index - hist_len if start_index >= 0: mfi_hist = security.close.mfi( security.high, security.low, security.close, security.volume, self.trading_days[start_index], date, mfi_periods) return mfi_hist.historic_spread(date, hist_len) # --------------------------------------------------------- def close_m_avg_up(my_symbol, m_avg, hist_len): m_avg_symbol = symbol.Symbol(my_symbol, self.trading_days[0], self.trading_days[-1]) end_index = self.trading_days.index(date) start_index = end_index - max([m_avg, hist_len]) if start_index >= 0: close_m_avg = m_avg_symbol.close.m_avg( self.trading_days[start_index], date, m_avg) return close_m_avg.monotonous_up(date, hist_len) # --------------------------------------------------------- def roc(hist_len): end_index = self.trading_days.index(date) start_index = end_index - hist_len if start_index >= 0: return security.close.roc(date, hist_len) # --------------------------------------------------------- def roc_s(my_symbol, hist_len): my_symbol = symbol.Symbol(my_symbol, self.trading_days[0], self.trading_days[-1]) end_index = self.trading_days.index(date) start_index = end_index - hist_len if start_index >= 0: return my_symbol.close.roc(date, hist_len) # --------------------------------------------------------- def d_m_avg(m_avg_len): ts = security.close.derivative_m_avg(date, date, m_avg_len) return ts.data[0] # --------------------------------------------------------- def d_m_avg_s(my_symbol, m_avg_len): start_date = datetime.strptime( self.trading_days[0], "%Y%M%d") - timedelta(days=300) mysymbol = symbol.Symbol(my_symbol, start_date, self.trading_days[-1]) ts = mysymbol.close.derivative_m_avg(date, date, m_avg_len) return ts.data[0] # --------------------------------------------------------- open = security.open.data[security.open.get_index(date)] high = security.high.data[security.high.get_index(date)] low = security.low.data[security.low.get_index(date)] close = security.close.data[security.close.get_index(date)] volume = security.volume.data[security.volume.get_index(date)] if sc.verbose: print date, 'open : ', open print date, 'high : ', high print date, 'low : ', low print date, 'close : ', close print date, 'volume: ', volume # --------------------------------------------------------- # compute market_cap in billions, set mcap to zero if only N/A is given market_cap = security.extra['market_cap'] if market_cap[-1] == 'B': mcap = float(market_cap.strip(market_cap[-1])) elif market_cap[-1] == 'M': mcap = float(market_cap.strip(market_cap[-1])) / 1000.0 else: mcap = 0 transaction_date = date transaction_price = float(open) # --------------------------------------------------------- # portfolio with steady hold if date == self.trading_days[0]: num_shares_hold = int( (float(self.portfolio_hold.cash) - float(self.trading_commission)) / transaction_price) self.portfolio_hold.add_security(security, transaction_price, num_shares_hold, transaction_date) if date == self.trading_days[-1]: self.portfolio_hold.delete_security( security, transaction_price, num_shares_hold, transaction_date) # --------------------------------------------------------- if enter_market: num_shares = int( (float(self.portfolio.cash) - float(self.trading_commission)) / transaction_price) self.portfolio.add_security(security, transaction_price, num_shares, transaction_date) if sc.verbose: self.portfolio.print_holdings(date) in_market = True enter_market = False purchase = float(open) if sc.verbose: print date, 'transaction_price: ', transaction_price # --------------------------------------------------------- if in_market and not leave_market: self.in_n_out.data.append(1) else: self.in_n_out.data.append(0) # --------------------------------------------------------- if leave_market: in_market = False leave_market = False self.portfolio.delete_security(security, transaction_price, num_shares, transaction_date) if sc.verbose: self.portfolio.print_holdings(date) if sc.verbose: print date, 'transaction_price: ', transaction_price # --------------------------------------------------------- if eval(self.enter_signal): security_had_enter_signal = True if not in_market: enter_market = True if sc.verbose: print '%s - %-4s -' % (date, security.symbol) \ + '------------------------> enter signal' # for query mode store securities that have an enter_signal today if localtime == date: signal_msg = '%-4s (%-18s) ' % (security.symbol, security.extra['name']) signal_msg += '- price: ' + security.extra['price'] + \ ' change: ' + security.extra['change'] + \ ' volume: ' + security.extra['volume'] + '\n' self.has_enter_signal[security.symbol] = signal_msg if eval(self.risk_cap): if in_market: leave_market = True if sc.verbose: print '%s - %-4s -' % (date, security.symbol) \ + '------------------------> risk cap. exiting' # for query mode store securities that have an risk_cap_signal today if localtime == date: signal_msg = '%-4s (%-18s) ' % (security.symbol, security.extra['name']) signal_msg += '- price: ' + security.extra['price'] + \ ' change: ' + security.extra['change'] + \ ' volume: ' + security.extra['volume'] + '\n' self.has_risk_cap_signal[security.symbol] = signal_msg if eval(self.exit_signal): if in_market: leave_market = True if sc.verbose: print '%s - %-4s -' % (date, security.symbol) \ + '------------------------> exit signal' # for query mode store securities that have an exit_signal today if localtime == date: signal_msg = '%-4s (%-18s) ' % (security.symbol, security.extra['name']) signal_msg += '- price: ' + security.extra['price'] + \ ' change: ' + security.extra['change'] + \ ' volume: ' + security.extra['volume'] + '\n' self.has_exit_signal[security.symbol] = signal_msg # compute and store chart functions for chart in chart_panels['stock_chart']: update_dictionary(self.stock_chart, chart, eval(chart)) if len(chart_sections) > 1: for chart in chart_panels['indicators']: update_dictionary(self.indicators, chart, eval(chart)) # also store stock_close/stock_open and p_index for all charts update_dictionary(self.stock_chart, 'close', close) update_dictionary(self.stock_chart, 'open', open) performance_index = self.portfolio.evaluate_assets( date) / float(self.starting_cash) update_dictionary(self.performance_index, 'p_index', performance_index) # new chart_functions syntax # chart_sections_new = string.split(self.chart_functions_new, '|') # sindex = 0 # self.plot_panels = {} # for section in chart_sections_new: # sindex += 1 # chart_plots = string.split(section, '%') # for plot in chart_plots: # plot_s = plot.strip(' ') # plot_function = plot_s[0:-2] # update_dictionary(self.plot_panels, str( (sindex, plot_s) ) , eval(plot_function)) # update_dictionary(self.plot_panels, str((0,'p_index:b')), performance_index) # update_dictionary(self.plot_panels, str((1,'close:b')), close) # update_dictionary(self.plot_panels, str((1,'open:b')), open) if sc.verbose: print date, 'performance index:', \ self.performance_index[ 'p_index' ].data[-1] print date, \ 'enter_market:', enter_market, \ 'leave_market:', leave_market, \ 'in_market:', in_market # end: loop over trading days p_index_data = self.performance_index['p_index'].data[-1] p_index = "%5.3f" % p_index_data p_index_accum += p_index_data p_index_hold_data = self.portfolio_hold.evaluate_assets( self.trading_days[-1]) / float(self.starting_cash) p_index_hold = "%5.3f" % p_index_hold_data p_index_hold_accum += p_index_hold_data # --------------------------------------------------------- if self.portfolio.transactions > 0: days_in_market = reduce(lambda x, y: x + y, self.in_n_out.data) trading_days = len(self.trading_days) tp_index = "%6.3f" % (trading_days / days_in_market * (p_index_data - 1.0)) print '%-7s - performance indices: %s / %s / %s - transactions: %s' \ % (security.symbol, p_index, p_index_hold, tp_index, self.portfolio.transactions) self.p_index[security.symbol] = [ self.portfolio.transactions, p_index, p_index_hold, tp_index ] if (security_had_enter_signal or self.always_plot): sym_chart = trading_chart.TradingChart(security, self) filename = security.symbol.ljust(4,'_') + '.' \ + self.strategy_label + '_' + str(p_index) + '.png' sym_chart.savefig(filename) # accumulated performance index # --------------------------------------------------------- print 'accumulated performance index: ', \ p_index_accum/len(self.trading_symbols.components.keys()), ' / ', \ p_index_hold_accum/len(self.trading_symbols.components.keys()) print '\n---------------------[ Top 20 ]---------------------' items = self.p_index.items() #items.sort(key=lambda x: x[1][0], reverse=True) items.sort(key=lambda x: x[1], reverse=True) rank = 0 print 'symbol transactions strategy hold normalized' for key, value in items: rank += 1 if rank <= 20: print '%-7s : %3s %s %s %s' % ( key, value[0], value[1], value[2], value[3])
def find_suspicious_user(df, out_dir, window_size=2*3600, interval=1, activity_th_for_hash=2, shifting_times = 40, modul = 5000, sigma=0.024, good_user_th=5, good_bin_th=5): ''' This corresponds to the indexer part of DeBot (step_2.py). The main difference is that random projection will be perform only if the author count exceeds 150. This threshold is given by experiment. This is because the hashing is time-consuming, and may take more time than computing DTW between all the authors without hashing them. - df : a dataframe containing 2 columns : - author_id : the id of the author of the tweet - created : time of creation of the tweet (datetime.datetime) with as index the sorted created_at column - out_dir : path of a directory where to store the suspicious users. - window_size : the window size (in seconds) we use to compare authors between them. According to DeBot, the default value is 2 hours. - interval : the interval (in seconds) between two values in the time series. - activity_th_for_hash : threshold of tweet count for a given author, under which we filter out the author. Note that the default value is low because we don't have the entire timeline of the users yet. - shifting times : Shows the number of copies we have from each users in our buckets (DeBot) - modul : number of buckets (DeBot) - sigma : standard deviation for equip_probable function (DeBot) - good_user_th : How many copies a user should have to be picked in a bin (default 5). Strict comparison (x > good_user_th) (DeBot) - good_bin_th : How many suspicious users a bin should have to be picked (default 5) strict comparison (x > good_bin_th) (DeBot) The suspicious users are reported as a list for each time window, in a dataframe with two columns - time_window : the start of the time window where the users are suspicious - suspicious_users : a list of suspicious users, according to random projection. They may not be bot, but were hashed into the same bucket and thus are suspicious. The dataframe is stored on disk in the given location. Intermediate csv files are also stored, such that if for some reason the computation stop, we can relaunch from the last date saved on disk. ''' # the first date start_date = df.iloc[0].created_at # the last date final_date = df.iloc[-1].created_at # the start date of the current window start_window = start_date # will be used to save the results on disk regularly base_time = time.time() # will be used to print the progress last_time = start_window - timedelta(days=31) # current index of the partial results to save on disk idx_out = 0 # wrapper to compute time series TS = time_series.TimeSeries(interval=interval) # rows that will contain the suspicious users rows = [] # Iterate over all windows while start_window + timedelta(seconds=window_size) <= final_date: if start_window > last_time + timedelta(days=30): print(start_window) last_time = start_window end_window = start_window + timedelta(seconds=window_size) # Keep the tweets corresponding to the window df_window = df.loc[start_window:end_window] # For each author_id, get the activity (list of datetime) df_window = (df_window.groupby('author_id')['created_at'] .apply(list) .reset_index(name='activity')) # Compute how much tweet each author created df_window["count"] = df_window['activity'].apply(len) # Filter out all authors with low activity df_window = df_window[df_window["count"]>= activity_th_for_hash] # Convert each activity (list of datetime) into a non-normalized time series # (list of int) df_window["ts"] = df_window['activity'].apply(lambda x: TS.ts_from_datetime(x, start_window, end_window)) # Hash users into buckets using Debot hashing technique ts_list = list(df_window.ts) if len(ts_list) > 1: test_time = time.time() bucket_user = hash_equi_prob(ts_list, modul, shifting_times, sigma) author_id_list = list(df_window.author_id) to_report = set() # From experiment, it will be faster and more accurate # to compute the warped correlation for all authors in # the time window without doing the hashing part if # the count of author is less than 150 if len(author_id_list) > 150: #Find the set of suspicious users in the bucket good_bin = 0 bucket_user = sort_list_based_len(bucket_user) for i in range (len(bucket_user)): cur_good_user = get_num_good_usr(bucket_user[i],good_user_th) if (len(cur_good_user) > good_bin_th): good_bin = good_bin + 1 for j in range (len(cur_good_user)): cur_id = author_id_list[cur_good_user[j]] to_report.add(cur_id) else: to_report = to_report.union(set(author_id_list)) if len(to_report) > 1: rows.append([start_window, to_report]) # Save the suspicious users if time.time() > base_time + 300: if len(rows) > 0: df_suspicious = pd.DataFrame(rows, columns=['time_window', 'suspicious_users']) df_suspicious.to_csv(os.path.join(out_dir, "out_hash_" + str(idx_out) + ".csv"), index=False, encoding='utf-8') print("saved until " + str(start_window)) rows = [] idx_out += 1 base_time = time.time() # Update start_window start_window = end_window # Save the remaining users df_suspicious = pd.DataFrame(rows, columns=['time_window', 'suspicious_users']) df_suspicious.to_csv(os.path.join(out_dir, "out_hash_" + str(idx_out) + ".csv"), index=False, encoding='utf-8') # Concatenate the partial results concat_csv(out_dir, "out_hash_", os.path.join(out_dir, "out_hash.csv"))
def setUp(self): self.test_symbol = symbol.Symbol('^DJI', '20090103', '20091001') self.time_series1 = time_series.TimeSeries( ['20090101', '20090102', '20090103', '20090104'], [1000, 1001, 1002, 1003]) self.time_series2 = time_series.TimeSeries( ['20090101', '20090102', '20090103', '20090104'], [1000, 998, 1002, 1003]) self.time_series3 = time_series.TimeSeries( ['20090101', '20090102', '20090103', '20090104'], [1003, 1002, 1001, 1000]) self.time_series_long = time_series.TimeSeries([ '20090101', '20090102', '20090103', '20090104', '20090105', '20090106', '20090107', '20090108', '20090109', '20090110', '20090111', '20090112', '20090113', '20090114', '20090115', '20090116', ], [ 1000, 998, 970, 1008, 1001, 978, 999, 1002, 1005, 1002, 998, 1000, 999, 1002, 1004, 1007, ]) self.time_series_m_avg = time_series.TimeSeries( ['20090102', '20090103', '20090104'], [1000.5, 1001.5, 1002.5]) self.time_series_hist_changes = time_series.TimeSeries( ['20090102', '20090103', '20090104'], [-2, 4, 1]) self.time_series_hist_adv = time_series.TimeSeries( ['20090102', '20090103', '20090104'], [0, 4, 1]) self.time_series_hist_dec = time_series.TimeSeries( ['20090102', '20090103', '20090104'], [2, 0, 0]) self.time_series_low1 = time_series.TimeSeries( ['20090110', '20090111', '20090112'], [1002, 998, 1000]) self.time_series_low2 = time_series.TimeSeries( ['20090110', '20090111', '20090112'], [1002, 998, 998]) self.time_series_low8 = time_series.TimeSeries( ['20090110', '20090111', '20090112'], [970, 978, 978]) self.time_series_high1 = time_series.TimeSeries( ['20090110', '20090111', '20090112'], [1002, 998, 1000]) self.time_series_high2 = time_series.TimeSeries( ['20090110', '20090111', '20090112'], [1005, 1002, 1000]) self.time_series_high8 = time_series.TimeSeries( ['20090110', '20090111', '20090112'], [1008, 1008, 1005]) self.time_series_rsi = time_series.TimeSeries([ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19' ], [ 46.125, 47.125, 46.4375, 46.9375, 44.9375, 44.25, 44.625, 45.75, 47.8125, 47.5625, 47.0, 44.5625, 46.3125, 47.6875, 46.6875, 45.6875, 43.0625, 43.5625, 44.875, 43.6875 ]) self.time_series_stdev = time_series.TimeSeries([ '20090901', '20090902', '20090903', '20090904', '20090905', '20090906', '20090907', '20090908' ], [2., 4., 4., 4., 5., 5., 7., 9.]) self.time_series_exp_m_avg = time_series.TimeSeries([ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20' ], [ 64.75, 63.79, 63.73, 63.73, 63.55, 63.19, 63.91, 63.85, 62.95, 63.37, 61.33, 61.51, 61.87, 60.25, 59.35, 59.95, 58.93, 57.68, 58.82, 58.87 ])
def find_correlated_activity(df, in_path, out_dir, window_size=2*3600, interval=1, allowable_lag=20, activity_th_for_dtw=10, report_threshold=0.995, use_shift=False, fetch_timelines=False, since_id=None, max_id=None): ''' Iterate over all time window and compute dtw between authors in each time window. Each author must have a minimum amount at least of tweets during the time window to compute the DTW. Report the account whose warped correlation is over report_threshold and store it in a csv file containing 3 columns : - author_id_x - author_id_y - warp_corr : warped correlation between the two authors - window : start of time window No duplicates. If author_id_x = X and author_id_y = Y in a row, there will not be author_id_x = Y and author_id_y = X in some other row. Parameters : - df : a dataframe containing 2 columns : - author_id : the id of the author of the tweet - created : time of creation of the tweet (datetime.datetime) with as index the sorted created_at column The dataframe should contain the complete timeline for all users. - in_path : the path to the csv file containing the suspicious user. The csv is created by the find_suspicious_user function. If None, then the warped correlation will be computed between each author for each time window - out_dir : the path to the output directory - window_size : the window size (in seconds) we use to compare authors between them. According to DeBot, the default value is 2 hours. - interval : the interval (in seconds) between two values in the time series. - allowable_lag : the maximum shift (in seconds) allowed to match two tweets from two different authors. - activity_th_for_dtw : the minimum amount of tweet a user must have to be candidate for dtw computation. We use 10 by default, as DeBot. - use_shift : when computing DTW, if set to true, we will shift one time series such that it matches better the other time series. If this is set to True, we should use a lower allowable lag, otherwise we may have false positives. ''' # Create the correlation dataframe df_corr = pd.DataFrame(columns=['author_id_x', 'author_id_y', 'warp_corr', 'window']) # will be use to save the results on disk regularly base_time = time.time() # the first date start_date = df.iloc[0].created_at # the last date end_date = df.iloc[-1].created_at # the start date of the current window start_window = start_date # wrapper to compute time series TS = time_series.TimeSeries(interval=interval) # current index of the partial results to save on disk idx_out = 0 # create the suspicious user dataframe df_suspicious = None if in_path is not None: df_suspicious = pd.read_csv(in_path, encoding='utf-8', engine='python', index_col="time_window") last_time = start_window - timedelta(days=31) # Iterate over all windows while start_window + timedelta(seconds=window_size) <= end_date: # Print the start window every month to give the progress if start_window > last_time + timedelta(days=30): print(start_window) last_time = start_window end_window = start_window + timedelta(seconds=window_size) # Keep only tweets in the current time window df_window = df.loc[start_window:end_window] # Filter out authors that are not suspicious if df_suspicious is not None and start_window in df_suspicious.index: susp_authors = df_suspicious.loc[start_window]["suspicious_authors"] df_window = df_window[df_window.author_id.isin(susp_authors)] if df_suspicious is None or start_window in df_suspicious.index: # Group tweets by authors df_grouped = (df_window.groupby('author_id')['created_at'] .apply(list) .reset_index(name='activity')) # Keep only authors who have enough tweets during this time window df_grouped["count"] = df_grouped.activity.apply(len) df_filtered = df_grouped[df_grouped["count"] >= activity_th_for_dtw].copy() authors_kept = set(df_filtered["author_id"]) # If there is at least two authors remaining, compute DTW between # all of them if df_filtered.shape[0] > 1: # Compute the z-normalized time series df_filtered["ts"] = df_filtered["activity"].apply(lambda x: TS.znorm(TS.ts_from_datetime(x, start_window, end_window))) # Compute DTW between each authors out = compute_dtw(df_filtered, allowable_lag, use_shift) #print("max corr = " + str(out.warp_corr.max()) + " at " + str(start_window)) out["window"] = start_window corr = out[out["warp_corr"] >= report_threshold].copy() df_corr = pd.concat([df_corr, corr]) if time.time() > base_time + 300: if df_corr.shape[0] > 0: df_corr.to_csv(os.path.join(out_dir, "out_corr_" + str(idx_out) + ".csv"), index=False, encoding='utf-8') print("saved until " + str(start_window)) df_corr = pd.DataFrame(columns=['author_id_x', 'author_id_y', 'warp_corr', 'window']) idx_out += 1 base_time = time.time() # Update start_window start_window = end_window df_corr.to_csv(os.path.join(out_dir, "out_corr_" + str(idx_out) + ".csv"), index=False, encoding='utf-8') # Concatenate the partial results concat_csv(out_dir, "out_corr_", os.path.join(out_dir, "out_corr.csv"))