def main(): path = './' n_runs = 1 simulation_periods = [['2019-02-01', '2019-02-15'], ['2019-02-08', '2019-02-22'], ['2019-02-15', '2019-03-01'], ['2019-02-22', '2019-03-01']] #files are in weekly subsets, e.g. venezuela_v2_extracted_twitter_2019-02-01_2019-02-08.json all_files = glob.glob(path + 'venezuela_v2_extracted*.json') #extract dates and platforms from file names date_re = '(20\d\d-\d\d-\d\d)_(20\d\d-\d\d-\d\d)' dates = [re.search(date_re, fn) for fn in all_files] start_dates = [d.group(1) for d in dates] end_dates = [d.group(2) for d in dates] platforms = [re.search('twitter|youtube', fn).group(0) for fn in all_files] #create data frame with files, dates, and platforms fn_df = pd.DataFrame({ 'fn': all_files, 'start': start_dates, 'end': end_dates, 'platform': platforms }) fn_df['start'] = pd.to_datetime(fn_df['start']) fn_df['end'] = pd.to_datetime(fn_df['end']) fn_df = fn_df.sort_values('start') #loop over simulation periods for sim_period in simulation_periods: #start and end time of the simulation start = pd.to_datetime(sim_period[0]) end = pd.to_datetime(sim_period[1]) #select files to sample from based on dates (same length as simulation period, just before simulation period) sim = fn_df[(fn_df['start'] >= start) & (fn_df['start'] < end)] hist = fn_df[(fn_df['start'] < start) & (fn_df['start'] >= start - (end - start))] previous = fn_df[(fn_df['start'] < start - (end - start))] print(start, end) print('Historical Data to Sample From') print(hist) print('Prior Data to Track Users From') print(previous) previous_history_data = list(previous['fn'].values) history_data = list(hist['fn'].values) #load data hist = [] for data in history_data: hist.append( ss.load_data(data, ignore_first_line=False, verbose=False)) hist = pd.concat(hist) hist = hist.sort_values('nodeTime') previous_hist = [] for data in previous_history_data: previous_hist.append( ss.load_data(data, ignore_first_line=False, verbose=False)) previous_hist = pd.concat(previous_hist) previous_hist = previous_hist[['nodeUserID', 'informationID']].drop_duplicates() #multiple runs of the baseline sampling for i in range(n_runs): dfs = [] # for each platform and information ID for (plat, info), grp in hist.groupby(['platform', 'informationID']): print(plat, info) starting = time.time() sampled_df = sample_from_historical_data( grp, info, plat, hist['nodeTime'].min(), hist['nodeTime'].max(), start, end, previous_hist=previous_hist, new_users=True) ending = time.time() elapsed = (ending - starting) / 60.0 print(f'Time elapsed: {elapsed} minutes') dfs.append(sampled_df) baseline = pd.concat(dfs).reset_index(drop=True) #save generated baseline start_str = start.strftime('%Y-%m-%d') end_str = end.strftime('%Y-%m-%d') baseline.to_json(f'baseline_{start_str}_{end_str}_{i}.json', orient='records', lines=True)
import socialsim as ss # Load the configuration file config = 'data/cp4_configuration.json' config = ss.load_config(config) # Get metadata metadata = ss.MetaData() # Instantiate the task runner with the specified ground truth ground_truth_filepath = 'data/test_dataset.json' ground_truth = ss.load_data(ground_truth_filepath, ignore_first_line=True, verbose=False) eval_runner = ss.EvaluationRunner(ground_truth, config, metadata=metadata) # Evaluate a series of submissions that contain submission metadata as the first line of the submission file\ submission_filepaths = ['data/test_dataset.json'] for simulation_filepath in submission_filepaths: # Run measurements and metrics on the simulation data results, logs = eval_runner(simulation_filepath, verbose=True, submission_meta=True)
import socialsim as ss # Load the example dataset dataset = 'data/test_dataset.txt' dataset = ss.load_data(dataset) # Subset the dataset to a particular platform dataset = dataset[dataset['platform'] == 'twitter'].head(n=2000) # Load the configuration file config = 'cp1_configuration.json' config = ss.load_config(config) # Subset the configuration for the given task config = config['twitter']['social_structure'] # Define the measurement object social_structure_measurements = ss.SocialStructureMeasurements( dataset, config, None, 'twitter') # Run all measurements in the config file results = social_structure_measurements.run(verbose=True)
def check_records(submission_filepath, nodelist, simulation_period): errors, warnings = [], [] try: # test that submission file can be loaded subm = ss.load_data(submission_filepath, ignore_first_line=True, verbose=False) loaded = True except Exception as e: errors.append('Submission could not be loaded: ' + str(e)) loaded = False if loaded: # platform tests valid_items = VALID_OPTIONS[challenge]['platforms'] subm_items = set(subm['platform'].unique()) platform_errors, platform_warnings = check_all_present( valid_items, subm_items, 'platforms') errors.extend(platform_errors) warnings.extend(platform_warnings) if nodelist is not None: # informationID tests valid_items = VALID_OPTIONS[challenge]['informationID'] subm_items = set(subm['informationID'].unique()) informationID_errors, informationID_warnings = check_all_present( valid_items, subm_items, 'informationIDs') errors.extend(informationID_errors) warnings.extend(informationID_warnings) # test that there are no NaN items in required event details for c in [ 'informationID', 'nodeTime', 'nodeID', 'parentID', 'rootID', 'platform', 'actionType', 'nodeUserID' ]: if len(subm[c]) != len(subm[c].dropna()): errors.append(f'{c} can not be NaN values.') print(c) print(subm[c].astype(str).unique()) # check for empty user-user network parentID_nodeID_overlap = set(subm['parentID']).intersection( set(subm['nodeID'])) if len(parentID_nodeID_overlap) == 0: warnings.append( 'There is no overlap between nodeID values and parentID values -- the user-to-user network created from this submission will be empty.' ) # check that nodeTimes fall within simulation window try: simulation_window = VALID_OPTIONS[challenge]['simulation_windows'][ simulation_period] minday = f'2019-{simulation_window[0]}' maxday = f'2019-{simulation_window[1]} 23:59' maxday_str = maxday.split(' ')[0] subm['nodeTime'] = pd.to_datetime(subm['nodeTime']).astype(str) subm_minday, subm_maxday = subm['nodeTime'].min( ), subm['nodeTime'].max() if subm_maxday <= minday: errors.append( f'There is no data within the simulation period, all nodeTime values occur before the simulation period ({minday} - {maxday_str}).\n\tSubmission nodeTime values -- Min: {subm_minday} Max: {subm_maxday}' ) elif subm_minday > maxday: errors.append( f'There is no data within the simulation period, all nodeTime values occur after the simulation period ({minday} - {maxday_str}).\n\tSubmission nodeTime values -- Min: {subm_minday} Max: {subm_maxday}' ) elif subm_minday < minday: warnings.append( f'Some events occur before the simulation period, earliest nodeTime value is {subm_minday}' ) elif subm_maxday > maxday: warnings.append( f'Some events occur after the simulation period, latest nodeTime value is {subm_maxday}' ) except Exception as e: warnings.append( 'Could not validate nodeTimes occur within the simulation period: ' + str(e)) # actionType tests valid_items = VALID_OPTIONS[challenge]['actiontypes'] subm_items = set(subm['actionType'].unique()) platform_errors, platform_warnings = check_all_present( valid_items, subm_items, 'actionType') errors.extend(platform_errors) warnings.extend(platform_warnings) result = '' if len(errors) > 0: result = result + 'ERRORS:\n\t' result = result + '\n\n\t'.join(errors) + '\n\n' if len(warnings) > 0: result = result + 'WARNINGS:\n\t' result = result + '\n\n\t'.join(warnings) if result == '': result = 'success' return result
import socialsim as ss # Load the simulation data simulation = 'data/test_dataset.txt' simulation = ss.load_data(simulation) # Load the ground truth data ground_truth = 'data/test_dataset.txt' ground_truth = ss.load_data(ground_truth) # Load the configuration file config = 'data/cp2_configuration.json' config = ss.load_config(config) # Get metadata metadata = ss.MetaData(community_directory='data/communities/') # Instantiate the task runner task_runner = ss.TaskRunner(ground_truth, config, metadata=metadata, test=True) # Run measurements and metrics on the simulation data results = task_runner(simulation, verbose=True)
from pprint import pprint import socialsim as ss # Load the simulation data simulation = 'data/test_dataset.json' simulation = ss.load_data(simulation, ignore_first_line=True, verbose=False) # Load the ground truth data ground_truth = 'data/test_dataset.json' ground_truth = ss.load_data(ground_truth, ignore_first_line=True, verbose=False) # Load the configuration file config = 'data/cp3_s1_configuration.json' config = ss.load_config(config) # Get metadata metadata = ss.MetaData() # Instantiate the task runner task_runner = ss.TaskRunner(ground_truth, config, metadata=metadata) # Run measurements and metrics on the simulation data results, logs = task_runner(simulation, verbose=True)