def get_priors(): try: load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env') username = os.getenv('DATABASE_USER') password = os.getenv('DATABASE_PASSWORD') conn_string = "dbname='twitter' user="******" password = "******"got filenames" for file_path, file_name in file_names: try: f = json.load(open(file_path +'/' + file_name, 'r')) except: print file_path print file_name continue if len(f) > 150: user_id = file_name.split('_')[0] cur.execute('SELECT user_info_json FROM followers WHERE user_id = %s', (user_id,)) record = cur.fetchone() if record: if record[0]: user_info = ast.literal_eval(record[0]) else: continue gf = GetFeatures(user_id, user_info, f) gf.user_features() gf.collect_tweets() gf.content_features() gf.temporal_features() # need to incorporate other network features # gf.features['num_shared_edges'] = follower_counts[user] features_list.append(gf.features) print len(features_list) pickle.dump(features_list, open('priors_feature_list.p', 'wb'))
def __init__(self, neighborhood, folder, what_to_run, landmark_function_to_run, name): """ :param neighborhood: which neiberhood exam accessibility :param folder: to save the results :param what_to_run: in developing stage no all the function should run :param landmark_function_to_run: many steps in this class , so sometime I run only part of them """ # build the network based on the neighborhood # save absulute path to pedestrian_flow_model before changing workspace folder pedestrian_flow_folder = os.path.dirname(__file__) + '/pedestrian_flow' os.chdir(folder) if what_to_run['Network']: print('run Network') Network(neighborhood, name + '_ntwrk.shp') if what_to_run['get_features']: print(' get_features') gdb = gpd.read_file(name + '_ntwrk.shp') GetFeatures(gdb, neighborhood, name + '_features.shp') if what_to_run['pedestrian flow']: print(' pedestrian flow') GetFeatures.calculate_padestrain_flow(pedestrian_flow_folder + '/finalized_model.sav', name + '_features.shp', pedestrian_flow_folder, name + '_ped_flow.shp') # Calculate landmark criterion if what_to_run['landmark']: print('run landmark') Landmark(landmark_function_to_run, neighborhood, name) # Calculate waytype criterion if what_to_run['WayType']: print('run WayType') WayType(name) # Calculate complexity criterion if what_to_run['Complexity']: print('run Complexity') Complexity(name) # Calculate final cost if what_to_run['Final']: print('run Final') Final(name)
def get_priors(): folders = os.walk('/home/amanda/bigDisk/Twitter/random_streams/') try: load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env') username = os.getenv('DATABASE_USER') password = os.getenv('DATABASE_PASSWORD') conn_string = "dbname='twitter' user="******" password = "******"too long" print gf.features features_list.append(gf.features) pickle.dump(features_list, open('priors_feature_list.p', 'wb'))
def GetFeatureMaps(self): """Stores feature maps of the stored faces obtained using a pretrained vgg16 network""" # Make feature model FeatureModel = MakeFeatureModel(modelName='vgg16') # Get list of list of feature maps FeatureMaps = [] for Face in self.Faces: Face = Face.unsqueeze(0) Face = nn.functional.interpolate(Face, size=(256, 256), mode='bilinear') FeatureMaps.append(GetFeatures(Face, FeatureModel)) assert (len(FeatureMaps) == len(self.Faces) ), 'Feature maps obtained not equal to number of stored faces' # Concatenate feature maps in batch size dimension self.FeatureMaps = [] for i in range(len(FeatureMaps[0])): self.FeatureMaps.append( torch.cat([Map[i] for Map in FeatureMaps], dim=0))
def TrainLatentsEnc(Model, FaceLatents, Epochs_n, LearnRate, LearnRateW, BatchSize=2,\ Weight_feature=1e2, Weight_pixel=1, Weight_tp=1e4, \ TrainZ=True, StoredTargetFeatures=True, PrintInterval=20, MaxBatchPrint=10, UseCuda=True): """Training function to train a StyleGAN encoder network and optionally an input Zspace to map the Zspace to an extended Wspace (input to StyleGAN synthesis network) corresponding to target faces stored in FaceLatents. """ # Initialisation of model if UseCuda: device = 'cuda:0' if torch.cuda.is_available() else 'cpu' if device == 'cpu': raise Warning( 'UseCuda was set true but cuda is not available, using cpu') else: device = 'cpu' if device != FaceLatents.device: print( 'Training function and FaceLatents class are using different devices' ) Model = Model.to(device) for param in Model.parameters(): param.requires_grad = False # Initialise learnable parameters OptimParams = [] # Input Zspace if TrainZ: OptimParams = OptimParams + [FaceLatents.Zspace] # Parameters of encoder network DecompParams = list(Model.g_all.g_decompress.parameters()) for params in DecompParams: params.requires_grad = True OptimParams = OptimParams + DecompParams # Initialise optimiser Optimiser = torch.optim.Adam(OptimParams, lr=LearnRate) # Initialise optimiser for W+ for target propagation WOptimiser = torch.optim.Adam([FaceLatents.Wspace], lr=LearnRateW) # Initialise loss variables Loss = torch.tensor(0, device=device).float() Loss_feature = torch.tensor(0, device=device).float() Loss_pixel = torch.tensor(0, device=device).float() Loss_tp = torch.tensor(0, device=device).float() # Initialise feature model to get feature maps from if Weight_feature > 0: FeatureModel = MakeFeatureModel(modelName='vgg16') # Calculate batches Batch_n = FaceLatents.Size // BatchSize if FaceLatents.Size % BatchSize is not 0: Batch_n += 1 # Training loop TimeStart = time.time() for Epoch_i in range(1, Epochs_n + 1): # Shuffle batch indices RandIdx = list(np.random.permutation(FaceLatents.Size)) for Batch_i in range(Batch_n): # Reset gradients Optimiser.zero_grad() WOptimiser.zero_grad() # Get a batch StartIdx = Batch_i * BatchSize EndIdx = StartIdx + BatchSize BatchIdx = RandIdx[StartIdx:EndIdx] Zspace = FaceLatents.Zspace[BatchIdx].to(device) Wspace = FaceLatents.Wspace[BatchIdx].to(device) Faces = FaceLatents.Faces[BatchIdx].to(device) # Get target features for the batch if StoredTargetFeatures: TargetFeatures = [ Maps[BatchIdx].to(device) for Maps in FaceLatents.FeatureMaps ] else: Faces_ds = nn.functional.interpolate(Faces, size=(256, 256), mode='bilinear') TargetFeatures = GetFeatures(Faces_ds, FeatureModel, UseCuda=UseCuda) # Learn target W+ and compute target propagation loss if Weight_tp > 0: # Get synthesis output of W+ target ModelOut_tgt = Model.g_all.g_synthesis(Wspace) # Standardise pixel values to [0,1] ModelOut_tgt = ModelOut_tgt.clone().clamp_(-1, 1).add_(1).div_(2.0) # Target feature loss if Weight_feature > 0: # Get feature maps ModelOut_ds_tgt = nn.functional.interpolate( ModelOut_tgt, size=(256, 256), mode='bilinear') OutFeatures_tgt = GetFeatures(ModelOut_ds_tgt, FeatureModel) Loss_feature_tgt = functools.reduce( lambda x, y : x + y , [nn.functional.mse_loss(Features[0], Features[1])\ for Features in zip(OutFeatures_tgt, TargetFeatures)]) # Target pixel loss Loss_pixel_tgt = nn.functional.mse_loss(ModelOut_tgt, Faces) # Aggregate pixel and feature loss Loss_W_ex_tgt = Weight_feature * Loss_feature_tgt + Weight_pixel * Loss_pixel_tgt # Learn W+ target Loss_W_ex_tgt.backward() WOptimiser.step() # Calculate target propagation loss Wspace_Z = Model.g_all.g_decompress(Zspace) Loss_tp = nn.functional.mse_loss(Wspace_Z, Wspace.detach()) # Get model output from Zspace ModelOut = Model(Zspace) # Standardise output pixel values to [0,1] ModelOut = ModelOut.clone().clamp_(-1, 1).add_(1).div_(2.0) # Compute feature (perceptual) loss Loss_feature = torch.tensor(0, device=device).float() if Weight_feature > 0: ModelOut_ds = nn.functional.interpolate(ModelOut, size=(256, 256), mode='bilinear') OutFeatures = GetFeatures(ModelOut_ds, FeatureModel, UseCuda=UseCuda) Loss_feature = functools.reduce( lambda x, y : x + y , [nn.functional.mse_loss(Features[0], Features[1])\ for Features in zip(OutFeatures, TargetFeatures)]) # Compute pixel loss Loss_pixel = nn.functional.mse_loss(ModelOut, Faces) # Aggregate losses Loss = Weight_feature * Loss_feature + Weight_pixel * Loss_pixel + Weight_tp * Loss_tp # Print metrics if (Epoch_i == 1 or Epoch_i == Epochs_n or Epoch_i % PrintInterval == 0) \ and Batch_i < MaxBatchPrint: print('Epoch: ', Epoch_i) print('Batch: ', Batch_i + 1) print('This batch: ', [FaceLatents.FaceNames[Idx] for Idx in BatchIdx]) print('Total time elapsed: ', time.time() - TimeStart, ' s') print('Feature loss: ', Loss_feature.item(), \ ' | Pixel loss: ', Loss_pixel.item(), \ ' | Target prop loss: ', Loss_tp.item()) print('Weighted - Feature loss: ', Weight_feature*Loss_feature.item(), \ ' | Pixel loss: ', Weight_pixel*Loss_pixel.item(), \ ' | Target prop loss: ', Weight_tp*Loss_tp.item()) print('Total loss: ', Loss.item()) ShowModelOutput(ModelOut) if Weight_tp > 0: print('Target W+ output') print('W+ loss: ', Loss_W_ex_tgt.item()) ShowModelOutput(Model.g_all.g_synthesis(Wspace)) # Perform gradient descent and backprop Loss.backward() Optimiser.step()
# special case with Tel Aviv city = 'tel_aviv' place = place['geometry'][0] output = os.path.join('networks', 'tel_aviv.shp') if 'network' in keys_parameters: print(' network') Network(place, output, centrality=True, useful_tags_path=['highway']) if 'get_features' in keys_parameters: print(' get_features') gdb = gpd.read_file(output) GetFeatures(gdb, place, 'output/' + city + '.shp') # merge labels and features if 'merge_labels_features' in keys_parameters: print(' merge_labels_features') if parameters['country'] == 'Germany': features_folder = 'output' features_file = MergeLabelFeatures.merge_location_to_features( features_folder) os.chdir(os.path.dirname(__file__)) # for Germany the feature file type and encoding are different features_file = pd.read_csv('germany/features.csv') labels_file = pd.read_csv('germany/labels.csv') encoding = 'utf-8-sig' else:
def find_bots(self, priors): self.users_to_query = set() user_features = {} followers_set = set(self.followers) if self.level > 0: print "Number of followers: " + str(len(self.followers)) follower_counts = Counter(self.followers).most_common() # should fix this to be a more precise measure size_to_keep = int(.08*len(self.followers)) connectedness_threshold = floor(0.3*self.n) print size_to_keep print connectedness_threshold tmp_followers = [f[0] for f in follower_counts if f[1] > connectedness_threshold] print "NUmber of followers over threshold = " + str(len(tmp_followers)) if len(tmp_followers) < size_to_keep: tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1]) followers_set = set(tmp_followers) print "Number of connected followers: " + str(len(followers_set)) print "Getting all user info..." for follower in followers_set: user_info = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.cur.execute('SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) continue if record[3] and not record[4]: self.ignore_users.add(follower) continue if record[3] and record[4]: try: self.user_info[follower] = ast.literal_eval(record[4]) continue except: self.ignore_users.add(follower) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.users.add(follower) self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get(follower) and len(timeline) > 50: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.user_features() gf.collect_tweets() gf.content_features() gf.temporal_features() except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] # need to incorporate other network features #gf.features['num_shared_edges'] = follower_counts[user] user_features[follower] = gf.features self.current_level_users.append(follower) # we can look at the out-degree of the collapsed ego network. We also calculate the average out degree, # which is the average number of followers per follower. # need to get the followers for all these len_priors = len(priors) current_features = priors current_features.extend(user_features.values()) print "Performing anomaly detection" #json.dump(priors, open('test.json', 'w'), indent=4, separators=(',', ': ')) X = self.vec.fit_transform(current_features).toarray() current_features = {} X_norm = normalize(X) #print np.any(np.isnan(X)) #print np.all(np.isfinite(X)) outliers = self.perform_outlier_detection(X, len_priors) #How do I add back in the outliers to the anomaly detection? Mueen said not to so I will leave for now self.level += 1 # Add highly connected followers to the clique and to_check clique_features = {} for follower in outliers: self.clique.add((follower, self.level)) self.to_check.add(follower) self.clique_features[follower] = user_features[follower] user_features = {} print self.clique self.n = float(len(self.clique)) print "Current size of cluster: " + str(self.n)
def get_bot_features(users_file, output): folders = os.walk('/home/amanda/bigDisk/Twitter/Debot2/stream/') try: load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env') username = os.getenv('DATABASE_USER') password = os.getenv('DATABASE_PASSWORD') conn_string = "dbname='twitter' user="******" password = "******"got filenames" i = 0 for file_path, file_name in file_names: if i >= 9000: break try: f = json.load(open(file_path + '/' + file_name, 'r')) except: print file_path print file_name continue if len(f) > 150: i += 1 user_id = file_name.split('_')[0] cur.execute( 'SELECT user_info_json FROM followers WHERE user_id = %s', (user_id, )) record = cur.fetchone() if record: if record[0]: user_info = ast.literal_eval(record[0]) else: continue gf = GetFeatures(user_id, user_info, f) gf.user_features() gf.collect_tweets() gf.content_features() gf.temporal_features() features.append(gf.features) pd.DataFrame(features).to_csv(output) print "dumped file"
def find_bots(self, priors): print "Getting all user info..." self.users_to_query = set() followers_set = set(self.followers) print "Number of followers: " + str(len(self.followers)) follower_counts = Counter(self.followers).most_common() # should fix this to be a more precise measure size_to_keep = int(.15*len(self.followers)) connectedness_threshold = floor(0.3*self.n) tmp_followers = [f[0] for f in follower_counts if f[1] >= connectedness_threshold] if len(tmp_followers) < size_to_keep: tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1]) followers_set = set(tmp_followers) print "Number of connected followers: " + str(len(followers_set)) for follower in followers_set: user_info = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.cur.execute('SELECT suspended, deleted, other_error, user_info_json FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have profile information for user number " + follower self.user_info[follower] = ast.literal_eval(record[3]) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.users.add(follower) self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get(follower) and len(timeline) > 50: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.user_features() gf.collect_tweets() gf.content_features() gf.temporal_features() except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] # need to incorporate other network features #gf.features['num_shared_edges'] = follower_counts[user] #cself.user_features[user] = gf.features self.current_level_users.append(follower) self.features_list.append(gf.features) # Axis=0 should be vertical len_priors = len(priors) current_features = priors current_features.extend(self.features_list) print "Performing anomaly detection" #json.dump(priors, open('test.json', 'w'), indent=4, separators=(',', ': ')) X = self.vec.fit_transform(current_features).toarray() current_features = {} X_norm = normalize(X) #print np.any(np.isnan(X)) #print np.all(np.isfinite(X)) print X.shape # X = np.stack([current_features, priors], axis=0) Every round will find outliers, how do we stop exploring? clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"]) if X is not None: X = check_array(X, accept_sparse='csr') y_pred = clf._decision_function(X) else: y_pred = clf.negative_outlier_factor_ #y_pred = clf.fit_predict(X) y_pred_new = y_pred[len_priors:] # Do anomaly detection and set connected followers to certain outliers # this line is a stand-in users_scores = zip(self.current_level_users, y_pred_new) connected_followers = [u[0] for u in users_scores if u[1] <= clf.threshold_] #How do I add back in the outliers to the anomaly detection? Mueen said not to so I will leave for now self.level += 1 # Add highly connected followers to the clique and to_check for follower in connected_followers: self.clique.add((follower, self.level)) self.to_check.add(follower) print self.clique self.n = float(len(self.clique)) print "Current size of cluster: " + str(self.n)