def concatenate_crowdtangle_group_data(suffix): if suffix == "fake_news_2021": df_list = [] for file_index in range(5): df_list.append( import_data(folder="crowdtangle_group", file_name="posts_" + suffix + "_group_" + str(file_index + 1) + ".csv")) posts_group_df = pd.concat(df_list) else: posts_group_df = import_data(folder="crowdtangle_group", file_name="posts_" + suffix + "_group.csv") print('\nThere are {} Facebook groups about {}.'.format( posts_group_df.account_id.nunique(), suffix)) posts_page_df = import_data(folder="crowdtangle_group", file_name="posts_" + suffix + "_page.csv") print('There are {} Facebook pages about {}.'.format( posts_page_df.account_id.nunique(), suffix)) posts_df = pd.concat([posts_group_df, posts_page_df]) posts_df['date'] = pd.to_datetime(posts_df['date']) return posts_df
def import_crowdtangle_group_data(): posts_wi_date_df = import_data(folder="crowdtangle_group", file_name="posts_self_declared_wi_date.csv") print('\nThere are {} Facebook pages with the last strike date visible on the screenshot.'.\ format(posts_wi_date_df.account_id.nunique())) posts_wo_date_df = import_data(folder="crowdtangle_group", file_name="posts_self_declared_wo_date.csv") list_wo_name = [ 'Artists For A Free World', 'Terrence K Williams', 'Ben Garrison Cartoons', 'Wendy Bell Radio', 'New Independence Network', 'Pruden POD & Post', 'PR Conservative', 'Org of Conservative Trump Americans', 'Con Ciencia Indigena', 'Republican Party of Lafayette County', 'The Daily Perspective Podcast', 'Freedom Memes', 'White Dragon Society', 'Robertson Family Values' ] posts_wo_date_df = posts_wo_date_df[~posts_wo_date_df['account_name']. isin(list_wo_name)] print('There are {} Facebook pages without the last strike date visible on the screenshot.'.\ format(posts_wo_date_df.account_id.nunique())) posts_df = pd.concat([posts_wi_date_df, posts_wo_date_df]) posts_df['date'] = pd.to_datetime(posts_df['date']) return posts_df
def run_test(mode='tfidf', model='linear', regularizer='ridge',\ train_data_path='data/intuit_data',\ test_data_path='data/intuit_test_data', augment=False): """ Prints out score report of model under given featurization """ modes = {'bow': 'Bag of Words', 'tfidf': 'TF-IDF'} print("Using featurization " + modes[mode] + "...") print("Training " + model.upper() + " model with " + regularizer.upper() +\ " regularization...") if augment: print("Importing Word2Vec Model...") w2v_model = Word2Vec.load_word2vec_format('w2v.bin', binary=True) print( "-------------------------------------------------------------------------" ) emails_train, y_train = import_data(train_data_path) transform = generate_featurizer(emails_train, mode=mode) X_train = transform(emails_train) eff_labels = np.unique(y_train) if augment: auxillary_features = [featurize(email, eff_labels, w2v_model)\ for email in emails_train] auxillary_features = np.vstack(auxillary_features) X_train = hstack((X_train, auxillary_features)) clf = generate_model(X_train, y_train, model=model,\ regularizer=regularizer) emails_test, y_test = import_data(test_data_path) X_test = transform(emails_test) if augment: auxillary_features = [featurize(email, eff_labels, w2v_model)\ for email in emails_test] auxillary_features = np.vstack(auxillary_features) X_test = hstack((X_test, auxillary_features)) y_pred = clf.predict(X_test) labels = np.unique(y_test) print(classification_report(y_test, y_pred)) accuracy = str(np.around(accuracy_score(y_test, y_pred), decimals=3)) print("accuracy: " + accuracy) if model != 'linear': regularizer = 'No' generate_confusion_matrix( y_test, y_pred, eff_labels, model.upper() + " model - " + regularizer.upper() + " regularization - " + modes[mode] + " featurization - " + accuracy, model + '-' + mode + '-' + regularizer.lower() + '.png', True) print( "-------------------------------------------------------------------------" )
def build_subwords_vocab(target_vocab_size=10000): train_neg = import_data('./train/neg') train_pos = import_data('./train/pos') train_raw = train_neg + train_pos train_clean = [clean_data(t) for t in train_raw] vocab_encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(train_clean, target_vocab_size) vocab_encoder.save_to_file('vocab') print(vocab_encoder.vocab_size)
def __init__(self): self.iv = bo.random_AES_key() self.key = bo.random_AES_key() file_name = "data_S3C17.txt" self.data = encode( random.choice(ut.import_data(file_name).splitlines()))
def challenge_6(): print(f"\n-- Challenge 6 - Break repeating-key XOR --") print(f"-- Part 1 --") data_1 = encode("this is a test") data_2 = encode("wokka wokka!!!") print(f"String 1 : {decode(data_1)}") print(f"String 2 : {decode(data_2)}") print(f"Edit distance : {bo.edit_distance(data_1, data_2)}") print(f"-- Part 2 --") B64_ciphertext = ut.import_data("data_S1C6.txt") data = b64decode(B64_ciphertext) likely_key_sizes = bo.find_key_size(40, data) # Find most likely key. def key_comparison(): for key_size in likely_key_sizes[0:3]: key = bo.key_finder(key_size, data) secret = bo.repeating_key_xor(data, key) score = bo.text_scorer(secret).score() yield score, key, secret score, key, secret = max(key_comparison()) print(f"Most likely key sizes : {likely_key_sizes[0:3]}") print(f"Highest score : {score}") print(f"Corresponding Key : {decode(key)}") print(f"Secret : \n{decode(secret[:90])}...")
def challenge_8(): print(f"\n-- Challenge 8 - Detect AES in ECB mode --") print(f"-- Method 1 --") hex_ciphertext = ut.import_data("data_S1C8.txt") def text_breaker(): for line_index, line in enumerate(hex_ciphertext.splitlines()): data = bytes.fromhex(line) unique_char_instances = len(list(Counter(data).items())) yield unique_char_instances, line_index unique_char_instances, line_index = min(text_breaker()) print( f"Assume ECB 1:1 mapping has low diversity of characters compared" " to random data") print(f"Lowest number of unique chars : {unique_char_instances}") print(f"Corresponding line : {line_index}") print(f"-- Method 2 --") # Find if data contains duplicate blocks. for line_index2, line in enumerate(hex_ciphertext.splitlines()): if bo.ECB_mode_check(bytes.fromhex(line)): break print(f"Find line with duplicate blocks") print(f"Corresponding line : {line_index2}")
def challenge_10(): print(f"\n-- Challenge 10 - Implement CBC mode --") data_p = bo.pad(16, b"This is a secret message! TOP SECRET") key = b"PASSWORDPASSWORD" iv = b"1122334455667788" ECB_1 = ocl.AESECB(key) CBC_1 = ocl.AESCBC(iv, key) ECB_ciphertext = ECB_1.encrypt(data_p) ECB_plaintext = bo.depad(ECB_1.decrypt(ECB_ciphertext)) CBC_ciphertext = CBC_1.encrypt(data_p) CBC_plaintext = bo.depad(CBC_1.decrypt(CBC_ciphertext)) print(f"Padded Secret Message : {data_p}") print(f"Key : {key}") print(f"ECB encrypted message : {ECB_ciphertext}") print(f"ECB decrypted message : {ECB_plaintext}") print(f"iv : {iv}") print(f"CBC encrypted message : {CBC_ciphertext}") print(f"CBC decrypted message : {CBC_plaintext}") print("----- Part 2 ------") data = b64decode(ut.import_data("data_S2C10.txt")) key = b"YELLOW SUBMARINE" iv = bytes([0]) * 16 CBC_2 = ocl.AESCBC(iv, key) decrypted = decode(bo.depad(CBC_2.decrypt(data))) print(f"CBC decrypted message : \n{decrypted[0:90]}...")
def main(): model = create_model() model.summary() # Building Phase data = import_data("./dataset/crx_clean.data.txt") X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) # Expand data dimension for kernel to convolve over X_train = np.expand_dims(X_train, axis=2) # (None, 46, 1) X_test = np.expand_dims(X_test, axis=2) # (None, 46, 1) # create model model = KerasClassifier(build_fn=create_model, verbose=0) # Operational Phase scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_CNN(model, X_train, Y_train, X_test, Y_test, scorer) Y_pred_grid_search = np.squeeze(Y_pred_grid_search) print() print() print(Y_pred_grid_search) print() print(Y_test) print() print_scores(Y_test, Y_pred_grid_search)
def challenge_7(): print(f"\n-- Challenge 7 - AES in ECB mode --") key = encode("YELLOW SUBMARINE") data = b64decode(ut.import_data("data_S1C7.txt")) plaintext = ocl.AESECB(key).decrypt(data) print(f"Key : {decode(key)}") print(f"Secret : \n{decode(plaintext[:90])}...")
def print_individual_drops_statistics(): df = import_data(folder="crowdtangle_list", file_name="account_list_part_1.csv") df = df.dropna(subset=['june_drop']) df['june_drop'] = df['june_drop'].astype(str).apply( lambda x: x[:-1]).astype(int) print( '\nThere are {} accounts for which we can calculate the drop.'.format( len(df))) print('Among them, {} accounts have a drop (decrease).'.format( len(df[df['june_drop'] < 0])))
def test_filter(clf, transform, test_data_path): print("Testing event filter...") print( "---------------------------------------------------------------------" ) email_texts, y_test = import_data(test_data_path) y_test[np.where(y_test != 'no event')] = 'event' X_test = transform(email_texts) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(classification_report(y_test, y_pred)) print("accuracy: " + str(accuracy)) print( "---------------------------------------------------------------------" )
def main(): # Building Phase data = import_data("./dataset/crx_clean.data.txt") X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) # Operational Phase scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_mlp(X_train, Y_train, X_test, Y_test, scorer) print() print() print(Y_pred_grid_search) print() print(Y_test) print() print_scores(Y_test, Y_pred_grid_search)
def challenge_4(): print("\n-- Challenge 4 - Detect single-char XOR --") file_name = "data_S1C4.txt" hex_ciphertext = ut.import_data(file_name) def text_breaker(): for line_index, line in enumerate(hex_ciphertext.splitlines()): data = bytes.fromhex(line) score, byte = bo.single_byte_xor_breaker(data) yield score, byte, line_index, data score, byte, line_index, data = max(text_breaker()) plaintext = bo.single_byte_xor(byte, data) print(f"Hex data file : {file_name}") print(f"Highest frequency analysis score : {score}") print(f"Corresponding line : {line_index}") print(f"Corresponding key : {decode(byte)}") print(f"Decrypted plaintext : {decode(plaintext)}")
def main(): # Building Phase data = import_data( "./dataset/crx_clean.data.txt" ) X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data) clf_entropy = train_using_entropy(X_train, Y_train) # Operational Phase print("\n### SINGLE TRAIN-TEST SPLIT ###\n") Y_pred_entropy = prediction(X_test, clf_entropy) print_scores(Y_test, Y_pred_entropy) print("\n### CROSS VAL USING STRATIFIED K FOLD ###\n") fold_scores = cv_with_entropy(X, Y) print("Cross Validate: ", fold_scores) print("Best F1_score: ", max(fold_scores)*100) scorer = make_scorer(f1_score, pos_label='+') print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n") Y_pred_grid_search = grid_search_cv_DT(X_train, Y_train, X_test, Y_test, scorer) print_scores(Y_test, Y_pred_grid_search)
randgain = args.randgain if randgain == 1: gainmin = 0.1 gainmax = 0.8 # scaling the input range ~ [-1.25,1.25] in [-1,1] print( 'at training, for every forward, apply random gain to [xref,xper] between ', gainmin, gainmax) print( 'test data is loaded with random gains, kept fixed throughout the training' ) rgains = [gainmin, gainmax] else: rgains = False train_loader, test_loader, train_refloader, test_refloader = import_data( data_path, subsets, Lsize, batch_size, train_ratio=0.8, rgains=rgains) ############################################################################### ### BUILD MODEL nconv = args.nconv nchan = args.nchan dist_dp = args.dist_dp dist_act = args.dist_act ndim = [args.ndim0, args.ndim1] classif_dp = args.classif_dp classif_BN = args.classif_BN classif_act = args.classif_act minit = args.minit print( '\nBUILDING with settings nconv,nchan,dist_dp,dist_act,ndim,classif_dp,classif_BN,classif_act,minit'
train_data.Survived[(train_data.Sex == 'male') & (train_data.Pclass == 3)].value_counts( normalize=True).plot.bar(alpha=0.5) plt.xticks(rotation='horizontal') plt.title('Poor men survived') # Rich women plt.subplot2grid((3, 4), (2, 2)) train_data.Survived[(train_data.Sex == 'female') & (train_data.Pclass == 1)].value_counts( normalize=True).plot.bar(alpha=0.5, color='#FA0000') plt.xticks(rotation='horizontal') plt.title('Rich women survived') # Poor women plt.subplot2grid((3, 4), (2, 3)) train_data.Survived[(train_data.Sex == 'female') & (train_data.Pclass == 3)].value_counts( normalize=True).plot.bar(alpha=0.5, color='#FA0000') plt.xticks(rotation='horizontal') plt.title('Poor women survived') plt.show() if __name__ == "__main__": train_data, test_data = import_data() #train_data, test_data = data_wrangling(train_data, test_data) plot_basics(train_data) plot_gender(train_data)
def main(): # Importo le tabelle del dataset e i grafi stop_times, trips, routes, exceptions_service, calendar, stops, trips_with_stop_times = import_data( ) stop_times_load = stop_times.copy().drop(['stop_sequence'], axis=1) graph_with_routes, graph_no_multiple_edges = import_graphs( 'XML files//Complete_TrenordNetwork.xml', 'XML files//CompleteGraph_NoMultipleEdges.xml') # Svolgo le varie fasi dell'analisi print('Pre analisi in corso') do_pre_analysis(graph_with_routes) print('Studio dei carichi in corso') do_load_analysis('1841', 'monday', exceptions_service, stops, trips_with_stop_times) print('Studio dei percorsi minimi in corso') do_min_path_analysis('1581', '1711', '09:00:00', 'monday', 0, trips_with_stop_times, stops, stop_times, trips, graph_no_multiple_edges) print('Studio della gestione degli attacchi in corso') do_attack_handling_analysis(graph_with_routes, graph_no_multiple_edges) print('Fine!')
gainmax = 0.8 # scaling the input range ~ [-1.25,1.25] in [-1,1] print( 'at training, for every forward, apply random gain to [xref,xper] between ', gainmin, gainmax) print( 'test data is loaded with random gains, kept fixed throughout the training' ) rgains = [gainmin, gainmax] else: rgains = False #train_loader,test_loader,train_refloader,test_refloader = import_data(data_path,subsets,Lsize,batch_size,train_ratio=0.8,rgains=rgains) if args.use_npy == 0: train_loader, test_loader, train_refloader, test_refloader = import_data( Lsize, batch_size, train_ratio=0.8, dummy_test=args.dummy_test, audio_inputs_normalise=args.audio_inputs_normalise) else: train_loader, test_loader, train_refloader, test_refloader = import_data( data_path, subsets, Lsize, batch_size, train_ratio=0.8, rgains=rgains, dummy_test=args.dummy_test, audio_inputs_normalise=args.audio_inputs_normalise) ############################################################################### ### BUILD MODEL
def _convert_crf_output_to_json(crf_output): return json.dumps(utils.import_data(crf_output), indent=2, sort_keys=True)
url_df = url_df.dropna(subset=['scientific_topic']) return url_df def keep_only_topic_data(url_df, TOPIC): if TOPIC in ["climate", "health", "covid"]: return url_df[url_df["scientific_topic"] == TOPIC] else: return url_df if __name__ == "__main__": DATE = sys.argv[1] TOPIC = sys.argv[2] if len(sys.argv) >= 3 else "" url_df = import_data(folder="sciencefeedback", file_name="Appearances-Grid view " + DATE + ".csv") url_df = keep_only_the_urls_considered_fake_by_facebook(url_df) url_df = clean_url_format(url_df) url_df = add_info_from_fact_check_table(url_df) url_df = keep_only_topic_data(url_df, TOPIC) url_df = url_df[[ 'url', 'url_cleaned', 'domain_name', 'Item reviewed', 'Date of publication', 'scientific_topic' ]] print("There are {} fake news urls.".format(len(url_df))) export_data(url_df, 'sciencefeedback', "appearances_" + DATE + "_" + TOPIC + ".csv")
clean_df = pd.DataFrame(columns=[ "account_name", "account_id", "date", "share", "comment", "reaction" ]) clean_df['account_name'] = df['account_name'].astype(str) clean_df['account_id'] = df['account_id'].astype(int) clean_df['date'] = pd.to_datetime(df['date']) clean_df["share"] = df[["actual_share_count"]].astype(int) clean_df["comment"] = df[["actual_comment_count"]].astype(int) clean_df["reaction"] = df[[ "actual_like_count", "actual_favorite_count", "actual_love_count", "actual_wow_count", "actual_haha_count", "actual_sad_count", "actual_angry_count", "actual_thankful_count" ]].sum(axis=1).astype(int) return clean_df if __name__ == "__main__": DATE = sys.argv[1] SUFFIX = sys.argv[2] df = import_data(folder="crowdtangle_group", file_name='posts_group_' + DATE + '.csv') clean_df = clean_columns(df) export_data(clean_df, 'crowdtangle_group', 'posts_' + SUFFIX + '.csv')
pd.options.display.max_colwidth = 300 def create_template_csv_from_serie(serie, list_name): df = pd.DataFrame(columns=["Page or Account URL", "List"]) df["Page or Account URL"] = serie.index df["List"] = list_name export_data(df, 'crowdtangle_list', list_name + '.csv') return df if __name__=="__main__": df = import_data(folder="crowdtangle_url", file_name="posts_url_2021-01-04_.csv") df = df.drop_duplicates(subset=['url', 'account_id']) s = df["account_url"].value_counts() top1_df = create_template_csv_from_serie(s[s > 45], "heloise_fake_news_groups_1") top2_df = create_template_csv_from_serie(s[(s <= 45) & (s > 35)], "heloise_fake_news_groups_2") top3_df = create_template_csv_from_serie(s[(s <= 35) & (s > 29)], "heloise_fake_news_groups_3") top4_df = create_template_csv_from_serie(s[(s <= 29) & (s > 26)], "heloise_fake_news_groups_4") top5_df = create_template_csv_from_serie(s[(s <= 26) & (s > 23)], "heloise_fake_news_groups_5") print(len(top1_df)) print(len(top2_df)) print(len(top3_df)) print(len(top4_df)) print(len(top5_df))
from utils import import_data dataset_path = "data" dataset_version = "fake-v1.0" fake_dataset = import_data(dataset_path, dataset_version) dataset_path = "data" dataset_version = "automated-v1.0" automated_dataset = import_data(dataset_path, dataset_version)
if (group_index % 10 == 9) | (group_index == posts_df['account_id'].nunique() - 1): plt.tight_layout() save_figure( 'z_part_2_all_groups_{}'.format(int(group_index / 10) + 1), folder='ip&m', dpi=100) group_index += 1 if __name__ == "__main__": posts_df = import_crowdtangle_group_data() pages_df = import_data(folder="crowdtangle_list", file_name="page_list_part_2.csv") pages_df['date'] = pd.to_datetime( pages_df['reduced_distribution_start_date']) save_figure_4(posts_df, pages_df) save_supplementary_figure_2(posts_df, pages_df) save_figure_5(posts_df, pages_df) save_figure_5(posts_df, pages_df, period_length=30) screenshot_df = import_data(folder="crowdtangle_post_by_id", file_name='screenshot_posts.csv') print_statistics_screenshot_posts(screenshot_df) # save_all_groups_figures(posts_df, pages_df)
def __init__(self): self.key = bo.random_AES_key() self.secret = b64decode(ut.import_data("data_S2C12.txt"))
def combine_predictions(self, y_filter, y_clf, event_indices): y_pred = [] y_filter = list(y_filter) y_clf = list(y_clf) for i in range(len(y_filter) + len(y_clf)): if i in event_indices: y_pred.append(y_clf.pop(0)) else: y_pred.append(y_filter.pop(0)) return np.array(y_pred) if __name__ == "__main__": from argparse import ArgumentParser email_texts, y_train = import_data('data/intuit_data') transform = generate_featurizer(email_texts) X_train = transform(email_texts) model = HierachicalClassifier('rf') model.fit(X_train, y_train) email_texts, y_test = import_data('data/intuit_test_data') X_test = transform(email_texts) y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) print("accuracy: " + str(accuracy_score(y_test, y_pred))) generate_confusion_matrix(y_test, y_pred, np.unique(y_train),\ 'Hierachical Classification', 'hc.png', True)
import nltk import os from utils import import_data, tag_data, extract_important_words, extract_simple_model FILE = "model2" def prepare_tools(): nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') if __name__ == "__main__": prepare_tools() path = "{}/data/{}.txt".format(os.curdir, FILE) data = import_data(path) tagged_data = tag_data(data) extracted_data = extract_important_words(tagged_data) model = extract_simple_model(extracted_data) print(model)
############### # ## SETUP ## # ############### # load experiment config with open(CONFIG_FILE) as file: config = json.load(file) # directory for experiment results exp_dir = config['exp_dir'] + '_' + datetime.datetime.now().strftime('%d-%m-%Y_%I-%M-%S_%p') + '_/' # setup folders, save code, set seed and get device setup_exp(exp_dir, config['seed'], ['log'], ['bpda_eot_attack.py', 'nets.py', 'utils.py', CONFIG_FILE]) print('Loading data and nets.') # data loader data, num_classes = import_data(config['data_type'], False, False) attack_loader = DataLoader(data, batch_size=config['batch_size'], shuffle=config['subset_shuffle'], num_workers=0) # get clf and ebm networks and load saved weights clf = WideResNet(num_classes=num_classes).cuda() clf.load_state_dict(t.load(config['clf_weight_path'], map_location=lambda storage, loc: storage.cuda())) clf.eval() if config['langevin_steps'] > 0: ebm = EBM().cuda() ebm.load_state_dict(t.load(config['ebm_weight_path'], map_location=lambda storage, loc: storage.cuda())) ebm.eval() # cross-entropy loss function to generate attack gradients criterion = t.nn.CrossEntropyLoss() # rescale adversarial parameters for attacks on images with pixel intensities in the range [-1, 1] config['adv_eps'] *= 2.0 / 255.0
import sys sys.path.append("..") import utils as u # In[4]: # change this string to match the path on your computer path_to_root = "/Users/mcapizzi/Github/dynet_tutorial/" # In[5]: trainX, trainY, testX, testY = u.import_data(path_to_root) # In[6]: trainX.shape, trainY.shape # In[7]: testX.shape, testY.shape # The labels are either `1` or `0` where `1=Spam` and `0=Ham`