def init_train_data(fnames, topics): print ('[ init_train_data ] =================') # amap # key : aid # value : attr[0] preferance, attr[1] aid , attr[2] aname train_rank = [] for QID in range(len(topics)): fname = fnames[QID] topic = topics[QID] amap = filter_data(fname) fea = Feature(topic) ext_aids = ZC.get_raw_rank(topic, EXT_TRAIN_A_SIZE) print '[ init_train_data ] amap_1 size = %d ' %(len(amap)) for tid in ext_aids : if not (tid in amap) : amap[tid] = (0, tid, '') print '[ init_train_data ] amap_2 size = %d ' %(len(amap)) for tid in amap : fv = fea.get_feature_vector(tid) #print ('[ init_train_data ] %d get feature vector ok.' %(tid)) train_rank.append( (int(amap[tid][0]), reform_vector(fv), QID) ) print '[ init_train_data ] topic : %s ok , train_rank_size = %d' %(topic, len(train_rank)) ZC.dump_cache() with open('train_rank.dat' , 'w') as f : pprint.pprint(train_rank, f) return train_rank
def process(data: Data) -> Data: result = data if result.args.white_black: result = make_white_black(result, np.array([0.299, 0.587, 0.114, 1])) if result.args.filter: result = filter_data(result) if result.args.bit: result.norm() if result.args.sobel: result = sobel_filter(data) return result
def init_test_data(fname, topic): print('[ init_train_data ] =================') QID = 1 # amap , key : aid # value : attr[0] preferance, attr[1] aid , attr[2] aname amap = filter_data(fname) fea = Feature(topic) train_rank = [] for tid in amap: aid = int(tid) fv = fea.get_feature_vector(aid) print('[ init_train_data ] %d get feature vector ok.' % (aid)) train_rank.append((aid, reform_vector(fv), QID)) #ZC.dump_cache() return train_rank
def init_test_data(fname, topic): print ('[ init_train_data ] =================') QID = 1 # amap , key : aid # value : attr[0] preferance, attr[1] aid , attr[2] aname amap = filter_data(fname) fea = Feature(topic) train_rank = [] for tid in amap : aid = int(tid) fv = fea.get_feature_vector(aid) print ('[ init_train_data ] %d get feature vector ok.' %(aid)) train_rank.append( (aid, reform_vector(fv), QID) ) #ZC.dump_cache() return train_rank
def preprocess_data(path_model, model_name, k, path_preprocess): path = path_model + model_name data = load_file(path) data = filter_data(data) data = add_tia(data) data = normalize_data(data) data.to_csv(path_preprocess + 'preprocessed_data_' + model_name, index=0) [feature_selected, stat_ranksums] = select_feature(data) feature_selected.to_csv(path_preprocess + 'selected_features_' + model_name, index=0) sn_folds = kfold_data(data, k) sn_folds.to_csv(path_preprocess + 'sn_folds_' + model_name, index=0) print('[%s]%s done...' % (time.asctime(time.localtime( time.time())), sys._getframe().f_code.co_name)) return [feature_selected, sn_folds]
def init_train_data(fnames, topics): print('[ init_train_data ] =================') # amap # key : aid # value : attr[0] preferance, attr[1] aid , attr[2] aname train_rank = [] for QID in range(len(topics)): fname = fnames[QID] topic = topics[QID] amap = filter_data(fname) fea = Feature(topic) ext_aids = ZC.get_raw_rank(topic, EXT_TRAIN_A_SIZE) print '[ init_train_data ] amap_1 size = %d ' % (len(amap)) for tid in ext_aids: if not (tid in amap): amap[tid] = (0, tid, '') print '[ init_train_data ] amap_2 size = %d ' % (len(amap)) for tid in amap: fv = fea.get_feature_vector(tid) #print ('[ init_train_data ] %d get feature vector ok.' %(tid)) train_rank.append((int(amap[tid][0]), reform_vector(fv), QID)) print '[ init_train_data ] topic : %s ok , train_rank_size = %d' % ( topic, len(train_rank)) ZC.dump_cache() with open('train_rank.dat', 'w') as f: pprint.pprint(train_rank, f) return train_rank
'failure', 'smart_1_normalized', 'smart_3_normalized', 'smart_5_normalized', 'smart_7_normalized', 'smart_9_normalized', 'smart_187_normalized', 'smart_189_normalized', 'smart_194_normalized', 'smart_195_normalized', 'smart_197_normalized', 'smart_5_raw', 'smart_197_raw', ] # Preprocess data data['failure'] = (data['failure'] * (-1) + 1) / 2 data = add_date(data) data = filter.filter_data(data, save=1, start_date='2010-01-01') data = filter.add_tia(data) data = filter.normalize_data(data) data.to_csv(path_load + 'preprocessed_data_' + model_name, index=0) # Select features [feature_selected, stat_ranksums] = filter.select_feature(data) feature_selected.to_csv(path_load + 'selected_features_' + model_name, index=0) # Build k-fold tags sn_folds = filter.kfold_data(data, k) sn_folds.to_csv(path_load + 'sn_folds_' + model_name, index=0)
def pipeline(directory: str, frames: List[Frame], parser_name: str, extract_features: set, filter_: dict, prune_test_data=True, log_data=False): filter = copy.deepcopy(filter_) # Prune sentences without an LU from the frames # TODO: log no. sentences pruned filter_faulty_sentences(frames) # Filter frames (frames, no_filtered_frames_sentences) = filter_data(frames, filter) print(f"Filtered data") # Add feature representation of each word to each word node (no_data_points_features) = create_feature_representation( frames, extract_features) print(f"Feature representation created") no_frames = f"Number of frames: {len(frames)}" print(no_frames) # Split data into training and test sets (train_sentences, test_sentences) = split_data_train_test(frames) # # For testing purpose adding all sentences to training and testing # sentences = [] # for frame in frames: # sentences.extend(frame.getSentences()) # (train_sentences, test_sentences) = (sentences, sentences) no_sentences = f"Number of sentences: {len(train_sentences) + len(test_sentences)}" print(no_sentences) print(no_data_points_features) filter = copy.deepcopy(filter_) # Train models id_clf, label_clf, report_training = train_models_2( train_sentences, filter) print(f"{report_training}") if log_data: model_path = f"{directory}/models" if not os.path.isdir(model_path): # Create model folder try: os.mkdir(f"{model_path}") except: raise OSError(f"Unable to create directory {model_path}") # Save models save_model(id_clf, f"{parser_name}_identification_model", f"{model_path}") save_model(label_clf, f"{parser_name}_labeling_model", f"{model_path}") filter = copy.deepcopy(filter_) # Test models (id_evaluation, label_evaluation, evaluation) = test_models_2(id_clf, label_clf, test_sentences, filter, prune_test_data=prune_test_data) print(f"Models tested") if log_data: # Save evaluation save_to_file(id_evaluation, f"{directory}/{parser_name}_id_evaluation.txt") save_to_file(label_evaluation, f"{directory}/{parser_name}_label_evaluation.txt") save_to_file(f"{evaluation}", f"{directory}/{parser_name}_evaluation.txt") save_to_file( f"{no_frames}\n{no_sentences}\n{no_data_points_features}\n{report_training}", f"{directory}/run_description.txt") # Release memory using garbage collection del id_clf del label_clf gc.collect() return evaluation
def pipeline_2(directory: str, frames: List[Frame], parser_name: str, extract_features: set, filter_: dict, prune_test_data=True, log_data=False): filter = copy.deepcopy(filter_) # Prune sentences without an LU from the frames filter_faulty_sentences(frames) # Filter frames (frames, no_filtered_frames_sentences) = filter_data(frames, filter) print(f"Filtered data") # Add feature representation of each word to each word node (no_data_points_features) = create_feature_representation( frames, extract_features) print(f"Feature representation created") no_frames = f"Number of frames: {len(frames)}" print(no_frames) # Split data into training and test sets (train_sentences, test_sentences) = split_data_train_test(frames) no_sentences = f"Number of sentences: {len(train_sentences) + len(test_sentences)}" print(no_sentences) print(no_data_points_features) id_clf = open_model(f"{parser_name}_identification_model", f"{directory}/models") label_clf = open_model(f"{parser_name}_labeling_model", f"{directory}/models") filter = copy.deepcopy(filter_) # Test models (id_evaluation, label_evaluation, evaluation) = test_models_2(id_clf, label_clf, test_sentences, filter, prune_test_data=prune_test_data) print(f"Models tested") if log_data: # Save evaluation save_to_file(id_evaluation, f"{directory}/{parser_name}_id_evaluation.txt") save_to_file(label_evaluation, f"{directory}/{parser_name}_label_evaluation.txt") save_to_file(f"{evaluation}", f"{directory}/{parser_name}_evaluation.txt") save_to_file( f"\n{no_frames}\n{no_sentences}\n{no_data_points_features}", f"{directory}/run_description.txt") # Release memory using garbage collection del id_clf del label_clf gc.collect() return evaluation
def main(run_args): if not run_args.skip: access_token = run_args.access_token if not run_args.file: print("\n# 0. Crawling Facebook...") if run_args.specific: specific = json.load(run_args.specific) print("Specific category to get :") print(specific) fb_crawlfile = crawl_facebook(access_token, run_args.count, run_args.limit, 10000, specific) else: fb_crawlfile = crawl_facebook(access_token, run_args.count, run_args.limit, 10000, []) pages = json.load(open(fb_crawlfile)) else: pages = json.load(run_args.file) i = 0 for page in pages: output = dict() i += 1 print("\n# " + str(i) + ".1 Crawling page '" + page['name'] + "'...") filename = "data/datasets/" + page['id'] if not os.path.isfile( filename + ".json") or run_args.erase or os.path.getsize(filename + ".json") < 3: crawl.crawl_page(access_token, page['id'], 200, run_args.limit, output_name=page['id']) else: print("Skipped (-e to not skip)") print("# " + str(i) + ".2 Filtering content of the page...") if not os.path.isfile(filename + "_filtered.json" ) or run_args.erase or os.path.getsize( filename + "_filtered.json") < 3: filter.filter_data(filename + ".json", False, 1, 1, 1) else: print("Skipped (-e to not skip)") print("# " + str(i) + ".3 Normalizing content of the page...") if not os.path.isfile(filename + "_filtered_normalized.json" ) or run_args.erase or os.path.getsize( filename + "_filtered_normalized.json") < 3: normalize.normalize_data(filename + "_filtered.json") else: print("Skipped (-e to not skip)") print("# " + str(i) + ".4 Counting main reactions of the page...") if not os.path.isfile(filename + "_filtered_normalized_count.json" ) or run_args.erase or os.path.getsize( filename + "_filtered_normalized_count.json") < 3: posts = None with open(filename + "_filtered_normalized.json", 'r') as infile: posts = json.load(infile) count = Counter() for post in posts: count.update([post['reaction']]) # Add category & fan_count output['id'] = page['id'] output['name'] = page['name'] output['category'] = page['category'] output['fan_count'] = page['fan_count'] output['reaction'] = count with open(filename + "_filtered_normalized_count.json", 'w') as outfile: json.dump(output, outfile) else: print("Skipped (-e to not skip)") cats = [] reacts = [] X = [] Y = [] for element in os.listdir('data/datasets'): if element.endswith('_count.json'): page = json.load(open('data/datasets/' + element)) # Build dictionnary with main reactions if page['reaction']: max_key, max_value = max(page['reaction'].items(), key=lambda x: x[1]) final_max = {max_key: max_value} for k, v in page['reaction'].items(): if v >= max_value * (1 - run_args.value): final_max[k] = v # Print a test if page['category'] == "Author": print(final_max) reacts.append(final_max) cats.append(page['category']) if run_args.nojoy: i_to_delete = [] for i in range(0, len(cats)): if convert_reaction_to_bitmask( reacts[i]) == REACTIONS_BITMASK['joy']: i_to_delete.append(i) i_to_delete.sort(reverse=True) for i in i_to_delete: cats.pop(i) reacts.pop(i) reactions = build_reaction_vector(reacts) categories = build_category_vector(cats) print(categories) for i in range(0, len(cats)): # Create esthetical offset if convert_reaction_to_bitmask( reacts[i]) == REACTIONS_BITMASK['joy'] or run_args.nojoy: offset = random.uniform(-0.1, 0.1) else: offset = 0 X.append(categories[cats[i]] + offset) if convert_reaction_to_bitmask( reacts[i]) == REACTIONS_BITMASK['joy'] or run_args.nojoy: offset = random.uniform(-0.1, 0.1) else: offset = 0 Y.append(reactions[convert_reaction_to_bitmask(reacts[i])] + offset) plt.figure() plt.plot(X, Y, '+') plt.title('Reactions by categories\nPages with min.10000 fans') plt.figtext(0.9, 0.9, str(len(categories)) + 'categories\n' + str(len(reacts)) + 'datas', fontsize=9, ha='right') bitmasks = [] for bitmask in list(reactions.keys()): bitmasks.append(convert_bitmask_to_react(bitmask)) plt.yticks(range(len(list(reactions.keys()))), bitmasks, rotation=0) plt.xticks(range(len(list(categories.keys()))), list(categories.keys()), rotation=80) plt.show()