def train_model_ui(): base_model_name = data_files_prompt( name='base_model_name', message='Select a model to train.', dir_='models', allow_custom_file=True, custom_file_choice_name='SpaCy built-in or new...', custom_file_message= "What is the name of the model you want to train? If it doesn't exists it will be created." ) annotations_loaders = get_annotations_loaders() a_training = prompt([ data_files_question( name='training_data_filename', message= "Select the annotated training data file you want to train this model against.", dir_='annotated_training_data', allow_custom_file=False), { 'type': 'list', 'name': 'annotations_loader', 'message': 'What annotation loader do you want to use to load the file?', 'choices': map(lambda loader: loader[0], annotations_loaders), 'filter': lambda loader_name: [ loader[1] for loader in annotations_loaders if loader[0] == loader_name ][0] }, { 'type': 'input', 'name': 'output_model_name', 'message': "Where do you want to save the trained model? If the model already exists it will be overridden.", 'default': base_model_name, 'validate': 'required' }, { 'type': 'input', 'name': 'num_iter', 'message': "How many iteration should this training last?", 'default': '100', 'validate': lambda a: a.isdigit() and int(a) > 0, 'filter': 'to_int' } ]) annotations_loaders = a_training['annotations_loader'] training_data_filename = path.join(get_data_dir(), 'annotated_training_data', a_training['training_data_filename']) output_model_name = path.join(get_data_dir(), 'models', a_training['output_model_name']) train_ner(model=base_model_name, training_data=annotations_loaders(training_data_filename), output_model=output_model_name, num_iter=a_training['num_iter'])
def get_browser_history_ui(browser: str): # This is gonna exit in case it's not running on the evidence machine show_running_on_same_machine_warning() a_dataset = prompt([ { 'type': 'input', 'name': 'dataset_filename', 'message': f"Where should the {browser.capitalize()} history get saved to? If the file does not exist it " "will be created, otherwise it will be overwritten.", 'validate': 'required', }, yes_no_question( 'load_history', 'Do you want to also crawl the content of the visited web pages? This operation will take a while...', yes_next= [{ 'type': 'input', 'name': 'dataset_content_filename', 'message': f"Where should the {browser.capitalize()} history content get saved to? If the " "file doesn't exist it will be created, otherwise it will be overwritten.", 'validate': 'required' }, datetime_question( 'dataset_content_from', 'From what date and time would you like to retrieve the history content (format: ISO 8601, blank ' 'for no lower filtering)?'), datetime_question( name='dataset_content_to', message= 'To what date and time would you like to retrieve the history content (format: ISO 8601, ' 'blank for no upper filtering)?')]), ]) dataset_filepath = path.join(get_data_dir(), 'datasets', 'history_data', a_dataset['dataset_filename']) history = getattr(browser_history, f'get_{browser}_history')() history.to_csv(dataset_filepath, index=False) LOGGER.info(f'Saved dataset to {dataset_filepath}') # Load history content if a_dataset['load_history']: dataset_content_filepath = path.join( get_data_dir(), 'datasets', 'history_data', a_dataset['load_history']['next']['dataset_content_filename']) history_content = load_history( history, a_dataset['load_history']['next']['dataset_content_from'], a_dataset['load_history']['next']['dataset_content_to']) history_content.to_csv(dataset_content_filepath, index=False) LOGGER.info(f'Saved dataset to {dataset_content_filepath}')
def _process_load_file(df_a, column_name_a, df_b, column_name_b, column_date): while True: a_set_a_set_b = prompt(q_set_a_set_b) if a_set_a_set_b['option'] == 'exit': break if a_set_a_set_b['option'] == 'set_A': a_append_overwrite = prompt(q_append_overwrite) # a_file_loc_generic = prompt(q_file_loc_generic) a_file_loc_generic = file_loc_generic() filename_a = path.join(get_data_dir(), 'datasets', a_file_loc_generic) with Halo(text=f"Loading load file function...", spinner='dots'): read = read_file(filename_a) if a_append_overwrite['integrate'] == 'overwrite' or df_a.empty: # Filter out empty titles (stored as nan aka flat values) and # get only content and date column df_a = read.dropna()[[column_name_a, column_date]] df_a = spacy_token(df_a, column_name_a) df_a = spacy_label_token_full(df_a, column_name_a) else: df_temporary = read.dropna()[[column_name_a, column_date ]] # impt to match columns df_temporary = spacy_token(df_temporary, column_name_a) df_temporary = spacy_label_token_full( df_temporary, column_name_a) df_a = venn.venn_union(df_a, df_temporary) if a_set_a_set_b['option'] == 'set_B': a_append_overwrite = prompt(q_append_overwrite) # a_file_loc_generic = prompt(q_file_loc_generic) a_file_loc_generic = file_loc_generic() filename_b = path.join(get_data_dir(), 'datasets', a_file_loc_generic) with Halo(text=f"Loading load file function...", spinner='dots'): read = read_file(filename_b) if a_append_overwrite['integrate'] == 'overwrite' or df_b.empty: # Filter out empty titles (stored as nan aka flat values) and # get only content and date column df_b = read.dropna()[[column_name_b, column_date]] df_b = spacy_token(df_b, column_name_b) df_b = spacy_label_token_full(df_b, column_name_b) else: df_temporary = read.dropna()[[column_name_a, column_date ]] # impt to match columns df_temporary = spacy_token(df_temporary, column_name_b) df_temporary = spacy_label_token_full( df_temporary, column_name_b) df_b = venn.venn_union(df_b, df_temporary) return df_a, df_b
def get_twitter_timeline_ui(): a_dataset = prompt([{ 'type': 'input', 'name': 'profiled_screen_name', 'message': "What's the Twitter screen name of the person you're profiling (string after '@')?" }, { 'type': 'input', 'name': 'dataset_filename', 'message': "Where should the Twitter timeline get saved to? If the file doesn't exist it " "will be created, otherwise it will be overwritten.", 'validate': 'required' }]) dataset_filepath = path.join(get_data_dir(), 'datasets', 'social_data', a_dataset['dataset_filename']) twitter_timeline = get_twitter_timeline(a_dataset['profiled_screen_name']) twitter_timeline.to_csv(dataset_filepath, index=False) LOGGER.info(f'Saved dataset to {dataset_filepath}')
def get_facebook_data_ui(): print( 'Refer to https://www.facebook.com/help/212802592074644 on how to download data from a ' 'Facebook account. You will need the login credentials of the person you are profiling. ' ) input('Press enter when you are ready to proceed...') a_dataset = prompt([{ 'type': 'input', 'name': 'facebook_data_dir', 'message': "Where is your Facebook data directory located?", 'validate': 'required' }, { 'type': 'input', 'name': 'dataset_filename', 'message': "Where should the Facebook data get saved to? If the file doesn't exist it " "will be created, otherwise it will be overwritten.", 'validate': 'required' }]) dataset_filepath = path.join(get_data_dir(), 'datasets', 'social_data', a_dataset['dataset_filename']) posts = get_facebook_posts(a_dataset['facebook_data_dir']) posts.to_csv(dataset_filepath, index=False) LOGGER.info(f'Saved dataset to {dataset_filepath}')
def data_files_question(*, name: str, message: str, dir_: str, allow_custom_file: bool = True, custom_file_choice_name: str = 'Other...', custom_file_message: str = 'Filename:', recursive=False): """ Returns a question dictionary containing a list of files in a directory relative to the data folder. :param custom_file_choice_name: :param recursive: :param custom_file_message: :param allow_custom_file: :param name: :param message: :param dir_: :return: """ dir_path = path.join(get_data_dir(), dir_) # Get all files inside directory, except the directory itself, if using recursive, then make it relative to that # folder (remove first part) files: Iterator[str] = map( lambda p: path.relpath(p, dir_path), glob(path.join(dir_path, '**'), recursive=recursive)[1 if recursive else 0:]) choices: List[Dict[str, Any]] = list(map(lambda f: {'name': f}, files)) if allow_custom_file: choices.append({ 'name': custom_file_choice_name, 'next': { 'type': 'input', 'name': 'filename', 'message': custom_file_message, 'validate': 'required' } }) return { 'type': 'list', 'name': name, 'message': message, 'choices': choices }
def save_training_data(training_data): training_data_filename = prompt({ 'type': 'input', 'name': 'training_data_filename', 'message': "Where should the training data get saved to? If the file doesn't exist it will be created, " "otherwise this training data will be appended.", 'validate': 'required' }) with open(path.join(get_data_dir(), 'training_data', training_data_filename), 'a', encoding='utf-8') as f: f.writelines(map(lambda l: f'{l}\n', training_data)) LOGGER.info(f'Successfully saved training data {training_data_filename}.')
def get_File_dir(): return get_data_dir()
def write_file_csv(df, filename): output_datasets_name = path.join(get_data_dir(), 'results', filename) df.to_csv(output_datasets_name, index=False)
def write_file_json(content, filename): # write to file. output_datasets_name = path.join(get_data_dir(), 'results', filename) with open(output_datasets_name, 'w+') as fp: json.dump(content, fp)