Exemple #1
0
def train_model_ui():
    base_model_name = data_files_prompt(
        name='base_model_name',
        message='Select a model to train.',
        dir_='models',
        allow_custom_file=True,
        custom_file_choice_name='SpaCy built-in or new...',
        custom_file_message=
        "What is the name of the model you want to train? If it doesn't exists it will be created."
    )

    annotations_loaders = get_annotations_loaders()
    a_training = prompt([
        data_files_question(
            name='training_data_filename',
            message=
            "Select the annotated training data file you want to train this model against.",
            dir_='annotated_training_data',
            allow_custom_file=False), {
                'type':
                'list',
                'name':
                'annotations_loader',
                'message':
                'What annotation loader do you want to use to load the file?',
                'choices':
                map(lambda loader: loader[0], annotations_loaders),
                'filter':
                lambda loader_name: [
                    loader[1] for loader in annotations_loaders
                    if loader[0] == loader_name
                ][0]
            },
        {
            'type': 'input',
            'name': 'output_model_name',
            'message':
            "Where do you want to save the trained model? If the model already exists it will be overridden.",
            'default': base_model_name,
            'validate': 'required'
        }, {
            'type': 'input',
            'name': 'num_iter',
            'message': "How many iteration should this training last?",
            'default': '100',
            'validate': lambda a: a.isdigit() and int(a) > 0,
            'filter': 'to_int'
        }
    ])

    annotations_loaders = a_training['annotations_loader']
    training_data_filename = path.join(get_data_dir(),
                                       'annotated_training_data',
                                       a_training['training_data_filename'])
    output_model_name = path.join(get_data_dir(), 'models',
                                  a_training['output_model_name'])
    train_ner(model=base_model_name,
              training_data=annotations_loaders(training_data_filename),
              output_model=output_model_name,
              num_iter=a_training['num_iter'])
def get_browser_history_ui(browser: str):
    # This is gonna exit in case it's not running on the evidence machine
    show_running_on_same_machine_warning()

    a_dataset = prompt([
        {
            'type':
            'input',
            'name':
            'dataset_filename',
            'message':
            f"Where should the {browser.capitalize()} history get saved to? If the file does not exist it "
            "will be created, otherwise it will be overwritten.",
            'validate':
            'required',
        },
        yes_no_question(
            'load_history',
            'Do you want to also crawl the content of the visited web pages? This operation will take a while...',
            yes_next=
            [{
                'type':
                'input',
                'name':
                'dataset_content_filename',
                'message':
                f"Where should the {browser.capitalize()} history content get saved to? If the "
                "file doesn't exist it will be created, otherwise it will be overwritten.",
                'validate':
                'required'
            },
             datetime_question(
                 'dataset_content_from',
                 'From what date and time would you like to retrieve the history content (format: ISO 8601, blank '
                 'for no lower filtering)?'),
             datetime_question(
                 name='dataset_content_to',
                 message=
                 'To what date and time would you like to retrieve the history content (format: ISO  8601, '
                 'blank for no upper filtering)?')]),
    ])

    dataset_filepath = path.join(get_data_dir(), 'datasets', 'history_data',
                                 a_dataset['dataset_filename'])
    history = getattr(browser_history, f'get_{browser}_history')()
    history.to_csv(dataset_filepath, index=False)
    LOGGER.info(f'Saved dataset to {dataset_filepath}')

    # Load history content
    if a_dataset['load_history']:
        dataset_content_filepath = path.join(
            get_data_dir(), 'datasets', 'history_data',
            a_dataset['load_history']['next']['dataset_content_filename'])
        history_content = load_history(
            history, a_dataset['load_history']['next']['dataset_content_from'],
            a_dataset['load_history']['next']['dataset_content_to'])
        history_content.to_csv(dataset_content_filepath, index=False)
        LOGGER.info(f'Saved dataset to {dataset_content_filepath}')
def _process_load_file(df_a, column_name_a, df_b, column_name_b, column_date):
    while True:
        a_set_a_set_b = prompt(q_set_a_set_b)
        if a_set_a_set_b['option'] == 'exit':
            break

        if a_set_a_set_b['option'] == 'set_A':
            a_append_overwrite = prompt(q_append_overwrite)
            # a_file_loc_generic = prompt(q_file_loc_generic)
            a_file_loc_generic = file_loc_generic()
            filename_a = path.join(get_data_dir(), 'datasets',
                                   a_file_loc_generic)

            with Halo(text=f"Loading load file function...", spinner='dots'):
                read = read_file(filename_a)
                if a_append_overwrite['integrate'] == 'overwrite' or df_a.empty:
                    # Filter out empty titles (stored as nan aka flat values) and
                    # get only content and date column
                    df_a = read.dropna()[[column_name_a, column_date]]
                    df_a = spacy_token(df_a, column_name_a)
                    df_a = spacy_label_token_full(df_a, column_name_a)
                else:
                    df_temporary = read.dropna()[[column_name_a, column_date
                                                  ]]  # impt to match columns
                    df_temporary = spacy_token(df_temporary, column_name_a)
                    df_temporary = spacy_label_token_full(
                        df_temporary, column_name_a)
                    df_a = venn.venn_union(df_a, df_temporary)

        if a_set_a_set_b['option'] == 'set_B':
            a_append_overwrite = prompt(q_append_overwrite)
            # a_file_loc_generic = prompt(q_file_loc_generic)
            a_file_loc_generic = file_loc_generic()
            filename_b = path.join(get_data_dir(), 'datasets',
                                   a_file_loc_generic)

            with Halo(text=f"Loading load file function...", spinner='dots'):
                read = read_file(filename_b)
                if a_append_overwrite['integrate'] == 'overwrite' or df_b.empty:
                    # Filter out empty titles (stored as nan aka flat values) and
                    # get only content and date column
                    df_b = read.dropna()[[column_name_b, column_date]]
                    df_b = spacy_token(df_b, column_name_b)
                    df_b = spacy_label_token_full(df_b, column_name_b)
                else:
                    df_temporary = read.dropna()[[column_name_a, column_date
                                                  ]]  # impt to match columns
                    df_temporary = spacy_token(df_temporary, column_name_b)
                    df_temporary = spacy_label_token_full(
                        df_temporary, column_name_b)
                    df_b = venn.venn_union(df_b, df_temporary)

    return df_a, df_b
def get_twitter_timeline_ui():
    a_dataset = prompt([{
        'type':
        'input',
        'name':
        'profiled_screen_name',
        'message':
        "What's the Twitter screen name of the person you're profiling (string after '@')?"
    }, {
        'type':
        'input',
        'name':
        'dataset_filename',
        'message':
        "Where should the Twitter timeline get saved to? If the file doesn't exist it "
        "will be created, otherwise it will be overwritten.",
        'validate':
        'required'
    }])

    dataset_filepath = path.join(get_data_dir(), 'datasets', 'social_data',
                                 a_dataset['dataset_filename'])
    twitter_timeline = get_twitter_timeline(a_dataset['profiled_screen_name'])
    twitter_timeline.to_csv(dataset_filepath, index=False)
    LOGGER.info(f'Saved dataset to {dataset_filepath}')
def get_facebook_data_ui():
    print(
        'Refer to https://www.facebook.com/help/212802592074644 on how to download data from a '
        'Facebook account. You will need the login credentials of the person you are profiling. '
    )
    input('Press enter when you are ready to proceed...')

    a_dataset = prompt([{
        'type': 'input',
        'name': 'facebook_data_dir',
        'message': "Where is your Facebook data directory located?",
        'validate': 'required'
    }, {
        'type':
        'input',
        'name':
        'dataset_filename',
        'message':
        "Where should the Facebook data get saved to? If the file doesn't exist it "
        "will be created, otherwise it will be overwritten.",
        'validate':
        'required'
    }])

    dataset_filepath = path.join(get_data_dir(), 'datasets', 'social_data',
                                 a_dataset['dataset_filename'])
    posts = get_facebook_posts(a_dataset['facebook_data_dir'])
    posts.to_csv(dataset_filepath, index=False)
    LOGGER.info(f'Saved dataset to {dataset_filepath}')
def data_files_question(*,
                        name: str,
                        message: str,
                        dir_: str,
                        allow_custom_file: bool = True,
                        custom_file_choice_name: str = 'Other...',
                        custom_file_message: str = 'Filename:',
                        recursive=False):
    """
    Returns a question dictionary containing a list of files in a directory relative to the data folder.
    :param custom_file_choice_name:
    :param recursive:
    :param custom_file_message:
    :param allow_custom_file:
    :param name:
    :param message:
    :param dir_:
    :return:
    """
    dir_path = path.join(get_data_dir(), dir_)
    # Get all files inside directory, except the directory itself, if using recursive, then make it relative to that
    # folder (remove first part)
    files: Iterator[str] = map(
        lambda p: path.relpath(p, dir_path),
        glob(path.join(dir_path, '**'),
             recursive=recursive)[1 if recursive else 0:])

    choices: List[Dict[str, Any]] = list(map(lambda f: {'name': f}, files))

    if allow_custom_file:
        choices.append({
            'name': custom_file_choice_name,
            'next': {
                'type': 'input',
                'name': 'filename',
                'message': custom_file_message,
                'validate': 'required'
            }
        })
    return {
        'type': 'list',
        'name': name,
        'message': message,
        'choices': choices
    }
Exemple #7
0
def save_training_data(training_data):
    training_data_filename = prompt({
        'type':
        'input',
        'name':
        'training_data_filename',
        'message':
        "Where should the training data get saved to? If the file doesn't exist it will be created, "
        "otherwise this training data will be appended.",
        'validate':
        'required'
    })
    with open(path.join(get_data_dir(), 'training_data',
                        training_data_filename),
              'a',
              encoding='utf-8') as f:
        f.writelines(map(lambda l: f'{l}\n', training_data))
    LOGGER.info(f'Successfully saved training data {training_data_filename}.')
def get_File_dir():
    return get_data_dir()
def write_file_csv(df, filename):
    output_datasets_name = path.join(get_data_dir(), 'results', filename)
    df.to_csv(output_datasets_name, index=False)
def write_file_json(content, filename):  # write to file.
    output_datasets_name = path.join(get_data_dir(), 'results', filename)
    with open(output_datasets_name, 'w+') as fp:
        json.dump(content, fp)