Beispiel #1
0
def make_preprocess():
    '''
        Read interim.csv and clean more data.
        1. Read StartTime as DateTime
        2. Perform binning on source and destination ports
        3. Add attribute indicating direction of flow
        4. Write to preproccessed.csv
    '''
    config = load_yaml(CONFIG_PATH)
    interim_output_path = config['interim_output_path']
    preprocessed_output_path = config['preprocessed_output_path']
    proto_dict = load_json(config['proto_dict_path'])
    dir_dict = load_json(config['dir_dict_path'])
    state_dict = load_json(config['state_dict_path'])
    # Well-known ports range from 0 through 1023
    # Registered ports are 1024 to 49151
    # Dynamic ports (also called private ports) are 49152 to 65535
    port_bins = [0, 1023, 49151, 65535]
    port_labels = [0, 1, 2]

    interim_df = pd.read_csv(interim_output_path, sep=',', escapechar='\\')
    preprocessed_df = interim_df
    preprocessed_df['StartTime'] = pd.to_datetime(preprocessed_df['StartTime'])

    preprocessed_df['Proto_Int'] = preprocessed_df['Proto'].map(proto_dict)
    preprocessed_df['Proto_Int'].fillna(proto_dict['Unknown'])
    preprocessed_df['Proto_Int'] = preprocessed_df['Proto_Int'].astype(
        'category')

    preprocessed_df['Sport_Int'] = pd.cut(preprocessed_df['Sport'],
                                          bins=port_bins,
                                          labels=port_labels,
                                          include_lowest=True)
    preprocessed_df['Sport_Int'] = preprocessed_df['Sport_Int'].astype(
        'category')

    preprocessed_df['Dir_Int'] = preprocessed_df['Dir'].map(dir_dict)
    preprocessed_df['Dir_Int'] = preprocessed_df['Dir_Int'].fillna(
        dir_dict['Unknown'])
    preprocessed_df['Dir_Int'] = preprocessed_df['Dir_Int'].astype('category')

    preprocessed_df['Dport_Int'] = pd.cut(preprocessed_df['Dport'],
                                          bins=port_bins,
                                          labels=port_labels,
                                          include_lowest=True)
    preprocessed_df['Dport_Int'] = preprocessed_df['Dport_Int'].astype(
        'category')

    preprocessed_df['State_Int'] = preprocessed_df['State'].map(state_dict)
    preprocessed_df['State_Int'] = preprocessed_df['State_Int'].fillna(
        state_dict['Unknown'])
    preprocessed_df['State_Int'] = preprocessed_df['State_Int'].astype(
        'category')

    preprocessed_df['is_fwd'] = preprocessed_df['Sport']
    preprocessed_df.loc[preprocessed_df['Sport'] >= 1024, 'is_fwd'] = 1
    preprocessed_df.loc[preprocessed_df['Sport'] < 1024, 'is_fwd'] = 0

    makedirs(dirname(preprocessed_output_path), exist_ok=True)
    preprocessed_df.to_csv(preprocessed_output_path, index=False)
def update_entity_details(folder_name, file_regex, output_path):
    file_names = file_util.get_file_name_in_dir_regex(folder_name, file_regex)
    link_data = {}
    parent_of_leaf = []
    all_entities_from_mention = {}
    for file_name in file_names:
        print("file_name", file_name)
        entity_dict = file_util.load(file_name)
        # print(entity_dict)
        for entity_id in entity_dict:
            all_entities_from_mention[entity_id] = entity_dict[entity_id]
            linkto_infos = entity_dict[entity_id]["parents"]
            for linkto_info in linkto_infos:
                source_id = linkto_info['id']
                dest_id = linkto_info['link_to']
                if source_id == entity_id:
                    parent_of_leaf.append(dest_id)
                else:
                    parent_of_leaf.append(source_id)
                    parent_of_leaf.append(dest_id)
                link_data[source_id] = link_data.get(source_id, [])
                link_data[dest_id] = link_data.get(dest_id, [])
                if dest_id not in link_data[source_id] and dest_id != '':
                    link_data[source_id].append(dest_id)
    file_util.dump(link_data,
                   output_path + ".pck")  # "iteration3_data_dumped.pck"
    file_util.dump(parent_of_leaf, output_path + "_parent_leaf.pck")
    file_util.dump_json(link_data, output_path + ".json")
    des_short_name_dict = update_entity_description_shortname(
        link_data, all_entities_from_mention)
    file_util.dump_json(des_short_name_dict, output_path + "_brief.json")
    wiki_graph_util.convert_to_tree(link_data, des_short_name_dict)
    file_util.dump_json(all_entities_from_mention,
                        output_path + "_patent_entity_relations.json")
    excel_tree_level_export.demo(file_util.load_json("all_entity_level.json"))
Beispiel #3
0
def make_raw_data():
    ''' create input.csv in project/data/raw/ directory '''
    config = load_yaml(CONFIG_PATH)
    binetflow_path = config['binet_output_path']
    raw_output_path = config['raw_output_path']
    dataset_path = config['dataset_path']
    dataset_json = load_json(dataset_path)
    dict_mal_hosts = dict_infected_hosts(dataset_json)
    file_list = get_file_list(binetflow_path)
    create_input_csv(file_list, binetflow_path, raw_output_path,
                     dict_mal_hosts)
Beispiel #4
0
    def get_mystic_codes(cls):
        mystic = []

        for mystic_file in glob(pathjoin(cls.MYSTIC_PATH, '*.json')):
            mystic_json = load_json(mystic_file)

            mystic.append({
                'name': mystic_json.get('name'),
                'description': mystic_json.get('description', ''),
            })

        return mystic
Beispiel #5
0
def load(profile, log):
    if log:
        log_util.set_file_logger(log)
    else:
        log_util.set_std_logger()

    input_params = load_json(profile)

    if not 'mystic' in input_params:
        click.echo('Invalid profile')
    else:
        tohsaka = Tohsaka(input_params.pop('mystic'), input_params)
        tohsaka.go()
Beispiel #6
0
    def test_weather(self):
        FILENAME = 'vancouver'
        tohsaka = Tohsaka(
            'weather', {
                'appid': os.environ['OPENWEATHER_TOKEN'],
                'city': 'vancouver',
                'country': 'ca',
                'output_file': FILENAME,
                'folder': tempfile.gettempdir()
            })

        tohsaka.go()

        result = load_json(
            pathjoin(tohsaka.outputter.output_folder, FILENAME + '.json'))

        assert result
        assert 'city' in result[0]
        assert 'cnt' in result[0]
Beispiel #7
0
def get_dataset_json(file_path):
    '''Returns the json for downloading the dataset'''
    return load_json(file_path)
if __name__ == "__main__":
    # Load data
    numpy_image = process_image(args.input_image_dir, T_RESIZE_CROP)

    # Load checkpoints
    checkpoint = load_checkpoint(args.checkpoint_filepath)

    # Restore model
    model = reconstruct_model(checkpoint)

    # Prediction
    probs, classes = predict(numpy_image, model, args.top_k, args.gpu)

    # Present results
    cat_to_id_map = None
    if args.category_names:
        cat_to_id_map = load_json(args.category_names)

    print("\nResults for image '{}':".format(args.input_image_dir))
    prob_class_id_tuple_list = sorted([(p, c) for p, c in zip(probs, classes)],
                                      key=lambda t: t[0],
                                      reverse=True)
    for i, (probability, class_id) in enumerate(prob_class_id_tuple_list):
        if cat_to_id_map is not None:
            class_label = cat_to_id_map[str(class_id)] + " ({})".format(
                class_id)
        else:
            class_label = "(Class id: {})".format(class_id)
        print("  {}. {} % - {}".format(i, np.round(probability * 100, 2),
                                       class_label))
        entities = wiki_util.get_wiki_id_from_text(word, entity_dict, iter_num)
        if singu_word != word:
            entities.extend(
                wiki_util.get_wiki_id_from_text(singu_word, entity_dict,
                                                iter_num))
        if len(entities) == 0:
            not_found_entity.append(word)
        file_util.dump(entity_dict, output_entity_file)
        file_util.dump(not_found_entity, not_wiki_output)
        print(i, '/', total, ')')  #, word, '###', entities, '###'
    # file_util.dump(entity_dict, "entities_dict_wth_lvl.pck")
    file_util.dump(entity_dict, output_entity_file)
    file_util.dump(not_found_entity, not_wiki_output)


if __name__ == "__main__":
    choice = int(sys.argv[1:][0])
    if not choice:  #choice=0 folder_name, start, end, iteration
        # python3 sony_patent_evaluation/test/crawl_wiki_tree.py 0 entity_folder_09122019 0 10 2
        search_wiki_with_threads(sys.argv[1:][1], int(sys.argv[1:][2]),
                                 int(sys.argv[1:][3]), int(sys.argv[1:][4]))
    elif choice == 1:
        #python3 sony_patent_evaluation/test/crawl_wiki_tree.py 1 "entity_folder_09122019" "_dict_iteration.pck" "09_12_2019"
        update_entity_details(sys.argv[1:][1], sys.argv[1:][2],
                              sys.argv[1:][3])
    else:
        #python3 sony_patent_evaluation/test/crawl_wiki_tree.py 1 "entity_folder_03122019" "_dict_iteration.pck" "07_12_2019"
        excel_tree_level_export.demo(
            file_util.load_json("all_entity_level.json"))
# update_entity_details("entity_folder_09122019", "_dict_iteration.pck", "09_12_2019")
Beispiel #10
0
    def load_mystic_code(cls, mystic_code):
        filepath = pathjoin(cls.MYSTIC_PATH, mystic_code + '.json')

        return load_json(filepath)