コード例 #1
0
def make_preprocess():
    '''
        Read interim.csv and clean more data.
        1. Read StartTime as DateTime
        2. Perform binning on source and destination ports
        3. Add attribute indicating direction of flow
        4. Write to preproccessed.csv
    '''
    config = load_yaml(CONFIG_PATH)
    interim_output_path = config['interim_output_path']
    preprocessed_output_path = config['preprocessed_output_path']
    proto_dict = load_json(config['proto_dict_path'])
    dir_dict = load_json(config['dir_dict_path'])
    state_dict = load_json(config['state_dict_path'])
    # Well-known ports range from 0 through 1023
    # Registered ports are 1024 to 49151
    # Dynamic ports (also called private ports) are 49152 to 65535
    port_bins = [0, 1023, 49151, 65535]
    port_labels = [0, 1, 2]

    interim_df = pd.read_csv(interim_output_path, sep=',', escapechar='\\')
    preprocessed_df = interim_df
    preprocessed_df['StartTime'] = pd.to_datetime(preprocessed_df['StartTime'])

    preprocessed_df['Proto_Int'] = preprocessed_df['Proto'].map(proto_dict)
    preprocessed_df['Proto_Int'].fillna(proto_dict['Unknown'])
    preprocessed_df['Proto_Int'] = preprocessed_df['Proto_Int'].astype(
        'category')

    preprocessed_df['Sport_Int'] = pd.cut(preprocessed_df['Sport'],
                                          bins=port_bins,
                                          labels=port_labels,
                                          include_lowest=True)
    preprocessed_df['Sport_Int'] = preprocessed_df['Sport_Int'].astype(
        'category')

    preprocessed_df['Dir_Int'] = preprocessed_df['Dir'].map(dir_dict)
    preprocessed_df['Dir_Int'] = preprocessed_df['Dir_Int'].fillna(
        dir_dict['Unknown'])
    preprocessed_df['Dir_Int'] = preprocessed_df['Dir_Int'].astype('category')

    preprocessed_df['Dport_Int'] = pd.cut(preprocessed_df['Dport'],
                                          bins=port_bins,
                                          labels=port_labels,
                                          include_lowest=True)
    preprocessed_df['Dport_Int'] = preprocessed_df['Dport_Int'].astype(
        'category')

    preprocessed_df['State_Int'] = preprocessed_df['State'].map(state_dict)
    preprocessed_df['State_Int'] = preprocessed_df['State_Int'].fillna(
        state_dict['Unknown'])
    preprocessed_df['State_Int'] = preprocessed_df['State_Int'].astype(
        'category')

    preprocessed_df['is_fwd'] = preprocessed_df['Sport']
    preprocessed_df.loc[preprocessed_df['Sport'] >= 1024, 'is_fwd'] = 1
    preprocessed_df.loc[preprocessed_df['Sport'] < 1024, 'is_fwd'] = 0

    makedirs(dirname(preprocessed_output_path), exist_ok=True)
    preprocessed_df.to_csv(preprocessed_output_path, index=False)
コード例 #2
0
def update_entity_details(folder_name, file_regex, output_path):
    file_names = file_util.get_file_name_in_dir_regex(folder_name, file_regex)
    link_data = {}
    parent_of_leaf = []
    all_entities_from_mention = {}
    for file_name in file_names:
        print("file_name", file_name)
        entity_dict = file_util.load(file_name)
        # print(entity_dict)
        for entity_id in entity_dict:
            all_entities_from_mention[entity_id] = entity_dict[entity_id]
            linkto_infos = entity_dict[entity_id]["parents"]
            for linkto_info in linkto_infos:
                source_id = linkto_info['id']
                dest_id = linkto_info['link_to']
                if source_id == entity_id:
                    parent_of_leaf.append(dest_id)
                else:
                    parent_of_leaf.append(source_id)
                    parent_of_leaf.append(dest_id)
                link_data[source_id] = link_data.get(source_id, [])
                link_data[dest_id] = link_data.get(dest_id, [])
                if dest_id not in link_data[source_id] and dest_id != '':
                    link_data[source_id].append(dest_id)
    file_util.dump(link_data,
                   output_path + ".pck")  # "iteration3_data_dumped.pck"
    file_util.dump(parent_of_leaf, output_path + "_parent_leaf.pck")
    file_util.dump_json(link_data, output_path + ".json")
    des_short_name_dict = update_entity_description_shortname(
        link_data, all_entities_from_mention)
    file_util.dump_json(des_short_name_dict, output_path + "_brief.json")
    wiki_graph_util.convert_to_tree(link_data, des_short_name_dict)
    file_util.dump_json(all_entities_from_mention,
                        output_path + "_patent_entity_relations.json")
    excel_tree_level_export.demo(file_util.load_json("all_entity_level.json"))
コード例 #3
0
def make_raw_data():
    ''' create input.csv in project/data/raw/ directory '''
    config = load_yaml(CONFIG_PATH)
    binetflow_path = config['binet_output_path']
    raw_output_path = config['raw_output_path']
    dataset_path = config['dataset_path']
    dataset_json = load_json(dataset_path)
    dict_mal_hosts = dict_infected_hosts(dataset_json)
    file_list = get_file_list(binetflow_path)
    create_input_csv(file_list, binetflow_path, raw_output_path,
                     dict_mal_hosts)
コード例 #4
0
ファイル: tohsaka.py プロジェクト: ye11ow/tohsaka
    def get_mystic_codes(cls):
        mystic = []

        for mystic_file in glob(pathjoin(cls.MYSTIC_PATH, '*.json')):
            mystic_json = load_json(mystic_file)

            mystic.append({
                'name': mystic_json.get('name'),
                'description': mystic_json.get('description', ''),
            })

        return mystic
コード例 #5
0
def load(profile, log):
    if log:
        log_util.set_file_logger(log)
    else:
        log_util.set_std_logger()

    input_params = load_json(profile)

    if not 'mystic' in input_params:
        click.echo('Invalid profile')
    else:
        tohsaka = Tohsaka(input_params.pop('mystic'), input_params)
        tohsaka.go()
コード例 #6
0
ファイル: test_weather.py プロジェクト: ye11ow/tohsaka
    def test_weather(self):
        FILENAME = 'vancouver'
        tohsaka = Tohsaka(
            'weather', {
                'appid': os.environ['OPENWEATHER_TOKEN'],
                'city': 'vancouver',
                'country': 'ca',
                'output_file': FILENAME,
                'folder': tempfile.gettempdir()
            })

        tohsaka.go()

        result = load_json(
            pathjoin(tohsaka.outputter.output_folder, FILENAME + '.json'))

        assert result
        assert 'city' in result[0]
        assert 'cnt' in result[0]
コード例 #7
0
def get_dataset_json(file_path):
    '''Returns the json for downloading the dataset'''
    return load_json(file_path)
コード例 #8
0
if __name__ == "__main__":
    # Load data
    numpy_image = process_image(args.input_image_dir, T_RESIZE_CROP)

    # Load checkpoints
    checkpoint = load_checkpoint(args.checkpoint_filepath)

    # Restore model
    model = reconstruct_model(checkpoint)

    # Prediction
    probs, classes = predict(numpy_image, model, args.top_k, args.gpu)

    # Present results
    cat_to_id_map = None
    if args.category_names:
        cat_to_id_map = load_json(args.category_names)

    print("\nResults for image '{}':".format(args.input_image_dir))
    prob_class_id_tuple_list = sorted([(p, c) for p, c in zip(probs, classes)],
                                      key=lambda t: t[0],
                                      reverse=True)
    for i, (probability, class_id) in enumerate(prob_class_id_tuple_list):
        if cat_to_id_map is not None:
            class_label = cat_to_id_map[str(class_id)] + " ({})".format(
                class_id)
        else:
            class_label = "(Class id: {})".format(class_id)
        print("  {}. {} % - {}".format(i, np.round(probability * 100, 2),
                                       class_label))
コード例 #9
0
        entities = wiki_util.get_wiki_id_from_text(word, entity_dict, iter_num)
        if singu_word != word:
            entities.extend(
                wiki_util.get_wiki_id_from_text(singu_word, entity_dict,
                                                iter_num))
        if len(entities) == 0:
            not_found_entity.append(word)
        file_util.dump(entity_dict, output_entity_file)
        file_util.dump(not_found_entity, not_wiki_output)
        print(i, '/', total, ')')  #, word, '###', entities, '###'
    # file_util.dump(entity_dict, "entities_dict_wth_lvl.pck")
    file_util.dump(entity_dict, output_entity_file)
    file_util.dump(not_found_entity, not_wiki_output)


if __name__ == "__main__":
    choice = int(sys.argv[1:][0])
    if not choice:  #choice=0 folder_name, start, end, iteration
        # python3 sony_patent_evaluation/test/crawl_wiki_tree.py 0 entity_folder_09122019 0 10 2
        search_wiki_with_threads(sys.argv[1:][1], int(sys.argv[1:][2]),
                                 int(sys.argv[1:][3]), int(sys.argv[1:][4]))
    elif choice == 1:
        #python3 sony_patent_evaluation/test/crawl_wiki_tree.py 1 "entity_folder_09122019" "_dict_iteration.pck" "09_12_2019"
        update_entity_details(sys.argv[1:][1], sys.argv[1:][2],
                              sys.argv[1:][3])
    else:
        #python3 sony_patent_evaluation/test/crawl_wiki_tree.py 1 "entity_folder_03122019" "_dict_iteration.pck" "07_12_2019"
        excel_tree_level_export.demo(
            file_util.load_json("all_entity_level.json"))
# update_entity_details("entity_folder_09122019", "_dict_iteration.pck", "09_12_2019")
コード例 #10
0
ファイル: tohsaka.py プロジェクト: ye11ow/tohsaka
    def load_mystic_code(cls, mystic_code):
        filepath = pathjoin(cls.MYSTIC_PATH, mystic_code + '.json')

        return load_json(filepath)