コード例 #1
0
ファイル: stitch.py プロジェクト: Parmeggiani-Lab/elfin
def main(test_args=None):
    args = parse_args(sys.argv[1:] if test_args is None else test_args)

    input_ext = args.input_file[args.input_file.rfind('.'):].lower()

    if input_ext == '.json':
        spec = utils.read_json(args.input_file)
        xdb = utils.read_json(args.xdb)

        struct = Stitcher(
            spec,
            xdb,
            args.pdb_dir,
            args.cappings_dir,
            args.metadata_dir,
            args.show_fusion,
            args.disable_capping,
            args.skip_unused
        ).run()

        if args.out_file == '':
            args.out_file = args.input_file
        args.out_file = '.'.join(args.out_file.split('.')[:-1] + ['cif'])

        print('Saving to:', args.out_file)
        pdb_utils.save_cif(struct=struct, path=args.out_file)
    else:
        print('Unknown input file type: \"{}\"'.format(input_ext))
        exit()
コード例 #2
0
def update_hash_in_partitioned_files(
        data_directory,
        file_name,
        suffix_1='_partition_',
        suffix_2='_updated',
        mapping_file_with_hash='mapping_with_hash.json'):
    mapping_json = utilities.read_json(
        os.path.join(data_directory, mapping_file_with_hash))

    dict_per_partition = {}
    empty = 0
    non_empty = 0
    pbar = tqdm(total=len(mapping_json))
    for key in mapping_json:
        if mapping_json[key]['partition_index'] not in dict_per_partition:
            dict_per_partition[mapping_json[key]['partition_index']] = {}
            dict_per_partition[mapping_json[key]['partition_index']][
                mapping_json[key]['id_index']] = {}
        else:
            dict_per_partition[mapping_json[key]['partition_index']][
                mapping_json[key]['id_index']] = {}

        if mapping_json[key]['content_hash'].strip() == '':
            empty += 1
        else:
            non_empty += 1

        dict_per_partition[mapping_json[key]['partition_index']][
            mapping_json[key]
            ['id_index']]['content_hash'] = mapping_json[key]['content_hash']

        pbar.update(1)
    print('empty, non-empty:', empty, non_empty)

    pbar = tqdm(total=len(dict_per_partition))
    for key in dict_per_partition:
        current_json = None
        if key == len(dict_per_partition):
            current_json = utilities.read_json(
                os.path.join(data_directory,
                             file_name + suffix_1 + str(key) + '.json'))
        else:
            current_json = utilities.read_json(
                os.path.join(
                    data_directory,
                    file_name + suffix_1 + str(key) + suffix_2 + '.json'))
        for item in dict_per_partition[key]:
            current_json[item]['content_hash'] = dict_per_partition[key][item][
                'content_hash']
        utilities.write_json(
            os.path.join(
                data_directory,
                file_name + '_with_hash_partition_' + str(key) + '.json'),
            current_json)
        pbar.update(1)
    return
コード例 #3
0
def collect_core_selection_data(username, password, use_json=False):
    data_set = get_core_selection_data(username, password)

    try:
        os.mkdir("DeGiro")
    except Exception:
        print("DeGiro folder already exists")

    if use_json:
        data_set_filled = read_json(r"DeGiro/core_selection_filled.json")
    else:
        print("Filling JSON data_set_filled..")
        data_set_filled = {}
        for symbol in tqdm(data_set):
            data_set_filled[symbol] = fill_data_points_etfs(data_set[symbol])
            data_set_filled[symbol]['ISIN'] = data_set[symbol]['ISIN']

        with open(r"DeGiro/core_selection_filled.json", 'w') as handle:
            sorted_data_set_filled = sort_dictionary(data_set_filled)
            json.dump(sorted_data_set_filled, handle, indent=4)

    # Make legit data set
    print("Filling JSON data_set_filtered..")
    data_set_filtered = {}
    for symbol in data_set_filled:
        if data_set_filled[symbol]['summary'] is not None:
            data_set_filtered[symbol] = data_set_filled[symbol]
            data_set_filtered[symbol]['ISIN'] = data_set[symbol]['ISIN']

    with open(r"DeGiro/core_selection_filtered.json", 'w') as handle:
        sorted_data_set_filtered = sort_dictionary(data_set_filtered)
        json.dump(sorted_data_set_filtered, handle, indent=4)
    print("Done!")

    return data_set_filtered
コード例 #4
0
 def stitch_images(self, whattostitch):
     print(colored("Image stitching started: {}".format(whattostitch), color='blue'))
     for dirName, subdirList, fileList in os.walk(self.path_RESULTS):
         if 'tile_list.json' in fileList:
             tile_list = read_json(dirName + '/tile_list.json')
             self.just_stitch_them(tile_list, dirName, whattostitch)
     print(colored("Image stitching completed", color='green'))
コード例 #5
0
def taste(food):
    '''
    The main public interface of the system.
    This function returns a JSON object that contains
    six taste scores, on a scale of 0 to 10:
    bitter, rich, salt, sour, sweet and umami
    '''
    food = append_parsed(food)
    nutrients = get_nutrients(food)
    tastes = {
        "bitter": round(bitter(food, nutrients), 3),
        "rich": round(rich(nutrients), 3),
        "salt": round(salt(nutrients), 3),
        "sour": round(sour(food, nutrients), 3),
        "sweet": round(sweet(nutrients), 3),
        "umami": round(umami(food, nutrients, 3))
    }
    if os.path.exists("adjustment_factors.json"):
        for taste, adjustment \
                in utilities.read_json("adjustment_factors.json").items():

            tastes[taste] -= adjustment

    if not os.path.exists("../Utilities/Team 2"):
        os.mkdir("../Utilities/Team 2")
    with open("../Utilities/Team 2/tastes.csv", "a") as csvfile:
        csvfile.write(",".join(
            [str(food['dish_id'])] +
            [str(round(tastes[key], 3))
             for key in sorted(tastes.keys())]) + "\n")
    return tastes
コード例 #6
0
def bitter(food,
           nutrition_data,
           LEVEL1_MULTIPLIER=0.80,
           LEVEL2_MULTIPLIER=1.40,
           MULTI_WORD_MULTIPLIER=2.3):
    '''
    This function computes the bitter score for the food item.
    This computation is performed with three approaches:
    - The iron content is used a fraction of the total active nutrient weight
    - Three groups of descriptors are used, each of which
        has a different weightage towards the final scoring.
    - The bitter words are divided into two levels, each corresponding
        to an particular intensity of bitterness. Each of these levels
        again has a different weightage.

    Finally, a real value from 0-10 is returned as a bitter score
    for the food item.
    '''
    try:
        bitter_descriptors = utilities.read_json("bitter_descriptors.json")
        descriptor_score = match_descriptors(food['ingredient_str'],
                                             bitter_descriptors)
        bitterscore = (nutrition_data["iron"] /
                       total_nutrient_weight(nutrition_data))
        pairings = zip(
            [LEVEL1_MULTIPLIER, LEVEL2_MULTIPLIER, MULTI_WORD_MULTIPLIER],
            ["bitter_l1", "bitter_l2", "multi_words"])
        for pair in pairings:
            if descriptor_score.__contains__(pair[1]):
                bitterscore += pair[0] * descriptor_score[pair[1]] * 1
    except Exception:
        bitterscore = 0

    return round(bitterscore / 1.4571, 3)
コード例 #7
0
def sour(food,
         nutrition_data,
         SOURNESS_FACTOR_X=0.5,
         SOURNESS_FACTOR_Y=0.15,
         SOURNESS_FACTOR_Z=0.35):
    '''
    For sourness, an approach similar to how bitter was calculated
    is applied.
    Here, the nutrient directly affecting the sourness of a food item
    is the vitamin C content. Therefore, this is given the maximum weightage
    of 50% that counts towards the final final sour score for the dish.

    The other two metrics are calculated from comparing against
    an internally maintained database of ingredients and keywords,
    each of which is given its own weightage. These are divided
    into two levels: sour and too sour. Naturally, too sour keywords have
    a higher weightage than keywords that are tagged just sour.
    '''
    food_words = food['ingredient_str'].upper().split(' ')

    try:
        vitamin_c = nutrition_data['vitamin_c'] * 1000
    except KeyError:
        vitamin_c = 0.0

    sour = utilities.read_json("sour.json")
    too_sour = utilities.read_json("too_sour.json")
    try:
        sour_score_x = vitamin_c / nutrition_data['weight']
    except ZeroDivisionError:
        sour_score_x = 0

    sour_score_y = 0
    sour_score_z = 0

    for word in food_words:
        if word in sour[word[0]]:
            sour_score_y += 1
        if word in too_sour[word[0]]:
            sour_score_z += 1
    sour_score = round(((SOURNESS_FACTOR_X * sour_score_x) +
                        (SOURNESS_FACTOR_Y * sour_score_y) +
                        (SOURNESS_FACTOR_Z * sour_score_z)) / 1.43, 3)
    if sour_score > 1:
        sour_score = 1
    return round(sour_score * 10, 3)
コード例 #8
0
def adjust_greedy(partitioned_data_directory,
                  partitioned_file_name='javascript_partition_',
                  remaining_file_name='remaining.json'):
    remaining_json = utilities.read_json(
        os.path.join(partitioned_data_directory, remaining_file_name))

    pbar_overall = tqdm(total=len(remaining_json))
    for key in remaining_json:
        current_json = utilities.read_json(
            os.path.join(partitioned_data_directory,
                         partitioned_file_name + key + '.json'))

        for str_id_index in remaining_json[key]:
            if 'visit_id' not in remaining_json[key][str_id_index]:
                continue

            current_json[str_id_index]['visit_id'] = remaining_json[key][
                str_id_index]['visit_id']
            current_json[str_id_index]['top_url'] = remaining_json[key][
                str_id_index]['top_url']
            current_json[str_id_index]['script_url'] = remaining_json[key][
                str_id_index]['script_url']
            current_json[str_id_index]['script_line'] = remaining_json[key][
                str_id_index]['script_line']
            current_json[str_id_index]['script_col'] = remaining_json[key][
                str_id_index]['script_col']
            for item in remaining_json[key][str_id_index]['information']:
                current_json[str_id_index]['information'].append(item)

        utilities.write_json(os.path.join(
            partitioned_data_directory,
            partitioned_file_name + key + '_updated.json'),
                             current_json,
                             indent_length=0)
        pbar_overall.update(1)

    return
コード例 #9
0
def _loadLayoutFile(path: str) -> pd.DataFrame:
    df: pd.DataFrame = pd.read_csv(get_resource('empty.csv'),
                                   names=KEY_SPEC.keys(),
                                   dtype=KEY_SPEC)
    assert len(df.columns) == len(KEY_SPEC)
    keyboard_json = read_json(path)
    layout = pd.read_json(json.dumps(keyboard_json['keys']))
    df = df.append(layout)
    assert len(df.columns) == len(KEY_SPEC)

    setattr(df, 'keyboard_left', keyboard_json['left'])
    setattr(df, 'keyboard_top', keyboard_json['top'])
    setattr(df, 'keyboard_width', keyboard_json['width'])
    setattr(df, 'keyboard_height', keyboard_json['height'])
    return df
コード例 #10
0
 def request_token(self, credentials_path):
     credentials = read_json(credentials_path)
     authentication = Task(self.loop)
     hdrs = {'Content-Type': 'application/json'}
     req_auth = authentication.do_the_task(self.TOKEN_REQ_URL, hdrs,
                                           json.dumps(credentials))
     self.TOKEN = req_auth[0]
     self.TOKEN['timestamp'] = time.time()
     headers.update(
         {'Authorization': 'Bearer {}'.format(self.TOKEN['id_token'])})
     write_json(self.TOKEN, path_token)
     if 'status_code' in req_auth[0] and req_auth[0]['status_code'] == 200:
         print(
             colored("New token saved in: {}".format(path_token),
                     color='green'))
     else:
         print(
             colored("New token request Failed, status code: {}".format(
                 req_auth[0]['status_code']),
                     color='red'))
コード例 #11
0
    def check_token(self, TOKEN_path):
        valid = False
        try:
            TOKEN = read_json(TOKEN_path)
            authentication = Task(self.loop)

            if token_seems_valid(TOKEN):
                hdrs = copy.deepcopy(headers)
                hdrs.update(
                    {'Authorization': 'Bearer {}'.format(TOKEN['id_token'])})
                auth = authentication.do_the_task(
                    self.AUTHORISE_URL, hdrs,
                    json.dumps({"token": TOKEN['id_token']}))

                if 'status_code' in auth[0] and auth[0]['status_code'] == 200:
                    valid = True
                    print(
                        colored("Token authentication successful ",
                                color='green'))
                    self.TOKEN = TOKEN
                    headers.update({
                        'Authorization':
                        'Bearer {}'.format(self.TOKEN['id_token'])
                    })
                else:
                    print(
                        colored("Token authentication Failed, status code:{}".
                                format(auth[0]['status_code']),
                                color='red'),
                        colored("\nbut, new Token request will be called",
                                color='blue'))

                    valid = False
            else:
                valid = False
        except:
            print(colored("Token Not found!", color='red'),
                  colored("but, new request will be called", color='blue'))
        finally:
            return valid
コード例 #12
0
    def count_cars(self):
        print(colored("Cars counting started", color='blue'))

        cars = []
        nof_maps = 0
        for dirName, subdirList, fileList in os.walk(self.path_RESULTS):
            if 'mapId_0' in subdirList:
                nof_maps = len(subdirList)
            if 'detections.geojson' in fileList:
                cars_in_tile = 0
                detection = read_json(dirName+'/'+'detections.geojson')
                for idx in range(len((detection['features']))):
                    props = dict(detection['features'][idx]["properties"])
                    if props.get('class') and 'cars' in props.values():
                        cars_in_tile += int(props['count'])

                cars.append(cars_in_tile)

        print(colored("Total # of cars counted in the desired period: {}".format(sum(cars)),
                      color='magenta', on_color='on_white', attrs=['bold']))
        print(colored("counted in {} maps, resulting in {} of cars in average.".format(nof_maps, sum(cars)/nof_maps),
                      color='magenta', on_color='on_white', attrs=['bold']))
コード例 #13
0
def umami(food,
          nutrition_data,
          PROTEIN_SUPPLEMENT_MULTIPLIER=0.80,
          VEGETABLES_MULTIPLIER=7,
          MEAT_MULTIPLIER=3,
          STRING_MULTIPLIER=9.45):
    '''
    Calculation of umami score is similar to that of bitter -
    The presense of iron along with the added effects of different
    categories of keywords, such as savoury vegetables,
    the savouriness of different meats as well as
    naturally occuring sources of protein, such as casein,
    all have their own weights that determines their contribution
    to the final score.
    '''
    for key in nutrition_data.keys():
        if nutrition_data[key] is None:
            nutrition_data[key] = 0
    umami_descriptors = utilities.read_json("umami_descriptors.json")
    descriptor_score = match_descriptors(food['ingredient_str'],
                                         umami_descriptors)
    try:
        umamiscore = (nutrition_data["protein"] /
                      total_nutrient_weight(nutrition_data))

        pairings = zip([
            PROTEIN_SUPPLEMENT_MULTIPLIER, VEGETABLES_MULTIPLIER,
            MEAT_MULTIPLIER, STRING_MULTIPLIER
        ], ["protein_supps", "vegetables", "meat", "savory_strings"])
        for pair in pairings:
            if descriptor_score.__contains__(pair[1]):
                umamiscore += pair[0] * descriptor_score[pair[1]]
    except Exception:
        umamiscore = 0

    return round(umamiscore, 3) if umamiscore <= 10 else 10
コード例 #14
0
def extract_features(directory_path, result_path, keywords_file, feature_type_to_read):
    all_files = utilities.get_files_in_a_directory(directory_path)
    temp = utilities.read_file(keywords_file)

    keywords_list = []
    for item in temp:
        keywords_list.append(item.strip())

    for f_name in all_files:
        try:
            file_data =  utilities.read_json(f_name)
            if feature_type_to_read == ALL:
                raw_features = new_walk(file_data)
            elif feature_type_to_read == NO_NAMES:
                raw_features = new_walk_no_names(file_data)
            elif feature_type_to_read == KEYWORD:
                raw_features = new_walk_reserved_words(file_data, keywords_list)

            utilities.append_list(f_name.replace(directory_path,result_directory).replace('json', 'txt'), raw_features)
        except Exception as e:
            print(f_name)
            print(str(e))

    return
コード例 #15
0
ファイル: cuisine_classifier.py プロジェクト: pratul29/food
    vector = vectorizer.fit_transform(all_recipes)
    for index in test_indices:
        test_dish = training_set[test_indices[index]]
        neighbors = get_neighbors(test_dish, training_set, vector,
                                  similarity_measure)
        neighbors_cuisines = [(get_cuisine_tags(dish_name[0]), dish_name[1])
                              for dish_name in neighbors][:7]
        cuisines_dict[test_dish['dish_name']] = knn(neighbors_cuisines)
    return cuisines_dict


if __name__ == '__main__':
    if len(sys.argv) == 3 or len(sys.argv) == 4:
        all_recipes = list()
        sample_size = 1300
        if len(sys.argv) == 4:
            sample_size = int(sys.argv[3])
        all_dishes = read_json(sys.argv[1])
        test_dishes = read_json(sys.argv[2])
        for dish, value in classify_cuisine(all_dishes[:sample_size],
                                            test_dishes,
                                            cosine_similarity).items():
            print(dish)
            print("------------")
            print(value)
            print()
    else:
        print(
            "python3 cuisine_classifier.py <path_do_dish_database> <path_to_test_dishes> <sample_size>(OPTIONAL)"
        )
コード例 #16
0
ファイル: concat.py プロジェクト: stothe2/muppet
def main(files):
    # We're done if there's only one file.
    if len(files) == 1:
        return

    # Check if files exist.
    for file in files:
        assert os.path.isfile(file)

    data = []
    for i, file in enumerate(files):
        # Load files.
        data.append(read_json(file))
        # Check if necessary fields are present in all files.
        # TODO: Maybe make a function to do this, since it's used often? Also, is this really necessary here?
        assert 'experiment_name' in data[i]
        assert 'experiment_paradigm' in data[i]
        assert 'date' in data[i]
        assert 'neuroid' in data[i]
        assert 'animal' in data[i]['neuroid']
        assert 'f_sampling' in data[i]
        assert 'f_low' in data[i]
        assert 'f_high' in data[i]
        assert 'ellip_order' in data[i]
        assert 'threshold_sd' in data[i]
        assert 'chunks_for_threshold' in data[i]
        assert 'start_time' in data[i]
        assert 'stop_time' in data[i]
        assert 'spikes' in data[i]
        assert 'baseline' in data[i]
        assert 'spikes' in data[i]['baseline']
        assert 'n_grey' in data[i]['baseline']
        assert 'n_other' in data[i]['baseline']
        assert 'n_trials' in data[i]
        assert 'n_channels' in data[i]
        assert 'stim_on_time' in data[i]
        assert 'stim_off_time' in data[i]
        assert 'stim_on_delay' in data[i]
        assert 'inter_trial_interval' in data[i]
        assert 'stim_size' in data[i]
        assert 'fixation_point_size' in data[i]
        assert 'fixation_window_size' in data[i]

    # Check if necessary field values match in all files.
    # TODO: Check if neuroid ids match too?
    assert all(session_data['experiment_name'] == data[0]['experiment_name'] for session_data in data)
    assert all(session_data['experiment_paradigm'] == data[0]['experiment_paradigm'] for session_data in data)
    assert all(session_data['neuroid']['animal'] == data[0]['neuroid']['animal'] for session_data in data)
    assert all(session_data['f_sampling'] == data[0]['f_sampling'] for session_data in data)
    assert all(session_data['f_low'] == data[0]['f_low'] for session_data in data)
    assert all(session_data['f_high'] == data[0]['f_high'] for session_data in data)
    assert all(session_data['ellip_order'] == data[0]['ellip_order'] for session_data in data)
    assert all(session_data['threshold_sd'] == data[0]['threshold_sd'] for session_data in data)
    assert all(session_data['chunks_for_threshold'] == data[0]['chunks_for_threshold'] for session_data in data)
    assert all(session_data['start_time'] == data[0]['start_time'] for session_data in data)
    assert all(session_data['stop_time'] == data[0]['stop_time'] for session_data in data)
    assert all(session_data['n_channels'] == data[0]['n_channels'] for session_data in data)
    assert all(session_data['stim_on_time'] == data[0]['stim_on_time'] for session_data in data)
    assert all(session_data['stim_off_time'] == data[0]['stim_off_time'] for session_data in data)
    assert all(session_data['stim_on_delay'] == data[0]['stim_on_delay'] for session_data in data)
    assert all(session_data['inter_trial_interval'] == data[0]['inter_trial_interval'] for session_data in data)
    assert all(session_data['stim_size'] == data[0]['stim_size'] for session_data in data)
    assert all(session_data['fixation_point_size'] == data[0]['fixation_point_size'] for session_data in data)
    assert all(session_data['fixation_window_size'] == data[0]['fixation_window_size'] for session_data in data)
    assert all(session_data['baseline']['n_grey'] == data[0]['baseline']['n_grey'] for session_data in data)
    assert all(session_data['baseline']['n_other'] == data[0]['baseline']['n_other'] for session_data in data)

    # Populate the new dictionary which will contain all the merged data.
    concatenated_data = dict()
    for key, value in data[0].items():
        if key in ['spikes', 'baseline', 'trial_times', 'date', 'n_trials']:
            continue
        concatenated_data[key] = value

    # Merge dates on which the experiment was run.
    dates = list(set([session_data['date'] for session_data in data]))
    concatenated_data['date'] = ', '.join(dates)

    # Merge the number of trials.
    concatenated_data['n_trials'] = sum(session_data['n_trials'] for session_data in data)

    # Create a grouping_idx field so it is easy to identify which trials were run on which days
    # for normalization purposes.
    grouping_dates = []  # Temporary list of dates used to compute groupings
    grouping = []  # This is initially a list of n_trials per date (combines different sessions run on same day)
    for session_data in data:
        if session_data['date'] in grouping_dates:
            grouping[grouping_dates.index(session_data['date'])] += session_data['n_trials']
        else:
            grouping_dates.append(session_data['date'])
            grouping.append(session_data['n_trials'])
    # We update the grouping variable so now it stores a list of indexes that can be used to select
    # appropriate coordinates from the trial dimension of a PSTH XArray.
    for i, group in enumerate(grouping):
        group = list(range(group))
        print(group)
        if i != 0:
            group = [_ + grouping[i-1][-1] + 1 for _ in group]
        grouping[i] = group
    assert len(grouping) == len(concatenated_data['date'].split(','))
    concatenated_data['grouping_idx'] = grouping

    # Merge spikes.
    # TODO: a more efficient way?
    concatenated_data['spikes'] = {}
    for channel in data[0]['spikes']:
        concatenated_data['spikes'][channel] = {}
        for item in data[0]['spikes'][channel]:
            _ = {}  # Initialize an empty dictionary that will contain data for all trials.
            trial_counter = 0  # Initialize a counter for trial number.
            for session_data in data:
                for trial_data in session_data['spikes'][channel][item].values():
                    trial_counter += 1
                    _[trial_counter] = trial_data
            concatenated_data['spikes'][channel][item] = _

    # Merge baseline.
    concatenated_data['baseline'] = {}
    concatenated_data['baseline']['n_grey'] = data[0]['baseline']['n_grey']
    concatenated_data['baseline']['n_other'] = data[0]['baseline']['n_other']

    concatenated_data['baseline']['spikes'] = {}
    for channel in data[0]['baseline']['spikes']:
        concatenated_data['baseline']['spikes'][channel] = {}
        for item in data[0]['baseline']['spikes'][channel]:
            _ = {}  # Initialize an empty dictionary that will contain data for all trials.
            trial_counter = 0  # Initialize a counter for trial number.
            for session_data in data:
                for trial_data in session_data['baseline']['spikes'][channel][item].values():
                    trial_counter += 1
                    _[trial_counter] = trial_data
            concatenated_data['baseline']['spikes'][channel][item] = _

    # Store data.
    with open('data.json', 'w') as f:
        json.dump(concatenated_data, f, indent=4)  # TODO: Store in a braintree directory?

    return
コード例 #17
0
def convert_to_json(db_addr, data_directory, file_name, partition_size=10):
    con = sqlite3.connect(db_addr)
    con.row_factory = sqlite3.Row
    cur = con.cursor()
    cur.execute("SELECT MAX(id) as max_id FROM javascript")
    total_rows = cur.fetchone()['max_id']
    pbar = tqdm(total=total_rows)
    cur.execute(
        'SELECT visit_id, script_url, top_level_url, symbol, arguments, value, script_line, script_col FROM javascript'
    )

    js_data = {}
    # unique_scripts_keymap = {}

    id_index = 0
    # key_counter = 0
    id_key_map = {}
    # unique_scripts = []
    current_partition_index = 1
    required_partition_index = 1
    next_partition_index = 1
    next_check = True

    for row in cur:
        str_id = str(row[0]) + '|' + row[2] + '|' + row[1]

        if str_id not in id_key_map:
            if next_partition_index != current_partition_index:
                # check if the file exists read from there, otherwise create a new file/object
                utilities.write_json(os.path.join(
                    data_directory, file_name + '_partition_' +
                    str(current_partition_index) + '.json'),
                                     js_data,
                                     indent_length=0)
                if os.path.exists(
                        os.path.join(
                            data_directory, file_name + '_partition_' +
                            str(next_partition_index) + '.json')):
                    js_data = utilities.read_json(
                        os.path.join(
                            data_directory, file_name + '_partition_' +
                            str(next_partition_index) + '.json'))
                    print(
                        '39: Writing:', file_name + '_partition_' +
                        str(current_partition_index), 'Reading:',
                        file_name + '_partition_' + str(next_partition_index),
                        'Next Partition: ', next_partition_index)
                else:
                    print(
                        '39: Writing:', file_name + '_partition_' +
                        str(current_partition_index), 'Not Reading:',
                        file_name + '_partition_' + str(next_partition_index),
                        'Next Partition: ', next_partition_index)
                current_partition_index = next_partition_index
                next_check = True

            id_index += 1
            str_id_index = str(id_index)
            id_key_map[str_id] = {}
            id_key_map[str_id]['id_index'] = str_id_index

            id_key_map[str_id]['partition_index'] = current_partition_index
            js_data[str_id_index] = {}
            js_data[str_id_index]['information'] = []
        else:
            str_id_index = id_key_map[str_id]['id_index']
            required_partition_index = id_key_map[str_id]['partition_index']

            if required_partition_index != current_partition_index:
                utilities.write_json(os.path.join(
                    data_directory, file_name + '_partition_' +
                    str(current_partition_index) + '.json'),
                                     js_data,
                                     indent_length=0)
                js_data = utilities.read_json(
                    os.path.join(
                        data_directory, file_name + '_partition_' +
                        str(required_partition_index) + '.json'))
                print(
                    '58: Writing:',
                    file_name + '_partition_' + str(current_partition_index),
                    'Reading:',
                    file_name + '_partition_' + str(required_partition_index),
                    'Next Partition: ', next_partition_index)
                current_partition_index = required_partition_index
            # make sure we have the right json object for the partition

        js_data[str_id_index]['visit_id'] = row[0]
        js_data[str_id_index]['top_url'] = row[2]
        js_data[str_id_index]['script_url'] = row[1]
        js_data[str_id_index]['script_line'] = row[6]
        js_data[str_id_index]['script_col'] = row[7]
        js_data[str_id_index]['information'].append({
            'symbol': row[3],
            'argument': row[4],
            'value': row[5]
        })

        if id_index % partition_size == 0:
            # Because you may not update id_index and might still read rows before the partition.
            # if not os.path.exists(os.path.join(data_directory, file_name + '_partition_' + str(next_partition_index) + '.json')):
            if next_check:
                utilities.write_json(os.path.join(
                    data_directory, file_name + '_partition_' +
                    str(next_partition_index) + '.json'),
                                     js_data,
                                     indent_length=0)
                next_partition_index += 1
                print('76: Writing:',
                      file_name + '_partition_' + str(current_partition_index),
                      'Next Partition: ', next_partition_index)
                current_partition_index = next_partition_index
                required_partition_index = next_partition_index
                js_data = {}
                next_check = False

        pbar.update(1)
    utilities.write_json(os.path.join(data_directory, 'mapping.json'),
                         id_key_map,
                         indent_length=4)
    return
コード例 #18
0
ファイル: ingredient_parser.py プロジェクト: pratul29/food
        i.strip() for i in ingredient.lower().split(' ') if i not in adjectives
    ]

    measurement = str()
    prev_word = str()
    for word in ingredient_tokens:
        if len(difflib.get_close_matches(word, measurements, cutoff=0.9)) > 0:
            measurement = prev_word + ' ' + word

        prev_word = word
    if len(measurement) == 0:
        measurement = re.match(r'\d+(.\d+)?', ingredient)
        if measurement is not None:
            measurement = measurement.group(0)
        else:
            measurement = str()

    ingredient_tokens = [i.strip() for i in ingredient_tokens]
    ingredient = ' '.join(ingredient_tokens).strip()
    ingredient = ingredient.replace(measurement, '')
    ingredient = rejector.process(ingredient)
    ingredient_dict['measurement'] = measurement
    ingredient_dict['ingredient'] = rejector.process(ingredient)
    return ingredient_dict


if __name__ == '__main__':
    for file in sys.argv[1:]:
        for item in utilities.read_json(file):
            print(json.dumps(parse_recipe(item), indent="  "))
コード例 #19
0
def update_hash(db_addr,
                partitioned_data_directory,
                mapping_file_name='mapping.json'):
    mapping_json = utilities.read_json(
        os.path.join(partitioned_data_directory, mapping_file_name))

    con = sqlite3.connect(db_addr)
    con.row_factory = sqlite3.Row
    cur = con.cursor()
    cur.execute("SELECT MAX(id) as max_id FROM http_responses")
    total_rows = cur.fetchone()['max_id']
    pbar = tqdm(total=total_rows)
    cur.execute('SELECT visit_id, url, content_hash FROM http_responses')

    updated_mapping_object = {}
    url_updated_mapping_object = {}
    total_keys_matched = 0

    for key in mapping_json:
        new_key = key.split('|', 2)[0].strip() + \
            '|' + key.split('|', 2)[2].strip()
        updated_mapping_object[new_key] = {}
        updated_mapping_object[new_key]['old_key'] = key
        updated_mapping_object[new_key]['id_index'] = mapping_json[key][
            'id_index']
        updated_mapping_object[new_key]['partition_index'] = mapping_json[key][
            'partition_index']
        updated_mapping_object[new_key]['content_hash'] = ''

        mapping_json[key]['new_id_url_key'] = new_key

        new_key = key.split('|', 2)[2].strip()
        url_updated_mapping_object[new_key] = {}
        url_updated_mapping_object[new_key]['old_key'] = key
        url_updated_mapping_object[new_key]['id_index'] = mapping_json[key][
            'id_index']
        url_updated_mapping_object[new_key]['partition_index'] = mapping_json[
            key]['partition_index']
        url_updated_mapping_object[new_key]['content_hash'] = ''

        mapping_json[key]['new_url_key'] = new_key

    for row in cur:
        if row[2] != '' and row[2] != None:
            db_row_key = str(row[0]) + '|' + row[1].strip()
            if db_row_key in updated_mapping_object:
                updated_mapping_object[db_row_key]['content_hash'] = row[
                    2].strip()
                total_keys_matched += 1
            if row[1].strip() != '' and row[1].strip(
            ) in url_updated_mapping_object:
                url_updated_mapping_object[
                    row[1].strip()]['content_hash'] = row[2].strip()
                total_keys_matched += 1

                # if row[1].strip() != '':
                #     for key in updated_mapping_object:
                #         if key.split('|', 2)[1].strip() == row[1].strip():
                #             updated_mapping_object[key]['content_hash'] = row[2].strip()
                #             total_keys_matched += 1
                #             break
        pbar.update(1)
    print('Total keys matched:', total_keys_matched)
    utilities.write_json(os.path.join(partitioned_data_directory,
                                      'mapping_updated.json'),
                         updated_mapping_object,
                         indent_length=0)
    utilities.write_json(os.path.join(partitioned_data_directory,
                                      'url_only_mapping_updated.json'),
                         url_updated_mapping_object,
                         indent_length=0)

    for key in mapping_json:
        mapping_json[key]['content_hash'] = updated_mapping_object[
            mapping_json[key]['new_id_url_key']]['content_hash']
        if mapping_json[key]['content_hash'] == '':
            mapping_json[key]['content_hash'] = url_updated_mapping_object[
                mapping_json[key]['new_url_key']]['content_hash']

    utilities.write_json(os.path.join(partitioned_data_directory,
                                      'mapping_with_hash.json'),
                         mapping_json,
                         indent_length=0)
    return
コード例 #20
0
            ha, distance_ax = "right", 1

        ax.text(angle_rad,
                10 + distance_ax,
                cat[i],
                size=10,
                horizontalalignment=ha,
                verticalalignment="center")

    # Show polar plot
    plt.show()


if arguments.profile:
    for dishfile in arguments.profile:
        for dish in utilities.read_json(dishfile):
            print("\n" + dish["dish_name"] + "\n" +
                  "=" * len(dish["dish_name"]))
            data = taster.taste(dish)
            print(json.dumps(data, sort_keys=True, indent="  "))
            show_graph(data)

if arguments.validate:
    from validator import Validator
    adjustment = dict()
    if os.path.exists("adjustment_factors.json"):
        adjustment = utilities.read_json("adjustment_factors.json")
    for vjob in arguments.validate:
        gendata = json.load(open(vjob[1]))
        survdata = json.load(open(vjob[2]))
        vobj = Validator(gendata, survdata)