def set_features(collection_name):
    raw_list = []
    raw_ids = []
    counter = Counter(100)
    collection = client.wiki[collection_name]  # type: collection.Collection
    for raw in collection.find({}):
        if len(raw["revs"]) <= 1:
            continue

        if "rwords" not in raw:
            continue
        distr = calculate(raw)

        collection.update_one(
            {"_id": raw["_id"]},
            {"$set": {
                "f.t_charscore": BHdist(distr, good_distr)
            }})

        raw_list.append({key: value for key, value in distr.items()})
        raw_ids.append(raw["_id"])
        counter.tick()
    '''
Beispiel #2
0
client = MongoClient('localhost', 27017)

items_collection = client.wiki.any_items  # type: collection.Collection
total_count = 0

for collection_name in TASK:
    collection = client.wiki[collection_name]
    collection.delete_many({})
    collection.drop_indexes()
    collection.create_index([("r", pymongo.ASCENDING)])
    total_count += sum(TASK[collection_name].values())

# set random field
seed()
random_counter = Counter(100)
print("Randomizing")
for item in items_collection.find({}, no_cursor_timeout=True):
    items_collection.update_one({'_id': item['_id']},
                                {'$set': {
                                    'r': random()
                                }})
    random_counter.tick()

print("Random elements created")

# setting indices
indices = items_collection.index_information()
if 'random' not in indices:
    print("Creating index")
    items_collection.create_index([("vandal", pymongo.ASCENDING),
client = MongoClient('localhost', 27017)
raw_collection = client.wiki[COLLECTION_NAME]  # type: collection.Collection

features_list = dict()
if len(REQUIRED_FEATURES) == 1 and REQUIRED_FEATURES[0] == '*':
    features_list = FEATURES_LIST
else:
    for key, val in FEATURES_LIST.items():
        if key in REQUIRED_FEATURES:
            features_list[key] = val

for key in features_list:
    features_list[key] = features_list[key]()

cnt = Counter(100, raw_collection.count())
for raw in raw_collection.find({}, no_cursor_timeout=True):
    if raw["revs"] is None or len(raw["revs"]) <= 1:
        continue

    if raw["revs"][-1]["text"] == "":
        continue

    f = (raw["f"] if "f" in raw else None) or dict()
    for key in features_list:
        res = features_list[key].extract(raw)
        if type(res) is dict:
            f.update(res)
        else:
            f[key] = res