def set_features(collection_name): raw_list = [] raw_res = [] raw_ids = [] counter = Counter(100) collection = client.wiki[collection_name] # type: collection.Collection for raw in collection.find({}): if "rwords" not in raw or len(raw["rwords"]) == 0: continue if len(raw["revs"]) <= 1 or "f" not in raw or len(raw["f"]) < 25: continue if any(f not in raw["f"] for f in OK_FEATURES): continue raw_list.append([x for n, x in raw["f"].items() if n in OK_FEATURES]) raw_res.append(1 if raw["vandal"] else 0) raw_ids.append(raw["_id"]) counter.tick() pred = frst.predict_proba(raw_list) for i, x in enumerate(pred[:, 1]): collection.update_one({"_id": raw_ids[i]}, {"$set": { "f.forest_score": x }})
def insertdb (data): downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collection.bulk_write(data) collection.update_one(data, {'$set': data}, upsert=True) print('添加完成'+downloadTime) except: print('重复添加'+downloadTime)
def set_features(collection_name): raw_list = [] raw_res = [] raw_ids = [] raw_list_opp = [] counter = Counter(100) collection = client.wiki[collection_name] # type: collection.Collection for raw in collection.find({}, { "_id": 1, TEXT_FEATURE_KEY: 1, "vandal": 1 }): if TEXT_FEATURE_KEY not in raw: # or len(raw[TEXT_FEATURE_KEY]) == 0: continue filtered = {x: sign(y) for x, y in raw[TEXT_FEATURE_KEY].items() } #if not x.isdigit() filtered2 = { x: 1 for x, y in filtered.items() if y > 0 and not check_rgb(x) and ' ' not in x } # raw_list.append(filtered2) raw_list_opp.append({ x: y * (-1) for x, y in filtered.items() if y < 0 and not check_rgb(x) and ' ' not in x }) # #raw_list.append(raw[TEXT_FEATURE_KEY]) raw_res.append(1 if raw["vandal"] else 0) raw_ids.append(raw["_id"]) counter.tick() pred = lr.predict_proba(fh.transform(raw_list)) pred2 = lr2.predict_proba(fh.transform(raw_list_opp)) for i, x in enumerate(pred[:, 1]): collection.update_one( {"_id": raw_ids[i]}, { "$set": { "f.t_biscore": x, #max(x,pred2[i,1]) 'f.t_biscore_opp': pred2[i, 1] } })
def set_features(collection_name): raw_list = [] raw_ids = [] counter = Counter(100) collection = client.wiki[collection_name] # type: collection.Collection for raw in collection.find({}): if len(raw["revs"]) <= 1: continue if "rwords" not in raw: continue distr = calculate(raw) collection.update_one( {"_id": raw["_id"]}, {"$set": { "f.t_charscore": BHdist(distr, good_distr) }}) raw_list.append({key: value for key, value in distr.items()}) raw_ids.append(raw["_id"]) counter.tick() '''
def check_collection(collection: pymongo.collection.Collection, db_client: database.client.DatabaseClient): """ Check all the entities in a collection :param collection: :param db_client: :return: """ all_entities = collection.find() for s_entity in all_entities: # patch the entity type if appropriate if '.' not in s_entity['_type']: qual_types = database.entity_registry.find_potential_entity_classes( s_entity['_type']) if len(qual_types) == 1 and qual_types[0] != s_entity['_type']: logging.getLogger(__name__).error( "Entity {0} had unqualified type {1}".format( s_entity['_id'], s_entity['_type'])) collection.update_one({'_id': s_entity['_id']}, {'$set': { '_type': qual_types[0] }}) # Try and deserialize the entity, and validate it if we succeed try: entity = db_client.deserialize_entity(s_entity) except Exception: entity = None logging.getLogger(__name__).error( "Exception occurred deserializing object {0}:\n{1}".format( s_entity['_id'], traceback.format_exc())) if entity is not None and hasattr(entity, 'validate'): if not entity.validate(): logging.getLogger(__name__).error( "Entity {0} ({1}) failed validation".format( entity.identifier, s_entity['_type']))
pred = frst.predict_proba(raw_list) for k in range(0,len(raw_list)): raw_orig[k]["score"] = pred[k,1] #* other[k,0] # * other_char[k, 0] cnt = 0 collection = client.wiki['new_big_train'] # type: collection.Collection for obj in sorted(raw_orig, key=lambda x: x["score"], reverse=False): if obj["vandal"] == 1: cnt += 1 collection.update_one({ "_id": obj["_id"] }, { "$unset": { "ignore3": 1 } }) if cnt % 3 != 0: collection.update_one({"_id": obj["_id"]}, {"$set": { "ignore3": 1 }}) if cnt > 900: break continue print(obj["url"]) if query_yes_no("^^^ Make vandal?"): collection.update_one({"_id": obj["_id"]}, {"$set": { "vandal": True