コード例 #1
0
def set_features(collection_name):
    raw_list = []
    raw_res = []
    raw_ids = []
    counter = Counter(100)
    collection = client.wiki[collection_name]  # type: collection.Collection
    for raw in collection.find({}):
        if "rwords" not in raw or len(raw["rwords"]) == 0:
            continue

        if len(raw["revs"]) <= 1 or "f" not in raw or len(raw["f"]) < 25:
            continue

        if any(f not in raw["f"] for f in OK_FEATURES):
            continue

        raw_list.append([x for n, x in raw["f"].items() if n in OK_FEATURES])
        raw_res.append(1 if raw["vandal"] else 0)
        raw_ids.append(raw["_id"])
        counter.tick()

    pred = frst.predict_proba(raw_list)
    for i, x in enumerate(pred[:, 1]):
        collection.update_one({"_id": raw_ids[i]},
                              {"$set": {
                                  "f.forest_score": x
                              }})
コード例 #2
0
def insertdb (data):
    downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    try:
        collection.bulk_write(data)
        collection.update_one(data, {'$set': data}, upsert=True)
        print('添加完成'+downloadTime)
    except:
        print('重复添加'+downloadTime)
コード例 #3
0
def set_features(collection_name):
    raw_list = []
    raw_res = []
    raw_ids = []
    raw_list_opp = []
    counter = Counter(100)
    collection = client.wiki[collection_name]  # type: collection.Collection
    for raw in collection.find({}, {
            "_id": 1,
            TEXT_FEATURE_KEY: 1,
            "vandal": 1
    }):
        if TEXT_FEATURE_KEY not in raw:  # or len(raw[TEXT_FEATURE_KEY]) == 0:
            continue

        filtered = {x: sign(y)
                    for x, y in raw[TEXT_FEATURE_KEY].items()
                    }  #if not x.isdigit()
        filtered2 = {
            x: 1
            for x, y in filtered.items()
            if y > 0 and not check_rgb(x) and ' ' not in x
        }  #

        raw_list.append(filtered2)
        raw_list_opp.append({
            x: y * (-1)
            for x, y in filtered.items()
            if y < 0 and not check_rgb(x) and ' ' not in x
        })  #
        #raw_list.append(raw[TEXT_FEATURE_KEY])
        raw_res.append(1 if raw["vandal"] else 0)
        raw_ids.append(raw["_id"])
        counter.tick()

    pred = lr.predict_proba(fh.transform(raw_list))
    pred2 = lr2.predict_proba(fh.transform(raw_list_opp))
    for i, x in enumerate(pred[:, 1]):
        collection.update_one(
            {"_id": raw_ids[i]},
            {
                "$set": {
                    "f.t_biscore": x,  #max(x,pred2[i,1])
                    'f.t_biscore_opp': pred2[i, 1]
                }
            })
コード例 #4
0
def set_features(collection_name):
    raw_list = []
    raw_ids = []
    counter = Counter(100)
    collection = client.wiki[collection_name]  # type: collection.Collection
    for raw in collection.find({}):
        if len(raw["revs"]) <= 1:
            continue

        if "rwords" not in raw:
            continue
        distr = calculate(raw)

        collection.update_one(
            {"_id": raw["_id"]},
            {"$set": {
                "f.t_charscore": BHdist(distr, good_distr)
            }})

        raw_list.append({key: value for key, value in distr.items()})
        raw_ids.append(raw["_id"])
        counter.tick()
    '''
コード例 #5
0
def check_collection(collection: pymongo.collection.Collection,
                     db_client: database.client.DatabaseClient):
    """
    Check all the entities in a collection
    :param collection:
    :param db_client:
    :return:
    """
    all_entities = collection.find()
    for s_entity in all_entities:
        # patch the entity type if appropriate
        if '.' not in s_entity['_type']:
            qual_types = database.entity_registry.find_potential_entity_classes(
                s_entity['_type'])
            if len(qual_types) == 1 and qual_types[0] != s_entity['_type']:
                logging.getLogger(__name__).error(
                    "Entity {0} had unqualified type {1}".format(
                        s_entity['_id'], s_entity['_type']))
                collection.update_one({'_id': s_entity['_id']},
                                      {'$set': {
                                          '_type': qual_types[0]
                                      }})

        # Try and deserialize the entity, and validate it if we succeed
        try:
            entity = db_client.deserialize_entity(s_entity)
        except Exception:
            entity = None
            logging.getLogger(__name__).error(
                "Exception occurred deserializing object {0}:\n{1}".format(
                    s_entity['_id'], traceback.format_exc()))

        if entity is not None and hasattr(entity, 'validate'):
            if not entity.validate():
                logging.getLogger(__name__).error(
                    "Entity {0} ({1}) failed validation".format(
                        entity.identifier, s_entity['_type']))
コード例 #6
0
pred = frst.predict_proba(raw_list)

for k in range(0,len(raw_list)):
    raw_orig[k]["score"] = pred[k,1] #* other[k,0] # * other_char[k, 0]


cnt = 0
collection = client.wiki['new_big_train']  # type: collection.Collection
for obj in sorted(raw_orig, key=lambda x: x["score"], reverse=False):
    if obj["vandal"] == 1:
        cnt += 1

        collection.update_one({
            "_id": obj["_id"]
        }, {
            "$unset": {
                "ignore3": 1
            }
        })
        if cnt % 3 != 0:
            collection.update_one({"_id": obj["_id"]}, {"$set": {
                "ignore3": 1
            }})

        if cnt > 900:
            break
        continue
        print(obj["url"])
        if query_yes_no("^^^ Make vandal?"):
            collection.update_one({"_id": obj["_id"]}, {"$set": {
                "vandal": True