Esempio n. 1
0
def fix_olx_model():
    db = dao.get_db()
    products = db.products.find({'source': 'olx.sa.com'})
    # npproducts =  np.array([product for product in products]) #np.array(products)
    # npproducts = []
    models = []
    nonrecognized = []
    NoneCounter = 0
    for product in products:
        # if len(product['model']['name']) >0 and models.find(product['model']['name']) == 0:
        # db.models.insert({'name':'', 'name_ar':product['model'], 'active':bool('true')})
        #  npproducts.append(product)
        if product['model'] == None:
            NoneCounter += 1

    #
    # unique, counts = np.unique(models, return_counts=True)
    # hist = np.array((unique, counts)).T
    # print hist
    # print nonrecognized
    # print str(NoneCounter)
    # util.save_list_as_csv(hist,'coutns', '/home/bebe/0yasser/shofle/source/shofle_reader/log/' )
    # plt.plot(unique, counts)
    # plt.xticks(unique, counts)
    # plt.show()
    print(str(NoneCounter))
    print("Done")
Esempio n. 2
0
def dump_models():
    db = dao.get_db()
    models = db.models.find({})
    models_list = []
    for product in models:
        models_list.append(product)
    util.save_list_as_csv(
        models_list, 'models.csv',
        '/home/bebe/0yasser/shofle/source/shofle_reader/log/')
    print("Done")
Esempio n. 3
0
def update_models():
    db = dao.get_db()
    models = util.read_csv_as_list_(
        'models.csv', '/home/bebe/0yasser/shofle/source/shofle_reader/log/')
    for model in models:
        model['active'] = bool('true')
        model['name'] = model['name'].strip()
        model['name_ar'] = model['name_ar'].strip()
        db.models.insert(model)
    print("Done")
Esempio n. 4
0
def get_ads_dataset(filters, fields_to_project, target_field):
    db = dao.get_db()
    try:
        # {
        #     "brand": ObjectId("5916d82fff4a4215c6cb2ac7"),
        #     "model": ""
        #     "year": "٢٠١٥",
        #     "body_type": "",
        #     "engine": 0,
        #     "power": 0,
        #     "fuel": "",
        #     "transmission": "اوتوماتيك",
        #     "location": "جدة",
        #     "specs": ""
        # }

        # {"body_type": {"$ne": ""}, "$and": [{"engine": {"$ne": "0"}}, {"engine": {"$ne": 0}}],
        #  "power": {"$ne": 0}, "fuel": {"$ne": ""}, "specs": {"$ne": ""}},
        # {"brand": 1, "model": 1, "year": 1, "body_type": 1, "engine": 1, "power": 1, "fuel": 1,
        #  "transmission": 1, "location": 1, "specs": 1, "_id": 0}):

        ads = []
        for ad in db.products.find(
                filters, {
                    "brand": 1,
                    "model": 1,
                    "year": 1,
                    "body_type": 1,
                    "engine": 1,
                    "power": 1,
                    "fuel": 1,
                    "transmission": 1,
                    "location": 1,
                    "specs": 1,
                    "_id": 0
                }):
            temp = []
            target_field_index = 0
            counter = -1
            for key in ad:
                if key in fields_to_project:
                    if key != target_field:
                        temp.append(hash(ad[key]))
                    else:
                        temp.append(ad[key])
                    counter += 1
                if key == target_field:
                    target_field_index = counter
            ads.append(temp)
        ads = np.array(ads)

        # swapping columns so that y i.e target is col 0
        ads[:,
            0], ads[:,
                    target_field_index] = ads[:,
                                              target_field_index], ads[:,
                                                                       0].copy(
                                                                       )

        # # cleaning engine capcity less than 1000 cc
        # ads[ads[:, 0] > 900, :]

        # # lebeling rest of columns
        # le = preprocessing.LabelEncoder()
        # for i in range (1,ads.shape[1]):
        #      le.fit(ads[:,i])
        #      ads[:, i] = le.fit_transform(ads[:,i])

        # Calculate mode of each column except target key
        # max(set(list), key=list.count)

        return ads
    except (Exception, e):
        util.logerr(str(e))
    return