コード例 #1
0
def build_power_prediction_model():
    util.loginfo(
        "==============================================================")

    ads = get_ads_dataset(
        {
            "$and": [{
                "engine": {
                    "$ne": "0"
                }
            }, {
                "engine": {
                    "$ne": 0
                }
            }],
            "$and": [{
                "power": {
                    "$ne": 0
                }
            }, {
                "power": {
                    "$ne": "0"
                }
            }]
        }, ["model", "engine", "power"], "power")

    build_model(ads, "power")
    util.loginfo(
        "======================== Done ===============================")
コード例 #2
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def add_list_to_db(item_list):
    db = get_db()
    try:
        db.products.insert(item_list)
        util.loginfo('ad(s) inserted successfully')
    except Exception as e:
        util.logerr(str(e))
コード例 #3
0
def build_price_prediction_model():
    util.loginfo(
        "==============================================================")
    ads = get_ads_dataset(
        {
            "body_type": {
                "$ne": ""
            },
            "fuel": {
                "$ne": ""
            },
            "price": {
                "$ne": ""
            },
            "color": {
                "$ne": ""
            },
            "year": {
                "$ne": ""
            },
            "mileage": {
                "$ne": ""
            }
        }, [
            "brand", "model", "year", "body_type", "transmission", "location",
            "specs", "color", "mileage"
        ], "price")
    build_model(ads, "price")
    util.loginfo(
        "======================== Done ===============================")
コード例 #4
0
    def classify(self, ad, is_raw_ad=False):

        votes = {
            'SAL': 0,
            'PRT': 0,
            'TRN': 0,
            'SVC': 0,
            'EXP': 0,
            'REQ': 0,
            'ACC': 0,
            'EXC': 0,
            'OTH': 0,
            'INV': 0
        }

        if is_raw_ad:
            ad = ({
                'ad_text':
                ad['description'] + " " + ad['title'] + " " +
                (ad['source']).replace('.com', '').replace('.sa', '').replace(
                    '.ksa', '') +
                (" " + (ad['ad_page_link'][:-1]).rsplit('/', 1)[-1]) if
                (ad['source'] == 'haraj.com.sa') else ""
            })
            util.logdebug(
                "-------------------------------------------------------------------------"
            )
            util.logdebug(ad['ad_text'])
            util.logdebug(
                "-------------------------------------------------------------------------"
            )
        else:
            ad_txt = ''
            for w in ad:
                if ad[w] == True: ad_txt += ' ' + w
            print(ad_txt)

        for classifier_name, classifier in self.trained_classifiers:
            if is_raw_ad:
                vote = classifier.classify(
                    self.generate_feature(ad,
                                          self.words_sets[classifier_name]))
            else:
                vote = classifier.classify(ad)

            if vote == '': continue
            util.logdebug(classifier_name + " votted :" + vote)
            votes[vote] += 1
        util.logdebug("final votes for all overall classifiers for all cats" +
                      str(votes))

        vote = max(votes, key=votes.get)
        confidence = float(votes[max(votes, key=votes.get)]) / float(
            len(self.trained_classifiers))
        util.loginfo("final vote is :" + vote)
        util.loginfo("confidenc is :" + str(confidence))
        if is_raw_ad:
            return vote, confidence
        else:
            return vote
コード例 #5
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def add_ad_to_db(item):
    db = get_db()
    try:
        db.products.insert_one(item)
        util.loginfo('ad inserted successfully')
    except Exception as e:
        util.logerr(str(e))
コード例 #6
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def deactivate(item):
    db = get_db()
    try:
        db.products.update({'source': item.get('source'),"ad_id": item.get('ad_id')}, {'$set': {'active': False}})
        util.loginfo('ad is deactivated successful')
    except Exception as e:
        util.logerr(str(e))
コード例 #7
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def delete(item):
    db = get_db()
    try:
        db.products.delete_many({'source': item.get('source'),"ad_id": item.get('ad_id')})
        util.loginfo('ad deleted successful')
    except Exception as e:
        util.logerr(str(e))
コード例 #8
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def get_models():
    db = get_db()
    try:
        models = list(db.models.find({'active': True}))
        util.loginfo('models retrived successfully')
    except Exception as e:
        util.logerr(str(e))
    return models
コード例 #9
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def get_brands():
    db = get_db()
    try:
        brands = list(db.brands.find({'active': True}))
        util.loginfo('brands retrived successfully')
    except Exception as e:
        util.logerr(str(e))
    return brands
コード例 #10
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def get_ad_by_query(q):
    db = get_db()
    try:
        ads = list(db.products.find(q))
        util.loginfo('ads retrived successfully')
    except Exception as e:
        util.logerr(str(e))
    return ads
コード例 #11
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def get_ad():
    db = get_db()
    try:
        ad = db.products.find_one()
        util.loginfo('ad retrived successfully')
    except Exception as e:
        util.logerr(str(e))
    return ad
コード例 #12
0
    def feature_selection(self, number_of_features_to_remove, raw_list_of_ads,
                          best_classifier, best_words_set, best_accuracy):
        # read ads samples from training database

        util.loginfo("---------------- feature selection ----------------")
        util.logdebug("accuracy trying to beat is " + str(best_accuracy))
        number_of_features_removed = 0
        counter = 0
        while counter < len(
                best_words_set) - 1 and number_of_features_removed < 3:
            removed = best_words_set.pop(
                random.randint(0,
                               len(best_words_set) - 1))
            util.logdebug("feature trying to test its value is " +
                          u''.join(removed))
            accuracy_ = []
            for i in range(0, 11):
                util.logdebug("shuffling for " + str(i))
                random.shuffle(raw_list_of_ads)
                raw_train_rows = int(math.floor(0.7 * len(raw_list_of_ads)))
                raw_train_set, raw_test_set = raw_list_of_ads[:raw_train_rows], raw_list_of_ads[
                    raw_train_rows:]

                train_list_of_ads = [
                    (self.generate_feature(ad, best_words_set), cat)
                    for (ad, cat) in raw_train_set
                ]
                test_list_of_ads = [(self.generate_feature(ad, best_words_set),
                                     cat) for (ad, cat) in raw_test_set]

                new_classifier = best_classifier.train(train_list_of_ads)

                new_accuracy = nltk.classify.accuracy(best_classifier,
                                                      test_list_of_ads)

                # accuracy_.append(new_accuracy)
                # if new_accuracy > accuracy_:
                #     util.logdebug("found new local accuracy"  + str (new_accuracy))
                #     accuracy_ = new_accuracy
                accuracy_.append(new_accuracy)
            if np.median(accuracy_) > best_accuracy:
                util.loginfo("---------- accuracy captured when removing " +
                             removed + " is " + str(accuracy_))
                util.loginfo("---------- new median accuracy" +
                             str(np.median(accuracy_)))
                util.loginfo("---------- feature removed is" +
                             u''.join(removed))
                best_accuracy = np.median(accuracy_)
                best_classifier = new_classifier
                number_of_features_removed += 1
            else:
                best_words_set.append(removed)
            if number_of_features_removed >= number_of_features_to_remove:
                break
            counter += 1
        util.loginfo("accuracy : " + str(best_accuracy))
        return best_classifier, best_words_set
コード例 #13
0
def build_price_prediction_model():
    util.loginfo(
        "==============================================================")
    ads = get_ads_dataset({
        "year": {
            "$ne": 0
        },
        "mileage": {
            "$ne": ""
        }
    }, ["brand", "model", "year", "mileage"], "price")
    return build_model(ads)
コード例 #14
0
ファイル: dao.py プロジェクト: YaserMarey/ads_aggregator
def update(items):
    db = get_db()
    for item in items:
        try:
            db.products.update(
                {
                    'source': item.get('source'),
                    "ad_id": item.get('ad_id')
                },
                {
                    "$set": {
                            'language_override': item.get('language_override'),
                            'ad_page_link': item.get('ad_page_link'),
                            'last_update': item.get('last_update'),
                            'title': item.get('title'),
                            'description': item.get('description'),
                            'brand': item.get('brand'),
                            'model': item.get('model'),
                            'year': item.get('year'),
                            'body_type': item.get('body_type'),
                            'mileage': item.get('mileage'),
                            'engine': item.get('engine'),
                            'power': item.get('power'),
                            'specs': item.get('specs'),
                            'transmission': item.get('transmission'),
                            'fuel': item.get('fuel'),
                            'condition': item.get('condition'),
                            'color': item.get('color'),
                            'price': item.get('price'),
                            'image_link': item.get('image_link'),
                            'location': item.get('location'),
                            'active': item.get('active'),
                            'keyFeatures': item.get('keyFeatures'),
                            'features': item.get('features'),
                            'tags':item.get('tags'),
                            'variants': item.get('variants'),
                            'ad_cat': item.get('ad_cat')
                    }
                }
            )
            util.loginfo("ad(s) updated successfully")
        except Exception as e:
            util.logerr(str(e))
コード例 #15
0
    def checkIfModelsExist(self):
        import os.path
        util.loginfo("-------------- checking if models exist --------------")
        for classifier_name, classifier in self.classifiers_list:
            if os.path.isfile(
                    settings[settings['ENVIRONMENT']]['ML_FOLDER_PATH'] +
                    'classifier/model/' + classifier_name +
                    '.pickle') == False:
                return False

            if os.path.isfile(
                    settings[settings['ENVIRONMENT']]['ML_FOLDER_PATH'] +
                    'classifier/model/words_set_' + classifier_name +
                    '.txt') == False:
                return False

        if os.path.isfile(settings[settings['ENVIRONMENT']]['ML_FOLDER_PATH'] +
                          'classifier/model/stop_words_set.txt') == False:
            return False

        return True
コード例 #16
0
    def __init__(self,
                 classifier_list=[
                     ('linear_svc_2', SklearnClassifier(LinearSVC())),
                     ('log_reg_1_2', SklearnClassifier(LogisticRegression())),
                     ('log_reg_2_2', SklearnClassifier(LogisticRegression())),
                     ('log_reg_3_2', SklearnClassifier(LogisticRegression())),
                     ('multinom_nb_1_2', SklearnClassifier(MultinomialNB())),
                     ('multinom_nb_2_2', SklearnClassifier(MultinomialNB())),
                     ('multinom_nb_3_2', SklearnClassifier(MultinomialNB())),
                     ('linear_svc_3', SklearnClassifier(LinearSVC())),
                     ('log_reg_1_3', SklearnClassifier(LogisticRegression())),
                     ('log_reg_2_3', SklearnClassifier(LogisticRegression())),
                     ('log_reg_3_3', SklearnClassifier(LogisticRegression())),
                     ('multinom_nb_1_3', SklearnClassifier(MultinomialNB())),
                     ('multinom_nb_2_3', SklearnClassifier(MultinomialNB())),
                     ('multinom_nb_3_3', SklearnClassifier(MultinomialNB()))
                 ]):

        self.classifiers_list = []
        self.trained_classifiers = []
        self.words_sets = {}
        self.stop_words = []

        self.classifiers_list = classifier_list

        self.ad_cats = [
            'SAL', 'PRT', 'TRN', 'SVC', 'EXP', 'REQ', 'ACC', 'EXC', 'OTH',
            'INV'
        ]

        if (self.checkIfModelsExist()):
            util.loginfo(
                "---------------------- reading models ------------------------"
            )
            self.read_models()
        else:
            util.loginfo(
                "-------------- models not there, start training --------------"
            )
            self.train()
コード例 #17
0
def build_specs_prediction_model():
    util.loginfo(
        "==============================================================")
    ads = get_ads_dataset(
        {
            "body_type": {
                "$ne": ""
            },
            "$and": [{
                "engine": {
                    "$ne": "0"
                }
            }, {
                "engine": {
                    "$ne": 0
                }
            }],
            "$and": [{
                "power": {
                    "$ne": 0
                }
            }, {
                "power": {
                    "$ne": "0"
                }
            }],
            "fuel": {
                "$ne": ""
            },
            "specs": {
                "$ne": ""
            }
        }, [
            "brand", "model", "engine", "power", "body_type", "fuel", "price",
            "specs"
        ], "specs")
    build_model(ads, "specs")
    util.loginfo(
        "======================== Done ===============================")
コード例 #18
0
def update_lang():
    db = dao.get_db_production()
    products = db.products.find({'language_override': {'$exists': False}})
    for product in products:
        try:
            if detect(product['description']) == 'ar':
                language_override = 'arabic'  # langdetect returns ISO 639-1 codes,
            else:
                language_override = 'english'
            # where mongo expects ISO 639-3 codes for Arabic
        except:
            util.loginfo(
                "Coudn't detect language falling back to English, ad_page_link"
                + product['ad_id'])
            language_override = 'english'

        brands = dao.get_brands()

        try:
            for brand in brands:
                if brand['_id'] == product['brand']:
                    if language_override == "arabic":
                        brand_name = brand.get("name_ar")
                        break
                    else:
                        brand_name = brand.get("name")
        except:
            util.loginfo(
                "Skipping one item, error parsing brand, ad_data_link" +
                product['ad_id'])
            continue
        if brand == None:
            util.loginfo(
                "Skipping one item, brand is not found, ad_data_link" +
                product['ad_id'])
            continue

        product['language_override'] = language_override
        product['tags'] = brand_name

        db.get_collection('products').save(product)
コード例 #19
0
def read(number_of_pages, debug, brands, models, classifier):
    skipped = 0
    SOURCE = "carmudi.com.sa"
    count_of_ads_added = 0
    count_of_ads_updated = 0
    counter_of_ads = 0
    already_loaded_counter = 0
    for i in range(1, number_of_pages):
        util.loginfo(">>Page #" + str(i))
        page_data = util.download_file(settings[SOURCE]['base_url'] +
                                       "/ar/api/web_listing?appliedFilter=%7B%22count%22:20,%22page%22:" + str(
            i) + "%7D",
                                       settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])
        if (page_data == None):
            continue
        if (len(json.loads(page_data)['products']) == 0):
            continue
        # for item in catalog_items:
        for item in json.loads(page_data)['products']:
            counter_of_ads += 1
            util.loginfo(">Ad # " + str(counter_of_ads))

            # 1- ad_data_link, mandatory
            try:
                ad_data_link = "/ar/api/product_details?lang=ar&product_id=" + str(item['id']);
            except:
                util.loginfo("Skipping one item, error parsing ad_data_link")
                skipped += 1
                continue

            ad_data_content = util.download_file(settings[SOURCE]['base_url'] + ad_data_link,
                                                 settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])

            if ad_data_content == None:
                util.loginfo("Skipping one item, error downloading ad_page, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            ad_json = json.loads(ad_data_content)['products'];

            # 2- ad_update_date, mandatory
            try:
                for c in {'th', 'nd', 'st'}:
                    ad_json['created_at'] = ad_json['created_at'].replace(c, '')
                ad_update_date = datetime.strptime(ad_json['created_at'], "%d %b %Y")
            except:
                util.loginfo("Skipping one item, error parsing ad_update_date, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 3- ad_id, mandatory
            try:
                ad_id = ad_json['id']
            except:
                util.loginfo("Skipping one item, error parsing ad_id, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 4- title, mandatory
            try:
                title = ad_json['make'] + " " + ad_json['model'] + " " + ad_json['year']
            except:
                util.loginfo("Skipping one item, error parsing title, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 5- description, mandatory
            try:
                description = BeautifulSoup(ad_json['description'], 'html.parser').text  # remove all tags
            except:
                util.loginfo("Skipping one item, error parsing body, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 6- brand, mandatory because the ad page link depends on it
            try:
                brand = util.find_brand(ad_json['make'], brands)
            except:
                util.loginfo("Skipping one item, error parsing brand, ad_data_link" + ad_data_link);
                continue
            if brand == None:
                util.loginfo("Skipping one item, brand is not found, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 7- model, mandatory
            try:
                model, found = util.find_model(ad_json['model'].capitalize(), models);
            except:
                util.loginfo("Skipping one item, error parsing model, ad_data_link" + ad_data_link);
                skipped += 1
                continue
                # model = ""
            # if found == False: model = ""

            # 8- year, mandatory
            try:
                year = int(util.toArabicNumerals(ad_json['year']))
            except:
                util.loginfo("Skipping one item, error parsing year, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 9- body_type, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                body_type = ad_json['body_type']
            except:
                body_type = ""

            # 10- mileage, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                mileage = re.findall(r'\d+', ad_json['mileage'])[0]
            except:
                mileage = 0

            # 11- engine, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                engine = re.findall(r'\d+', ad_json['engine'])[0]
            except:
                engine = 0

            # 12- power, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                power = re.findall(r'\d+', ad_json['power'])[0]
            except:
                power = 0

            # 13- specs[GCC,EU..etc], optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                specs = ad_json['specs']
            except:
                specs = ""

            # 14- transmission, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                transmission = ad_json['transmission']
            except:
                transmission = ""

            # 15- fuel [Gasline/Disel], optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                fuel = ad_json['fuel']
            except:
                fuel = ""

            # 16- condition [Used/New], optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                condition = ad_json['condition']
            except:
                condition = ""

            # 17- color, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                color = ad_json['color']
            except:
                color = ""

            # 18- price, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                price = ad_json['price']
                price_numeric_value = int(
                    (filter(lambda x: x in set(string.printable), price)).strip('.. ').replace(',', ''))
            except:
                util.loginfo("Skipping one item, error parsing price, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 19- image_link, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                image_link = "https://s3-us-west-2.amazonaws.com/carmudi-site/products/360/" + ad_json['images'][0]
            except:
                util.loginfo("Skipping one item, error parsing image_link, ad_data_link" + ad_data_link);
                skipped += 1
                continue

            # 20- location, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web
            try:
                location = ad_json['location'].capitalize()
            except:
                location = ""

            # 21- ad_page_link
            ad_page_link = "/ar/product-detail/" + str(year) + "-" + brand['name'].lower() + "-" + \
                           model.lower() + "-in-" + location.lower() + "-" + str(price) + "-" + str(ad_id)

            try:
                if detect(description) == 'ar':
                    language_override = 'arabic'  # langdetect returns ISO 639-1 codes,
                else:
                    language_override = 'english'
                # where mongo expects ISO 639-3 codes for Arabic
            except:
                util.loginfo("Coudn't detect language falling back to English, ad_page_link" + ad_page_link);
                language_override = 'english'

            brand_name = ""

            ad_to_save_or_update = {
                'source': SOURCE,
                'ad_id': ad_id,
                'language_override': language_override,
                'ad_page_link': settings[SOURCE]['base_url'] + ad_page_link,
                'last_update': ad_update_date,
                'title': title,
                'nameLower': title.lower(),
                'description': description,
                'brand': ObjectId(brand.get('_id')),
                'model': model,
                'year': year,
                'body_type': body_type,
                'mileage': mileage,
                'engine': engine,
                'power': power,
                'specs': specs,
                'transmission': transmission,
                'fuel': fuel,
                'condition': condition,
                'color': color,
                'price': price,
                'image_link': image_link,
                'location': location,
                'active': True,
                'keyFeatures': [],
                'features': [],
                'tags': brand_name,
                'variants': [{'image': image_link,
                              'price': price_numeric_value}]}

            # is_raw_ad = True indicates that ad need to be processed to extract features
            # ready ad for classification should be in the formate of: ([fet1,fet2....],cat)
            vote, confidence = classifier.classify(ad_to_save_or_update, is_raw_ad=True)
            # sleep(10)
            if vote != 'INV':
                if confidence >= 0.3:
                    if vote == 'SAL':
                        ad_to_save_or_update['ad_cat'] = vote
                    else:
                        if brand == "": del ad_to_save_or_update['brand']
                        ad_to_save_or_update['ad_cat'] = vote
                else:
                    util.loginfo("Skipping one item, not confident classification, ad_data_link" + ad_page_link);
                    skipped += 1
                    continue
            else:
                util.loginfo("Skipping one item, classified INVALID, ad_data_link" + ad_page_link);
                skipped += 1
                continue

            ad_to_save_or_update['tags'] = calculate_tags(brand, model, ad_to_save_or_update, models)

            # TODO rename all products to ads
            if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}):
                if dao.product_exists_in_db(ad_to_save_or_update):
                    util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) + " is already in the database!");
                else:
                    util.loginfo("updating " + SOURCE + ad_data_link);
                    dao.update([ad_to_save_or_update]);
                    count_of_ads_updated += 1
            else:
                util.loginfo("adding " + SOURCE + ad_data_link);
                dao.add_list_to_db([ad_to_save_or_update])
                count_of_ads_added += 1

            if (settings['ENVIRONMENT'] == "Production"):
                wait = randint(5, 20)
                util.loginfo("waiting for " + str(wait) + " before reading next item");
                sleep(wait)

    return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads
コード例 #20
0
def read(number_of_pages, debug, brands, models, classifier):
    skipped = 0
    SOURCE = "olx.sa.com"
    count_of_ads_added = 0
    count_of_ads_updated = 0
    counter_of_ads = 0
    already_loaded_counter = 0
    for i in range(1, number_of_pages):
        util.loginfo(">>Page #" + str(i))
        page_content = util.download_file(
            settings[SOURCE]['base_url'] + "/vehicles/cars/?page=" + str(i),
            settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])
        if (page_content == None):
            continue

        catalog_listing_items = BeautifulSoup(
            page_content, 'html.parser').find_all('div', class_="ads__item")

        # TODO add ad title, ad link at its source, update date, make, model, year, location, millage, body type, seller type, options,
        # TODO add transmission, asked price, cash or installment, license validity, number of prev owners, main image, additional images, description
        # TODO decide which are mandatory and which are optional, in case it is optional decide about default data to fill input
        for item in catalog_listing_items:
            counter_of_ads += 1
            util.loginfo(">Ad # " + str(counter_of_ads))
            try:
                #TODO language should be configurable

                ad_page_link = item.find(
                    'a', class_="ads__item__title").attrs['href']
            except:
                util.loginfo("Skipping one item, error parsing ad_page_link")
                skipped += 1
                continue

            ad_page_content = util.download_file(
                ad_page_link,
                settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])

            if ad_page_content == None:
                continue

            try:
                ad_key_details = BeautifulSoup(ad_page_content,
                                               'html.parser').find_all(
                                                   'div',
                                                   class_="clr offerbody")[0]
            except:
                util.loginfo(
                    "Skipping one item, error parsing ad_key_details, ad_page_link"
                    + ad_page_link)
                skipped += 1
                continue
            try:
                ad_update_date = ad_key_details.find_all(
                    'span',
                    class_="pdingleft10 brlefte5")[0].next.strip().split(" ")
                ad_update_date = datetime.strptime(
                    ad_update_date[7][:-1] + "-" +
                    util.find_month(ad_update_date[6]) + "-" +
                    ad_update_date[5], "%Y-%m-%d")
            except:
                util.loginfo(
                    "Skipping one item, error parsing ad_update_date, ad_page_link"
                    + ad_page_link)
                skipped += 1
                continue

            try:
                ad_id = ad_key_details.find_all(
                    'span', class_="rel inlblk")[0].next.strip()
            except:
                util.loginfo(
                    "Skipping one item, error parsing ad_id, ad_page_link" +
                    ad_page_link)
                skipped += 1
                continue

            try:
                title = BeautifulSoup(
                    (item.find_all('a',
                                   class_="ads__item__title")[0].text).strip(),
                    'html.parser').text
            except:
                util.loginfo(
                    "Skipping one item, error parsing title, ad_page_link" +
                    ad_page_link)
                skipped += 1
                continue

            try:
                description = BeautifulSoup(
                    ad_key_details.find_all(
                        'p', class_="pding10 lheight20 large")[0].text.strip(),
                    'html.parser').text  #remove all tags
            except:
                util.loginfo(
                    "Skipping one item, error parsing body, ad_page_link" +
                    ad_page_link)
                skipped += 1
                continue

            try:
                # <p class="ads__item__breadcrumbs">سيارات » هيونداي</p>
                # https: // olx.sa.com / ad / honda - civic - 2004 - ID6NhJW.html
                brand = util.find_brand(
                    item.find_all('p', class_="ads__item__breadcrumbs")
                    [0].text.strip().split(' ')[2], brands)
                # brand = util.find_brand(ad_page_link.strip().split('/')[4].split('-')[0], brands)
            except:
                brand = ""

            if brand == None: brand = ""

            # TODO create a lookup for this
            try:
                model, found = util.find_model(
                    ad_key_details.find_all('td',
                                            class_="value")[0].text.strip(),
                    models)
            except:
                model = ""

            if found == False: model = ""

            try:
                year = int(
                    util.toArabicNumerals(
                        ad_key_details.find_all(
                            'td', class_="value")[2].text.strip()))
            except:
                year = 0

            try:
                body_type = ""
            except:
                body_type = ""

            try:
                mileage = ad_key_details.find_all(
                    'td', class_="value")[3].text.strip()
            except:
                mileage = 0

            # TODO some features are not offered by hatla2ee, therefore we need to ml
            # TODO suggested algo: find similar brand, model, year instances with power values exist and then take mode
            # TODO I am returning zero for now, if we are showing this value on the website then we need to indicate that it is
            # TODO prdited value and does not exisit in the original ad

            try:
                engine = 0
            except:
                engine = 0

            # TODO find similar instances, if three instances found then fetch power for this car otherwise return zero
            try:
                power = 0
            except:
                power = 0

            try:
                specs = ""
            except:
                specs = ""

            try:
                transmission = (ad_key_details.find_all(
                    'td', class_="value")[1].text).strip()
            except:
                transmission = ""

            try:
                fuel = ""
            except:
                fuel = ""

            try:
                condition = ""
            except:
                condition = ""

            try:
                color = ""
            except:
                color = ""

            try:
                price = item.find(
                    'p', class_="ads__item__price price ").text.strip()
                price_numeric_value = int(
                    (filter(lambda x: x in set(string.printable),
                            price)).strip('.. ').replace(',', ''))
            except:
                price = 0
                price_numeric_value = 0

            try:
                image_link = item.find_all(
                    'img', class_="ads__item__photos")[0].attrs['src'].strip()
            except:
                util.loginfo(
                    "Skipping one item, error parsing image_link, ad_page_link"
                    + ad_page_link)
                skipped += 1
                continue

            try:
                location = item.find(
                    'p', class_='ads__item__location').text.strip()
            except:
                location = ""

            try:
                if detect(description) == 'ar':
                    language_override = 'arabic'  # langdetect returns ISO 639-1 codes,
                else:
                    language_override = 'english'  # where mongo expects ISO 639-3 codes for Arabic

            except:
                util.loginfo(
                    "Coudn't detect language falling back to English, ad_page_link"
                    + ad_page_link)
                language_override = 'english'

            brand_name = ""

            # TODO add tags field to the ads, this tags will take priority in search and will contian
            # TODO brand, model, color, engine ...etc all the features that I would to search with and I
            # TODO will need to categorize with as well, possibly I will make them lookups.

            ad_to_save_or_update = {
                'source': SOURCE,
                'ad_id': ad_id,
                'language_override': language_override,
                'ad_page_link': ad_page_link,
                'last_update': ad_update_date,
                'title': title,
                'nameLower': title.lower(),
                'description': description,
                'brand': brand if brand == "" else ObjectId(brand.get('_id')),
                'model': model,
                'year': year,
                'body_type': body_type,
                'mileage': mileage,
                'engine': engine,
                'power': power,
                'specs': specs,
                'transmission': transmission,
                'fuel': fuel,
                'condition': condition,
                'color': color,
                'price': price,
                'image_link': image_link,
                'location': location,
                'active': True,
                'keyFeatures': [],  #TODO remove this, not used
                'features': [],  #TODO remove this, not used
                'tags': brand_name,
                'variants': [{
                    'image': image_link,
                    'price': price_numeric_value
                }]
            }

            #is_raw_ad = True indicates that ad need to be processed to extract features
            #ready ad for classification should be in the formate of: ([fet1,fet2....],cat)

            vote, confidence = classifier.classify(ad_to_save_or_update, True)
            # sleep(10)
            if vote != 'INV':
                if confidence >= 0.3:
                    if vote == 'SAL':
                        # must have brand
                        if ad_to_save_or_update['brand'] == "":
                            util.loginfo(
                                "Skipping one item, SAL and brand is not found, ad_page_link"
                                + ad_page_link)
                            skipped += 1
                            continue
                        # must have model
                        if ad_to_save_or_update['model'] == "":
                            util.loginfo(
                                "Skipping one item, SAL and model is not found, ad_page_link"
                                + ad_page_link)
                            skipped += 1
                            continue
                        # must have year
                        # if ad_to_save_or_update['year'] == 0:
                        #     util.loginfo("Skipping one item, SAL and year is not found, ad_page_link" + ad_page_link);
                        #     skipped += 1
                        #     continue
                    else:
                        if brand == "": del ad_to_save_or_update['brand']
                    ad_to_save_or_update['ad_cat'] = vote
                else:
                    util.loginfo(
                        "Skipping one item, not confident classification, ad_data_link"
                        + ad_page_link)
                    skipped += 1
                    continue
            else:
                util.loginfo(
                    "Skipping one item, classified INVALID, ad_data_link" +
                    ad_page_link)
                skipped += 1
                continue

            ad_to_save_or_update['tags'] = calculate_tags(
                brand, model, ad_to_save_or_update, models)

            # TODO rename all products to ads
            if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}):
                if dao.product_exists_in_db(ad_to_save_or_update):
                    util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) +
                                 " is already in the database!")
                else:
                    util.loginfo("updating " + SOURCE + ad_page_link)
                    dao.update([ad_to_save_or_update])
                    count_of_ads_updated += 1

            else:
                util.loginfo("adding " + SOURCE + ad_page_link)
                dao.add_list_to_db([ad_to_save_or_update])
                count_of_ads_added += 1

            if (settings['ENVIRONMENT'] == "Production"):
                wait = randint(5, 20)
                util.loginfo("waiting for " + str(wait) +
                             " before reading next item")
                sleep(wait)

    return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads
コード例 #21
0
def main(number_of_pages, debug, source, brands, models, classifier):
    log_file_name = settings[settings['ENVIRONMENT']]['LOG_FOLDER_PATH'] + str(
        date.today())
    util.loginfo(
        "==============================================================")
    util.loginfo(" Reading from " + source + " .... # pages: " +
                 str(number_of_pages) + " debug: " + str(debug))
    util.logtofile(
        log_file_name, "Reading from " + source + " .... # pages: " +
        str(number_of_pages) + " debug: " + str(debug))
    util.loginfo(
        "==============================================================")
    # TODO we need to check if reading basic data without description is enough to spee up update

    if source == SOURCE_CARMUDI:
        reader = reader_carmudi
    elif source == SOURCE_HATLA2EE:
        reader = reader_hatla2ee
    elif source == SOURCE_OLX:
        reader = reader_olx
    elif source == SOURCE_HARAJ:
        reader = reader_haraj

    count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads = reader.read(
        number_of_pages, debug, brands, models, classifier)

    util.loginfo("For " + source + " number of ads to add: " +
                 str(count_of_ads_added))
    util.logtofile(
        log_file_name,
        "For " + source + " number of ads to add: " + str(count_of_ads_added))
    util.loginfo("For " + source + " number of ads to update: " +
                 str(count_of_ads_updated))
    util.logtofile(
        log_file_name, "For " + source + " number of ads to update: " +
        str(count_of_ads_updated))
    util.logtofile(
        log_file_name,
        "For " + source + " number of ads to skipped: " + str(skipped))
    util.logtofile(
        log_file_name,
        "For " + source + " total number of ads : " + str(counter_of_ads))

    util.loginfo(
        "======================== Done ===============================")
コード例 #22
0
    def extract_features(self):
        # read ads samples from training database
        min_word_length = 2
        min_word_freq = 5
        no_training_rounds = 5
        all_stop_words = self.get_stopwords()

        raw_list_of_ads = self.read_data_from_db()

        raw_list_of_ads = [({
            'ad_text':
            ad['description'] + " " + ad['title'] + " " +
            (ad['source']).replace('.com', '').replace('.sa', '').replace(
                '.ksa', '') + (" " +
                               (ad['ad_page_link'][:-1]).rsplit('/', 1)[-1]) if
            (ad['source'] == 'haraj.com.sa') else ""
        }, ad['ad_cat']) for ad in raw_list_of_ads]

        for (classifier_name, classifier) in self.classifiers_list:
            accuracy_ary = []
            best_accuracy = 0
            util.loginfo("----------------" + classifier_name +
                         " training ----------------")
            all_words_set = []
            for i in range(0, len(self.words_sets[classifier_name])):
                all_words_set = self.words_sets[classifier_name].pop(i)
                random.shuffle(raw_list_of_ads)
                raw_train_rows = int(math.floor(0.7 * len(raw_list_of_ads)))
                raw_train_set, raw_test_set = raw_list_of_ads[:raw_train_rows], raw_list_of_ads[
                    raw_train_rows:]
                train_list_of_ads = [(self.generate_feature(ad, all_words_set),
                                      cat) for (ad, cat) in raw_train_set]
                test_list_of_ads = [(self.generate_feature(ad,
                                                           all_words_set), cat)
                                    for (ad, cat) in raw_test_set]

                classifier = classifier.train(train_list_of_ads)

                accuracy = nltk.classify.accuracy(classifier, test_list_of_ads)

                if accuracy > best_accuracy:
                    util.log("found new best accuracy" + str(accuracy))
                    best_accuracy = accuracy
                    best_classifier = classifier
                    best_words_set = all_words_set
                    self.words_sets[classifier_name] = all_words_set

            util.loginfo("best accuracy : " + str(best_accuracy))

            self.save_classifier_model(best_classifier, classifier_name)
            self.trained_classifiers.append((classifier_name, best_classifier))
            self.save_word_set(best_words_set, classifier_name)
            self.words_sets[classifier_name] = best_words_set

        # self.save_stop_words_set(all_stop_words)
        # self.stop_words = all_stop_words
        util.loginfo(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
        )
        util.loginfo("voteclassifier accuracy is:" +
                     str(nltk.classify.accuracy(self, test_list_of_ads)))
        util.loginfo(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
        )
        util.logdebug("read models")
        self.read_models()
        util.loginfo(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
        )
        util.loginfo("voteclassifier accuracy is:" +
                     str(nltk.classify.accuracy(self, test_list_of_ads)))
        util.loginfo(
            "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
        )
コード例 #23
0
def read(number_of_pages, debug, brands, models, classifier):
    skipped = 0
    SOURCE = "ksa.hatla2ee.com"
    count_of_ads_added = 0
    count_of_ads_updated = 0
    counter_of_ads = 0
    for i in range(1, number_of_pages):
        util.loginfo(">>Page #" + str(i))
        page_content = util.download_file(
            settings[SOURCE]['base_url'] + "/ar/car/page/" + str(i),
            settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])
        if (page_content == None):
            continue

        catalog_listing_items = BeautifulSoup(
            page_content, 'html.parser').find_all('div', class_="CarListUnit")

        # TODO add ad title, ad link at its source, update date, make, model, year, location, millage, body type, seller type, options,
        # TODO add transmission, asked price, cash or installment, license validity, number of prev owners, main image, additional images, description
        # TODO decide which are mandatory and which are optional, in case it is optional decide about default data to fill input
        for item in catalog_listing_items:
            counter_of_ads += 1
            util.loginfo(">Ad # " + str(counter_of_ads))
            try:
                #TODO language should be configurable
                ad_page_link = (item.find_all(
                    'a', class_="NewListTitle")[0].attrs['href']).strip()
            except:
                util.loginfo("Skipping one item, error parsing ad_page_link")
                skipped += 1
                continue

            ad_page_content = util.download_file(
                settings[SOURCE]['base_url'] + ad_page_link,
                settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])
            if ad_page_content == None:
                skipped += 1
                continue

            try:
                ad_key_details = BeautifulSoup(
                    ad_page_content,
                    'html.parser').find_all('div',
                                            class_="nUnitKeyDetailsContent")
            except:
                util.loginfo(
                    "Skipping one item, error parsing ad_key_details, ad_page_link"
                    + ad_page_link)
                skipped += 1
                continue

            try:
                ad_update_date = datetime.strptime(
                    item.find_all('span',
                                  class_="NewListDate")[0].next.strip(),
                    "%Y-%m-%d")
            except:
                util.loginfo(
                    "Skipping one item, error parsing ad_update_date, ad_page_link"
                    + ad_page_link)
                skipped += 1
                continue

            try:
                ad_id = (item.find_all(
                    'div', class_="favorit")[0].attrs['data-carid']).strip()
            except:
                util.loginfo(
                    "Skipping one item, error parsing ad_id, ad_page_link" +
                    ad_page_link)
                skipped += 1
                continue

            try:
                title = BeautifulSoup(
                    (item.find_all('a',
                                   class_="NewListTitle")[0].text).strip(),
                    'html.parser').text
            except:
                util.loginfo(
                    "Skipping one item, error parsing title, ad_page_link" +
                    ad_page_link)
                skipped += 1
                continue

            try:
                desc_items = (item.find_all(
                    'ul', class_="NewListSpecifications")[0].find_all('li'))
                description = desc_items[0].find_all('a')[0].text.strip() + ' ' \
                       + desc_items[1].find_all('a')[0].text.strip() + ' ' \
                       + desc_items[2].find_all('span')[0].text.strip() + ' ' \
                       + desc_items[3].find_all('a')[0].text.strip() + ' ' \
                       + desc_items[4].find_all('span')[0].text.strip()
                description = BeautifulSoup(
                    description, 'html.parser').text  #remove all tags
            except:
                util.loginfo(
                    "Skipping one item, error parsing body, ad_page_link" +
                    ad_page_link)
                skipped += 1
                continue

            # TODO configure
            try:
                # <a href="/ar/car/hyundai/elantra/1592640">
                brand = util.find_brand(
                    ((item.find_all('a')[0]['href']).strip()).split('/')[3],
                    brands)
            except:
                brand = ""

            if brand == None: brand = ""

            try:
                model, found = util.find_model(
                    ((item.find_all('a')[0]['href']).strip()
                     ).split('/')[4].capitalize(), models)
            except:
                model = ""
            if found == False: model = ""

            try:
                year = int(
                    util.toArabicNumerals(
                        (item.find_all('span',
                                       class_="muted")[0].text).strip()))
            except:
                year = 0

            # TODO configure
            try:
                body_type = (ad_key_details[0].find_all(
                    'strong', class_="nUnitItem")[8].next).strip()
            except:
                body_type = ""

            # TODO configure
            try:
                mileage = (ad_key_details[0].find_all(
                    'strong', class_="nUnitItem")[0].next).strip()
            except:
                mileage = 0

            # TODO some features are not offered by hatla2ee, therefore we need to ml
            # TODO suggested algo: find similar brand, model, year instances with power values exist and then take mode
            # TODO I am returning zero for now, if we are showing this value on the website then we need to indicate that it is
            # TODO prdited value and does not exisit in the original ad

            # TODO zero need to be shown as -- on shofle_web

            # TODO optional
            try:
                engine = 0
                # engine = predicter.query_engin_prediction_model()
            except:
                engine = 0

            # TODO find similar instances, if three instances found then fetch power for this car otherwise return zero
            try:
                power = 0
            except:
                power = 0

            try:
                specs = ""
            except:
                specs = 0

            try:
                transmission = (ad_key_details[0].find_all(
                    'strong', class_="nUnitItem")[6].next).strip()
            except:
                transmission = ""

            try:
                fuel = (ad_key_details[0].find_all(
                    'strong', class_="nUnitItem")[7].next).strip()
            except:
                fuel = 0

            try:
                condition = ""
            except:
                condition = ""

            try:
                color = (ad_key_details[0].find_all(
                    'strong', class_="nUnitItem")[4].next).strip()
            except:
                color = ""

            try:
                price = (item.find_all('a',
                                       class_="NewListPrice")[0].text).strip()
                price_numeric_value = int(
                    (filter(lambda x: x in set(string.printable),
                            price)).strip('.. ').replace(',', ''))
            except:
                price = 0
                price_numeric_value = 0

            try:
                image_link = (item.find_all(
                    'img', class_="lazy imgfit")[0]['data-original']).strip()
            except:
                util.loginfo(
                    "Skipping one item, error parsing image_link, ad_page_link"
                    + ad_page_link)
                skipped += 1
                continue

            try:
                location = (item.find(
                    'a', href=lambda href: href and "city" in href)
                            )['href'].strip().split('/')[4].capitalize()
            except:
                location = ""

            try:
                if detect(description) == 'ar':
                    language_override = 'arabic'  # langdetect returns ISO 639-1 codes,
                else:
                    language_override = 'english'
                # where mongo expects ISO 639-3 codes for Arabic
            except:
                util.loginfo(
                    "Coudn't detect language falling back to English, ad_page_link"
                    + ad_page_link)
                language_override = 'english'

            brand_name = ""

            # TODO add tags field to the ads, this tags will take priority in search and will contian
            # TODO brand, model, color, engine ...etc all the features that I would to search with and I
            # TODO will need to categorize with as well, possibly I will make them lookups.

            ad_to_save_or_update = {
                'source': SOURCE,
                'ad_id': ad_id,
                'language_override': language_override,
                'ad_page_link': settings[SOURCE]['base_url'] + ad_page_link,
                'last_update': ad_update_date,
                'title': title,
                'nameLower': title.lower(),
                'description': description,
                'brand': brand if brand == "" else ObjectId(brand.get('_id')),
                'model': model,
                'year': year,
                'body_type': body_type,
                'mileage': mileage,
                'engine': engine,
                'power': power,
                'specs': specs,
                'transmission': transmission,
                'fuel': fuel,
                'condition': condition,
                'color': color,
                'price': price,
                'image_link': image_link,
                'location': location,
                'active': True,
                'keyFeatures': [],  #TODO remove this, not used
                'features': [],  #TODO remove this, not used
                'tags': brand_name,
                'variants': [{
                    'image': image_link,
                    'price': price_numeric_value
                }]
            }

            #is_raw_ad = True indicates that ad need to be processed to extract features
            #ready ad for classification should be in the formate of: ([fet1,fet2....],cat)

            vote, confidence = classifier.classify(ad_to_save_or_update,
                                                   is_raw_ad=True)
            # sleep(10)
            if vote != 'INV':
                if confidence >= 0.3:
                    if vote == 'SAL':
                        # must have brand
                        if ad_to_save_or_update['brand'] == "":
                            util.loginfo(
                                "Skipping one item, SAL and brand is not found, ad_page_link"
                                + ad_page_link)
                            skipped += 1
                            continue
                        # must have model
                        if ad_to_save_or_update['model'] == "":
                            util.loginfo(
                                "Skipping one item, SAL and model is not found, ad_page_link"
                                + ad_page_link)
                            skipped += 1
                            continue
                        # must have year
                        # if ad_to_save_or_update['year'] == 0:
                        #     util.loginfo("Skipping one item, SAL and year is not found, ad_page_link" + ad_page_link);
                        #     skipped += 1
                        #     continue
                        # estimated_price = price_predicter.query_price_prediction_model([ad_to_save_or_update])
                        # print "Predicted Engine value is ", str(estimated_price)
                        # ad_to_save_or_update['engine'] = estimated_price[0]

                    else:
                        if brand == "": del ad_to_save_or_update['brand']
                    ad_to_save_or_update['ad_cat'] = vote
                else:
                    util.loginfo(
                        "Skipping one item, not confident classification, ad_data_link"
                        + ad_page_link)
                    skipped += 1
                    continue
            else:
                util.loginfo(
                    "Skipping one item, classified INVALID, ad_data_link" +
                    ad_page_link)
                skipped += 1
                continue

            ad_to_save_or_update['tags'] = calculate_tags(
                brand, model, ad_to_save_or_update, models)

            # TODO rename all products to ads
            if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}):
                if dao.product_exists_in_db(ad_to_save_or_update):
                    util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) +
                                 " is already in the database!")
                else:
                    util.loginfo("updating " + SOURCE + ad_page_link)
                    dao.update([ad_to_save_or_update])
                    count_of_ads_updated += 1

            else:
                util.loginfo("adding " + SOURCE + ad_page_link)
                dao.add_list_to_db([ad_to_save_or_update])
                count_of_ads_added += 1

            if (settings['ENVIRONMENT'] == "Production"):
                wait = randint(5, 20)
                util.loginfo("waiting for " + str(wait) +
                             " before reading next item")
                sleep(wait)

    return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads
コード例 #24
0
def main():
    util.loginfo(
        "==============================================================")
コード例 #25
0
def read(number_of_pages, debug, brands, models, classifier):
    skipped = 0
    SOURCE = "haraj.com.sa"
    count_of_ads_added = 0
    count_of_ads_updated = 0
    counter_of_ads = 0
    already_loaded_counter = 0
    for i in range(1, number_of_pages):
        util.loginfo("number of pages is" + str(number_of_pages))
        util.loginfo(">>Page #" + str(i))
        page_content = util.download_file(settings[SOURCE]['base_url'] +
                                          "/jsonGW/getadsx.php?link=tags/%D8%AD%D8%B1%D8%A7%D8%AC%20%D8%A7%D9%84%D8%B3%D9%8A%D8%A7%D8%B1%D8%A7%D8%AA/" + str(
            i) + "&_=1512689531762",
                                          settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])
        if (page_content == None):
            continue

        catalog_listing_items = BeautifulSoup(page_content, 'html.parser').find_all('div', class_="adx")

        # for item in catalog_items:
        for item in catalog_listing_items:
            counter_of_ads += 1
            util.loginfo(">Ad # " + str(counter_of_ads))
            try:
                ad_page_link = (item.find_all('a')[0].attrs['href']).strip();
            except:
                util.loginfo("Skipping one item, error parsing ad_page_link")
                skipped += 1
                continue

            ad_page_content = util.download_file(ad_page_link,
                                                 settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH'])

            if ad_page_content == None:
                skipped += 1
                continue

            try:
                ad_page_content = BeautifulSoup(ad_page_content, 'html.parser').find_all('div', class_="pageContent")[
                    0];
            except:
                util.loginfo("Skipping one item, error parsing ad_key_details, ad_page_content" + ad_page_link);
                skipped += 1
                continue

            try:
                time_text_ary = ad_page_content.find_all('div', class_="adxExtraInfoPart")[2].text.strip().split()
                if len(time_text_ary) == 3:
                    ad_update_date = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
                elif len(time_text_ary) == 5 or len(time_text_ary) == 6:
                    if time_text_ary[2].find(u'ساعه') != -1 \
                            or time_text_ary[1].find(u'ساعه') != -1 \
                            or time_text_ary[1].find(u'يوم') != -1:
                        ad_update_date = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
                    elif time_text_ary[2].find(u'يوم') != -1:
                        ad_update_date = datetime.today() - timedelta(days=int(time_text_ary[1]))
                        ad_update_date = datetime(ad_update_date.today().year, ad_update_date.today().month,
                                                  ad_update_date.today().day)
                    else:
                        ad_update_date = None
                else:
                    ad_update_date = None
                if ad_update_date == None:
                    util.loginfo("Skipping one item, error parsing ad_update_date, ad_page_link" + ad_page_link);
                    skipped += 1
                    continue
            except:
                util.loginfo("Skipping one item, error parsing ad_update_date, ad_page_link" + ad_page_link);
                skipped += 1
                continue

            try:
                ad_id = ad_page_content.find_all('div', class_="adxExtraInfoPart")[3].text.strip().strip('#')
            except:
                util.loginfo("Skipping one item, error parsing ad_id, ad_page_link" + ad_page_link);
                skipped += 1
                continue

            try:
                title = ad_page_content.find_all('h3')[0].text.strip().split()[1]
            except:
                util.loginfo("Skipping one item, error parsing title, ad_page_link" + ad_page_link);
                skipped += 1
                continue

            try:
                # desc_items = strip_tags(ad_page_content.find_all('div', class_="adxBody")[0].next.strip("<br>").strip('»'))
                description = BeautifulSoup(ad_page_content.find_all('div', class_="adxBody")[0].text,
                                            'html.parser').text.strip()  # remove all tags
            except:
                util.loginfo("Skipping one item, error parsing body, ad_page_link" + ad_page_link);
                skipped += 1
                continue

            try:
                title_desc_text_ary = (title + " " + description).split()
                for w in title_desc_text_ary:
                    brand = util.find_brand(w, brands)
                    if brand != None: break;
            except:
                brand = ""

            if brand == None: brand = ""

            try:
                for w in title_desc_text_ary:
                    model, found = util.find_model(w, models)
                    if found == True: break
            except:
                model = ""

            if found == False: model = ""

            try:
                year = int(ad_page_content.find_all('meta')[0].attrs['content'])
            except:
                year = 0

            try:
                body_type = "";
            except:
                body_type = ""

            try:
                mileage = 0;
            except:
                mileage = 0

            try:
                engine = 0;
            except:
                engine = 0

            try:
                power = 0
            except:
                power = 0

            try:
                specs = ""
            except:
                specs = ""

            try:
                transmission = "";
            except:
                transmission = ""

            try:
                fuel = "";
            except:
                fuel = ""

            try:
                condition = ad_page_content.find_all('meta')[1].attrs['content']
            except:
                condition = ""

            try:
                color = "";
            except:
                color = ""

            try:
                price = 0
                price_numeric_value = 0
            except:
                price = 0
                price_numeric_value = 0

            try:
                # $('.adxBody>img')[0]['src']
                image_link = ad_page_content.find('div', class_='adxBody').find_all('img')[0].attrs['src']
            except:
                util.loginfo("Skipping one item, error parsing image_link, ad_page_link" + ad_page_link);
                skipped += 1
                continue

            try:
                location = ad_page_content.find_all('div', class_="adxExtraInfoPart")[0].text.strip()
            except:
                location = 0

            try:
                if detect(description) == 'ar':
                    language_override = 'arabic'  # langdetect returns ISO 639-1 codes,
                else:
                    language_override = 'english'
                # where mongo expects ISO 639-3 codes for Arabic
            except:
                util.loginfo("Coudn't detect language falling back to English, ad_page_link" + ad_page_link);
                language_override = 'english'

            brand_name = ""

            # TODO add tags field to the ads, this tags will take priority in search and will contian
            # TODO brand, model, color, engine ...etc all the features that I would to search with and I
            # TODO will need to categorize with as well, possibly I will make them lookups.

            ad_to_save_or_update = {
                'source': SOURCE,
                'ad_id': ad_id,
                'language_override': language_override,
                'ad_page_link': ad_page_link,
                'last_update': ad_update_date,
                'title': title,
                'nameLower': title.lower(),
                'description': description,
                'brand': brand if brand == "" else ObjectId(brand.get('_id')),
                'model': model,
                'year': year,
                'body_type': body_type,
                'mileage': mileage,
                'engine': engine,
                'power': power,
                'specs': specs,
                'transmission': transmission,
                'fuel': fuel,
                'condition': condition,
                'color': color,
                'price': price,
                'image_link': image_link,
                'location': location,
                'active': True,
                'keyFeatures': [],  # TODO remove this, not used
                'features': [],  # TODO remove this, not used
                'tags': brand_name,
                'variants': [{'image': image_link,
                              'price': price_numeric_value}]
            }

            # is_raw_ad = True indicates that ad need to be processed to extract features
            # ready ad for classification should be in the formate of: ([fet1,fet2....],cat)
            vote, confidence = classifier.classify(ad_to_save_or_update, is_raw_ad=True)
            # sleep(5)
            # In Haraj and in all Readers, Brand is mandatory only in SAL, in Haraj, in SAL, if model is not found
            # the origianl text value from the ad is accepted, also "" is accpted as a model from Haraj
            # all this is due to large noise in data, TODO link brands to models, collect varaious ways of
            # writting models and brands in both arabic and english to enhance matching, and then reject
            # "" as a model, TODO add year as a mandatory attribute to SAL in Haraj and in all other readers
            if vote != 'INV':
                if confidence >= 0.6:  # Confidence is a little highr for Haraj due to high rate of noise
                    if vote == 'SAL':
                        # must have brand
                        if ad_to_save_or_update['brand'] == "":
                            if ad_to_save_or_update['model'] != "":
                                util.logdebug(
                                    "Brand is empty, and model is not, now trying to resolve brand using model")
                                brand = find_brand_by_model(ad_to_save_or_update['model'], models)
                                if brand != "":
                                    util.logdebug("------------- >>> Found brand by model ")
                                    ad_to_save_or_update['brand'] = brand
                                else:
                                    util.loginfo(
                                        "Skipping one item, SAL and brand is not found, ad_page_link" + ad_page_link);
                                    skipped += 1
                                    continue
                            else:
                                util.loginfo(
                                    "Skipping one item, SAL and brand is not found, ad_page_link" + ad_page_link);
                                skipped += 1
                                continue

                        # must have model
                        # if ad_to_save_or_update['model'] == "":
                        #     util.loginfo("Skipping one item, SAL and model is not found, ad_page_link" + ad_page_link);
                        #     skipped += 1
                        #     continue

                        if ad_to_save_or_update['brand'] != "":
                            if ad_to_save_or_update['model'] != "":
                                if (not brand_and_model_match(brand, model)):
                                    util.loginfo(
                                        "Skipping one item, error model and barnd don't match, ad_page_link" + ad_page_link);
                                    skipped += 1
                                    continue
                        else:
                            util.loginfo(
                                "Skipping one item, error model and barnd don't match, ad_page_link" + ad_page_link);
                            skipped += 1
                            continue

                            # engine = predicter.query_engin_prediction_model([ad_to_save_or_update])
                            # print "Predicted Engine value is ", str(engine)

                    else:
                        if brand == "": del ad_to_save_or_update['brand']
                    ad_to_save_or_update['ad_cat'] = vote
                else:
                    util.loginfo("Skipping one item, not confident classification, ad_data_link" + ad_page_link);
                    skipped += 1
                    continue
            else:
                util.loginfo("Skipping one item, classified INVALID, ad_data_link" + ad_page_link);
                skipped += 1
                continue

                # must have year
                if ad_to_save_or_update['year'] == 0:
                    util.loginfo("Skipping one item, SAL and year is not found, ad_page_link" + ad_page_link);
                    skipped += 1
                    continue

            ad_to_save_or_update['tags'] = calculate_tags(brand, model, ad_to_save_or_update, models)

            # TODO rename all products to ads
            if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}):
                if dao.product_exists_in_db(ad_to_save_or_update):
                    util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) + " is already in the database!");
                else:
                    util.loginfo("updating " + SOURCE + ad_page_link);
                    dao.update([ad_to_save_or_update]);
                    count_of_ads_updated += 1

            else:
                util.loginfo("adding " + SOURCE + ad_page_link);
                dao.add_list_to_db([ad_to_save_or_update])
                count_of_ads_added += 1

            if (settings['ENVIRONMENT'] == "Production"):
                wait = randint(5, 20)
                util.loginfo("waiting for " + str(wait) + " before reading next item");
                sleep(wait)

    return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads