Example #1
0
    def handle(self, *args, **options):
        """
        Working process divided into some separate stages
            1. Getting last date of Advert from DB. 
        """

        FULL_IMPORT = True
        DUPS_LIMIT = 10
        __count = 0
        __dups = 0
        __page_dups = 0
        control = None
        obj_buffer = []
        obj_buffer_len = 10
        obj_buffer_len = 100

        if FULL_IMPORT:
            Advert.objects.all().delete()
            Advert._get_db().mongoengine.counters.remove({})

        it = ads()
        while True:
            try:
                # it.next()
                # print control
                obj = it.send(control)
                control = None

                if obj == {}:
                    print "EMPTY"
                    continue

                if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys(
                ):
                    # print obj["url"]
                    obj['kitchen_area'] = obj[
                        u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']
                    del obj[
                        u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']

                if obj["images_len"] == 0:
                    continue

                if (not "price" in obj.keys()) or obj["price"] == "":
                    continue

                if Advert.objects(url=obj["url"]):
                    # if Advert.objects(title=obj["title"], description=obj["description"]):
                    __dups += 1
                    __page_dups += 1
                    if __page_dups >= DUPS_LIMIT and not FULL_IMPORT:
                        control = "break_page_search"
                        __page_dups = 0
                else:
                    try:
                        prep_obj = _prepare(obj)
                        print obj['url']

                        _ins = Advert(**prep_obj)

                        if FULL_IMPORT:
                            obj_buffer.append(_ins)
                            if len(obj_buffer) >= obj_buffer_len:
                                Advert.objects.insert(obj_buffer)
                                obj_buffer = []

                        else:
                            _ins.save(write_concern={
                                'w': 0,
                                'j': False,
                                'wtimeout': 0
                            })

                        if prep_obj['images_len'] > 0:
                            for element in prep_obj["images"]:
                                for jit in prep_obj["images"][element]:
                                    dl_image(jit)
                            # import ipdb; ipdb.set_trace()

                    except UnicodeEncodeError:
                        pass
                        # import ipdb; ipdb.set_trace()

                if __count % 100 == 0:
                    print "count: %s\tdups: %s" % (__count, __dups)
                # print "count: %s\tdups: %s\t\r" % (__count, __dups),
                __count += 1

            except StopIteration:
                it.close()
                break

        if len(obj_buffer) > 0:
            Advert.objects.insert(obj_buffer)

        print
Example #2
0
    def handle(self, *args, **options):
        # Advert.objects.all().delete()
        # Advert._get_db().mongoengine.counters.remove({})
        f = open('./csv_files/base_kv_utf.csv')
        r = csv.DictReader(f, delimiter=";")
        # curs = Currency.objects.all()
        # adverts = Advert.objects({})
        # return
        for row in r:
            del_keys = []
            row_obj = {}
            for key in row:
                if key != 'id' and not isinstance(key, unicode):
                    try:
                        key = key.encode('utf-8')
                    except UnicodeDecodeError:
                        continue
                    if key not in ['terms_of_sale', 'auction',
                                   'lift', 'kitchen_area',
                                   'total_area', 'living_area',
                                   'year_built', 'floor',
                                   'number_of_floors', 'price',
                                   'region', 'city', 'microregion'] \
                            and not 'image' in key and 'description' not in key:
                        row_obj[key] = unicode(row[key].decode('utf-8'))
                    elif key == 'auction':
                        if unicode(row[key].decode('utf-8')) == u'Да':
                            row_obj[key] = True
                        else:
                            row_obj[key] = False
                    elif key == 'microregion':
                        value = unicode(row[key].decode('utf-8'))
                        row_obj[key] = value
                        row_obj['region2'] = value
                    elif key =='terms_of_sale':
                        value = unicode(row[key].decode('utf-8'))
                        if u'аренда' in value:
                            row_obj['action_type'] = 'rent'
                        elif u'обмен' in value:
                            row_obj['action_type'] = 'exchange'
                        else:
                            row_obj['action_type'] = 'sale'
                    elif key == 'lift':
                        if row[key] is not None and row[key] != '':
                            row_obj[key] = True
                        else:
                            row_obj[key] = False
                    elif key in ['year_built', 'floor', 'number_of_floors']:
                        if row[key] is not None and row[key] != '':
                            try:
                                row_obj[key] = int(row[key])
                            except ValueError:
                                continue
                    elif key in ['kitchen_area', 'total_area', 'living_area', 'price']:
                        if row[key] is not None and row[key] != '':
                            try:
                                row_obj[key] = float(row[key].replace(',', '.'))
                            except ValueError:
                                continue
                    elif 'image' in key:
                        if row_obj.get('images') is None:
                            row_obj['images'] = {'medium': [], 'thumbs': [], 'original': []}
                            row_obj['images_len'] = 0
                        if isinstance(row[key], list):
                            for image in row[key]:
                                if image != '':
                                    images_arr = image.split(',')
                                    for img in images_arr:
                                        row_obj['images']['medium'].append(img)
                                        row_obj['images']['thumbs'].append(img)
                                        row_obj['images']['original'].append(img)
                                        row_obj['images_len'] += 1
                        else:
                            if row[key] != '':
                                images_arr = row[key].split(',')
                                for img in images_arr:
                                    row_obj['images']['medium'].append(img)
                                    row_obj['images']['thumbs'].append(img)
                                    row_obj['images']['original'].append(img)
                                    row_obj['images_len'] += 1
                    elif 'description' in key:
                        if row_obj.get('description') is None:
                            row_obj['description'] = ''
                        row_obj['description'] += row[key]

                    elif key in ['region', 'city']:
                        if key == 'city':
                            value = unicode(row[key].decode('utf-8'))
                            for city in CITIES:
                                if city in value:
                                    row_obj['region'] = value
                                    break
                            if row_obj.get('region') is None:
                                row_obj['region'] = unicode(row['region'].decode('utf-8')) + u' область'
                            row_obj[key] = u'г. ' + value

                        # elif key == 'house':
                        #     if row_obj.get('address') is not None:
                        #         row_obj['address'] += u' ' + unicode(row[key].decode('utf-8'))
                        #     else:
                        #         row_obj[key] = row[key]
                        # elif key == 'address':
                        #     if row_obj.get(key) is None:
                        #         row_obj[key] = unicode(row[key].decode('utf-8'))
                        #     row_obj[key] += u' ' + unicode(row['house'].decode('utf-8'))

            row_obj['currency'] = 'usd'
            row_obj['current_status'] = 'vip_normal'
            row_obj['group'] = 'living'
            row_obj['cat_tab'] = 'flat'
            row_obj['cat_type'] = 'flat'
            adv = Advert(**row_obj)
            adv.save(write_concern={'w':0, 'j':False, 'wtimeout':0})

        # a = Advert.objects(price__exists=True, images_len__gt=1,
        #                    region__icontains=u'Гомель',
        #                    action_type__contains='sale',
        #                    group='living', cat_type_in='flat')
        a = Advert.objects(price__exists=True, images_len__gt=1,
                           region__icontains=u'Гомель',
                           action_type__contains='sale',
                           group='living',
                           cat_type__in=['flat'])
        adv_im = []
        for adv in a:
            images = adv.get('images')
            if images is not None and len(images)>1:
               adv_im.append(adv)
        print a
Example #3
0
    def handle(self, *args, **options):
        # Advert.objects.all().delete()
        # Advert._get_db().mongoengine.counters.remove({})
        f = open('./csv_files/base_kv_utf.csv')
        r = csv.DictReader(f, delimiter=";")
        # curs = Currency.objects.all()
        # adverts = Advert.objects({})
        # return
        for row in r:
            del_keys = []
            row_obj = {}
            for key in row:
                if key != 'id' and not isinstance(key, unicode):
                    try:
                        key = key.encode('utf-8')
                    except UnicodeDecodeError:
                        continue
                    if key not in ['terms_of_sale', 'auction',
                                   'lift', 'kitchen_area',
                                   'total_area', 'living_area',
                                   'year_built', 'floor',
                                   'number_of_floors', 'price',
                                   'region', 'city', 'microregion'] \
                            and not 'image' in key and 'description' not in key:
                        row_obj[key] = unicode(row[key].decode('utf-8'))
                    elif key == 'auction':
                        if unicode(row[key].decode('utf-8')) == u'Да':
                            row_obj[key] = True
                        else:
                            row_obj[key] = False
                    elif key == 'microregion':
                        value = unicode(row[key].decode('utf-8'))
                        row_obj[key] = value
                        row_obj['region2'] = value
                    elif key == 'terms_of_sale':
                        value = unicode(row[key].decode('utf-8'))
                        if u'аренда' in value:
                            row_obj['action_type'] = 'rent'
                        elif u'обмен' in value:
                            row_obj['action_type'] = 'exchange'
                        else:
                            row_obj['action_type'] = 'sale'
                    elif key == 'lift':
                        if row[key] is not None and row[key] != '':
                            row_obj[key] = True
                        else:
                            row_obj[key] = False
                    elif key in ['year_built', 'floor', 'number_of_floors']:
                        if row[key] is not None and row[key] != '':
                            try:
                                row_obj[key] = int(row[key])
                            except ValueError:
                                continue
                    elif key in [
                            'kitchen_area', 'total_area', 'living_area',
                            'price'
                    ]:
                        if row[key] is not None and row[key] != '':
                            try:
                                row_obj[key] = float(row[key].replace(
                                    ',', '.'))
                            except ValueError:
                                continue
                    elif 'image' in key:
                        if row_obj.get('images') is None:
                            row_obj['images'] = {
                                'medium': [],
                                'thumbs': [],
                                'original': []
                            }
                            row_obj['images_len'] = 0
                        if isinstance(row[key], list):
                            for image in row[key]:
                                if image != '':
                                    images_arr = image.split(',')
                                    for img in images_arr:
                                        row_obj['images']['medium'].append(img)
                                        row_obj['images']['thumbs'].append(img)
                                        row_obj['images']['original'].append(
                                            img)
                                        row_obj['images_len'] += 1
                        else:
                            if row[key] != '':
                                images_arr = row[key].split(',')
                                for img in images_arr:
                                    row_obj['images']['medium'].append(img)
                                    row_obj['images']['thumbs'].append(img)
                                    row_obj['images']['original'].append(img)
                                    row_obj['images_len'] += 1
                    elif 'description' in key:
                        if row_obj.get('description') is None:
                            row_obj['description'] = ''
                        row_obj['description'] += row[key]

                    elif key in ['region', 'city']:
                        if key == 'city':
                            value = unicode(row[key].decode('utf-8'))
                            for city in CITIES:
                                if city in value:
                                    row_obj['region'] = value
                                    break
                            if row_obj.get('region') is None:
                                row_obj['region'] = unicode(
                                    row['region'].decode(
                                        'utf-8')) + u' область'
                            row_obj[key] = u'г. ' + value

                        # elif key == 'house':
                        #     if row_obj.get('address') is not None:
                        #         row_obj['address'] += u' ' + unicode(row[key].decode('utf-8'))
                        #     else:
                        #         row_obj[key] = row[key]
                        # elif key == 'address':
                        #     if row_obj.get(key) is None:
                        #         row_obj[key] = unicode(row[key].decode('utf-8'))
                        #     row_obj[key] += u' ' + unicode(row['house'].decode('utf-8'))

            row_obj['currency'] = 'usd'
            row_obj['current_status'] = 'vip_normal'
            row_obj['group'] = 'living'
            row_obj['cat_tab'] = 'flat'
            row_obj['cat_type'] = 'flat'
            adv = Advert(**row_obj)
            adv.save(write_concern={'w': 0, 'j': False, 'wtimeout': 0})

        # a = Advert.objects(price__exists=True, images_len__gt=1,
        #                    region__icontains=u'Гомель',
        #                    action_type__contains='sale',
        #                    group='living', cat_type_in='flat')
        a = Advert.objects(price__exists=True,
                           images_len__gt=1,
                           region__icontains=u'Гомель',
                           action_type__contains='sale',
                           group='living',
                           cat_type__in=['flat'])
        adv_im = []
        for adv in a:
            images = adv.get('images')
            if images is not None and len(images) > 1:
                adv_im.append(adv)
        print a
Example #4
0
    def handle(self, *args, **options):
        """
        Working process divided into some separate stages
            1. Getting last date of Advert from DB. 
        """

        FULL_IMPORT = True
        DUPS_LIMIT = 10
        __count = 0
        __dups = 0
        __page_dups = 0
        control = None
        obj_buffer = []
        obj_buffer_len = 10
        obj_buffer_len = 100

        if FULL_IMPORT:
            Advert.objects.all().delete()
            Advert._get_db().mongoengine.counters.remove({})


        it = ads()
        while True:
            try:
                # it.next()
                # print control
                obj = it.send(control)
                control = None

                if obj == {}:
                    print "EMPTY"
                    continue

                if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys():
                    # print obj["url"]
                    obj['kitchen_area'] = obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']
                    del obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']

                if obj["images_len"] == 0:
                    continue

                if (not "price" in obj.keys()) or obj["price"] == "":
                    continue


                if Advert.objects(url=obj["url"]):
                # if Advert.objects(title=obj["title"], description=obj["description"]):
                    __dups += 1
                    __page_dups += 1
                    if __page_dups >= DUPS_LIMIT and not FULL_IMPORT:
                        control = "break_page_search"
                        __page_dups = 0
                else:
                    try:
                        prep_obj = _prepare(obj)
                        print obj['url']
                        






                        _ins = Advert(** prep_obj)

                        if FULL_IMPORT:
                            obj_buffer.append(_ins)
                            if len(obj_buffer) >= obj_buffer_len:
                                Advert.objects.insert(obj_buffer)
                                obj_buffer = []

                        else:
                            _ins.save(write_concern={'w':0, 'j':False, 'wtimeout':0})

                        if prep_obj['images_len'] > 0:
                            for element in prep_obj["images"]:
                                for jit in prep_obj["images"][element]:
                                    dl_image(jit)
                            # import ipdb; ipdb.set_trace()

                        
                    except UnicodeEncodeError:
                        pass
                        # import ipdb; ipdb.set_trace()

                if __count % 100 == 0:
                    print "count: %s\tdups: %s" % (__count, __dups)
                # print "count: %s\tdups: %s\t\r" % (__count, __dups),
                __count += 1
                    


            except StopIteration:
                it.close()
                break

        if len(obj_buffer) >0:
            Advert.objects.insert(obj_buffer)

        print
Example #5
0
    def handle(self, *args, **options):
        import sys
        sys.path.append("/home/bkmz/Dev/realty_parser/src")
        from analytics import insert as insert_irr

        mongo_objects = []

        print "Start Truncating"
        # Ad.objects.all().delete()
        Advert.objects.all().delete()
        Advert._get_db().mongoengine.counters.remove({})
        print "Truncating finished"

        COUNT = 0
        for x in insert_irr():
            # print x['url']

            # try:
            #     current_region = Region.objects.filter(name=x['region']).get()
            # except Region.DoesNotExist:
            #     print "Region not found! Skip ad"
            #     import ipdb; ipdb.set_trace()
            #     continue

            # Advert(floor=2).save()
            if u"Адрес" in x.keys() and not x[u'Адрес'].strip() == "":
                # x['address'] = x[u'Адрес']
                del x[u'Адрес']

            # if x['address'] == "":
            #     import ipdb; ipdb.set_trace()

            all = set(KEYS)
            all2 = set(VALUES)
            adv = set(x.keys())

            old_keys       = list((adv & all2))
            converted_keys = [DICT2[xi] for xi in (adv & all2)]


            nonrel_adv = x

            for key1 in x.keys():
                if key1 in DICT2:
                    nonrel_adv[DICT2[key1]] = x[key1]
                    del nonrel_adv[key1]


            # nonrel_adv['region'] = int(current_region.pk)
            nonrel_adv['region'] = x['region'].strip()

            ad_nonrel_obj = Advert(**nonrel_adv)
            # ad_nonrel_obj.save()

            mongo_objects.append(ad_nonrel_obj)

            # ad_nonrel_obj.save()

            print COUNT
            COUNT += 1

            # if COUNT >= 1000:
                # break
            

            # print x['adding_date']



            # import ipdb; ipdb.set_trace()
            # break
        
        Advert.objects.insert(mongo_objects)
        transaction.commit()