Beispiel #1
0
    def handle(self, *args, **options):
        """
        Working process divided into some separate stages
            1. Getting last date of Advert from DB. 
        """

        FULL_IMPORT = True
        DUPS_LIMIT = 10
        __count = 0
        __dups = 0
        __page_dups = 0
        control = None
        obj_buffer = []
        obj_buffer_len = 10
        obj_buffer_len = 100

        if FULL_IMPORT:
            Advert.objects.all().delete()
            Advert._get_db().mongoengine.counters.remove({})


        it = ads()
        while True:
            try:
                # it.next()
                # print control
                obj = it.send(control)
                control = None

                if obj == {}:
                    print "EMPTY"
                    continue

                if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys():
                    # print obj["url"]
                    obj['kitchen_area'] = obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']
                    del obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']

                if obj["images_len"] == 0:
                    continue

                if (not "price" in obj.keys()) or obj["price"] == "":
                    continue


                if Advert.objects(url=obj["url"]):
                # if Advert.objects(title=obj["title"], description=obj["description"]):
                    __dups += 1
                    __page_dups += 1
                    if __page_dups >= DUPS_LIMIT and not FULL_IMPORT:
                        control = "break_page_search"
                        __page_dups = 0
                else:
                    try:
                        prep_obj = _prepare(obj)
                        print obj['url']
                        






                        _ins = Advert(** prep_obj)

                        if FULL_IMPORT:
                            obj_buffer.append(_ins)
                            if len(obj_buffer) >= obj_buffer_len:
                                Advert.objects.insert(obj_buffer)
                                obj_buffer = []

                        else:
                            _ins.save(write_concern={'w':0, 'j':False, 'wtimeout':0})

                        if prep_obj['images_len'] > 0:
                            for element in prep_obj["images"]:
                                for jit in prep_obj["images"][element]:
                                    dl_image(jit)
                            # import ipdb; ipdb.set_trace()

                        
                    except UnicodeEncodeError:
                        pass
                        # import ipdb; ipdb.set_trace()

                if __count % 100 == 0:
                    print "count: %s\tdups: %s" % (__count, __dups)
                # print "count: %s\tdups: %s\t\r" % (__count, __dups),
                __count += 1
                    


            except StopIteration:
                it.close()
                break

        if len(obj_buffer) >0:
            Advert.objects.insert(obj_buffer)

        print
Beispiel #2
0
    def handle(self, *args, **options):
        """
        Working process divided into some separate stages
            1. Getting last date of Advert from DB. 
        """

        FULL_IMPORT = True
        DUPS_LIMIT = 10
        __count = 0
        __dups = 0
        __page_dups = 0
        control = None
        obj_buffer = []
        obj_buffer_len = 10
        obj_buffer_len = 100

        if FULL_IMPORT:
            Advert.objects.all().delete()
            Advert._get_db().mongoengine.counters.remove({})

        it = ads()
        while True:
            try:
                # it.next()
                # print control
                obj = it.send(control)
                control = None

                if obj == {}:
                    print "EMPTY"
                    continue

                if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys(
                ):
                    # print obj["url"]
                    obj['kitchen_area'] = obj[
                        u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']
                    del obj[
                        u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438']

                if obj["images_len"] == 0:
                    continue

                if (not "price" in obj.keys()) or obj["price"] == "":
                    continue

                if Advert.objects(url=obj["url"]):
                    # if Advert.objects(title=obj["title"], description=obj["description"]):
                    __dups += 1
                    __page_dups += 1
                    if __page_dups >= DUPS_LIMIT and not FULL_IMPORT:
                        control = "break_page_search"
                        __page_dups = 0
                else:
                    try:
                        prep_obj = _prepare(obj)
                        print obj['url']

                        _ins = Advert(**prep_obj)

                        if FULL_IMPORT:
                            obj_buffer.append(_ins)
                            if len(obj_buffer) >= obj_buffer_len:
                                Advert.objects.insert(obj_buffer)
                                obj_buffer = []

                        else:
                            _ins.save(write_concern={
                                'w': 0,
                                'j': False,
                                'wtimeout': 0
                            })

                        if prep_obj['images_len'] > 0:
                            for element in prep_obj["images"]:
                                for jit in prep_obj["images"][element]:
                                    dl_image(jit)
                            # import ipdb; ipdb.set_trace()

                    except UnicodeEncodeError:
                        pass
                        # import ipdb; ipdb.set_trace()

                if __count % 100 == 0:
                    print "count: %s\tdups: %s" % (__count, __dups)
                # print "count: %s\tdups: %s\t\r" % (__count, __dups),
                __count += 1

            except StopIteration:
                it.close()
                break

        if len(obj_buffer) > 0:
            Advert.objects.insert(obj_buffer)

        print
Beispiel #3
0
    def handle(self, *args, **options):
        import sys
        sys.path.append("/home/bkmz/Dev/realty_parser/src")
        from analytics import insert as insert_irr

        mongo_objects = []

        print "Start Truncating"
        # Ad.objects.all().delete()
        Advert.objects.all().delete()
        Advert._get_db().mongoengine.counters.remove({})
        print "Truncating finished"

        COUNT = 0
        for x in insert_irr():
            # print x['url']

            # try:
            #     current_region = Region.objects.filter(name=x['region']).get()
            # except Region.DoesNotExist:
            #     print "Region not found! Skip ad"
            #     import ipdb; ipdb.set_trace()
            #     continue

            # Advert(floor=2).save()
            if u"Адрес" in x.keys() and not x[u'Адрес'].strip() == "":
                # x['address'] = x[u'Адрес']
                del x[u'Адрес']

            # if x['address'] == "":
            #     import ipdb; ipdb.set_trace()

            all = set(KEYS)
            all2 = set(VALUES)
            adv = set(x.keys())

            old_keys       = list((adv & all2))
            converted_keys = [DICT2[xi] for xi in (adv & all2)]


            nonrel_adv = x

            for key1 in x.keys():
                if key1 in DICT2:
                    nonrel_adv[DICT2[key1]] = x[key1]
                    del nonrel_adv[key1]


            # nonrel_adv['region'] = int(current_region.pk)
            nonrel_adv['region'] = x['region'].strip()

            ad_nonrel_obj = Advert(**nonrel_adv)
            # ad_nonrel_obj.save()

            mongo_objects.append(ad_nonrel_obj)

            # ad_nonrel_obj.save()

            print COUNT
            COUNT += 1

            # if COUNT >= 1000:
                # break
            

            # print x['adding_date']



            # import ipdb; ipdb.set_trace()
            # break
        
        Advert.objects.insert(mongo_objects)
        transaction.commit()