Ejemplo n.º 1
0
    def handle(self, *args, **options):
        from django.utils import translation
        translation.activate('en')  # with en-us everything crashes
        print(args)
        print(options)
        remove = options.get('delete', False)
        rebuild = False
        log = '------------------'
        for cat in Category.objects.all():
            for sub_cat in cat.subcategory_set.all():
                kws = {'sub_category': sub_cat, 'gkey__isnull': False, 'private': False}
                count = Ad.objects.filter(**kws).count()
                log += cat.name + ' - ' + sub_cat.name + ': ' + str(count)
                times = (count - 1000)/200
                if times > 0 and remove:
                    rebuild = True
                    for i in range(times):
                        to_remove = Ad.objects.filter(**kws).order_by('pub_date')[:200].values('id')
                        Ad.objects.filter(pk__in=[it['id'] for it in to_remove]).delete()

                    log += 'after remove'
                    count = Ad.objects.filter(**kws).count()
                    log += cat.name + ' - ' + sub_cat.name + ': ' + str(count)

            kws = {'category': cat, 'gkey__isnull': False, 'private': False}
            all_count = Ad.objects.filter(**kws).count()
            log += cat.name + ": " + str(all_count)
            log += '------------------'

            times = (all_count - 2000)/200
            if times > 0 and remove:
                rebuild = True
                for i in range(times):
                    to_remove = Ad.objects.filter(**kws).order_by('pub_date')[:200].values('id')
                    Ad.objects.filter(pk__in=[it['id'] for it in to_remove]).delete()

                all_count = Ad.objects.filter(**kws).count()
                log += cat.name + ": " + str(all_count)
        send_log(log)
        if rebuild:
            call_command('rebuild_index', interactive=False)
Ejemplo n.º 2
0
    def get_ads(cls):
        errors = 5
        match = {'cena': 'price', 'datap': 'pub_date', 'dom_etag': 'floor_max', 'etag': 'floor',
                 'kol_komn': 'rooms_count', 'pl1': 'area_living', 'plk': 'area_kitchen', 'plosh': 'area',
                 'pl_land': 'area_land', 'textob': 'desc'}

        now = datetime.now().strftime("%Y-%m-%d")
        yesterday = (date.today() - timedelta(1)).strftime("%Y-%m-%d")

        data = {
            're_base_name': 'kvart',
            're_base_section': 'green',
            'rem': 0,
            'subq_start': 0,
            'subq_lines': 200,
            'order_by[data]': 'desc',
            'order_by[datap]': 'desc',
            'data_start': yesterday,
            'data_end': now,
        }
        statistic = {}
        log = ""
        try:
            for table in ['arenda', 'kvart', 'komn', 'domm', 'negil', 'client']:  #  all tales
                data['re_base_name'] = table
                print('Load from ' + table)
                statistic[table] = {}
                for private_type in ['green']:  #['green', 'red']:  # all bases
                    statistic[table][private_type] = 0
                    print('table type ' + private_type)
                    data['re_base_section'] = private_type
                    current = 0
                    total = 200
                    while current < total:   # all pages
                        print('select 200 starts from ' + str(current) + ', total is ' + str(total))
                        data['subq_start'] = current
                        str_data = urllib.urlencode(data)
                        content = cls.get('http://estate-in-kharkov.com/ps/re_base/ajax/real-estate-database.php', str_data)
                        current += 200
                        if 're_base_query_count' in content:
                            total = int(content['re_base_query_count'])

                        if 'items' not in content and errors:
                            print("hasn't content , errors left %d" % errors)
                            errors -= 1
                            print(content)
                            time.sleep((6-errors)*10)
                            current -= 200
                            continue
                        assert 'items' in content, 'unexpected content: ' + str(content)

                        if content['items'] is False:
                            break
                        for item in content['items'].values():
                            aid = item['kod']
                            try:
                                existed = Ad.objects.get(gkey=aid)
                                print(str(existed) + ' already imported!')
                                continue
                            except Ad.DoesNotExist:
                                pass

                            ad_item = {'gkey': aid}
                            for field in item:
                                if item[field] and field in match:
                                    ad_item[match[field]] = item[field]

                            ad = cls.parse_categories(ad_item, item, table)
                            # continue
                            if 'sub_category' in ad:
                                print(ad['sub_category'])
                            if 'rooms_count' in ad:
                                print('rooms: ' + str(ad['rooms_count']))

                            if item['textob']:
                                ad['title'] = truncatesmart(item['textob'], 45)
                                if not ad['title']:
                                    ad['title'] = item['textob'][:45]

                            ad['offering'] = table != 'client'
                            ad['private'] = private_type == 'green'
                            ad['phone'] = ""
                            for i in range(1, 5):
                                name = 'tel'+str(i)
                                if name in item and item[name].strip():
                                    if i > 1:
                                        ad['phone'] += ', '
                                    ad['phone'] += item[name].strip()

                            if item['metro']:
                                try:
                                    metro = Metro.objects.get(pk=item['metro'])
                                    ad['desc'] += ' ' + metro.name
                                except Metro.DoesNotExist:
                                    pass

                            if item['ulica']:
                                try:
                                    ad['address'] = Street.objects.get(pk=item['ulica'])
                                except Street.DoesNotExist:
                                    pass

                            if item['raj']:
                                try:
                                    ad['district'] = District.objects.get(pk=item['raj'])
                                except District.DoesNotExist:
                                    pass

                            if item['nasp']:
                                item['nasp'] = 1 if item['nasp'] == '293' else item['nasp']
                                try:
                                    ad['town'] = Town.objects.get(pk=item['nasp'])
                                except Town.DoesNotExist:
                                    continue

                            if ad['pub_date']:
                                pub_date = dateutil.parser.parse(ad['pub_date'])
                                if str(pub_date.time()) == "00:00:00":
                                    pub_date = datetime.combine(pub_date.date(), datetime.now().time())

                                pub_date_utc = pub_date.replace(tzinfo=timezone.get_current_timezone())
                                ad['pub_date'] = ad['order_date'] = pub_date_utc

                            if item['fotosite'] and item['fotosite'].find('&have_images'):
                                ad['url'] = item['fotosite'].replace('&have_images', '')

                            ad_obj = Ad(**ad)
                            ad_obj.save()
                            statistic[table][private_type] += 1

                            find_similar.delay(ad_obj.id)
                            if ad_obj.url:
                                parsed_uri = urlparse(ad_obj.url)
                                if parsed_uri.netloc in Crawler.DOMAINS:
                                    import_attachments.delay(ad_obj.id)

                            print(ad_obj)
                            time.sleep(1)
        except:
            log += "Error: " + traceback.format_exc() + "\n"
            raise
        finally:
            for name, stats in statistic.items():
                log += name + ': '
                if 'green' in stats:
                    log += str(stats['green'])
                log += " / "
                if 'red' in stats:
                    log += str(stats['red'])
                log += "\n"
            send_log(log)