def handle(self, *args, **options):
        from locations.models import Location
        from organizations.models import Organization
        from violations.models import Violation

        content_type = ContentType.objects.get_for_model(Organization)

        if args[0] == 'gn':
            organization = Organization.objects.get(name='nabludatel')
            xml = fromstring(read_url('http://gnhq.info/export/violations.xml', encoding=None))
        elif args[0] == 'golos':
            organization = Organization.objects.get(name='golos')
            xml = fromstring(read_url('http://www.kartanarusheniy.org/export.xml', encoding=None))

        for viol_xml in xml:
            data = {}
            for field in viol_xml:
                if field.tag == 'id':
                    data['id'] = int(field.text)
                elif field.tag == 'updt':
                    data['time'] = datetime.strptime(field.text, '%y-%m-%d %H:%M')
                elif field.tag == 'obscomment':
                    data['text'] = field.text or ''
                elif field.tag == 'region':
                    data['region'] = int(field.text)
                    if data['region'] == 75:
                        data['region'] = 92
                    elif data['region'] == 41:
                        data['region'] = 91
                    elif data['region'] == 59:
                        data['region'] = 90
                    elif data['region'] == 99:
                        continue
                elif field.tag == 'uik':
                    data['uik'] = field.text
                elif field.tag == 'type':
                    data['type'] = GN_TO_GRAKON[field.text]
                elif field.tag == 'vtype':
                    data['type'] = GOLOS_TO_GRAKON[field.text]

            # Try to get location
            try:
                location = Location.objects.get(region_code=data['region'], name=data['uik'])
            except Location.DoesNotExist:
                print "Failed to find location of violation " + str(data['id'])
                continue

            fields = {'text': data['text'], 'type': data['type'], 'location': location}
            if args[0] == 'gn':
                fields['url'] = ''
            elif args[0] == 'golos':
                fields['url'] = 'http://www.kartanarusheniy.org/'+str(data['id'])

            violation, created = Violation.objects.get_or_create(content_type=content_type, object_id=organization.id,
                    violation_id=data['id'], defaults=fields)

            if not created:
                for field in fields:
                    setattr(violation, field, fields[field])
                    violation.save()
Beispiel #2
0
    def handle(self, *args, **options):
        print "Downloading second level ids"
        ids = []
        i = 0
        for line in open(os.path.join(settings.PROJECT_PATH, 'data', 'regions.txt')):
            region_id, name = line.strip().split(' ', 1)

            for option in HtmlXPathSelector(text=read_url(URL+region_id)) \
                    .select("//table[@width='100%' and @cellspacing='2' and @cellpadding='5']//tr[2]//option"):
                id = option.select("@value").extract()[0]

                for option1 in HtmlXPathSelector(text=read_url(URL+id)) \
                        .select("//table[@width='100%' and @cellspacing='2' and @cellpadding='5']//tr[3]//option"):
                    ids.append(option1.select("@value").extract()[0])

            i += 1
            print_progress(i, 80)

        print "Downloading locations hierarchy"
        i = 0
        data = {}
        for id in set(ids):
            for tr in HtmlXPathSelector(text=read_url(URL+id)).select("//table[@class='list']/tr")[1:]:
                okato_id = tr.select(".//td[2]//b/text()").extract()[0].replace(' ', '')
                assert len(okato_id)==8

                data[okato_id] = tr.select("./td[3]/text()").extract()[0]

            i += 1
            print_progress(i, len(ids))

        with open(os.path.join(settings.PROJECT_PATH, 'data', 'locations.json'), 'w') as f:
            f.write(json.dumps(data, indent=4, ensure_ascii=False).encode('utf8'))
    def handle(self, *args, **options):
        from locations.models import Location
        from organizations.models import Organization
        from protocols.models import Protocol

        cik = Organization.objects.get(name='cik')
        content_type = ContentType.objects.get_for_model(Organization)

        locations_processed = Protocol.objects.filter(content_type=content_type, object_id=cik.id) \
                .values_list('location', flat=True)
        uiks_count = Location.objects.exclude(tik=None).count()
        j = len(locations_processed)
        for location in Location.objects.exclude(tik=None).exclude(id__in=locations_processed):
            trs = HtmlXPathSelector(text=read_url(location.results_url())) \
                    .select("//table[@width='100%' and @cellspacing='1' and @cellpadding='2' and @bgcolor='#ffffff']//tr")
            #trs = list(HtmlXPathSelector(text=read_url(location.results_url())) \
            #        .select("//body//table[3]//tr[4]//td//table[6]//tr"))

            del trs[18]
            assert len(trs) == 23, "incorrect number of rows"

            data = {}
            for i in range(23):
                data['p'+str(i+1)] = int(trs[i].select(".//b/text()").extract()[0])

            data.update({'location': location, 'verified': True})

            Protocol.objects.get_or_create(content_type=content_type, object_id=cik.id,
                    protocol_id=location.id, defaults=data)

            print_progress(j, uiks_count)
            j += 1
    def handle(self, *args, **options):
        from locations.models import Location
        from organizations.models import Organization
        from protocols.models import Protocol

        liga = Organization.objects.get(name='liga')
        content_type = ContentType.objects.get_for_model(Organization)

        HtmlXPathSelector(text=read_url(LIST_URL)) \
                    .select("//div[@class='page_navigation'][0]//a")
    def handle(self, *args, **options):
        from locations.models import FOREIGN_CODE, FOREIGN_NAME, Location

        uiks = {}
        for line in open(os.path.join(settings.PROJECT_PATH, 'data', 'foreign_uiks.csv'), 'r'):
            uik_no, country_id, country_name, address = line.strip().split(',')
            uiks[uik_no] = {'tik': int(country_id), 'address': address}

        countries_by_id = dict((location.id, location) for location in Location.objects.exclude(region=None) \
                .filter(tik=None).filter(region_code=FOREIGN_CODE))

        foreign_countries = Location.objects.get(region=None, region_code=FOREIGN_CODE)

        i = 0
        for uik_option in HtmlXPathSelector(text=read_url(FOREIGN_UIKS_URL)) \
                .select("//select[@name='gs']//option"):
            uik_no = uik_option.select("text()").extract()[0].strip()[:4]

            if uik_no not in uiks:
                print uik_no
                continue

            url = uik_option.select("@value").extract()[0]
            for param in url.split('?')[1].split('&'):
                param_name, param_value = param.split('=')
                if param_name in ('root', 'tvd'):
                    uiks[uik_no][param_name] = int(param_value)

            location = Location(region=foreign_countries, tik=countries_by_id[uiks[uik_no]['tik']],
                    name=uik_no, region_name=FOREIGN_NAME, region_code=FOREIGN_CODE,
                    address=uiks[uik_no]['address'], tvd=uiks[uik_no]['tvd'],
                    root=uiks[uik_no]['root'], data='{}')
            location.save()

            i += 1
            print_progress(i, 350)
    def handle(self, *args, **options):
        from locations.models import Location
        from organizations.models import Organization
        from protocols.models import Protocol

        organization = Organization.objects.get(name='nabludatel')
        content_type = ContentType.objects.get_for_model(Organization)

        xml = fromstring(read_url('http://gnhq.info/export/protocols.xml', None))
        for protocol_xml in xml:
            data = {}
            fields = {}
            for field in protocol_xml:
                if field.tag == 'id':
                    data['id'] = int(field.text)
                elif field.tag == 'ncomp':
                    data['complaints'] = int(field.text)
                elif field.tag == 'region':
                    data['region'] = int(field.text)
                    if data['region'] == 75:
                        data['region'] = 92
                    elif data['region'] == 41:
                        data['region'] = 91
                    elif data['region'] == 59:
                        data['region'] = 90
                elif field.tag == 'uik':
                    data['uik'] = field.text
                elif field.tag == 'updt':
                    if field.text.startswith('11'): # hack to fix GN bug
                        field.text = '12'+field.text[:2]
                    data['sign_time'] = datetime.strptime(field.text, '%y-%m-%d %H:%M')
                elif field.tag == 'media':
                    if len(field) != 1:
                        raise ValueError
                    data['url'] = list(field)[0].text

                if field.tag.startswith('p'):
                    try:
                        p_index = int(field.tag[1:])
                    except ValueError:
                        continue

                    if p_index<1 or p_index>23:
                        continue

                    fields[field.tag] = int(field.text)

            # Try to get location
            try:
                location = Location.objects.get(region_code=data['region'], name=data['uik'])
            except Location.DoesNotExist:
                print "Failed to find location of protocol " + str(data['id'])
                continue

            if 'url' not in data:
                continue # skip protocols without images

            fields.update({'url': data['url'], 'location': location, 'verified': True,
                    'sign_time': data.get('sign_time'), 'complaints': data.get('complaints')})

            protocol, created = Protocol.objects.get_or_create(content_type=content_type, object_id=organization.id,
                    protocol_id=data['id'], defaults=fields)

            if not created:
                for field in fields:
                    setattr(protocol, field, fields[field])
                protocol.save()