def handle(self, *args, **options): from locations.models import Location from organizations.models import Organization from violations.models import Violation content_type = ContentType.objects.get_for_model(Organization) if args[0] == 'gn': organization = Organization.objects.get(name='nabludatel') xml = fromstring(read_url('http://gnhq.info/export/violations.xml', encoding=None)) elif args[0] == 'golos': organization = Organization.objects.get(name='golos') xml = fromstring(read_url('http://www.kartanarusheniy.org/export.xml', encoding=None)) for viol_xml in xml: data = {} for field in viol_xml: if field.tag == 'id': data['id'] = int(field.text) elif field.tag == 'updt': data['time'] = datetime.strptime(field.text, '%y-%m-%d %H:%M') elif field.tag == 'obscomment': data['text'] = field.text or '' elif field.tag == 'region': data['region'] = int(field.text) if data['region'] == 75: data['region'] = 92 elif data['region'] == 41: data['region'] = 91 elif data['region'] == 59: data['region'] = 90 elif data['region'] == 99: continue elif field.tag == 'uik': data['uik'] = field.text elif field.tag == 'type': data['type'] = GN_TO_GRAKON[field.text] elif field.tag == 'vtype': data['type'] = GOLOS_TO_GRAKON[field.text] # Try to get location try: location = Location.objects.get(region_code=data['region'], name=data['uik']) except Location.DoesNotExist: print "Failed to find location of violation " + str(data['id']) continue fields = {'text': data['text'], 'type': data['type'], 'location': location} if args[0] == 'gn': fields['url'] = '' elif args[0] == 'golos': fields['url'] = 'http://www.kartanarusheniy.org/'+str(data['id']) violation, created = Violation.objects.get_or_create(content_type=content_type, object_id=organization.id, violation_id=data['id'], defaults=fields) if not created: for field in fields: setattr(violation, field, fields[field]) violation.save()
def handle(self, *args, **options): print "Downloading second level ids" ids = [] i = 0 for line in open(os.path.join(settings.PROJECT_PATH, 'data', 'regions.txt')): region_id, name = line.strip().split(' ', 1) for option in HtmlXPathSelector(text=read_url(URL+region_id)) \ .select("//table[@width='100%' and @cellspacing='2' and @cellpadding='5']//tr[2]//option"): id = option.select("@value").extract()[0] for option1 in HtmlXPathSelector(text=read_url(URL+id)) \ .select("//table[@width='100%' and @cellspacing='2' and @cellpadding='5']//tr[3]//option"): ids.append(option1.select("@value").extract()[0]) i += 1 print_progress(i, 80) print "Downloading locations hierarchy" i = 0 data = {} for id in set(ids): for tr in HtmlXPathSelector(text=read_url(URL+id)).select("//table[@class='list']/tr")[1:]: okato_id = tr.select(".//td[2]//b/text()").extract()[0].replace(' ', '') assert len(okato_id)==8 data[okato_id] = tr.select("./td[3]/text()").extract()[0] i += 1 print_progress(i, len(ids)) with open(os.path.join(settings.PROJECT_PATH, 'data', 'locations.json'), 'w') as f: f.write(json.dumps(data, indent=4, ensure_ascii=False).encode('utf8'))
def handle(self, *args, **options): from locations.models import Location from organizations.models import Organization from protocols.models import Protocol cik = Organization.objects.get(name='cik') content_type = ContentType.objects.get_for_model(Organization) locations_processed = Protocol.objects.filter(content_type=content_type, object_id=cik.id) \ .values_list('location', flat=True) uiks_count = Location.objects.exclude(tik=None).count() j = len(locations_processed) for location in Location.objects.exclude(tik=None).exclude(id__in=locations_processed): trs = HtmlXPathSelector(text=read_url(location.results_url())) \ .select("//table[@width='100%' and @cellspacing='1' and @cellpadding='2' and @bgcolor='#ffffff']//tr") #trs = list(HtmlXPathSelector(text=read_url(location.results_url())) \ # .select("//body//table[3]//tr[4]//td//table[6]//tr")) del trs[18] assert len(trs) == 23, "incorrect number of rows" data = {} for i in range(23): data['p'+str(i+1)] = int(trs[i].select(".//b/text()").extract()[0]) data.update({'location': location, 'verified': True}) Protocol.objects.get_or_create(content_type=content_type, object_id=cik.id, protocol_id=location.id, defaults=data) print_progress(j, uiks_count) j += 1
def handle(self, *args, **options): from locations.models import Location from organizations.models import Organization from protocols.models import Protocol liga = Organization.objects.get(name='liga') content_type = ContentType.objects.get_for_model(Organization) HtmlXPathSelector(text=read_url(LIST_URL)) \ .select("//div[@class='page_navigation'][0]//a")
def handle(self, *args, **options): from locations.models import FOREIGN_CODE, FOREIGN_NAME, Location uiks = {} for line in open(os.path.join(settings.PROJECT_PATH, 'data', 'foreign_uiks.csv'), 'r'): uik_no, country_id, country_name, address = line.strip().split(',') uiks[uik_no] = {'tik': int(country_id), 'address': address} countries_by_id = dict((location.id, location) for location in Location.objects.exclude(region=None) \ .filter(tik=None).filter(region_code=FOREIGN_CODE)) foreign_countries = Location.objects.get(region=None, region_code=FOREIGN_CODE) i = 0 for uik_option in HtmlXPathSelector(text=read_url(FOREIGN_UIKS_URL)) \ .select("//select[@name='gs']//option"): uik_no = uik_option.select("text()").extract()[0].strip()[:4] if uik_no not in uiks: print uik_no continue url = uik_option.select("@value").extract()[0] for param in url.split('?')[1].split('&'): param_name, param_value = param.split('=') if param_name in ('root', 'tvd'): uiks[uik_no][param_name] = int(param_value) location = Location(region=foreign_countries, tik=countries_by_id[uiks[uik_no]['tik']], name=uik_no, region_name=FOREIGN_NAME, region_code=FOREIGN_CODE, address=uiks[uik_no]['address'], tvd=uiks[uik_no]['tvd'], root=uiks[uik_no]['root'], data='{}') location.save() i += 1 print_progress(i, 350)
def handle(self, *args, **options): from locations.models import Location from organizations.models import Organization from protocols.models import Protocol organization = Organization.objects.get(name='nabludatel') content_type = ContentType.objects.get_for_model(Organization) xml = fromstring(read_url('http://gnhq.info/export/protocols.xml', None)) for protocol_xml in xml: data = {} fields = {} for field in protocol_xml: if field.tag == 'id': data['id'] = int(field.text) elif field.tag == 'ncomp': data['complaints'] = int(field.text) elif field.tag == 'region': data['region'] = int(field.text) if data['region'] == 75: data['region'] = 92 elif data['region'] == 41: data['region'] = 91 elif data['region'] == 59: data['region'] = 90 elif field.tag == 'uik': data['uik'] = field.text elif field.tag == 'updt': if field.text.startswith('11'): # hack to fix GN bug field.text = '12'+field.text[:2] data['sign_time'] = datetime.strptime(field.text, '%y-%m-%d %H:%M') elif field.tag == 'media': if len(field) != 1: raise ValueError data['url'] = list(field)[0].text if field.tag.startswith('p'): try: p_index = int(field.tag[1:]) except ValueError: continue if p_index<1 or p_index>23: continue fields[field.tag] = int(field.text) # Try to get location try: location = Location.objects.get(region_code=data['region'], name=data['uik']) except Location.DoesNotExist: print "Failed to find location of protocol " + str(data['id']) continue if 'url' not in data: continue # skip protocols without images fields.update({'url': data['url'], 'location': location, 'verified': True, 'sign_time': data.get('sign_time'), 'complaints': data.get('complaints')}) protocol, created = Protocol.objects.get_or_create(content_type=content_type, object_id=organization.id, protocol_id=data['id'], defaults=fields) if not created: for field in fields: setattr(protocol, field, fields[field]) protocol.save()