def schools(data, town): schools = {} for school_type, content in data.items(): for entry in content: township = _township_for_name(town, entry[constants.TOWNSHIP_COL]) open_data_id = OpenDataId(entry[constants.OPEN_DATA_ID_COL]) if open_data_id in schools.keys(): raise KeyError("Found duplicate open data id: %d", open_data_id) school = School(id=open_data_id, township_id=township.id, type=school_type, name=entry[constants.NAME_COL], name_lt=to_latin(entry[constants.NAME_COL]), address=entry[constants.ADDRESS_COL], address_lt=to_latin(entry[constants.ADDRESS_COL]), place=entry[constants.PLACE_COL], place_lt=to_latin(entry[constants.PLACE_COL]), postcode=entry[constants.POSTCODE_COL], website=entry[constants.WEBSITE_COL], phone=entry[constants.PHONE_COL], email=entry[constants.EMAIL_COL]) schools[open_data_id] = school print("Found %d schools" % len(schools)) return schools
def data_importer_of_municipality_prijepolje(self): db.opstine.remove({"opstina.latinica": "Prijepolje", "tipPodataka.slug": "rashodi"}) # Read data from vranje csv file data_handler = reader(open("data/rashodi/prijepolje.csv", "r"), delimiter=",") program = "" subprogram = "" for index, row in enumerate(data_handler): if index > 0: if row[1] in ["", " "] and row[2] not in ["", " "] and row[2].strip() in utils.program_categories_for_prijepolje().keys(): program = row[2].strip() if program != "" and row[2].strip() in utils.program_categories_for_prijepolje()[program]: subprogram = row[2].strip() if row[1] not in ["", " "] and len(row[1]) > 2 and program not in ["", " "] and subprogram not in ["", " "]: json_doc = self.build_mongo_document_structure_for_prihodi_rashodi( "Пријепоље", row[1], row[2], row[3], row[4], row[5], row[6], None ) json_doc["program"] = {} json_doc["program"]["cirilica"] = program.strip() json_doc["program"]["latinica"] = cyrtranslit.to_latin(program, "sr") json_doc["potProgram"] = {} json_doc["potProgram"]["cirilica"] = subprogram.strip() json_doc["potProgram"]["latinica"] = cyrtranslit.to_latin(subprogram, "sr") db.opstine.insert(json_doc) print "Opstine: %s - Program: %s %s" % ("Пријепоље", program, row[1])
def build_docs(self, row): # Clean expense string so that is is numerical (e.g. turn blank string to 0). cost = row[9].replace(',', '') if not cost.strip(): cost = 0 # Create doc. doc = { 'region': { 'name': self.get_region(), 'slug': slugify(self.get_region(), to_lower=True), 'subregion':{ 'name': cyrtranslit.to_latin(row[0]), 'slug': cyrtranslit.to_latin(slugify(row[0], to_lower=True)), } }, 'activity':{ 'id': int(row[1]), 'description': cyrtranslit.to_latin(row[2]) }, 'dataset': { 'name': self.get_dataset(), 'slug': slugify(self.get_dataset(), to_lower=True) }, 'cost': cost, 'year': 2010 } # Console output to provide user with feedback on status of importing process. print '%s - %s: %s (%s %i)' % (doc['activity']['id'], doc['activity']['description'], doc['cost'], doc['region']['name'], doc['year']) return [doc]
def gen_email_addr(frm): frm_addr_frst = [ 'info', 'process', 'warning', 'vzyskanie', 'shtraf', 'dolg', 'alarm', 'zapros', 'request', 'tax', 'nedoimka', 'uvedomlenie' ] frm_dmns = ['com', 'net', 'ru', 'org'] addr = random.choice(frm_addr_frst).rstrip() tmp = frm.replace('"', '') if any(c in frm for c in ("суд", "Суд")): tmp = cyrtranslit.to_latin(tmp, 'ru') tmp = tmp.replace("'", '') tmp = tmp.split(' ') i = 0 for s in tmp: tmp[i] = s[:1].lower() i += 1 addr += "@" + ''.join(tmp).rstrip() + ".court.gov.ru" elif any(c in frm for c in ("инспекция", "ИФНС")): addr += "@ifns" + frm.rstrip()[-2:] + ".gov.ru" elif any(c in frm for c in ("банк", "Банк")): tmp = cyrtranslit.to_latin(tmp, 'ru').lower() tmp = tmp.replace("'", '') if tmp[:4] == "bank": tmp = tmp.split(tmp[:4], 1)[1] addr += "@" + tmp.replace(' ', '').rstrip() + "." + random.choice(frm_dmns) else: tmp = cyrtranslit.to_latin(tmp, 'ru').lower() tmp = tmp.replace("'", '') addr += "@" + tmp.replace(' ', '').rstrip() + "." + random.choice(frm_dmns) return addr
def autor_pesme(a, datoteka): with open(datoteka, 'w') as x: for author in authors: curr_path = '{}/{}'.format(myroot, author) if curr_path.endswith('{}'.format(a)): albums = [ dir for dir in listdir(curr_path) if not isfile(join(curr_path, dir)) ] for album in albums: album_path = '{}/{}'.format(curr_path, album) songs = [ f for f in listdir(album_path) if isfile(join(album_path, f)) ] for song in songs: if not song.startswith('.DS_S'): song_path = '{}/{}'.format(album_path, song) for stih in open( song_path, encoding="utf8", errors='ignore').read().split('\n')[:-1]: line = '{}'.format(stih) print(cyrtranslit.to_latin(line)) x.write(cyrtranslit.to_latin(line) + '\n') else: pass
def utmnamecreate(country, city, countvvod, utm_campaign, utm_term, utm_content): url = cyrtranslit.to_latin(str(Urlname.objects.last().name), 'ru').replace(" ", "") global k dataframe = pd.DataFrame(k[country][1:], columns=k[country][0]) print(dataframe.loc[dataframe['Название'] == city, 'utm_source']) utmname = url + '?' + 'utm_source=' + str(dataframe.loc[ dataframe['Название'] == city, 'utm_source'].item()) + '&utm_medium=' + str( dataframe.loc[dataframe['Название'] == city, 'utm_medium'].item() ) + '&utm_campaign=' + str(dataframe.loc[ dataframe['Название'] == city, 'utm_campaign'].item()) + '&utm_term=' + str( dataframe.loc[dataframe['Название'] == city, 'utm_term'].item()) + '&utm_content=' + str( dataframe.loc[dataframe['Название'] == city, 'utm_content'].item()) + '&' if countvvod == 1: find = re.findall(r'введите.*?[&]', utmname) newline = utmname.replace(find[0], utm_campaign + '&') itogfirst = deleteNO(newline[:-1], re.findall(r'[^&]*нет', newline[:-1])) itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "") return itog elif countvvod == 2: find = re.findall(r'введите.*?[&]', utmname) newline = utmname.replace(find[0], utm_campaign + '&') newline1 = newline.replace(find[1], utm_term + '&') itogfirst = deleteNO(newline1[:-1], re.findall(r'[^&]*нет', newline1[:-1])) itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "") return itog elif countvvod == 3: find = re.findall(r'введите.*?[&]', utmname) newline = utmname.replace(find[0], utm_campaign + '&') newline1 = newline.replace(find[1], utm_term + '&') newline2 = newline1.replace(find[2], utm_content + '&') itogfirst = deleteNO(newline2[:-1], re.findall(r'[^&]*нет', newline2[:-1])) itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "") return itog else: itogfirst = deleteNO(utmname[:-1], re.findall(r'[^&]*нет', utmname[:-1])) itog = cyrtranslit.to_latin(itogfirst, 'ru').replace(" ", "") return itog
def test_alphabet_transliteration_cyrillic_to_latin(self): ''' Transliteration of entire cyrillic alphabet to latin. ''' transliterated_alphabet = cyrtranslit.to_latin(macedonian_alphabet_cyrillic, lang_code='mk') # transliterated_alphabet = u's\u0301' 's\xcc\x81' self.assertEqual(transliterated_alphabet, macedonian_alphabet_latin)
def cyr2latTranslate(src_dir, dest_dir): # print(f"D {src_dir} -> {dest_dir}") if not os.path.exists(dest_dir): os.makedirs(dest_dir) for item in os.listdir(src_dir): extension = os.path.splitext(item)[1][1:] s = os.path.join(src_dir, item) if os.path.isdir(s): d = os.path.join(dest_dir, item) cyr2latTranslate(s, d) else: d = os.path.join(dest_dir, item) shutil.copyfile(s,d) f = open(s, encoding="utf8") content = f.read() newF = open(d, "w", encoding="utf8") newF.truncate(0) if extension in extensionList: new_content = cyrtranslit.to_latin(content, 'sr') if(extension == "rst"): new_content = title_fix(new_content) newF.write(new_content) else: newF.write(content) newF.close()
def save(self, commit=True): user = super(CreateUserForm, self).save(commit=False) user.set_password("password") user.is_active = True full_name = self.cleaned_data.get('full_name') full_name_en = cyrtranslit.to_latin(full_name, 'ru') full_name_clean = full_name_en.replace('#', '') full_name_clean = full_name_clean.replace("'", "") fio = full_name_clean.split() if len(fio) == 2: username = fio[1][0].lower() + fio[0].lower() else: username = fio[1][0].lower() + fio[2][0].lower() + fio[0].lower() same_name_users = User.objects.filter(username__iexact=username) if same_name_users: username += str(randrange(100)) user.username = username if commit: user.save() user.profile.birth_date = self.cleaned_data.get("birth_date") user.profile.full_name = self.cleaned_data.get("full_name") user.profile.user_type = self.cleaned_data.get("user_type") user.profile.save() return user
def save(self, *args, **kwargs): cyr = cyrtranslit.to_latin(self.option_name, 'ru') stri = slugify(cyr) self.slug = stri super(Option, self).save(*args, **kwargs)
def clean(txt, tagger, le): tweet_text = re.sub("@[A-Za-z0-9_-]+", "", txt) # remove mentions tweet_text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet_text) # remove links tweet_text = " ".join(tweet_text.split()) tweet_text = ''.join( c for c in tweet_text if c not in emoji.UNICODE_EMOJI) # remove emojis tweet_text = tweet_text.replace("#", "") # remove hashtags tweet_text = re.sub(r'[^\w\s]', '', tweet_text) # remove punctuation tweet_text = tweet_text.replace("_", " ") tweet_text = tweet_text.replace('RT ', '') # remove retweets # convert to latin scrypt tweet_text = [cyrtranslit.to_latin(w.lower()) for w in tweet_text.split()] results = list() for word in tweet_text: i = mapping.get(word) if i: line = linecache.getline(WORD_MODEL, i).strip() vec = np.array([float(n) for n in line.split()[1:]]) pred = le.inverse_transform(tagger.predict([vec]))[0] result = lemmatize(word, pred) results.append(result) if not results: return '' return ' '.join(results)
def test_mix_characters(self): ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't. ''' transliterated_mix = cyrtranslit.to_latin(mix_characters_some_cyrillic) self.assertEqual(transliterated_mix, mix_characters_all_latin)
def main(): working_path = Path(__file__).resolve().parent localization_path = Path(get_loc_dir_path()) cyrillic_path = localization_path / CYRILLIC_CODE latin_path = localization_path / LATIN_CODE shutil.rmtree(latin_path, ignore_errors=True) os.makedirs(latin_path) _, _, files = next(os.walk(cyrillic_path)) for file in files: with open(cyrillic_path / file, 'r', encoding='utf-8') as cyrillic, open( latin_path / file, 'w', encoding='utf-8') as latin: text = cyrillic.read() latin.write(to_latin(text)) cyrillic_zip_path = working_path / f'{CYRILLIC_CODE}-{VERSION}' os.chdir(cyrillic_path) shutil.make_archive(cyrillic_zip_path, 'zip', cyrillic_path) cyrillic_zip_path = working_path / f'{LATIN_CODE}-{VERSION}' os.chdir(latin_path) shutil.make_archive(cyrillic_zip_path, 'zip', latin_path)
def test_alphabet_transliteration_cyrillic_to_latin(self): ''' Transliteration of entire cyrillic alphabet to latin. ''' transliterated_alphabet = cyrtranslit.to_latin( bulgarian_alphabet_cyrillic, lang_code='bg') self.assertEqual(transliterated_alphabet, bulgarian_alphabet_latin)
def get(self, request, *args, **kwargs): activate_language(request.session) competition = Competition.objects.get(pk=kwargs['comp']) if request.user.is_authenticated and ( request.user.is_admin or request.user.id == int(kwargs['pk']) or competition.created_by == request.user.id): try: user = User.objects.get(pk=kwargs['pk']) badge_path = getBadge(user, competition) file_wrapper = FileWrapper(open(badge_path, 'rb')) file_mimetype = mimetypes.guess_type(badge_path) response = HttpResponse(file_wrapper, content_type=file_mimetype) response['X-Sendfile'] = badge_path response['Content-Length'] = os.stat(badge_path).st_size response[ 'Content-Disposition'] = 'attachment; filename={}'.format( '{}\'s_badge.png'.format( cyrtranslit.to_latin( remove_ukrainian(user.get_full_name()), 'ru'))) return response except Exception as ex: pass raise Http404
def test_special_diacritic_characters(self): ''' Diacritic characters should remain the same. ''' transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars, lang_code='tj') self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
def save_key_frames(video): """ Saves the frames that are estimated holds as image files and returns a list of their names (NB: only frames in the first half of the list of key frames are saved, as later frames are assumed to constitute final rest position) """ outfile = video.split(".")[0] outfile = outfile.split("_")[1] outfile = cyrtranslit.to_latin(outfile, 'ru') all_frames, offset = get_key_frames(video) frames = [el+offset for el in all_frames] # Uncomment if you want all key frames to be included print(frames) #frames = all_frames[:math.ceil(len(all_frames)/2)] # Comment out if you want all key frames included count = 1 filenames = [] for f in frames: v = cv2.VideoCapture(video) v.set(1,f-1) ret,frame = v.read() filename = outfile+"_frame"+str(count)+".jpg" cv2.imwrite(filename, frame) filenames.append(filename) count += 1 return filenames
def to_eng(line): out = cyrtranslit.to_latin(line, 'ru') out = "".join( c for c in out if c not in ['!', '.', ':', "'", '?', ' ', '-', '\'', ',', '\n']) return out
def townships(data, reg): names = set() for content in data.values(): for entry in content: township_name = entry[constants.TOWNSHIP_COL] names.add(township_name) name_to_id = {} for i, name in enumerate(sorted(names)): name_to_id[name] = TownshipId(i + 1) townships = {} for content in data.values(): for entry in content: township_name = entry[constants.TOWNSHIP_COL] township_id = name_to_id[township_name] region_name = entry[constants.REGION_COL] region = _region_for_name(reg, region_name) if township_id in townships.keys(): township = townships[township_id] if township.region_id != region.id: raise ValueError( "Found same township for different regions: %s -> (%d, %d)" % (township_name, township.region_id, region.id)) else: townships[township_id] = Township( id=township_id, name=township_name, name_lt=to_latin(township_name), region_id=region.id) print("Found %d townships" % len(townships)) return townships
def post_process(word, language): """ Post Processing steps TODO: Clean up and modularize :param word: :param language: :return: """ if language in config.get_languages_by_property("transliterate", "cyr"): word = cyrtranslit.to_latin(word, language).casefold() if language in config.get_languages_by_property("transliterate", "trans"): word = translit(word, language, reversed=True).casefold() if language in config.get_languages_if_property_exists("accents"): word = unidecode.unidecode(word).casefold() if language in config.get_languages_if_property_exists("stopwords"): res = word.casefold().split() res = list( filter( lambda x: x not in config.get_language(language)["stopwords"], res)) word = " ".join(res) if language in config.get_languages_if_property_exists("stopwords-arabic"): res = word.casefold().split("-") res = list(filter(lambda x: x not in ["al"], res)) word = " ".join(res) if language in config.get_languages_if_property_exists( "space-elimination"): word = "".join(word.split()) return word
def read_file(fileName,rangeEnd, columnName, columnYear, collabGraph, sumOfWorks, professorsDict): professors_papers = openpyxl.load_workbook(os.path.join(fileName)) papersSheet = professors_papers.active print(fileName + ' file reading...') for i in range(2, rangeEnd): authors = cyrtranslit.to_latin(papersSheet[columnName + str(i)].value) try: year = int(papersSheet[columnYear + str(i)].value) except TypeError: year = 0 except ValueError: year = 0 if year < 2000 or year > 2016: continue removableChars = '{}"' for char in removableChars: authors = authors.replace(char,'') authors = authors.split(',') if ('Sanja Delčev' in authors): print(authors) authorsCopy = [i for i in authors] for author in authors: if not (author in professorsDict): authorsCopy.remove(author) authors = authorsCopy for i in range(0, len(authors)): sumOfWorks[professorsDict[authors[i]]['id']] += 1 for j in range(i + 1, len(authors)): m = professorsDict[authors[i]]['id'] n = professorsDict[authors[j]]['id'] collabGraph[m][n] = collabGraph[m][n] + 1
def test_numerical_characters(self): ''' Numerical characters should remain the same. ''' transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars, lang_code='tj') self.assertEqual(transliterated_numerical_chars, numerical_chars)
def delete_badge(user, competition): try: os.remove(settings.BASE_DIR + '/media/badges/{}/{}_badge.png'.format( competition.id, cyrtranslit.to_latin(remove_ukrainian(user.get_full_name()), 'ru')) ) except: pass
def test_alphabet_transliteration(self): ''' Transliteration of entire Serbian cyrillic alphabet to latin. ''' transliterated_serbian_alphabet = cyrtranslit.to_latin( serbian_alphabet_cyrillic) self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_latin)
def clean_lineinst(line): pat = re.compile("(\d+),(.*),(.*)") pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+') uid, uname, fname = pat.match(line).groups() fname = re.sub(pat_word, '', fname).strip().lower() fname = cyrtranslit.to_latin(fname, 'ru').replace("'", '') return (uid, uname, fname)
def get_translated_title(language_abbreviation, title): result = [title] translated_title = translate(language_abbreviation, title) result.append(translated_title["res"]) if language_abbreviation == "en": transliterated_title = cyrtranslit.to_latin(translated_title["res"]) if transliterated_title != title: result.append(transliterated_title) return result
def data_importer_of_municipality_sombor(self): # Remove previous records in database, if there is any for this municipality db.opstine.remove({"opstina.latinica": "Sombor", "tipPodataka.slug": "rashodi"}) # Read data from CSV file and assign those data to a data handler object data_handler = reader(open("data/rashodi/sombor.csv", "r"), delimiter=",") program = '' subprogram = '' # use program categories for better data categorizing program_categories = utils.sombor_programs() # Iterate throughout every row in data handler for index, row in enumerate(data_handler): if index > 4: # init program if row[2] not in ["", " "]: if row[2].strip() in program_categories: program = row[2].strip() if program != "" and row[2].strip() in program_categories[program]: subprogram = row[2].strip() if row[1] not in ["", " "] and program not in ["", " "] and subprogram not in ["", " "] and len(row[1]) < 4: json_doc = self.build_mongo_document_structure_for_prihodi_rashodi( "Сомбор", row[1], row[2].replace("*", ""), row[3], row[4], row[5], row[6], None ) # Add program and subprogram after building the main mongo document json_doc["program"] = {} json_doc["program"]["cirilica"] = program.strip() json_doc["program"]["latinica"] = cyrtranslit.to_latin(program, "sr") json_doc["potProgram"] = {} json_doc["potProgram"]["cirilica"] = subprogram.strip() json_doc["potProgram"]["latinica"] = cyrtranslit.to_latin(subprogram, "sr") db.opstine.insert(json_doc) print "Opstine: %s - Program: %s %s" % ("Сомбор", program, row[1])
def c2l(): m = Frame.m txt = m.get() root29 = tk.Tk() root29.title('Result(Cyrillic2Latin)') result = cyrtranslit.to_latin(txt, 'ru') print(result) pyperclip.copy(result) label29 = tk.Label(root29, text=result, font=16) label29.pack(fill="x")
def get_translated_keywords(language_abbreviation, keywords): """ Merge keywords from the provided language and translated one. Runs transliteration - from cyrillic to latin in case of Serbian. :param language_abbreviation: eg. 'en', 'sr'. :param keywords: eg. ["epidemic", "flu"]. :return: eg. ["epidemic", "flu", "епидемија", "грипа"]. """ keywords_data = [] for keyword in keywords.split(","): translated_keyword = translate(language_abbreviation, keyword) keywords_data.append(keyword) keywords_data.append(translated_keyword["res"]) if language_abbreviation == "en": keywords_data.append( cyrtranslit.to_latin(translated_keyword["res"])) else: keywords_data.append(cyrtranslit.to_latin(keyword)) return keywords_data
def translate_word(word, lang, src='en'): if src == 'auto': translated_word = translator.translate(word, dest=lang).text # src is code of language from which word is being translated (english) else: translated_word = translator.translate(word, dest=lang, src='en').text if lang == 'sr': # if translating into serbian # also adding latin version of word using cytranslit. translated_word_lat = cyrtranslit.to_latin(translated_word) return translated_word, translated_word_lat else: return translated_word
def clean_linevk(line): pat = re.compile("(\d+),(.*),(.*),(.*)") pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+') try: uid, uname, name1, name2 = pat.match(line).groups() name1 = re.sub(pat_word, '', name1).strip().lower() name2 = re.sub(pat_word, '', name2).strip().lower() fname = name1 + ' ' + name2 except AttributeError: print(line) fname = cyrtranslit.to_latin(fname, 'ru').replace("'", '') return (uid, uname, fname)
def create_file_name(fio_ru): last_name_ru = cy.to_latin(fio_ru, 'ru').replace('J', 'Y').replace("'", "").replace('j', 'y') last_name_ru = last_name_ru.split() first_letter_in_fn = last_name_ru[1][0] if len(last_name_ru) == 3: # try: first_letter_in_fan = '_' + last_name_ru[2][0] except: pass else: first_letter_in_fan = '' last_name_ru = str(last_name_ru[0]).capitalize() + '_' + first_letter_in_fn + first_letter_in_fan return last_name_ru
def translite(file_in, file_out, ru=''): text_file = open(file_out, "w") with open(file_in) as f: if ru: for line in f: new_line = cyrtranslit.to_latin(line, 'ru') text_file.write(new_line) else: for line in f: new_line = cyrtranslit.to_cyrillic(line) text_file.write(new_line) text_file.close()
def import_data_parliament_2007(self): election_type = 'parlamentarni' year = 2007 self.prep_import(election_type, year, None, None) file_path = self.get_data_file_path(election_type, year, None, None) row_count = 0 docs = [] candidates_or_parties = {} parent_territory = '' with open(file_path, 'rb') as f: reader = csv.reader(f) for row in tqdm(reader): doc = {} # Get all the candidates/parties if row_count == 0: for i in range(12, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '') else: territory = row[2].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[3].strip()) polling_station_address = row[4].strip() ballots_received_count = int(row[5].strip()) unused_ballots_count = int(row[6].strip()) number_of_voters_registered=int(row[7].strip()) voters_who_voted_count = int(row[8].strip()) ballots_in_ballot_box_count = int(row[9].strip()) invalid_ballots_count = int(row[10].strip()) valid_ballots_count = int(row[11].strip()) doc['brojPrimljeniGlasackiListica'] = ballots_received_count doc['brojNeupotrebljenihGlasackiListica']=unused_ballots_count doc['brojUpisanihBiracaUBirackiSpisak'] = number_of_voters_registered doc['nevazeciGlasackiListici']= invalid_ballots_count doc['biraciKojiSuGlasali'] = {} doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent doc['brojGlasackihListicaUKutiji'] = {} doc['brojGlasackihListicaUKutiji']['broj'] = ballots_in_ballot_box_count doc['vazeciGlasackiListici'] = {} doc['vazeciGlasackiListici']['broj'] = valid_ballots_count doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) # Some rows consist of territory grouping. # We need to track those. if cyrtranslit.to_latin(territory, 'sr').isupper(): doc['instanca'] = 1 elif 'okrug' in territory_slug \ or territory_slug in ['grad-beograd', 'inostranstvo'] \ or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '': doc['instanca'] = 2 parent_territory = territory elif polling_station_num is '': doc['instanca'] = 3 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) elif polling_station_num is not '': doc['instanca'] = 4 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) doc['brojBirackogMesta'] = polling_station_num doc['adresaBirackogMesta'] = polling_station_address total_votes=0 udeo=0 for j in range(12, len(row)): doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista']) docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] row_count += 1 # Insert remaining documents if len(docs) > 0: db[collection].insert(docs)
def import_data_rest(self, election_type, year, month=None, rnd=None): self.prep_import(election_type, year, month, rnd) file_path = self.get_data_file_path(election_type, year, month, rnd) row_count = 0 docs = [] candidates_or_parties = {} parent_territory = '' with open(file_path, 'rb') as f: reader = csv.reader(f) for row in tqdm(reader): doc = {} # Get all the candidates/parties if row_count == 0: if int(year) == 2004 and election_type == "predsjednicki": for i in xrange(11, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() if int(year) == 2008 and election_type == "predsjednicki": for i in xrange(8, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() if int(year) == 2003 and election_type in ["predsjednicki", "parlamentarni"]: for i in xrange(6, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() elif int(year) == 2002 and election_type == "predsjednicki": for i in xrange(7, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() else: for i in xrange(13, len(row), 2): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() elif row_count == 1: pass else: if int(year)==2004 and election_type=="predsjednicki": territory = row[1].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[2].strip()) polling_station_address = row[3].strip() ballots_received_count = int(row[4].strip()) unused_ballots_count = int(row[5].strip()) registered_voters_count = int(row[6].strip()) voters_who_voted_count = int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) valid_ballots_count = int(row[10].strip()) print row_count else: print row_count territory = row[0].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[1].strip()) if row[1].strip() is not '' else row[1].strip() polling_station_address = row[2].strip() registered_voters_count = int(row[3].strip()) if int(year) == 2012 and election_type == "predsjednicki": ballots_received_count = int(row[6].strip()) unused_ballots_count = int(row[7].strip()) voters_who_voted_count = int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) invalid_ballots_percent = float(row[10].strip()) valid_ballots_count = int(row[11].strip()) valid_ballots_percent = float(row[12].strip()) if int(year) == 2012 and election_type == "parlamentarni": voters_who_voted_count = int(row[4].strip()) voters_who_voted_percent = float(row[5].strip()) ballots_received_count = int(row[6].strip()) unused_ballots_count = int(row[7].strip()) ballots_in_ballot_box_count=int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) invalid_ballots_percent = float(row[10].strip()) valid_ballots_count = int(row[11].strip()) valid_ballots_percent = float(row[12].strip()) if int(year)==2008 and election_type=="predsjednicki": voters_who_voted_count = int(row[6].strip()) voters_who_voted_percent=float(row[7].strip()) if int(year) not in [2008, 2012] and election_type != "predsjednicki": voters_who_voted_count = int(row[4].strip()) if int(year) == 2003 and election_type in["predsjednicki","parlamentarni"]: voters_who_voted_count = int(row[4].strip()) total_voter_turn_out = float(row[5].strip()) if int(year) == 2002 and election_type == "predsjednicki": print row_count voters_who_voted_count = int(row[4].strip()) total_voter_turn_out = float(row[5].strip()) if int(year) not in [2002, 2003,2004] and election_type not in ["predsjednicki", "parlamentarni"]: voters_who_voted_percent = float(row[5].strip()) ballots_received_count = int(row[6].strip()) unused_ballots_count = int(row[7].strip()) ballots_in_ballot_box_count = int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) invalid_ballots_percent = float(row[10].strip()) valid_ballots_count = int(row[11].strip()) valid_ballots_percent = float(row[12].strip()) doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count doc['biraciKojiSuGlasali'] = {} doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count if int(year) in [2002, 2003] and election_type in ["predsjednicki", "parlamentarni"]: doc['odzivBiraca']=total_voter_turn_out if int(year) not in [2002, 2003] and election_type not in ["predsjednicki", "parlamentarni"]: doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent doc['brojPrimljenihGlasackihListica'] = ballots_received_count doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count if int(year) not in [2012, 2004] and election_type!="predsjednicki": doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count doc['brojGlasackihListicaUKutiji'] = {} doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count if int(year)!=2004 and election_type!="predsjednicki": doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent doc['vazeciGlasackiListici'] = {} doc['vazeciGlasackiListici']['broj'] = valid_ballots_count if int(year) != 2004 and election_type != "predsjednicki": doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent # Some rows consist of territory grouping. # We need to track those. if cyrtranslit.to_latin(territory, 'sr').isupper(): doc['instanca'] = 1 elif 'okrug' in territory_slug\ or territory_slug in ['grad-beograd', 'inostranstvo']\ or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '': doc['instanca'] = 2 parent_territory = territory elif polling_station_num is '': doc['instanca'] = 3 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) elif polling_station_num is not '': doc['instanca'] = 4 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) doc['brojBirackogMesta'] = polling_station_num doc['adresaBirackogMesta'] = polling_station_address if int(year)==2003 and election_type in ["parlamentarni"]: total_votes=0 udeo=0 for j in xrange(6, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] =float(udeo) doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify( cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] elif int(year) == 2002 and election_type == "predsjednicki": total_votes=0 udeo=0 for j in xrange(7, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: print int(row[j]) total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] elif int(year) == 2003 and election_type == "predsjednicki": total_votes=0 udeo=0 for j in xrange(6, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: print int(row[j]) total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] elif int(year) == 2004 and election_type == "predsjednicki": total_votes=0 udeo=0 for j in xrange(11, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 print udeo else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] else: total_votes=0 udeo=0 for j in xrange(13, len(row), 2): # Set generic values doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 print udeo else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election if election_type == 'predsjednicki': month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) else: doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] row_count += 1 # Insert remaining documents if len(docs) > 0: db[collection].insert(docs)
def test_latin_alphabet_characters(self): ''' Alphabet characters should remain the same. ''' transliterated_alphabet_chars = cyrtranslit.to_latin(alphabet_chars) self.assertEqual(transliterated_alphabet_chars, alphabet_chars)
def test_numerical_characters(self): ''' Numerical characters should remain the same. ''' transliterated_numerical_chars = cyrtranslit.to_latin(numerical_chars) self.assertEqual(transliterated_numerical_chars, numerical_chars)
def test_special_diacritic_characters(self): ''' Diacritic characters should remain the same. ''' transliterated_diacritic_chars = cyrtranslit.to_latin(diacritic_chars) self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
def test_special_characters(self): ''' Special characters should remain the same. ''' transliterated_special_chars = cyrtranslit.to_latin(special_chars) self.assertEqual(transliterated_special_chars, special_chars)
def test_alphabet_transliteration(self): ''' Transliteration of entire Serbian cyrillic alphabet to latin. ''' transliterated_serbian_alphabet = cyrtranslit.to_latin(serbian_alphabet_cyrillic) self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_latin)
def test_alphabet_transliteration_cyrillic_to_latin(self): ''' Transliteration of entire cyrillic alphabet to latin. ''' transliterated_alphabet = cyrtranslit.to_latin(russian_alphabet_cyrillic, lang_code='ru') self.assertEqual(transliterated_alphabet, russian_alphabet_latin)
def import_data_parliament_2016(self): election_type = 'parlamentarni' year = 2016 self.prep_import(election_type, year, None, None) file_path = self.get_data_file_path(election_type, year, None, None) row_count = 0 docs = [] candidates_or_parties = {} with open(file_path, 'rb') as f: reader = csv.reader(f) for row in tqdm(reader): doc = {} # Get all the candidates/parties if row_count == 0: for i in range(14, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '') elif row[7].strip() is not '': # FIXME: we do this because row 8,350 is blank. parent_territory = row[1].strip() parent_territory_slug = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) territory = row[3].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[4].strip()) polling_station_address = row[5].strip() coordinates = row[6].strip().split(',') registered_voters_count = int(row[7].strip()) ballots_received_count = int(row[8].strip()) unused_ballots_count = int(row[9].strip()) voters_who_voted_count = int(row[10].strip()) # voters_who_voted_percent = None ballots_in_ballot_box_count = int(row[11].strip()) invalid_ballots_count = int(row[12].strip()) # invalid_ballots_percent = None valid_ballots_count = int(row[13].strip()) # valid_ballots_percent = None # Set election type and year doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) # Set generic location values doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = parent_territory_slug doc['brojBirackogMesta'] = polling_station_num doc['adresaBirackogMesta'] = polling_station_address # FIXME: at least one coordinate is missing (row 1481) if len(coordinates) == 2: doc['koordinateBirackomMestu'] = {} doc['koordinateBirackomMestu']['latituda'] = float(coordinates[0].strip()) doc['koordinateBirackomMestu']['longituda'] = float(coordinates[1].strip()) # Set generic ballot values doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count doc['biraciKojiSuGlasali'] = {} doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent doc['brojPrimljenihGlasackihListica'] = ballots_received_count doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count doc['brojGlasackihListicaUKutiji'] = {} doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count # doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent doc['vazeciGlasackiListici'] = {} doc['vazeciGlasackiListici']['broj'] = valid_ballots_count # doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent # For this year, we don't have grouped territories we are importing. # So every document is at the smallest unit of territory doc['instanca'] = 4 # print '---------' total_votes=0 udeo=0 for j in range(14, len(row)): doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista']) docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] row_count += 1 # Insert remaining documents if len(docs) > 0: db[collection].insert(docs)
def build_mongo_document_structure_for_budzets(self, razdeo, glava, program, funkcija, programska_aktivnost_projekat, ekonomska_klasifikacija, opis, ukupna_sredstva): """ :param razdeo: :param glava: :param program: :param funkcija: :param programska_aktivnost_projekat: :param ekonomska_klasifikacija: :param opis: :param ukupna_sredstva: the total for economic classification :return: """ json_doc = { "razdeo": { "broj": razdeo, "opis": { "cirilica": opis, "latinica": cyrtranslit.to_latin(opis, "sr") } }, "glava": { "broj": glava, "opis": { "cirilica": opis, "latinica": cyrtranslit.to_latin(opis, "sr") } }, "program": { "broj": program, "opis": { "cirilica": opis, "latinica": cyrtranslit.to_latin(opis, "sr") } }, "funkcija": { "broj": funkcija, "opis": { "cirilica": opis, "latinica": cyrtranslit.to_latin(opis, "sr") } }, "programskaAktivnostProjekat": { "broj": programska_aktivnost_projekat, "opis": { "cirilica": opis, "latinica": cyrtranslit.to_latin(opis, "sr") } }, "ekonomskaKlasifikacija": { "broj": ekonomska_klasifikacija, "opis": { "cirilica": opis, "latinica": cyrtranslit.to_latin(opis, "sr") }, "ukupna_sredstva": self.convert_to_float(ukupna_sredstva.replace(",", "")) } } return json_doc
def data_importer_of_municipality_vranje(self): db.opstine.remove({"opstina.latinica": "Vranje", "tipPodataka.slug": "rashodi"}) # init parent categories JSON parent_categories = utils.parent_categories_for_vranje() program_categories = utils.program_categories_for_vranje() # Read data from vranje csv file data_handler = reader(open("data/rashodi/vranje.csv", "r"), delimiter=",") program = "" subprogram = "" for index, row in enumerate(data_handler): if index > 0: if index < 48 and len(row[1]) > 2: if row[1] != "541": parent_handler = parent_categories[row[1][0:2]] else: parent_handler = parent_categories["51"] json_doc = self.build_mongo_document_structure_for_prihodi_rashodi( "Врање", row[1], row[2], row[3], row[4], row[5], row[6], None, parent_handler, row[1][0:2] ) db.opstine.insert(json_doc) print "Opstine: %s - Kategorija Roditelj: %s - Opis: %s" % ("Врање", parent_handler, row[1]) elif index > 48: # init program if row[2] not in ["", " "]: if row[2].strip() in program_categories: program = row[2].strip() if program != "" and row[2].strip() in program_categories[program]: subprogram = row[2].strip() if row[1] not in ["", " "] and program not in ["", " "] and subprogram not in ["", " "]: json_doc = self.build_mongo_document_structure_for_prihodi_rashodi( "Врање", row[1], row[2], row[3], row[4], row[5], row[6], None ) json_doc["program"] = {} json_doc["program"]["cirilica"] = program.strip() json_doc["program"]["latinica"] = cyrtranslit.to_latin(program, "sr") json_doc["potProgram"] = {} json_doc["potProgram"]["cirilica"] = subprogram.strip() json_doc["potProgram"]["latinica"] = cyrtranslit.to_latin(subprogram, "sr") db.opstine.insert(json_doc) print "Opstine: %s - Program: %s %s" % ("Врање", program, row[1])
def build_mongo_document_structure_for_prihodi_rashodi(self, municipality, class_number, opis, prihodi_vudzeta, sopstveni_prihodi, ostali, ukupno, kategorija_roditelj=None, roditelj_broj=None): """ :param municipality: :param class_number: :param opis: :param prihodi_vudzeta: :param sopstveni_prihodi: :param donacije: :param ostali: :param ukupno: :param kategorija_roditelj: :param roditelj_broj: :return: """ if municipality in ["Сомбор", "Звездара"]: # In this municipality we have values only for column ukupno (total value) # That's why we need to import, instead of manually calculating manually prihodi_vudzeta = 0 sopstveni_prihodi = 0 ostali = 0 ukupno = self.convert_to_float(ukupno.replace(',', '')) elif municipality in ["Краљево"]: # In this municipality we have values only for column ukupno (total value) # That's why we need to import, instead of manually calculating manually prihodi_vudzeta = 0 sopstveni_prihodi = 0 ostali = 0 ukupno = self.convert_to_float(ukupno.replace(',', '').replace('.', '')[:-2]) elif municipality in ["Нови Београд"]: # In this municipality we have values only for column ukupno (total value) # That's why we need to import, instead of manually calculating manually prihodi_vudzeta = 0 sopstveni_prihodi = 0 ostali = 0 ukupno = self.convert_to_float(ukupno.replace('.', '')) else: prihodi_vudzeta = self.convert_to_float(prihodi_vudzeta.replace(',', '')) sopstveni_prihodi = self.convert_to_float(sopstveni_prihodi.replace(',', '')) ostali = self.convert_to_float(ostali.replace(',', '')) ukupno = prihodi_vudzeta + sopstveni_prihodi + ostali # Let's build mongo document structure json_doc = { "tipPodataka": { "vrednost": "Prihodi", "slug": "prihodi", }, "godina": 2015, "kategorijaRoditelj": { "opis": { "cirilica": "Скупштина општине", "latinica": "Skupština Opštine", }, "broj": 0 }, "opstina": { "cirilica": municipality, "latinica": cyrtranslit.to_latin(municipality, "sr"), "slug": slugify(municipality, to_lower=True) }, "klasifikacija": { "opis": { "cirilica": opis.strip(), "latinica": cyrtranslit.to_latin(opis.strip(), "sr") } }, "prihodiBudzeta": prihodi_vudzeta, "sopstveniPrihodi": sopstveni_prihodi, "ostali": ostali, "ukupno": ukupno } if kategorija_roditelj is not None: json_doc["kategorijaRoditelj"]["opis"]["cirilica"] = kategorija_roditelj.strip() json_doc["kategorijaRoditelj"]["opis"]["latinica"] = cyrtranslit.to_latin(kategorija_roditelj, "sr") json_doc["kategorijaRoditelj"]["broj"] = roditelj_broj json_doc["klasifikacija"]["broj"] = class_number.strip() return json_doc
def import_data(self, election_type, year, month=None, rnd=None): self.prep_import(election_type, year, month, rnd) file_path = self.get_data_file_path(election_type, year, month, rnd) e = xml.etree.ElementTree.parse(file_path).getroot() results = {} docs = [] for result in e.findall('Result'): territory = result.attrib[u'Територија'].strip() data_type = result.attrib[u'Врста_податка'].strip() candidate = result.attrib[u'Кандидат'].strip() if election_type == 'predsjednicki' else result.attrib[u'Изборна_листа'].strip() # We have two entries per territory. One for share of votes (in percentage) and one for number of votes. # We want to save both numbers in the same document # To achieve this, we keep track of created documents per territory if territory not in results: results[territory] = {} if candidate not in results[territory]: results[territory][candidate] = { 'teritorija': territory, 'teritorijaSlug': slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'), to_lower=True), 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year), 'rezultat': { 'udeo': None, 'glasova': None } } # All values with capital letters are grouped regions # we need to mark them so that we don't count votes more than once territory_slug = slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr')) if territory_slug.isupper() and ('okrug' in territory_slug.lower() or territory_slug.lower() == 'grad-beograd') : results[territory][candidate]['instanca'] = 2 elif territory_slug.isupper(): results[territory][candidate]['instanca'] = 1 else: results[territory][candidate]['instanca'] = 3 # Set remaining values depending on whether is is a presidential or parliamentary election if election_type == 'predsjednicki': month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') results[territory][candidate]['mesec'] = month_cyr results[territory][candidate]['krug'] = rnd_cyr results[territory][candidate]['kandidat'] = candidate.title() results[territory][candidate]['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True) else: results[territory][candidate]['izbornaLista'] = candidate results[territory][candidate]['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True) # Удео броја гласова које је добила листа у укупном броју гласова, % if '%' in data_type: results[territory][candidate]['rezultat']['udeo'] = float(result.text.replace(',', '.')) # Број гласова које је добила листа else: results[territory][candidate]['rezultat']['glasova'] = int(result.text) if results[territory][candidate]['rezultat']['udeo'] is not None and results[territory][candidate]['rezultat']['glasova'] is not None: docs.append(results[territory][candidate]) # Insert documents db['izbori'].insert(docs)