def prep_import(self, election_type, year, month=None, rnd=None): if election_type == 'predsjednicki': print '\nRemoving previously imported data for %s %s %s %s...' % ( election_type, year, month, rnd) db['izbori'].remove({ 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year), 'mesec': cyrtranslit.to_cyrillic(month.title(), 'sr'), 'krug': cyrtranslit.to_cyrillic(rnd.title(), 'sr') }) print 'Importing data for %s %s %s %s...' % (election_type, year, month, rnd) else: print '\nRemoving previously imported data for %s %s...' % ( election_type, year) db['izbori'].remove({ 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year) }) print 'Importing data for %s %s...' % (election_type, year)
def get_top_indicators_by_type(self, data_source,election_type_slug, godina, instanca,round_slug=None): collection = 'izbori' if data_source == 1 else 'izbori2' match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': godina, 'instanca':instanca } if election_type_slug == 'predsjednicki': if round_slug is not None: round_val = cyrtranslit.to_cyrillic(round_slug.title(), 'sr') match['krug'] = round_val group = { '_id': { 'kandidat': '$kandidat', 'kandidatSlug': '$kandidatSlug' }, 'glasova': {"$sum": "$rezultat.glasova"}, 'udeo': {"$sum": "$rezultat.udeo"}, } else: group = { '_id': { 'izbornaLista': '$izbornaLista', 'izbornaListaSlug': '$izbornaListaSlug' }, 'glasova': {"$sum": "$rezultat.glasova"}, 'udeo': {"$sum": "$rezultat.udeo"}, } group_total = { "_id": None, "total": { "$sum": "$rezultat.glasova" } } sort = { "glasova": -1 } pipeline = [ {'$match': match}, {'$group': group}, {'$sort': sort}, {'$project': self.get_push_pipeline_operation_for_top_indicators(election_type_slug)} ] pipeline_total = [ {"$match": match}, {"$match": match}, {"$group": group_total} ] rsp_total = mongo.db[collection].aggregate(pipeline_total,allowDiskUse=True) rsp = mongo.db[collection].aggregate(pipeline) total_votes = rsp_total['result'][0]["total"] for candidate in rsp['result']: candidate["udeo"] = (float(candidate["glasova"]) / total_votes) * 100 return rsp['result']
def get_results_by_territory_by_candidate(self,data_source,election_type_slug,year,territory_slug,candidate_slug,instanca): collection = 'izbori' if data_source == 1 else 'izbori2' if election_type_slug == 'predsjednicki': match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': year, 'teritorijaSlug': territory_slug, 'instanca': instanca, 'kandidatSlug':candidate_slug } group = { '_id': { 'teritorija': '$teritorija', 'teritorijaSlug': '$teritorijaSlug', 'teritorija': '$teritorija', 'izbornaLista': '$izbornaLista', 'izbornaListaSlug': '$izbornaListaSlug', }, } else: match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': year, 'teritorijaSlug': territory_slug, 'instanca': instanca, 'izbornaListaSlug': candidate_slug } group = { '_id': { 'teritorija': '$teritorija', 'teritorijaSlug': '$teritorijaSlug', 'teritorija': '$teritorija', 'izbornaLista': '$izbornaLista', 'izbornaListaSlug': '$izbornaListaSlug', 'brojUpisanihBiracaUBirackiSpisak': '$brojUpisanihBiracaUBirackiSpisak', 'biraciKojiSuGlasali': '$biraciKojiSuGlasali', 'rezultat': '$rezultat' }, } project = { '_id': 0, 'teritorija': '$_id.teritorija', 'teritorijaSlug': '$_id.teritorijaSlug', 'izbornaListaSlug':'$_id.izbornaListaSlug', 'izbornaLista': '$_id.izbornaLista', 'brojUpisanihBiracaUBirackiSpisak':'$_id.brojUpisanihBiracaUBirackiSpisak', 'biraciKojiSuGlasali':'$_id.biraciKojiSuGlasali', 'rezultat': '$_id.rezultat', } pipeline = [ {'$match': match}, {'$group':group}, {'$project': project} ] rsp = mongo.db[collection].aggregate(pipeline) return rsp['result'][0]
def get_votes_grouped_by_party_or_candidate(self, data_source, election_type_slug, year, party_or_candidate_slug=None, round_slug=None): collection = 'izbori' if data_source == 1 else 'izbori2' match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': year } if round_slug is not None: round_val = cyrtranslit.to_cyrillic(round_slug.title(), 'sr') match['krug'] = round_val if party_or_candidate_slug is not None: if election_type_slug == 'predsjednicki': match['kandidatSlug'] = party_or_candidate_slug else: match['izbornaListaSlug'] = party_or_candidate_slug group = { '_id': self.get_id_pipeline_operation_for_votes_grouped_by_party_or_candidate(election_type_slug), 'rezultat': { '$push': { 'teritorija': '$teritorija', 'teritorijaSlug': '$teritorijaSlug', 'rezultat': '$rezultat' } } } if data_source == 2: group['_id']['parentTeritorija'] = '$parentTeritorija' group['_id']['parentTeritorijaSlug'] = '$parentTeritorijaSlug' group['_id']['adresaBirackogMesta'] = '$adresaBirackogMesta' group['_id']['koordinateBirackomMestu'] = '$koordinateBirackomMestu' group['_id']['brojUpisanihBiracaUBirackiSpisak'] = '$brojUpisanihBiracaUBirackiSpisak' group['_id']['biraciKojiSuGlasali'] = '$biraciKojiSuGlasali' group['_id']['brojPrimljenihGlasackihListica'] = '$brojPrimljenihGlasackihListica' group['_id']['brojNeupoTrebljenihGlasackihListica'] = '$brojNeupoTrebljenihGlasackihListica' group['_id']['brojGlasackihListicaUKutiji'] = '$brojGlasackihListicaUKutiji' group['_id']['vazeciGlasackiListici'] = '$vazeciGlasackiListici' pipeline = [ {'$match': match}, {'$group': group}, {'$project': self.get_poject_pipeline_operation_for_votes_grouped_by_party_or_candidate(election_type_slug)} ] rsp = mongo.db[collection].aggregate(pipeline) if party_or_candidate_slug is not None: return rsp['result'][0] else: return rsp['result']
def test_special_diacritic_characters(self): ''' Diacritic characters should remain the same. ''' transliterated_diacritic_chars = cyrtranslit.to_cyrillic( diacritic_chars) self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
def test_numerical_characters(self): ''' Numerical characters should remain the same. ''' transliterated_numerical_chars = cyrtranslit.to_cyrillic( numerical_chars) self.assertEqual(transliterated_numerical_chars, numerical_chars)
def test_alphabet_transliteration_latin_to_cyrillic(self): ''' Transliteration of entire latin alphabet to cyrillic. ''' transliterated_alphabet = cyrtranslit.to_cyrillic( bulgarian_alphabet_latin, lang_code='bg') self.assertEqual(transliterated_alphabet, bulgarian_alphabet_cyrillic)
def proceed_token(t, translit=False, replace_i=True): t = t.replace('ё', 'е').replace(',', '.') if replace_i: t = t.replace('й', 'и') num = isnum(t) if num: return num t = t.rstrip('ъ') # # all ascii if translit and all(ord(char) < 128 for char in t): t = cyrtranslit.to_cyrillic(t, 'ru') # t = trans.transliterate(t) tmp = dot_prog.split(t) if len(tmp) > 1: tmp = [isnum(el) for el in tmp] if all(tmp): return 'x'.join(tmp) # english x tmp = split_unit(t) if type(tmp) == tuple: return tmp[0] + ' ' + tmp[1] t = t.replace('.', ' ') tmp = (unit_lookup.get(t, t) for t in t.split(' ')) t = ' '.join((ti for ti in tmp if len(ti) > 1)) return t
def test_mix_characters(self): ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't. ''' transliterated_mix = cyrtranslit.to_cyrillic( mix_characters_all_latin_no_alpha) self.assertEqual(transliterated_mix, mix_characters_some_cyrillic_no_alpha)
def test_alphabet_transliteration_latin_to_cyrillic(self): ''' Transliteration of entire latin alphabet to cyrillic. ''' transliterated_alphabet = cyrtranslit.to_cyrillic( montenegrin_alphabet_latin, lang_code='me') self.assertEqual(transliterated_alphabet, montenegrin_alphabet_cyrillic)
def test_alphabet_transliteration(self): ''' Transliteration of entire Serbian cyrillic alphabet to latin. ''' transliterated_serbian_alphabet = cyrtranslit.to_cyrillic( serbian_alphabet_latin) self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_cyrillic)
def branching(text, plant, depth=0): depth += 1 if depth == max_depth: return encoded_text = textgenrnn_encode_sequence(text[-maxlen:], vocab, maxlen) next_temperature = temperature[(len(text) - 1) % len(temperature)] # n_branches = 1 if depth is 0 else 2 n_branches = 2 def get_options(n_branches=n_branches): options_index = textgenrnn_sample( model.predict(encoded_text, batch_size=1)[0], next_temperature, interactive=interactive, # top_n=random.randint(2,5) # top_n=2 top_n=n_branches) options = [indices_char[idx] for idx in options_index] # filter punctuations options = list(filter(lambda o: o not in puncts, options)) return options options = get_options(10) options = options[:2] print("start") for index, option in enumerate(options): print(index) # if text[-1]: # prev_option = text[-1] # else: # prev_option = " " # # prev_is_punct = prev_option[0] in puncts # both_are_punct = prev_is_punct and option[0] in puncts # # if (prev_option == option) or both_are_punct: # new_options = get_options(10) # for new_option in new_options: # both_are_punct_again = prev_is_punct and new_option[0] in puncts # if (prev_option != new_option) and not both_are_punct_again: # option = new_option # break item_text = text + [option] cyr_option = cyrtranslit.to_cyrillic(option) node_object = {"name": cyr_option, "children": []} plant.append(node_object) branching(item_text, plant=node_object["children"], depth=depth) return plant
def test_alphabet_transliteration_latin_to_cyrillic(self): ''' Transliteration of entire latin alphabet to cyrillic. ''' transliterated_alphabet = cyrtranslit.to_cyrillic( russian_alphabet_latin, lang_code='ru') self.assertEqual( transliterated_alphabet, russian_alphabet_cyrillic.replace('Ъ', 'ъ').replace('Ь', 'ь'))
def l2c(): m = Frame.m txt = m.get() root28 = tk.Tk() root28.title('Result(Latin2Cyrillic)') result = cyrtranslit.to_cyrillic(txt, 'ru') print(result) pyperclip.copy(result) label28 = tk.Label(root28, text=result, font=16) label28.pack(fill="x")
def prep_import(self, election_type, year, month=None, rnd=None): if election_type == 'predsjednicki': print '\nRemoving previously imported data for %s %s %s %s...' % (election_type, year, month, rnd) db['izbori'].remove({ 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year), 'mesec': cyrtranslit.to_cyrillic(month.title(), 'sr'), 'krug': cyrtranslit.to_cyrillic(rnd.title(), 'sr') }) print 'Importing data for %s %s %s %s...' % (election_type, year, month, rnd) else: print '\nRemoving previously imported data for %s %s...' % (election_type, year) db['izbori'].remove({ 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year) }) print 'Importing data for %s %s...' % (election_type, year)
def translite(file_in, file_out, ru=''): text_file = open(file_out, "w") with open(file_in) as f: if ru: for line in f: new_line = cyrtranslit.to_latin(line, 'ru') text_file.write(new_line) else: for line in f: new_line = cyrtranslit.to_cyrillic(line) text_file.write(new_line) text_file.close()
def search(name): films = KinoPoisk.__get_list(name) name = name.replace('-', ' ').replace('1', 'i').replace('0', 'o').replace('.', ' ') films += KinoPoisk.__get_list(name) name = name.replace('’', 'ь').replace('ya', 'я') name = cyrtranslit.to_cyrillic(name, 'ru') films += KinoPoisk.__get_list(name) name = name.replace('ы', 'й') films += KinoPoisk.__get_list(name) films = list(set(films)) return films
def get_total_voters_turnout(self,data_source, election_type_slug, godina): collection = 'izbori' if data_source == 1 else 'izbori2' match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': godina, } group = { '_id': { 'teritorija': '$teritorija', }, } group['_id']['parentTeritorija'] = '$parentTeritorija' group['_id']['parentTeritorijaSlug'] = '$parentTeritorijaSlug' group['_id']['adresaBirackogMesta'] = '$adresaBirackogMesta' group['_id']['koordinateBirackomMestu'] = '$koordinateBirackomMestu' group['_id']['brojUpisanihBiracaUBirackiSpisak'] = '$brojUpisanihBiracaUBirackiSpisak' group['_id']['biraciKojiSuGlasali'] = '$biraciKojiSuGlasali' group['_id']['brojPrimljenihGlasackihListica'] = '$brojPrimljenihGlasackihListica' group['_id']['brojNeupoTrebljenihGlasackihListica'] = '$brojNeupoTrebljenihGlasackihListica' group['_id']['brojGlasackihListicaUKutiji'] = '$brojGlasackihListicaUKutiji' group['_id']['vazeciGlasackiListici'] = '$vazeciGlasackiListici' project = { '_id': 0, 'teritorija': '$_id.teritorija', 'brojUpisanihBiracaUBirackiSpisak': '$_id.brojUpisanihBiracaUBirackiSpisak', 'biraciKojiSuGlasali': '$_id.biraciKojiSuGlasali.broj', } pipeline = [ {'$match': match}, {'$group': group}, {'$project':project} ] rsp = mongo.db[collection].aggregate(pipeline) total_voters=0 total_registered=0 percentage=0; for rezultat in rsp['result']: total_voters+=rezultat['biraciKojiSuGlasali'] if rezultat['brojUpisanihBiracaUBirackiSpisak']!=0: total_registered+=rezultat['brojUpisanihBiracaUBirackiSpisak'] percentage=(float(total_voters) / total_registered) * 100 return {'percentage':percentage, 'total_voters': total_voters}
def transform(self, X: pd.DataFrame, y=None, *args, **kwargs): data = [] for w in tqdm(X['before'], f'{self.__class__.__name__} transform', total=len(X)): if re_eng.match(w): if self.algo == 'translit': rus_w = translit(w, language_code='ru').lower() elif self.algo == 'cyrtranslit': rus_w = cyrtranslit.to_cyrillic(w, lang_code='ru').lower() elif self.algo == 'pytils': rus_w = pytils.translit.detranslify(w).lower() data.append(' '.join([c + u'_trans' for c in rus_w])) else: data.append(None) if 'after' in X.columns: return X.assign( after=X['after'].combine_first(pd.Series(data, index=X.index))) else: return X.assign(after=data)
def tokenize(self, content, author_id, my_id=0): tuples = re.findall( r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|" r"(<@!?" + str(my_id) + r">)|" r"(<@!?\d{16,20}>)|" r"(<#\d{16,20}>)|" r"(<@&\d{16,20}>)|" r"(<:\w{1,32}:\d{16,20}>)|" r"(<[a]:\w{1,32}:\d{16,20}>)|" r"(@everyone|@here)|" r"([^\d\W]+)|" r"(.)", content, re.UNICODE, ) result = [] if author_id == my_id: result.append("_MY_MESSAGE_BEGIN_") else: result.append("_NOT_MY_MESSAGE_BEGIN_") for tup in tuples: for idx, item in enumerate(tup): if item: if idx <= 6: result.append(self.entity_to_word[idx]) elif idx == 7: result.append(item) elif idx == 8: if item.isupper(): result.append("_CAPS_") elif item[0].isupper(): result.append("_SHIFT_") trigrams = self.trigramize( cyrtranslit.to_cyrillic(item.lower(), "ru")) result.extend(trigrams) else: result.append(item) result.append("_MESSAGE_END_") return result
def get_winners_for_each_territory(self, data_source,election_type_slug,year,instanca): collection = 'izbori' if data_source == 1 else 'izbori2' match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': year, 'instanca':instanca } group = { '_id': { 'teritorija': '$teritorija', 'teritorijaSlug': '$teritorijaSlug', }, 'rezultat': { '$push': self.get_push_pipeline_operation_for_votes_grouped_by_territory_group_by_result( election_type_slug) }, } sort = { "rezultat.glasova": -1 } project = { '_id': 0, 'teritorija': '$_id.teritorija', 'teritorijaSlug': '$_id.teritorijaSlug', 'rezultat': 1, } pipeline = [ {'$match': match}, {'$sort': sort}, {'$group': group}, {'$project': project} ] rsp = mongo.db[collection].aggregate(pipeline, allowDiskUse=True) return rsp['result']
def test_mix_characters(self): ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't. ''' transliterated_mix = cyrtranslit.to_cyrillic(mix_characters_all_latin_no_alpha) self.assertEqual(transliterated_mix, mix_characters_some_cyrillic_no_alpha)
def get_votes_grouped_by_territory(self, data_source, election_type_slug, year, instanca, territory_slug=None, round_slug=None,range_of_documents=None): collection = 'izbori' if data_source == 1 else 'izbori2' match = { 'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'), 'godina': year } # For now, we only support territorial levels for parliament elections if election_type_slug != 'predsjednicki' and instanca is not None: match['instanca'] = instanca if round_slug is not None: round_val = cyrtranslit.to_cyrillic(round_slug.title(), 'sr') match['krug'] = round_val if territory_slug is not None: match['teritorijaSlug'] = territory_slug group = { '_id': { 'teritorija': '$teritorija', 'teritorijaSlug': '$teritorijaSlug', }, 'rezultat': { '$push': self.get_push_pipeline_operation_for_votes_grouped_by_territory_group_by_result( election_type_slug), }, } sort = { "rezultat.glasova": -1 } if data_source == 2: group['_id']['parentTeritorija'] = '$parentTeritorija' group['_id']['parentTeritorijaSlug'] = '$parentTeritorijaSlug' group['_id']['adresaBirackogMesta'] = '$adresaBirackogMesta' group['_id']['koordinateBirackomMestu'] = '$koordinateBirackomMestu' group['_id']['brojUpisanihBiracaUBirackiSpisak'] = '$brojUpisanihBiracaUBirackiSpisak' group['_id']['biraciKojiSuGlasali'] = '$biraciKojiSuGlasali' group['_id']['brojPrimljenihGlasackihListica'] = '$brojPrimljenihGlasackihListica' group['_id']['brojNeupoTrebljenihGlasackihListica'] = '$brojNeupoTrebljenihGlasackihListica' group['_id']['brojGlasackihListicaUKutiji'] = '$brojGlasackihListicaUKutiji' group['_id']['vazeciGlasackiListici'] = '$vazeciGlasackiListici' project = { '_id': 0, 'teritorija': '$_id.teritorija', 'teritorijaSlug': '$_id.teritorijaSlug', 'rezultat': 1, } if data_source == 2: project['parentTeritorija'] = '$_id.parentTeritorija' project['parentTeritorijaSlug'] = '$_id.parentTeritorijaSlug' project['adresaBirackogMesta'] = '$_id.adresaBirackogMesta' project['koordinateBirackomMestu'] = '$_id.koordinateBirackomMestu' project['brojUpisanihBiracaUBirackiSpisak'] = '$_id.brojUpisanihBiracaUBirackiSpisak' project['biraciKojiSuGlasali'] = '$_id.biraciKojiSuGlasali' project['brojPrimljenihGlasackihListica'] = '$_id.brojPrimljenihGlasackihListica' project['brojNeupoTrebljenihGlasackihListica'] = '$_id.brojNeupoTrebljenihGlasackihListica' project['brojGlasackihListicaUKutiji'] = '$_id.brojGlasackihListicaUKutiji' project['vazeciGlasackiListici'] = '$_id.vazeciGlasackiListici' pipeline = [ {'$match': match}, {'$sort': sort}, {'$limit':100}, {'$group': group}, {'$project': project}, ] if range_of_documents is not None: rsp = mongo.db[collection].aggregate(pipeline, allowDiskUse=True) else: rsp = mongo.db[collection].aggregate(pipeline, allowDiskUse=True) return rsp['result']
def textgenrnn_generate_special(model, vocab, indices_char, temperature=0.5, maxlen=40, meta_token='<s>', word_level=False, single_text=False, max_gen_length=300, interactive=False, top_n=3, prefix=None, synthesize=False, input_text=None, input_depth=6): if input_text is None: input_text = [''] collapse_char = ' ' if word_level else '' end = False text = input_text puncts = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' max_gen_length += maxlen if not isinstance(temperature, list): temperature = [temperature] if len(model.inputs) > 1: model = Model(inputs=model.inputs[0], outputs=model.outputs[1]) max_depth = input_depth def branching(text, plant, depth=0): depth += 1 if depth == max_depth: return encoded_text = textgenrnn_encode_sequence(text[-maxlen:], vocab, maxlen) next_temperature = temperature[(len(text) - 1) % len(temperature)] # n_branches = 1 if depth is 0 else 2 n_branches = 2 def get_options(n_branches=n_branches): options_index = textgenrnn_sample( model.predict(encoded_text, batch_size=1)[0], next_temperature, interactive=interactive, # top_n=random.randint(2,5) # top_n=2 top_n=n_branches) options = [indices_char[idx] for idx in options_index] # filter punctuations options = list(filter(lambda o: o not in puncts, options)) return options options = get_options(10) options = options[:2] print("start") for index, option in enumerate(options): print(index) # if text[-1]: # prev_option = text[-1] # else: # prev_option = " " # # prev_is_punct = prev_option[0] in puncts # both_are_punct = prev_is_punct and option[0] in puncts # # if (prev_option == option) or both_are_punct: # new_options = get_options(10) # for new_option in new_options: # both_are_punct_again = prev_is_punct and new_option[0] in puncts # if (prev_option != new_option) and not both_are_punct_again: # option = new_option # break item_text = text + [option] cyr_option = cyrtranslit.to_cyrillic(option) node_object = {"name": cyr_option, "children": []} plant.append(node_object) branching(item_text, plant=node_object["children"], depth=depth) return plant plt = branching(text=text, plant=[]) # print(json.dumps(plt)) return_value = {"name": cyrtranslit.to_cyrillic(text[-1]), "children": plt} return return_value, end
try: reader = csv.reader(source_bom_file, delimiter=';') writer = csv.writer(rus_bom_file, delimiter=';') row_num = 0 for row in reader: csv_str = row col_num = 0 row_num = row_num + 1 while col_num < len(csv_str): if col_num in translate_colums: if (col_num == pe3name_col): if (csv_str[col_num] != "-"): csv_str[col_num] = cyrtranslit.to_cyrillic( csv_str[col_num].encode("utf-8"), 'ru') else: csv_str[col_num] = csv_str[partnum_col] else: csv_str[col_num] = cyrtranslit.to_cyrillic( csv_str[col_num].encode("utf-8"), 'ru') col_num = col_num + 1 if (len(csv_str[prefix_col]) == 1): csv_str[prefix_col] = '' if (len(csv_str[partnum_col]) == 1): csv_str[partnum_col] = '' if (len(csv_str[desc_col]) == 1): csv_str[desc_col] = '' prefix_value = csv_str[prefix_col] partnumber_value = csv_str[partnum_col] pe3name_value = csv_str[pe3name_col] desc_value = csv_str[desc_col]
def cyr(to_translate): return cyrtranslit.to_cyrillic(to_translate)
def import_data_rest(self, election_type, year, month=None, rnd=None): self.prep_import(election_type, year, month, rnd) file_path = self.get_data_file_path(election_type, year, month, rnd) row_count = 0 docs = [] candidates_or_parties = {} parent_territory = '' with open(file_path, 'rb') as f: reader = csv.reader(f) for row in tqdm(reader): doc = {} # Get all the candidates/parties if row_count == 0: if int(year) == 2004 and election_type == "predsjednicki": for i in xrange(11, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() if int(year) == 2008 and election_type == "predsjednicki": for i in xrange(8, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() if int(year) == 2003 and election_type in ["predsjednicki", "parlamentarni"]: for i in xrange(6, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() elif int(year) == 2002 and election_type == "predsjednicki": for i in xrange(7, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() else: for i in xrange(13, len(row), 2): candidates_or_parties[str(i)] = row[i].replace('\n', '').strip() elif row_count == 1: pass else: if int(year)==2004 and election_type=="predsjednicki": territory = row[1].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[2].strip()) polling_station_address = row[3].strip() ballots_received_count = int(row[4].strip()) unused_ballots_count = int(row[5].strip()) registered_voters_count = int(row[6].strip()) voters_who_voted_count = int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) valid_ballots_count = int(row[10].strip()) print row_count else: print row_count territory = row[0].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[1].strip()) if row[1].strip() is not '' else row[1].strip() polling_station_address = row[2].strip() registered_voters_count = int(row[3].strip()) if int(year) == 2012 and election_type == "predsjednicki": ballots_received_count = int(row[6].strip()) unused_ballots_count = int(row[7].strip()) voters_who_voted_count = int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) invalid_ballots_percent = float(row[10].strip()) valid_ballots_count = int(row[11].strip()) valid_ballots_percent = float(row[12].strip()) if int(year) == 2012 and election_type == "parlamentarni": voters_who_voted_count = int(row[4].strip()) voters_who_voted_percent = float(row[5].strip()) ballots_received_count = int(row[6].strip()) unused_ballots_count = int(row[7].strip()) ballots_in_ballot_box_count=int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) invalid_ballots_percent = float(row[10].strip()) valid_ballots_count = int(row[11].strip()) valid_ballots_percent = float(row[12].strip()) if int(year)==2008 and election_type=="predsjednicki": voters_who_voted_count = int(row[6].strip()) voters_who_voted_percent=float(row[7].strip()) if int(year) not in [2008, 2012] and election_type != "predsjednicki": voters_who_voted_count = int(row[4].strip()) if int(year) == 2003 and election_type in["predsjednicki","parlamentarni"]: voters_who_voted_count = int(row[4].strip()) total_voter_turn_out = float(row[5].strip()) if int(year) == 2002 and election_type == "predsjednicki": print row_count voters_who_voted_count = int(row[4].strip()) total_voter_turn_out = float(row[5].strip()) if int(year) not in [2002, 2003,2004] and election_type not in ["predsjednicki", "parlamentarni"]: voters_who_voted_percent = float(row[5].strip()) ballots_received_count = int(row[6].strip()) unused_ballots_count = int(row[7].strip()) ballots_in_ballot_box_count = int(row[8].strip()) invalid_ballots_count = int(row[9].strip()) invalid_ballots_percent = float(row[10].strip()) valid_ballots_count = int(row[11].strip()) valid_ballots_percent = float(row[12].strip()) doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count doc['biraciKojiSuGlasali'] = {} doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count if int(year) in [2002, 2003] and election_type in ["predsjednicki", "parlamentarni"]: doc['odzivBiraca']=total_voter_turn_out if int(year) not in [2002, 2003] and election_type not in ["predsjednicki", "parlamentarni"]: doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent doc['brojPrimljenihGlasackihListica'] = ballots_received_count doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count if int(year) not in [2012, 2004] and election_type!="predsjednicki": doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count doc['brojGlasackihListicaUKutiji'] = {} doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count if int(year)!=2004 and election_type!="predsjednicki": doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent doc['vazeciGlasackiListici'] = {} doc['vazeciGlasackiListici']['broj'] = valid_ballots_count if int(year) != 2004 and election_type != "predsjednicki": doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent # Some rows consist of territory grouping. # We need to track those. if cyrtranslit.to_latin(territory, 'sr').isupper(): doc['instanca'] = 1 elif 'okrug' in territory_slug\ or territory_slug in ['grad-beograd', 'inostranstvo']\ or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '': doc['instanca'] = 2 parent_territory = territory elif polling_station_num is '': doc['instanca'] = 3 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) elif polling_station_num is not '': doc['instanca'] = 4 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) doc['brojBirackogMesta'] = polling_station_num doc['adresaBirackogMesta'] = polling_station_address if int(year)==2003 and election_type in ["parlamentarni"]: total_votes=0 udeo=0 for j in xrange(6, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] =float(udeo) doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify( cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] elif int(year) == 2002 and election_type == "predsjednicki": total_votes=0 udeo=0 for j in xrange(7, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: print int(row[j]) total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] elif int(year) == 2003 and election_type == "predsjednicki": total_votes=0 udeo=0 for j in xrange(6, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: print int(row[j]) total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] elif int(year) == 2004 and election_type == "predsjednicki": total_votes=0 udeo=0 for j in xrange(11, len(row)): doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 print udeo else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] else: total_votes=0 udeo=0 for j in xrange(13, len(row), 2): # Set generic values doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 print udeo else: udeo = 0.0 doc['rezultat']['udeo'] = udeo # Set remaining values depending on whether is is a presidential or parliamentary election if election_type == 'predsjednicki': month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') doc['mesec'] = month_cyr doc['krug'] = rnd_cyr doc['kandidat'] = candidates_or_parties[str(j)].title() doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) else: doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) ''' if 'parentTerritory' in doc: print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory']) else: print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija']) ''' docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] row_count += 1 # Insert remaining documents if len(docs) > 0: db[collection].insert(docs)
def test_special_characters(self): ''' Special characters should remain the same. ''' transliterated_special_chars = cyrtranslit.to_cyrillic(special_chars) self.assertEqual(transliterated_special_chars, special_chars)
def latin_to_cyrillic(text): """ Convert Latin letters to Serbian-Cyrillic letters """ return cyrtranslit.to_cyrillic(text)
def import_data(self, election_type, year, month=None, rnd=None): self.prep_import(election_type, year, month, rnd) file_path = self.get_data_file_path(election_type, year, month, rnd) e = xml.etree.ElementTree.parse(file_path).getroot() results = {} docs = [] for result in e.findall('Result'): territory = result.attrib[u'Територија'].strip() data_type = result.attrib[u'Врста_податка'].strip() candidate = result.attrib[u'Кандидат'].strip() if election_type == 'predsjednicki' else result.attrib[u'Изборна_листа'].strip() # We have two entries per territory. One for share of votes (in percentage) and one for number of votes. # We want to save both numbers in the same document # To achieve this, we keep track of created documents per territory if territory not in results: results[territory] = {} if candidate not in results[territory]: results[territory][candidate] = { 'teritorija': territory, 'teritorijaSlug': slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'), to_lower=True), 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year), 'rezultat': { 'udeo': None, 'glasova': None } } # All values with capital letters are grouped regions # we need to mark them so that we don't count votes more than once territory_slug = slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr')) if territory_slug.isupper() and ('okrug' in territory_slug.lower() or territory_slug.lower() == 'grad-beograd') : results[territory][candidate]['instanca'] = 2 elif territory_slug.isupper(): results[territory][candidate]['instanca'] = 1 else: results[territory][candidate]['instanca'] = 3 # Set remaining values depending on whether is is a presidential or parliamentary election if election_type == 'predsjednicki': month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') results[territory][candidate]['mesec'] = month_cyr results[territory][candidate]['krug'] = rnd_cyr results[territory][candidate]['kandidat'] = candidate.title() results[territory][candidate]['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True) else: results[territory][candidate]['izbornaLista'] = candidate results[territory][candidate]['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True) # Удео броја гласова које је добила листа у укупном броју гласова, % if '%' in data_type: results[territory][candidate]['rezultat']['udeo'] = float(result.text.replace(',', '.')) # Број гласова које је добила листа else: results[territory][candidate]['rezultat']['glasova'] = int(result.text) if results[territory][candidate]['rezultat']['udeo'] is not None and results[territory][candidate]['rezultat']['glasova'] is not None: docs.append(results[territory][candidate]) # Insert documents db['izbori'].insert(docs)
def test_numerical_characters(self): ''' Numerical characters should remain the same. ''' transliterated_numerical_chars = cyrtranslit.to_cyrillic(numerical_chars) self.assertEqual(transliterated_numerical_chars, numerical_chars)
def test_special_diacritic_characters(self): ''' Diacritic characters should remain the same. ''' transliterated_diacritic_chars = cyrtranslit.to_cyrillic(diacritic_chars) self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
def import_data_parliament_2016(self): election_type = 'parlamentarni' year = 2016 self.prep_import(election_type, year, None, None) file_path = self.get_data_file_path(election_type, year, None, None) row_count = 0 docs = [] candidates_or_parties = {} with open(file_path, 'rb') as f: reader = csv.reader(f) for row in tqdm(reader): doc = {} # Get all the candidates/parties if row_count == 0: for i in range(14, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '') elif row[7].strip() is not '': # FIXME: we do this because row 8,350 is blank. parent_territory = row[1].strip() parent_territory_slug = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) territory = row[3].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[4].strip()) polling_station_address = row[5].strip() coordinates = row[6].strip().split(',') registered_voters_count = int(row[7].strip()) ballots_received_count = int(row[8].strip()) unused_ballots_count = int(row[9].strip()) voters_who_voted_count = int(row[10].strip()) # voters_who_voted_percent = None ballots_in_ballot_box_count = int(row[11].strip()) invalid_ballots_count = int(row[12].strip()) # invalid_ballots_percent = None valid_ballots_count = int(row[13].strip()) # valid_ballots_percent = None # Set election type and year doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) # Set generic location values doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = parent_territory_slug doc['brojBirackogMesta'] = polling_station_num doc['adresaBirackogMesta'] = polling_station_address # FIXME: at least one coordinate is missing (row 1481) if len(coordinates) == 2: doc['koordinateBirackomMestu'] = {} doc['koordinateBirackomMestu']['latituda'] = float(coordinates[0].strip()) doc['koordinateBirackomMestu']['longituda'] = float(coordinates[1].strip()) # Set generic ballot values doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count doc['biraciKojiSuGlasali'] = {} doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent doc['brojPrimljenihGlasackihListica'] = ballots_received_count doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count doc['brojGlasackihListicaUKutiji'] = {} doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count # doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent doc['vazeciGlasackiListici'] = {} doc['vazeciGlasackiListici']['broj'] = valid_ballots_count # doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent # For this year, we don't have grouped territories we are importing. # So every document is at the smallest unit of territory doc['instanca'] = 4 # print '---------' total_votes=0 udeo=0 for j in range(14, len(row)): doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista']) docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] row_count += 1 # Insert remaining documents if len(docs) > 0: db[collection].insert(docs)
def import_data(self, election_type, year, month=None, rnd=None): self.prep_import(election_type, year, month, rnd) file_path = self.get_data_file_path(election_type, year, month, rnd) e = xml.etree.ElementTree.parse(file_path).getroot() results = {} docs = [] for result in e.findall('Result'): territory = result.attrib[u'Територија'].strip() data_type = result.attrib[u'Врста_податка'].strip() candidate = result.attrib[u'Кандидат'].strip( ) if election_type == 'predsjednicki' else result.attrib[ u'Изборна_листа'].strip() # We have two entries per territory. One for share of votes (in percentage) and one for number of votes. # We want to save both numbers in the same document # To achieve this, we keep track of created documents per territory if territory not in results: results[territory] = {} if candidate not in results[territory]: results[territory][candidate] = { 'teritorija': territory, 'teritorijaSlug': slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'), to_lower=True), 'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'), 'godina': int(year), 'rezultat': { 'udeo': None, 'glasova': None } } # All values with capital letters are grouped regions # we need to mark them so that we don't count votes more than once territory_slug = slugify( cyrtranslit.to_latin(territory.encode('utf-8'), 'sr')) if territory_slug.isupper() and ( 'okrug' in territory_slug.lower() or territory_slug.lower() == 'grad-beograd'): results[territory][candidate]['instanca'] = 2 elif territory_slug.isupper(): results[territory][candidate]['instanca'] = 1 else: results[territory][candidate]['instanca'] = 3 # Set remaining values depending on whether is is a presidential or parliamentary election if election_type == 'predsjednicki': month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr') rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr') results[territory][candidate]['mesec'] = month_cyr results[territory][candidate]['krug'] = rnd_cyr results[territory][candidate][ 'kandidat'] = candidate.title() results[territory][candidate]['kandidatSlug'] = slugify( cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True) else: results[territory][candidate]['izbornaLista'] = candidate results[territory][candidate][ 'izbornaListaSlug'] = slugify(cyrtranslit.to_latin( candidate.encode('utf-8'), 'sr'), to_lower=True) # Удео броја гласова које је добила листа у укупном броју гласова, % if '%' in data_type: results[territory][candidate]['rezultat']['udeo'] = float( result.text.replace(',', '.')) # Број гласова које је добила листа else: results[territory][candidate]['rezultat']['glasova'] = int( result.text) if results[territory][candidate]['rezultat'][ 'udeo'] is not None and results[territory][candidate][ 'rezultat']['glasova'] is not None: docs.append(results[territory][candidate]) # Insert documents db['izbori'].insert(docs)
def test_alphabet_transliteration_latin_to_cyrillic(self): ''' Transliteration of entire latin alphabet to cyrillic. ''' transliterated_alphabet = cyrtranslit.to_cyrillic(macedonian_alphabet_latin, lang_code='mk') self.assertEqual(transliterated_alphabet, macedonian_alphabet_cyrillic)
def test_alphabet_transliteration_latin_to_cyrillic(self): ''' Transliterate the entire latin alphabet to cyrillic ''' transliterated_alphabet = cyrtranslit.to_cyrillic( ukrainian_alphabet_latin, lang_code='ua') self.assertEqual(transliterated_alphabet, ukrainian_alphabet_cyrillic)
def test_alphabet_transliteration_latin_to_cyrillic(self): ''' Transliteration of entire latin alphabet to cyrillic. ''' transliterated_alphabet = cyrtranslit.to_cyrillic(russian_alphabet_latin, lang_code='ru') self.assertEqual(transliterated_alphabet, russian_alphabet_cyrillic.replace('Ъ', 'ъ').replace('Ь', 'ь'))
def import_data_parliament_2007(self): election_type = 'parlamentarni' year = 2007 self.prep_import(election_type, year, None, None) file_path = self.get_data_file_path(election_type, year, None, None) row_count = 0 docs = [] candidates_or_parties = {} parent_territory = '' with open(file_path, 'rb') as f: reader = csv.reader(f) for row in tqdm(reader): doc = {} # Get all the candidates/parties if row_count == 0: for i in range(12, len(row)): candidates_or_parties[str(i)] = row[i].replace('\n', '') else: territory = row[2].strip() territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True) polling_station_num = int(row[3].strip()) polling_station_address = row[4].strip() ballots_received_count = int(row[5].strip()) unused_ballots_count = int(row[6].strip()) number_of_voters_registered=int(row[7].strip()) voters_who_voted_count = int(row[8].strip()) ballots_in_ballot_box_count = int(row[9].strip()) invalid_ballots_count = int(row[10].strip()) valid_ballots_count = int(row[11].strip()) doc['brojPrimljeniGlasackiListica'] = ballots_received_count doc['brojNeupotrebljenihGlasackiListica']=unused_ballots_count doc['brojUpisanihBiracaUBirackiSpisak'] = number_of_voters_registered doc['nevazeciGlasackiListici']= invalid_ballots_count doc['biraciKojiSuGlasali'] = {} doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent doc['brojGlasackihListicaUKutiji'] = {} doc['brojGlasackihListicaUKutiji']['broj'] = ballots_in_ballot_box_count doc['vazeciGlasackiListici'] = {} doc['vazeciGlasackiListici']['broj'] = valid_ballots_count doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) # Some rows consist of territory grouping. # We need to track those. if cyrtranslit.to_latin(territory, 'sr').isupper(): doc['instanca'] = 1 elif 'okrug' in territory_slug \ or territory_slug in ['grad-beograd', 'inostranstvo'] \ or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '': doc['instanca'] = 2 parent_territory = territory elif polling_station_num is '': doc['instanca'] = 3 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) elif polling_station_num is not '': doc['instanca'] = 4 doc['parentTeritorija'] = parent_territory doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True) doc['brojBirackogMesta'] = polling_station_num doc['adresaBirackogMesta'] = polling_station_address total_votes=0 udeo=0 for j in range(12, len(row)): doc['rezultat'] = {} doc['rezultat']['glasova'] = int(row[j]) if int(row[j]) != 0: total_votes += int(row[j]) udeo = (float(int(row[j])) / total_votes) * 100 else: udeo = 0.0 doc['rezultat']['udeo'] = udeo doc['teritorija'] = territory doc['teritorijaSlug'] = territory_slug doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr') doc['godina'] = int(year) doc['izbornaLista'] = candidates_or_parties[str(j)] doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True) # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista']) docs.append(doc.copy()) if len(docs) % 1000 == 0: db[collection].insert(docs) docs = [] row_count += 1 # Insert remaining documents if len(docs) > 0: db[collection].insert(docs)
def test_alphabet_transliteration(self): ''' Transliteration of entire Serbian cyrillic alphabet to latin. ''' transliterated_serbian_alphabet = cyrtranslit.to_cyrillic(serbian_alphabet_latin) self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_cyrillic)
def cyr(to_translate): """ Transliterate `to_translate` from latin into cyrillic """ return cyrtranslit.to_cyrillic(to_translate)