Esempio n. 1
0
    def prep_import(self, election_type, year, month=None, rnd=None):
        if election_type == 'predsjednicki':
            print '\nRemoving previously imported data for %s %s %s %s...' % (
                election_type, year, month, rnd)
            db['izbori'].remove({
                'izbori':
                cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                'godina':
                int(year),
                'mesec':
                cyrtranslit.to_cyrillic(month.title(), 'sr'),
                'krug':
                cyrtranslit.to_cyrillic(rnd.title(), 'sr')
            })

            print 'Importing data for %s %s %s %s...' % (election_type, year,
                                                         month, rnd)

        else:
            print '\nRemoving previously imported data for %s %s...' % (
                election_type, year)
            db['izbori'].remove({
                'izbori':
                cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                'godina':
                int(year)
            })

            print 'Importing data for %s %s...' % (election_type, year)
    def get_top_indicators_by_type(self, data_source,election_type_slug, godina, instanca,round_slug=None):
        collection = 'izbori' if data_source == 1 else 'izbori2'

        match = {
            'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
            'godina': godina,
            'instanca':instanca

        }

        if election_type_slug == 'predsjednicki':
            if round_slug is not None:
                round_val = cyrtranslit.to_cyrillic(round_slug.title(), 'sr')
                match['krug'] = round_val
            group = {
                '_id': {
                    'kandidat': '$kandidat',
                    'kandidatSlug': '$kandidatSlug'
                },
                'glasova': {"$sum": "$rezultat.glasova"},
                'udeo': {"$sum": "$rezultat.udeo"},

            }
        else:
            group = {
                '_id': {
                    'izbornaLista': '$izbornaLista',
                    'izbornaListaSlug': '$izbornaListaSlug'
                },
                'glasova': {"$sum": "$rezultat.glasova"},
                'udeo': {"$sum": "$rezultat.udeo"},

            }
        group_total = {
            "_id": None,
            "total": {
                "$sum": "$rezultat.glasova"
            }
        }
        sort = {
            "glasova": -1
        }
        pipeline = [
            {'$match': match},
            {'$group': group},
            {'$sort': sort},
            {'$project': self.get_push_pipeline_operation_for_top_indicators(election_type_slug)}
        ]
        pipeline_total = [
            {"$match": match},
            {"$match": match},
            {"$group": group_total}
        ]
        rsp_total = mongo.db[collection].aggregate(pipeline_total,allowDiskUse=True)
        rsp = mongo.db[collection].aggregate(pipeline)
        total_votes = rsp_total['result'][0]["total"]

        for candidate in rsp['result']:
            candidate["udeo"] = (float(candidate["glasova"]) / total_votes) * 100
        return rsp['result']
    def get_results_by_territory_by_candidate(self,data_source,election_type_slug,year,territory_slug,candidate_slug,instanca):
        collection = 'izbori' if data_source == 1 else 'izbori2'
        if election_type_slug == 'predsjednicki':
            match = {
                'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
                'godina': year,
                'teritorijaSlug': territory_slug,
                'instanca': instanca,
                'kandidatSlug':candidate_slug
            }
            group = {
                '_id': {
                    'teritorija': '$teritorija',
                    'teritorijaSlug': '$teritorijaSlug',
                    'teritorija': '$teritorija',
                    'izbornaLista': '$izbornaLista',
                    'izbornaListaSlug': '$izbornaListaSlug',
                },
            }
        else:
            match = {
                'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
                'godina': year,
                'teritorijaSlug': territory_slug,
                'instanca': instanca,
                'izbornaListaSlug': candidate_slug
            }
            group = {
                '_id': {
                    'teritorija': '$teritorija',
                    'teritorijaSlug': '$teritorijaSlug',
                    'teritorija': '$teritorija',
                    'izbornaLista': '$izbornaLista',
                    'izbornaListaSlug': '$izbornaListaSlug',
                    'brojUpisanihBiracaUBirackiSpisak': '$brojUpisanihBiracaUBirackiSpisak',
                    'biraciKojiSuGlasali': '$biraciKojiSuGlasali',
                    'rezultat': '$rezultat'
                },
            }

        project = {
                '_id': 0,
                'teritorija': '$_id.teritorija',
                'teritorijaSlug': '$_id.teritorijaSlug',
                'izbornaListaSlug':'$_id.izbornaListaSlug',
                'izbornaLista': '$_id.izbornaLista',
                'brojUpisanihBiracaUBirackiSpisak':'$_id.brojUpisanihBiracaUBirackiSpisak',
                'biraciKojiSuGlasali':'$_id.biraciKojiSuGlasali',
                'rezultat': '$_id.rezultat',

            }
        pipeline = [
            {'$match': match},
            {'$group':group},
            {'$project': project}
        ]

        rsp = mongo.db[collection].aggregate(pipeline)
        return rsp['result'][0]
    def get_votes_grouped_by_party_or_candidate(self, data_source, election_type_slug, year, party_or_candidate_slug=None, round_slug=None):
        collection = 'izbori' if data_source == 1 else 'izbori2'
        match = {
            'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
            'godina': year
        }
        if round_slug is not None:
            round_val = cyrtranslit.to_cyrillic(round_slug.title(), 'sr')
            match['krug'] = round_val
        if party_or_candidate_slug is not None:
            if election_type_slug == 'predsjednicki':
                match['kandidatSlug'] = party_or_candidate_slug
            else:
                match['izbornaListaSlug'] = party_or_candidate_slug
        group = {
            '_id': self.get_id_pipeline_operation_for_votes_grouped_by_party_or_candidate(election_type_slug),
            'rezultat': {
                '$push': {
                    'teritorija': '$teritorija',
                    'teritorijaSlug': '$teritorijaSlug',
                    'rezultat': '$rezultat'
                }
            }
        }

        if data_source == 2:
            group['_id']['parentTeritorija'] = '$parentTeritorija'
            group['_id']['parentTeritorijaSlug'] = '$parentTeritorijaSlug'
            group['_id']['adresaBirackogMesta'] = '$adresaBirackogMesta'
            group['_id']['koordinateBirackomMestu'] = '$koordinateBirackomMestu'
            group['_id']['brojUpisanihBiracaUBirackiSpisak'] = '$brojUpisanihBiracaUBirackiSpisak'
            group['_id']['biraciKojiSuGlasali'] = '$biraciKojiSuGlasali'
            group['_id']['brojPrimljenihGlasackihListica'] = '$brojPrimljenihGlasackihListica'
            group['_id']['brojNeupoTrebljenihGlasackihListica'] = '$brojNeupoTrebljenihGlasackihListica'
            group['_id']['brojGlasackihListicaUKutiji'] = '$brojGlasackihListicaUKutiji'
            group['_id']['vazeciGlasackiListici'] = '$vazeciGlasackiListici'

        pipeline = [
            {'$match': match},
            {'$group': group},
            {'$project': self.get_poject_pipeline_operation_for_votes_grouped_by_party_or_candidate(election_type_slug)}
        ]

        rsp = mongo.db[collection].aggregate(pipeline)

        if party_or_candidate_slug is not None:
            return rsp['result'][0]
        else:
            return rsp['result']
    def test_special_diacritic_characters(self):
        ''' Diacritic characters should remain the same.
        '''
        transliterated_diacritic_chars = cyrtranslit.to_cyrillic(
            diacritic_chars)

        self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
    def test_numerical_characters(self):
        ''' Numerical characters should remain the same.
        '''
        transliterated_numerical_chars = cyrtranslit.to_cyrillic(
            numerical_chars)

        self.assertEqual(transliterated_numerical_chars, numerical_chars)
    def test_alphabet_transliteration_latin_to_cyrillic(self):
        ''' Transliteration of entire latin alphabet to cyrillic.
        '''
        transliterated_alphabet = cyrtranslit.to_cyrillic(
            bulgarian_alphabet_latin, lang_code='bg')

        self.assertEqual(transliterated_alphabet, bulgarian_alphabet_cyrillic)
Esempio n. 8
0
def proceed_token(t, translit=False, replace_i=True):
    t = t.replace('ё', 'е').replace(',', '.')
    if replace_i:
        t = t.replace('й', 'и')
    num = isnum(t)
    if num:
        return num

    t = t.rstrip('ъ')

    # # all ascii
    if translit and all(ord(char) < 128 for char in t):
        t = cyrtranslit.to_cyrillic(t, 'ru')
        # t = trans.transliterate(t)

    tmp = dot_prog.split(t)
    if len(tmp) > 1:
        tmp = [isnum(el) for el in tmp]
        if all(tmp):
            return 'x'.join(tmp)  # english x

    tmp = split_unit(t)
    if type(tmp) == tuple:
        return tmp[0] + ' ' + tmp[1]

    t = t.replace('.', ' ')
    tmp = (unit_lookup.get(t, t) for t in t.split(' '))
    t = ' '.join((ti for ti in tmp if len(ti) > 1))
    return t
    def test_mix_characters(self):
        ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't.
        '''
        transliterated_mix = cyrtranslit.to_cyrillic(
            mix_characters_all_latin_no_alpha)

        self.assertEqual(transliterated_mix,
                         mix_characters_some_cyrillic_no_alpha)
    def test_alphabet_transliteration_latin_to_cyrillic(self):
        ''' Transliteration of entire latin alphabet to cyrillic.
        '''
        transliterated_alphabet = cyrtranslit.to_cyrillic(
            montenegrin_alphabet_latin, lang_code='me')

        self.assertEqual(transliterated_alphabet,
                         montenegrin_alphabet_cyrillic)
    def test_alphabet_transliteration(self):
        ''' Transliteration of entire Serbian cyrillic alphabet to latin.
        '''
        transliterated_serbian_alphabet = cyrtranslit.to_cyrillic(
            serbian_alphabet_latin)

        self.assertEqual(transliterated_serbian_alphabet,
                         serbian_alphabet_cyrillic)
Esempio n. 12
0
    def branching(text, plant, depth=0):

        depth += 1
        if depth == max_depth:
            return

        encoded_text = textgenrnn_encode_sequence(text[-maxlen:], vocab,
                                                  maxlen)
        next_temperature = temperature[(len(text) - 1) % len(temperature)]
        # n_branches = 1 if depth is 0 else 2
        n_branches = 2

        def get_options(n_branches=n_branches):

            options_index = textgenrnn_sample(
                model.predict(encoded_text, batch_size=1)[0],
                next_temperature,
                interactive=interactive,
                # top_n=random.randint(2,5)
                # top_n=2
                top_n=n_branches)
            options = [indices_char[idx] for idx in options_index]
            # filter punctuations
            options = list(filter(lambda o: o not in puncts, options))
            return options

        options = get_options(10)
        options = options[:2]

        print("start")
        for index, option in enumerate(options):
            print(index)

            # if text[-1]:
            #     prev_option = text[-1]
            # else:
            #     prev_option = " "
            #
            # prev_is_punct = prev_option[0] in puncts
            # both_are_punct = prev_is_punct and option[0] in puncts
            #
            # if (prev_option == option) or both_are_punct:
            #     new_options = get_options(10)
            #     for new_option in new_options:
            #         both_are_punct_again = prev_is_punct and new_option[0] in puncts
            #         if (prev_option != new_option) and not both_are_punct_again:
            #             option = new_option
            #             break

            item_text = text + [option]

            cyr_option = cyrtranslit.to_cyrillic(option)
            node_object = {"name": cyr_option, "children": []}
            plant.append(node_object)
            branching(item_text, plant=node_object["children"], depth=depth)

        return plant
    def test_alphabet_transliteration_latin_to_cyrillic(self):
        ''' Transliteration of entire latin alphabet to cyrillic.
        '''
        transliterated_alphabet = cyrtranslit.to_cyrillic(
            russian_alphabet_latin, lang_code='ru')

        self.assertEqual(
            transliterated_alphabet,
            russian_alphabet_cyrillic.replace('Ъ', 'ъ').replace('Ь', 'ь'))
Esempio n. 14
0
 def l2c():
     m = Frame.m
     txt = m.get()
     root28 = tk.Tk()
     root28.title('Result(Latin2Cyrillic)')
     result = cyrtranslit.to_cyrillic(txt, 'ru')
     print(result)
     pyperclip.copy(result)
     label28 = tk.Label(root28, text=result, font=16)
     label28.pack(fill="x")
    def prep_import(self, election_type, year, month=None, rnd=None):
        if election_type == 'predsjednicki':
            print '\nRemoving previously imported data for %s %s %s %s...' % (election_type, year, month, rnd)
            db['izbori'].remove({
                'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                'godina': int(year),
                'mesec': cyrtranslit.to_cyrillic(month.title(), 'sr'),
                'krug': cyrtranslit.to_cyrillic(rnd.title(), 'sr')
            })

            print 'Importing data for %s %s %s %s...' % (election_type, year, month, rnd)

        else:
            print '\nRemoving previously imported data for %s %s...' % (election_type, year)
            db['izbori'].remove({
                'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                'godina': int(year)
            })

            print 'Importing data for %s %s...' % (election_type, year)
Esempio n. 16
0
def translite(file_in, file_out, ru=''):
    text_file = open(file_out, "w")

    with open(file_in) as f:
        if ru:
            for line in f:
                new_line = cyrtranslit.to_latin(line, 'ru')
                text_file.write(new_line)
        else:
            for line in f:
                new_line = cyrtranslit.to_cyrillic(line)
                text_file.write(new_line)
    text_file.close()
Esempio n. 17
0
 def search(name):
     films = KinoPoisk.__get_list(name)
     name = name.replace('-',
                         ' ').replace('1',
                                      'i').replace('0',
                                                   'o').replace('.', ' ')
     films += KinoPoisk.__get_list(name)
     name = name.replace('’', 'ь').replace('ya', 'я')
     name = cyrtranslit.to_cyrillic(name, 'ru')
     films += KinoPoisk.__get_list(name)
     name = name.replace('ы', 'й')
     films += KinoPoisk.__get_list(name)
     films = list(set(films))
     return films
    def get_total_voters_turnout(self,data_source, election_type_slug, godina):
        collection = 'izbori' if data_source == 1 else 'izbori2'
        match = {
            'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
            'godina': godina,

        }
        group = {
            '_id': {
                'teritorija': '$teritorija',

            },
        }
        group['_id']['parentTeritorija'] = '$parentTeritorija'
        group['_id']['parentTeritorijaSlug'] = '$parentTeritorijaSlug'
        group['_id']['adresaBirackogMesta'] = '$adresaBirackogMesta'
        group['_id']['koordinateBirackomMestu'] = '$koordinateBirackomMestu'
        group['_id']['brojUpisanihBiracaUBirackiSpisak'] = '$brojUpisanihBiracaUBirackiSpisak'
        group['_id']['biraciKojiSuGlasali'] = '$biraciKojiSuGlasali'
        group['_id']['brojPrimljenihGlasackihListica'] = '$brojPrimljenihGlasackihListica'
        group['_id']['brojNeupoTrebljenihGlasackihListica'] = '$brojNeupoTrebljenihGlasackihListica'
        group['_id']['brojGlasackihListicaUKutiji'] = '$brojGlasackihListicaUKutiji'
        group['_id']['vazeciGlasackiListici'] = '$vazeciGlasackiListici'
        project = {
            '_id': 0,
            'teritorija': '$_id.teritorija',
            'brojUpisanihBiracaUBirackiSpisak': '$_id.brojUpisanihBiracaUBirackiSpisak',
            'biraciKojiSuGlasali': '$_id.biraciKojiSuGlasali.broj',
        }

        pipeline = [
            {'$match': match},
            {'$group': group},
            {'$project':project}
        ]
        rsp = mongo.db[collection].aggregate(pipeline)
        total_voters=0
        total_registered=0
        percentage=0;
        for rezultat in rsp['result']:
            total_voters+=rezultat['biraciKojiSuGlasali']

            if rezultat['brojUpisanihBiracaUBirackiSpisak']!=0:
                total_registered+=rezultat['brojUpisanihBiracaUBirackiSpisak']
            percentage=(float(total_voters) / total_registered) * 100
        return {'percentage':percentage, 'total_voters': total_voters}
 def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
     data = []
     for w in tqdm(X['before'],
                   f'{self.__class__.__name__} transform',
                   total=len(X)):
         if re_eng.match(w):
             if self.algo == 'translit':
                 rus_w = translit(w, language_code='ru').lower()
             elif self.algo == 'cyrtranslit':
                 rus_w = cyrtranslit.to_cyrillic(w, lang_code='ru').lower()
             elif self.algo == 'pytils':
                 rus_w = pytils.translit.detranslify(w).lower()
             data.append(' '.join([c + u'_trans' for c in rus_w]))
         else:
             data.append(None)
     if 'after' in X.columns:
         return X.assign(
             after=X['after'].combine_first(pd.Series(data, index=X.index)))
     else:
         return X.assign(after=data)
Esempio n. 20
0
 def tokenize(self, content, author_id, my_id=0):
     tuples = re.findall(
         r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|"
         r"(<@!?" + str(my_id) + r">)|"
         r"(<@!?\d{16,20}>)|"
         r"(<#\d{16,20}>)|"
         r"(<@&\d{16,20}>)|"
         r"(<:\w{1,32}:\d{16,20}>)|"
         r"(<[a]:\w{1,32}:\d{16,20}>)|"
         r"(@everyone|@here)|"
         r"([^\d\W]+)|"
         r"(.)",
         content,
         re.UNICODE,
     )
     result = []
     if author_id == my_id:
         result.append("_MY_MESSAGE_BEGIN_")
     else:
         result.append("_NOT_MY_MESSAGE_BEGIN_")
     for tup in tuples:
         for idx, item in enumerate(tup):
             if item:
                 if idx <= 6:
                     result.append(self.entity_to_word[idx])
                 elif idx == 7:
                     result.append(item)
                 elif idx == 8:
                     if item.isupper():
                         result.append("_CAPS_")
                     elif item[0].isupper():
                         result.append("_SHIFT_")
                     trigrams = self.trigramize(
                         cyrtranslit.to_cyrillic(item.lower(), "ru"))
                     result.extend(trigrams)
                 else:
                     result.append(item)
     result.append("_MESSAGE_END_")
     return result
    def get_winners_for_each_territory(self, data_source,election_type_slug,year,instanca):
        collection = 'izbori' if data_source == 1 else 'izbori2'
        match = {
            'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
            'godina': year,
            'instanca':instanca
        }
        group = {
            '_id': {
                'teritorija': '$teritorija',
                'teritorijaSlug': '$teritorijaSlug',
            },
            'rezultat': {
                '$push':
                    self.get_push_pipeline_operation_for_votes_grouped_by_territory_group_by_result(
                    election_type_slug)
            },
        }
        sort = {
            "rezultat.glasova": -1
        }
        project = {
            '_id': 0,
            'teritorija': '$_id.teritorija',
            'teritorijaSlug': '$_id.teritorijaSlug',
            'rezultat': 1,
        }


        pipeline = [
            {'$match': match},
            {'$sort': sort},
            {'$group': group},
            {'$project': project}
        ]
        rsp = mongo.db[collection].aggregate(pipeline, allowDiskUse=True)
        return rsp['result']
    def test_mix_characters(self):
        ''' Serbian cyrillic characters should be transliterated but non serbian cyrillic ones shouldn't.
        '''
        transliterated_mix = cyrtranslit.to_cyrillic(mix_characters_all_latin_no_alpha)

        self.assertEqual(transliterated_mix, mix_characters_some_cyrillic_no_alpha)
    def get_votes_grouped_by_territory(self, data_source, election_type_slug, year, instanca, territory_slug=None, round_slug=None,range_of_documents=None):
        collection = 'izbori' if data_source == 1 else 'izbori2'
        match = {
            'izbori': cyrtranslit.to_cyrillic(election_type_slug.title(), 'sr'),
            'godina': year
        }

        # For now, we only support territorial levels for parliament elections
        if election_type_slug != 'predsjednicki' and instanca is not None:
            match['instanca'] = instanca

        if round_slug is not None:
            round_val = cyrtranslit.to_cyrillic(round_slug.title(), 'sr')
            match['krug'] = round_val

        if territory_slug is not None:
            match['teritorijaSlug'] = territory_slug
        group = {
            '_id': {
                'teritorija': '$teritorija',
                'teritorijaSlug': '$teritorijaSlug',
            },
            'rezultat': {
                '$push': self.get_push_pipeline_operation_for_votes_grouped_by_territory_group_by_result(
                    election_type_slug),

            },

        }
        sort = {
            "rezultat.glasova": -1
        }
        if data_source == 2:
            group['_id']['parentTeritorija'] = '$parentTeritorija'
            group['_id']['parentTeritorijaSlug'] = '$parentTeritorijaSlug'
            group['_id']['adresaBirackogMesta'] = '$adresaBirackogMesta'
            group['_id']['koordinateBirackomMestu'] = '$koordinateBirackomMestu'
            group['_id']['brojUpisanihBiracaUBirackiSpisak'] = '$brojUpisanihBiracaUBirackiSpisak'
            group['_id']['biraciKojiSuGlasali'] = '$biraciKojiSuGlasali'
            group['_id']['brojPrimljenihGlasackihListica'] = '$brojPrimljenihGlasackihListica'
            group['_id']['brojNeupoTrebljenihGlasackihListica'] = '$brojNeupoTrebljenihGlasackihListica'
            group['_id']['brojGlasackihListicaUKutiji'] = '$brojGlasackihListicaUKutiji'
            group['_id']['vazeciGlasackiListici'] = '$vazeciGlasackiListici'

        project = {
            '_id': 0,
            'teritorija': '$_id.teritorija',
            'teritorijaSlug': '$_id.teritorijaSlug',
            'rezultat': 1,

        }

        if data_source == 2:
            project['parentTeritorija'] = '$_id.parentTeritorija'
            project['parentTeritorijaSlug'] = '$_id.parentTeritorijaSlug'
            project['adresaBirackogMesta'] = '$_id.adresaBirackogMesta'
            project['koordinateBirackomMestu'] = '$_id.koordinateBirackomMestu'
            project['brojUpisanihBiracaUBirackiSpisak'] = '$_id.brojUpisanihBiracaUBirackiSpisak'
            project['biraciKojiSuGlasali'] = '$_id.biraciKojiSuGlasali'
            project['brojPrimljenihGlasackihListica'] = '$_id.brojPrimljenihGlasackihListica'
            project['brojNeupoTrebljenihGlasackihListica'] = '$_id.brojNeupoTrebljenihGlasackihListica'
            project['brojGlasackihListicaUKutiji'] = '$_id.brojGlasackihListicaUKutiji'
            project['vazeciGlasackiListici'] = '$_id.vazeciGlasackiListici'

        pipeline = [
                {'$match': match},
                {'$sort': sort},
                {'$limit':100},
                {'$group': group},
                {'$project': project},

        ]
        if range_of_documents is not None:
            rsp = mongo.db[collection].aggregate(pipeline, allowDiskUse=True)
        else:
            rsp = mongo.db[collection].aggregate(pipeline, allowDiskUse=True)

        return rsp['result']
Esempio n. 24
0
def textgenrnn_generate_special(model,
                                vocab,
                                indices_char,
                                temperature=0.5,
                                maxlen=40,
                                meta_token='<s>',
                                word_level=False,
                                single_text=False,
                                max_gen_length=300,
                                interactive=False,
                                top_n=3,
                                prefix=None,
                                synthesize=False,
                                input_text=None,
                                input_depth=6):
    if input_text is None:
        input_text = ['']

    collapse_char = ' ' if word_level else ''
    end = False

    text = input_text

    puncts = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

    max_gen_length += maxlen

    if not isinstance(temperature, list):
        temperature = [temperature]
    if len(model.inputs) > 1:
        model = Model(inputs=model.inputs[0], outputs=model.outputs[1])

    max_depth = input_depth

    def branching(text, plant, depth=0):

        depth += 1
        if depth == max_depth:
            return

        encoded_text = textgenrnn_encode_sequence(text[-maxlen:], vocab,
                                                  maxlen)
        next_temperature = temperature[(len(text) - 1) % len(temperature)]
        # n_branches = 1 if depth is 0 else 2
        n_branches = 2

        def get_options(n_branches=n_branches):

            options_index = textgenrnn_sample(
                model.predict(encoded_text, batch_size=1)[0],
                next_temperature,
                interactive=interactive,
                # top_n=random.randint(2,5)
                # top_n=2
                top_n=n_branches)
            options = [indices_char[idx] for idx in options_index]
            # filter punctuations
            options = list(filter(lambda o: o not in puncts, options))
            return options

        options = get_options(10)
        options = options[:2]

        print("start")
        for index, option in enumerate(options):
            print(index)

            # if text[-1]:
            #     prev_option = text[-1]
            # else:
            #     prev_option = " "
            #
            # prev_is_punct = prev_option[0] in puncts
            # both_are_punct = prev_is_punct and option[0] in puncts
            #
            # if (prev_option == option) or both_are_punct:
            #     new_options = get_options(10)
            #     for new_option in new_options:
            #         both_are_punct_again = prev_is_punct and new_option[0] in puncts
            #         if (prev_option != new_option) and not both_are_punct_again:
            #             option = new_option
            #             break

            item_text = text + [option]

            cyr_option = cyrtranslit.to_cyrillic(option)
            node_object = {"name": cyr_option, "children": []}
            plant.append(node_object)
            branching(item_text, plant=node_object["children"], depth=depth)

        return plant

    plt = branching(text=text, plant=[])
    # print(json.dumps(plt))
    return_value = {"name": cyrtranslit.to_cyrillic(text[-1]), "children": plt}

    return return_value, end
Esempio n. 25
0
try:
    reader = csv.reader(source_bom_file, delimiter=';')
    writer = csv.writer(rus_bom_file, delimiter=';')
    row_num = 0
    for row in reader:

        csv_str = row
        col_num = 0
        row_num = row_num + 1

        while col_num < len(csv_str):
            if col_num in translate_colums:
                if (col_num == pe3name_col):
                    if (csv_str[col_num] != "-"):
                        csv_str[col_num] = cyrtranslit.to_cyrillic(
                            csv_str[col_num].encode("utf-8"), 'ru')
                    else:
                        csv_str[col_num] = csv_str[partnum_col]
                else:
                    csv_str[col_num] = cyrtranslit.to_cyrillic(
                        csv_str[col_num].encode("utf-8"), 'ru')
            col_num = col_num + 1

        if (len(csv_str[prefix_col]) == 1): csv_str[prefix_col] = ''
        if (len(csv_str[partnum_col]) == 1): csv_str[partnum_col] = ''
        if (len(csv_str[desc_col]) == 1): csv_str[desc_col] = ''

        prefix_value = csv_str[prefix_col]
        partnumber_value = csv_str[partnum_col]
        pe3name_value = csv_str[pe3name_col]
        desc_value = csv_str[desc_col]
Esempio n. 26
0
def cyr(to_translate):
    return cyrtranslit.to_cyrillic(to_translate)
    def import_data_rest(self, election_type, year, month=None, rnd=None):

        self.prep_import(election_type, year, month, rnd)

        file_path = self.get_data_file_path(election_type, year, month, rnd)

        row_count = 0
        docs = []
        candidates_or_parties = {}
        parent_territory = ''

        with open(file_path, 'rb') as f:
            reader = csv.reader(f)

            for row in tqdm(reader):
                doc = {}

                # Get all the candidates/parties
                if row_count == 0:
                    if int(year) == 2004 and election_type == "predsjednicki":
                        for i in xrange(11, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()

                    if int(year) == 2008 and election_type == "predsjednicki":
                        for i in xrange(8, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()

                    if int(year) == 2003 and election_type in ["predsjednicki", "parlamentarni"]:
                        for i in xrange(6, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()
                    elif int(year) == 2002 and election_type == "predsjednicki":
                        for i in xrange(7, len(row)):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()
                    else:
                        for i in xrange(13, len(row), 2):
                            candidates_or_parties[str(i)] = row[i].replace('\n', '').strip()

                elif row_count == 1:
                    pass

                else:

                    if int(year)==2004 and election_type=="predsjednicki":
                        territory = row[1].strip()
                        territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)
                        polling_station_num = int(row[2].strip())
                        polling_station_address = row[3].strip()
                        ballots_received_count = int(row[4].strip())
                        unused_ballots_count = int(row[5].strip())
                        registered_voters_count = int(row[6].strip())
                        voters_who_voted_count = int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        valid_ballots_count = int(row[10].strip())
                        print row_count

                    else:
                        print row_count
                        territory = row[0].strip()
                        territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)
                        polling_station_num = int(row[1].strip()) if row[1].strip() is not '' else row[1].strip()
                        polling_station_address = row[2].strip()

                        registered_voters_count = int(row[3].strip())

                    if int(year) == 2012 and election_type == "predsjednicki":
                        ballots_received_count = int(row[6].strip())
                        unused_ballots_count = int(row[7].strip())
                        voters_who_voted_count = int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        invalid_ballots_percent = float(row[10].strip())
                        valid_ballots_count = int(row[11].strip())
                        valid_ballots_percent = float(row[12].strip())

                    if int(year) == 2012 and election_type == "parlamentarni":
                        voters_who_voted_count = int(row[4].strip())
                        voters_who_voted_percent = float(row[5].strip())
                        ballots_received_count = int(row[6].strip())
                        unused_ballots_count = int(row[7].strip())
                        ballots_in_ballot_box_count=int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        invalid_ballots_percent = float(row[10].strip())
                        valid_ballots_count = int(row[11].strip())
                        valid_ballots_percent = float(row[12].strip())


                    if int(year)==2008 and election_type=="predsjednicki":
                        voters_who_voted_count = int(row[6].strip())
                        voters_who_voted_percent=float(row[7].strip())
                    if int(year) not in [2008, 2012]  and election_type != "predsjednicki":
                        voters_who_voted_count = int(row[4].strip())

                    if int(year) == 2003 and election_type in["predsjednicki","parlamentarni"]:
                        voters_who_voted_count = int(row[4].strip())
                        total_voter_turn_out = float(row[5].strip())



                    if int(year) == 2002 and election_type == "predsjednicki":
                        print row_count
                        voters_who_voted_count = int(row[4].strip())
                        total_voter_turn_out = float(row[5].strip())


                    if int(year) not in [2002, 2003,2004] and election_type not in ["predsjednicki", "parlamentarni"]:
                        voters_who_voted_percent = float(row[5].strip())
                        ballots_received_count = int(row[6].strip())
                        unused_ballots_count = int(row[7].strip())
                        ballots_in_ballot_box_count = int(row[8].strip())
                        invalid_ballots_count = int(row[9].strip())
                        invalid_ballots_percent = float(row[10].strip())
                        valid_ballots_count = int(row[11].strip())
                        valid_ballots_percent = float(row[12].strip())


                    doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count
                    doc['biraciKojiSuGlasali'] = {}

                    doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count

                    if int(year) in [2002, 2003] and election_type in ["predsjednicki", "parlamentarni"]:
                        doc['odzivBiraca']=total_voter_turn_out

                    if int(year) not in [2002, 2003] and election_type not in ["predsjednicki", "parlamentarni"]:
                        doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent
                        doc['brojPrimljenihGlasackihListica'] = ballots_received_count
                        doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count
                        if int(year) not in [2012, 2004] and election_type!="predsjednicki":
                            doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count
                        doc['brojGlasackihListicaUKutiji'] = {}
                        doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count
                        if int(year)!=2004 and election_type!="predsjednicki":
                            doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent
                        doc['vazeciGlasackiListici'] = {}
                        doc['vazeciGlasackiListici']['broj'] = valid_ballots_count
                        if int(year) != 2004 and election_type != "predsjednicki":
                            doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent
                    # Some rows consist of territory grouping.
                    # We need to track those.
                    if cyrtranslit.to_latin(territory, 'sr').isupper():
                        doc['instanca'] = 1

                    elif 'okrug' in territory_slug\
                            or territory_slug in ['grad-beograd', 'inostranstvo']\
                            or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '':
                        doc['instanca'] = 2
                        parent_territory = territory

                    elif polling_station_num is '':
                        doc['instanca'] = 3
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True)

                    elif polling_station_num is not '':
                        doc['instanca'] = 4
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True)
                        doc['brojBirackogMesta'] = polling_station_num
                        doc['adresaBirackogMesta'] = polling_station_address

                    if int(year)==2003 and election_type in ["parlamentarni"]:
                        total_votes=0
                        udeo=0
                        for j in xrange(6, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}
                            doc['rezultat']['glasova'] = int(row[j])


                            if int(row[j]) != 0:
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100

                            else:
                                udeo = 0.0

                            doc['rezultat']['udeo'] =float(udeo)


                            doc['izbornaLista'] = candidates_or_parties[str(j)]
                            doc['izbornaListaSlug'] = slugify(
                            cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []
                    elif int(year) == 2002 and election_type == "predsjednicki":
                        total_votes=0
                        udeo=0
                        for j in xrange(7, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}


                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                print int(row[j])
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100

                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election

                            month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                            rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                            doc['mesec'] = month_cyr
                            doc['krug'] = rnd_cyr
                            doc['kandidat'] = candidates_or_parties[str(j)].title()
                            doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                              to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []
                    elif int(year) == 2003 and election_type == "predsjednicki":
                        total_votes=0
                        udeo=0
                        for j in xrange(6, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}


                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                print int(row[j])
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100

                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election

                            month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                            rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                            doc['mesec'] = month_cyr
                            doc['krug'] = rnd_cyr
                            doc['kandidat'] = candidates_or_parties[str(j)].title()
                            doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                              to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []

                    elif int(year) == 2004 and election_type == "predsjednicki":
                        total_votes=0
                        udeo=0
                        for j in xrange(11, len(row)):
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}

                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100
                                print udeo
                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election

                            month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                            rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                            doc['mesec'] = month_cyr
                            doc['krug'] = rnd_cyr
                            doc['kandidat'] = candidates_or_parties[str(j)].title()
                            doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                          to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []

                    else:
                        total_votes=0
                        udeo=0
                        for j in xrange(13, len(row), 2):
                            # Set generic values
                            doc['teritorija'] = territory
                            doc['teritorijaSlug'] = territory_slug
                            doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                            doc['godina'] = int(year)

                            doc['rezultat'] = {}
                            doc['rezultat']['glasova'] = int(row[j])
                            if int(row[j]) != 0:
                                total_votes += int(row[j])
                                udeo = (float(int(row[j])) / total_votes) * 100
                                print udeo
                            else:
                                udeo = 0.0
                            doc['rezultat']['udeo'] = udeo
                            # Set remaining values depending on whether is is a presidential or parliamentary election
                            if election_type == 'predsjednicki':
                                month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                                rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                                doc['mesec'] = month_cyr
                                doc['krug'] = rnd_cyr
                                doc['kandidat'] = candidates_or_parties[str(j)].title()
                                doc['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True)

                            else:
                                doc['izbornaLista'] = candidates_or_parties[str(j)]
                                doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'), to_lower=True)

                            '''
                            if 'parentTerritory' in doc:
                                print '%s - %s - %s - %s' % (row_count+1, doc['instanca'], doc['teritorija'], doc['parentTerritory'])
                            else:
                                print '%s - %s - %s' % (row_count + 1, doc['instanca'], doc['teritorija'])
                            '''

                            docs.append(doc.copy())

                            if len(docs) % 1000 == 0:
                                db[collection].insert(docs)
                                docs = []

                row_count += 1

        # Insert remaining documents
        if len(docs) > 0:
            db[collection].insert(docs)
    def test_special_characters(self):
        ''' Special characters should remain the same.
        '''
        transliterated_special_chars = cyrtranslit.to_cyrillic(special_chars)

        self.assertEqual(transliterated_special_chars, special_chars)
Esempio n. 29
0
 def latin_to_cyrillic(text):
     """
     Convert Latin letters to Serbian-Cyrillic letters
     """
     return cyrtranslit.to_cyrillic(text)
    def import_data(self, election_type, year, month=None, rnd=None):

        self.prep_import(election_type, year, month, rnd)

        file_path = self.get_data_file_path(election_type, year, month, rnd)

        e = xml.etree.ElementTree.parse(file_path).getroot()

        results = {}
        docs = []
        for result in e.findall('Result'):
            territory = result.attrib[u'Територија'].strip()
            data_type = result.attrib[u'Врста_податка'].strip()
            candidate = result.attrib[u'Кандидат'].strip() if election_type == 'predsjednicki' else result.attrib[u'Изборна_листа'].strip()

            # We have two entries per territory. One for share of votes (in percentage) and one for number of votes.
            # We want to save both numbers in the same document
            # To achieve this, we keep track of created documents per territory
            if territory not in results:
                results[territory] = {}

            if candidate not in results[territory]:
                results[territory][candidate] = {
                    'teritorija': territory,
                    'teritorijaSlug': slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'), to_lower=True),
                    'izbori': cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                    'godina': int(year),
                    'rezultat': {
                        'udeo': None,
                        'glasova': None
                    }
                }

                # All values with capital letters are grouped regions
                # we need to mark them so that we don't count votes more than once
                territory_slug = slugify(cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'))

                if territory_slug.isupper() and ('okrug' in territory_slug.lower() or territory_slug.lower() == 'grad-beograd') :
                    results[territory][candidate]['instanca'] = 2

                elif territory_slug.isupper():
                    results[territory][candidate]['instanca'] = 1

                else:
                    results[territory][candidate]['instanca'] = 3

                # Set remaining values depending on whether is is a presidential or parliamentary election
                if election_type == 'predsjednicki':
                    month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                    rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                    results[territory][candidate]['mesec'] = month_cyr
                    results[territory][candidate]['krug'] = rnd_cyr
                    results[territory][candidate]['kandidat'] = candidate.title()
                    results[territory][candidate]['kandidatSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True)

                else:
                    results[territory][candidate]['izbornaLista'] = candidate
                    results[territory][candidate]['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'), to_lower=True)

            # Удео броја гласова које је добила листа у укупном броју гласова, %
            if '%' in data_type:
                results[territory][candidate]['rezultat']['udeo'] = float(result.text.replace(',', '.'))

            # Број гласова које је добила листа
            else:
                results[territory][candidate]['rezultat']['glasova'] = int(result.text)


            if results[territory][candidate]['rezultat']['udeo'] is not None and results[territory][candidate]['rezultat']['glasova'] is not None:
                docs.append(results[territory][candidate])

        # Insert documents
        db['izbori'].insert(docs)
    def test_numerical_characters(self):
        ''' Numerical characters should remain the same.
        '''
        transliterated_numerical_chars = cyrtranslit.to_cyrillic(numerical_chars)

        self.assertEqual(transliterated_numerical_chars, numerical_chars)
    def test_special_diacritic_characters(self):
        ''' Diacritic characters should remain the same.
        '''
        transliterated_diacritic_chars = cyrtranslit.to_cyrillic(diacritic_chars)

        self.assertEqual(transliterated_diacritic_chars, diacritic_chars)
    def import_data_parliament_2016(self):
        election_type = 'parlamentarni'
        year = 2016
        self.prep_import(election_type, year, None, None)
        file_path = self.get_data_file_path(election_type, year, None, None)
        row_count = 0
        docs = []
        candidates_or_parties = {}
        with open(file_path, 'rb') as f:
            reader = csv.reader(f)

            for row in tqdm(reader):
                doc = {}

                # Get all the candidates/parties
                if row_count == 0:
                    for i in range(14, len(row)):
                        candidates_or_parties[str(i)] = row[i].replace('\n', '')

                elif row[7].strip() is not '':  # FIXME: we do this because row 8,350 is blank.
                    parent_territory = row[1].strip()
                    parent_territory_slug = slugify(cyrtranslit.to_latin(parent_territory, 'sr'), to_lower=True)

                    territory = row[3].strip()
                    territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)

                    polling_station_num = int(row[4].strip())
                    polling_station_address = row[5].strip()
                    coordinates = row[6].strip().split(',')

                    registered_voters_count = int(row[7].strip())
                    ballots_received_count = int(row[8].strip())
                    unused_ballots_count = int(row[9].strip())

                    voters_who_voted_count = int(row[10].strip())
                    # voters_who_voted_percent = None

                    ballots_in_ballot_box_count = int(row[11].strip())

                    invalid_ballots_count = int(row[12].strip())
                    # invalid_ballots_percent = None

                    valid_ballots_count = int(row[13].strip())
                    # valid_ballots_percent = None

                    # Set election type and year
                    doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                    doc['godina'] = int(year)

                    # Set generic location values
                    doc['teritorija'] = territory
                    doc['teritorijaSlug'] = territory_slug

                    doc['parentTeritorija'] = parent_territory
                    doc['parentTeritorijaSlug'] = parent_territory_slug

                    doc['brojBirackogMesta'] = polling_station_num
                    doc['adresaBirackogMesta'] = polling_station_address

                    # FIXME: at least one coordinate is missing (row 1481)
                    if len(coordinates) == 2:
                        doc['koordinateBirackomMestu'] = {}
                        doc['koordinateBirackomMestu']['latituda'] = float(coordinates[0].strip())
                        doc['koordinateBirackomMestu']['longituda'] = float(coordinates[1].strip())

                    # Set generic ballot values
                    doc['brojUpisanihBiracaUBirackiSpisak'] = registered_voters_count

                    doc['biraciKojiSuGlasali'] = {}
                    doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count
                    # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent

                    doc['brojPrimljenihGlasackihListica'] = ballots_received_count
                    doc['brojNeupoTrebljenihGlasackihListica'] = unused_ballots_count
                    doc['brojGlasackihListicaUKutiji'] = ballots_in_ballot_box_count

                    doc['brojGlasackihListicaUKutiji'] = {}
                    doc['brojGlasackihListicaUKutiji']['broj'] = invalid_ballots_count
                    # doc['brojGlasackihListicaUKutiji']['udeo'] = invalid_ballots_percent

                    doc['vazeciGlasackiListici'] = {}
                    doc['vazeciGlasackiListici']['broj'] = valid_ballots_count
                    # doc['vazeciGlasackiListici']['udeo'] = valid_ballots_percent

                    # For this year, we don't have grouped territories we are importing.
                    # So every document is at the smallest unit of territory
                    doc['instanca'] = 4

                    # print '---------'
                    total_votes=0
                    udeo=0
                    for j in range(14, len(row)):
                        doc['rezultat'] = {}
                        doc['rezultat']['glasova'] = int(row[j])
                        if int(row[j]) != 0:
                            total_votes += int(row[j])
                            udeo = (float(int(row[j])) / total_votes) * 100

                        else:
                            udeo = 0.0
                        doc['rezultat']['udeo'] = udeo

                        doc['izbornaLista'] = candidates_or_parties[str(j)]
                        doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                          to_lower=True)

                        # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista'])
                        docs.append(doc.copy())

                        if len(docs) % 1000 == 0:
                            db[collection].insert(docs)
                            docs = []

                row_count += 1

        # Insert remaining documents
        if len(docs) > 0:
            db[collection].insert(docs)
Esempio n. 34
0
    def import_data(self, election_type, year, month=None, rnd=None):

        self.prep_import(election_type, year, month, rnd)

        file_path = self.get_data_file_path(election_type, year, month, rnd)

        e = xml.etree.ElementTree.parse(file_path).getroot()

        results = {}
        docs = []
        for result in e.findall('Result'):
            territory = result.attrib[u'Територија'].strip()
            data_type = result.attrib[u'Врста_податка'].strip()
            candidate = result.attrib[u'Кандидат'].strip(
            ) if election_type == 'predsjednicki' else result.attrib[
                u'Изборна_листа'].strip()

            # We have two entries per territory. One for share of votes (in percentage) and one for number of votes.
            # We want to save both numbers in the same document
            # To achieve this, we keep track of created documents per territory
            if territory not in results:
                results[territory] = {}

            if candidate not in results[territory]:
                results[territory][candidate] = {
                    'teritorija':
                    territory,
                    'teritorijaSlug':
                    slugify(cyrtranslit.to_latin(territory.encode('utf-8'),
                                                 'sr'),
                            to_lower=True),
                    'izbori':
                    cyrtranslit.to_cyrillic(election_type.title(), 'sr'),
                    'godina':
                    int(year),
                    'rezultat': {
                        'udeo': None,
                        'glasova': None
                    }
                }

                # All values with capital letters are grouped regions
                # we need to mark them so that we don't count votes more than once
                territory_slug = slugify(
                    cyrtranslit.to_latin(territory.encode('utf-8'), 'sr'))

                if territory_slug.isupper() and (
                        'okrug' in territory_slug.lower()
                        or territory_slug.lower() == 'grad-beograd'):
                    results[territory][candidate]['instanca'] = 2

                elif territory_slug.isupper():
                    results[territory][candidate]['instanca'] = 1

                else:
                    results[territory][candidate]['instanca'] = 3

                # Set remaining values depending on whether is is a presidential or parliamentary election
                if election_type == 'predsjednicki':
                    month_cyr = cyrtranslit.to_cyrillic(month.title(), 'sr')
                    rnd_cyr = cyrtranslit.to_cyrillic(rnd.title(), 'sr')

                    results[territory][candidate]['mesec'] = month_cyr
                    results[territory][candidate]['krug'] = rnd_cyr
                    results[territory][candidate][
                        'kandidat'] = candidate.title()
                    results[territory][candidate]['kandidatSlug'] = slugify(
                        cyrtranslit.to_latin(candidate.encode('utf-8'), 'sr'),
                        to_lower=True)

                else:
                    results[territory][candidate]['izbornaLista'] = candidate
                    results[territory][candidate][
                        'izbornaListaSlug'] = slugify(cyrtranslit.to_latin(
                            candidate.encode('utf-8'), 'sr'),
                                                      to_lower=True)

            # Удео броја гласова које је добила листа у укупном броју гласова, %
            if '%' in data_type:
                results[territory][candidate]['rezultat']['udeo'] = float(
                    result.text.replace(',', '.'))

            # Број гласова које је добила листа
            else:
                results[territory][candidate]['rezultat']['glasova'] = int(
                    result.text)

            if results[territory][candidate]['rezultat'][
                    'udeo'] is not None and results[territory][candidate][
                        'rezultat']['glasova'] is not None:
                docs.append(results[territory][candidate])

        # Insert documents
        db['izbori'].insert(docs)
    def test_alphabet_transliteration_latin_to_cyrillic(self):
        ''' Transliteration of entire latin alphabet to cyrillic.
        '''
        transliterated_alphabet = cyrtranslit.to_cyrillic(macedonian_alphabet_latin, lang_code='mk')

        self.assertEqual(transliterated_alphabet, macedonian_alphabet_cyrillic)
    def test_alphabet_transliteration_latin_to_cyrillic(self):
        ''' Transliterate the entire latin alphabet to cyrillic '''
        transliterated_alphabet = cyrtranslit.to_cyrillic(
            ukrainian_alphabet_latin, lang_code='ua')

        self.assertEqual(transliterated_alphabet, ukrainian_alphabet_cyrillic)
    def test_alphabet_transliteration_latin_to_cyrillic(self):
        ''' Transliteration of entire latin alphabet to cyrillic.
        '''
        transliterated_alphabet = cyrtranslit.to_cyrillic(russian_alphabet_latin, lang_code='ru')

        self.assertEqual(transliterated_alphabet, russian_alphabet_cyrillic.replace('Ъ', 'ъ').replace('Ь', 'ь'))
    def import_data_parliament_2007(self):
        election_type = 'parlamentarni'
        year = 2007
        self.prep_import(election_type, year, None, None)
        file_path = self.get_data_file_path(election_type, year, None, None)

        row_count = 0
        docs = []
        candidates_or_parties = {}
        parent_territory = ''

        with open(file_path, 'rb') as f:
            reader = csv.reader(f)

            for row in tqdm(reader):
                doc = {}

                # Get all the candidates/parties
                if row_count == 0:
                    for i in range(12, len(row)):
                        candidates_or_parties[str(i)] = row[i].replace('\n', '')
                else:
                    territory = row[2].strip()
                    territory_slug = slugify(cyrtranslit.to_latin(territory, 'sr'), to_lower=True)
                    polling_station_num = int(row[3].strip())
                    polling_station_address = row[4].strip()
                    ballots_received_count = int(row[5].strip())
                    unused_ballots_count = int(row[6].strip())
                    number_of_voters_registered=int(row[7].strip())
                    voters_who_voted_count = int(row[8].strip())
                    ballots_in_ballot_box_count = int(row[9].strip())
                    invalid_ballots_count = int(row[10].strip())
                    valid_ballots_count = int(row[11].strip())


                    doc['brojPrimljeniGlasackiListica'] = ballots_received_count
                    doc['brojNeupotrebljenihGlasackiListica']=unused_ballots_count
                    doc['brojUpisanihBiracaUBirackiSpisak'] = number_of_voters_registered
                    doc['nevazeciGlasackiListici']= invalid_ballots_count
                    doc['biraciKojiSuGlasali'] = {}
                    doc['biraciKojiSuGlasali']['broj'] = voters_who_voted_count
                    # doc['biraciKojiSuGlasali']['udeo'] = voters_who_voted_percent
                    doc['brojGlasackihListicaUKutiji'] = {}
                    doc['brojGlasackihListicaUKutiji']['broj'] = ballots_in_ballot_box_count
                    doc['vazeciGlasackiListici'] = {}
                    doc['vazeciGlasackiListici']['broj'] = valid_ballots_count

                    doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                    doc['godina'] = int(year)
                    # Some rows consist of territory grouping.
                    # We need to track those.
                    if cyrtranslit.to_latin(territory, 'sr').isupper():
                        doc['instanca'] = 1

                    elif 'okrug' in territory_slug \
                            or territory_slug in ['grad-beograd', 'inostranstvo'] \
                            or territory_slug == 'zavodi-za-izvrsenje-zavodskih-sankcija' and polling_station_num is '':
                        doc['instanca'] = 2
                        parent_territory = territory

                    elif polling_station_num is '':
                        doc['instanca'] = 3
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'),
                                                              to_lower=True)

                    elif polling_station_num is not '':
                        doc['instanca'] = 4
                        doc['parentTeritorija'] = parent_territory
                        doc['parentTeritorijaSlug'] = slugify(cyrtranslit.to_latin(parent_territory, 'sr'),
                                                              to_lower=True)
                        doc['brojBirackogMesta'] = polling_station_num
                        doc['adresaBirackogMesta'] = polling_station_address
                    total_votes=0
                    udeo=0
                    for j in range(12, len(row)):
                        doc['rezultat'] = {}
                        doc['rezultat']['glasova'] = int(row[j])
                        if int(row[j]) != 0:
                            total_votes += int(row[j])
                            udeo = (float(int(row[j])) / total_votes) * 100

                        else:
                            udeo = 0.0
                        doc['rezultat']['udeo'] = udeo
                        doc['teritorija'] = territory
                        doc['teritorijaSlug'] = territory_slug
                        doc['izbori'] = cyrtranslit.to_cyrillic(election_type.title(), 'sr')
                        doc['godina'] = int(year)

                        doc['izbornaLista'] = candidates_or_parties[str(j)]

                        doc['izbornaListaSlug'] = slugify(cyrtranslit.to_latin(candidates_or_parties[str(j)], 'sr'),
                                                          to_lower=True)

                        # print "%s - %s - %s" % (row_count + 1, doc['rezultat']['glasova'], doc['izbornaLista'])
                        docs.append(doc.copy())

                        if len(docs) % 1000 == 0:
                            db[collection].insert(docs)
                            docs = []

                row_count += 1

        # Insert remaining documents
        if len(docs) > 0:
            db[collection].insert(docs)
    def test_alphabet_transliteration(self):
        ''' Transliteration of entire Serbian cyrillic alphabet to latin.
        '''
        transliterated_serbian_alphabet = cyrtranslit.to_cyrillic(serbian_alphabet_latin)

        self.assertEqual(transliterated_serbian_alphabet, serbian_alphabet_cyrillic)
Esempio n. 40
0
def cyr(to_translate):
    """
    Transliterate `to_translate` from latin into cyrillic
    """
    return cyrtranslit.to_cyrillic(to_translate)
    def test_special_characters(self):
        ''' Special characters should remain the same.
        '''
        transliterated_special_chars = cyrtranslit.to_cyrillic(special_chars)

        self.assertEqual(transliterated_special_chars, special_chars)