def analyse(self, vi_ch, smart=False):
        '''Tries every possible English letter for a Vietnamese mark.

        Parameters
        ----------
        vi_ch : char
            Vietnamese mark
        smart : bool
            whether or not the method is smart enough to always choose the option with the higher probability        
        '''
        for en_ch in vi_ch_alphabet[vi_ch]:
            df = self.apply_ch(vi_ch, en_ch, smart=smart)
            fer = (df['en_chosen'] * df['vi_prob']).sum()
            fvr = (df['vi_chosen'] * df['en_prob']).sum()
            score = fer + fvr
            logger.info("{}: {}={}+{}".format(en_ch, score, fer, fvr))
 def search_all(self, attempt_count=100, smart=False):
     best_score = 100.0
     for i in range(attempt_count):
         input_method = ''
         for vi_ch in vi_code:
             s = vi_ch_alphabet[vi_ch]
             input_method += s[randrange(len(s))]
         input_method, df, fer, fvr, score = self.search(input_method,
                                                         smart=smart)
         if score < best_score:
             best_df = df
             best_fer = fer
             best_fvr = fvr
             best_score = score
             best_input_method = input_method
         logger.info(
             "------ Current best score {}={}+{} from {} -----".format(
                 best_score, best_fer, best_fvr, best_input_method))
     return best_input_method, best_df, best_fer, best_fvr, best_score
    def search(self, initial_input_method, smart=False):
        '''Searches for the locally best input method. Not too optimal at the moment'''

        best_input_method = initial_input_method
        best_df, best_fer, best_fvr = self.apply(best_input_method,
                                                 smart=smart)
        best_score = best_fer + best_fvr
        logger.info("Initial score {}={}+{} from {}".format(
            best_score, best_fer, best_fvr, best_input_method))

        while True:
            better = False
            for i in range(len(vi_code)):
                for ch in vi_alphabet:
                    input_method = best_input_method[:
                                                     i] + ch + best_input_method[
                                                         i + 1:]
                    if not valid(input_method):
                        continue
                    df, fer, fvr = self.apply(input_method, smart=smart)
                    score = fer + fvr
                    if score < best_score:
                        best_df = df
                        best_fer = fer
                        best_fvr = fvr
                        best_score = score
                        best_input_method = input_method
                        logger.info("Better score {}={}+{} from {}".format(
                            best_score, best_fer, best_fvr, best_input_method))
                        better = True
            if not better:
                break

        logger.info("Best input method: {}".format(best_input_method))
        return best_input_method, best_df, best_fer, best_fvr, best_score
Example #4
0
def make_corpus(in_f, trie_f, out_f):
    logger.info("Loading the enwiki trie '{}'...".format(trie_f))
    trie = Trie.from_file(trie_f)

    logger.info("Loading the viwiki split csv '{}'...".format(in_f))
    df = pc.read_csv(in_f)
    df.columns = ['word', 'count', 'vi_prob']

    alphabet = 'bdfjklqrsvwxzadeou'
    s = df['word'].str.slice(stop=-1)
    for ch in alphabet:
        logger.info(ch)
        df[ch] = s.apply(lambda x: trie.prob(x + ch))

    logger.info("Saving the expanded viwiki split csv '{}'...".format(out_f))
    pc.to_csv(df, out_f, index=False)
Example #5
0
def make_corpus(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus"""

    logger.info("Opening the Wikipedia dump '{}'...".format(in_f))
    wiki = WikiCorpus(in_f, token_min_len=1)

    i = 0
    w = 0
    counter = Counter()
    for text in wiki.get_texts():
        if (i % 10000 == 0):
            logger.info('Processed {} articles with {} words'.format(i, w))
        w += len(text)
        counter.update(text)
        i = i + 1
    logger.info('Processing {} articles with {} words complete!'.format(i, w))

    df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    df = df.rename(columns={'index': 'word', 0: 'count'})
    df.sort_values('count', axis=0, ascending=False, inplace=True)
    pc.to_csv(df, out_f, index=False)
Example #6
0
def make_corpus(in_f, trie_f, out_f):
    logger.info("Loading the enwiki trie '{}'...".format(trie_f))
    trie = Trie.from_file(trie_f)

    logger.info("Loading the viwiki split csv '{}'...".format(in_f))
    df = pc.read_csv(in_f)
    df.columns = ['word', 'count', 'vi_prob']

    alphabet = 'bdfjklqrsvwxzadeou'
    s = df['word'].str.slice(stop=-1)
    for ch in alphabet:
        logger.info(ch)
        df[ch] = s.apply(lambda x: trie.prob(x + ch))

    logger.info("Saving the expanded viwiki split csv '{}'...".format(out_f))
    pc.to_csv(df, out_f, index=False)


if __name__ == '__main__':

    if len(sys.argv) != 4:
        logger.info(
            'Usage: python expand_split_corpus.py <viwiki_prefix_splitX-....csv> <enwiki_restricted_....trie> <viwiki_prefix_splitX_expanded....csv>'
        )
        sys.exit(1)
    in_f = sys.argv[1]
    trie_f = sys.argv[2]
    out_f = sys.argv[3]
    make_corpus(in_f, trie_f, out_f)
                s = vi_ch_alphabet[vi_ch]
                input_method += s[randrange(len(s))]
            input_method, df, fer, fvr, score = self.search(input_method,
                                                            smart=smart)
            if score < best_score:
                best_df = df
                best_fer = fer
                best_fvr = fvr
                best_score = score
                best_input_method = input_method
            logger.info(
                "------ Current best score {}={}+{} from {} -----".format(
                    best_score, best_fer, best_fvr, best_input_method))
        return best_input_method, best_df, best_fer, best_fvr, best_score


if __name__ == '__main__':

    if len(sys.argv) != 3:
        logger.info(
            'Usage: python process_wiki_dump.py <wikipedia_dump_file_that_ends_with_articles.xml.bz2> <processed_csv_file>'
        )
        sys.exit(1)
    in_f = sys.argv[1]
    out_f = sys.argv[2]
    make_corpus(in_f, out_f)

    # currently, the best input method is "qzxdzuzjfkaw" with fer = 0.0009225985366192045 and fvr = 0.002716891565571517
    # itelex: "fesdowajrwwx" fer = 0.010456098466438603 and fvr = 0.014519511048841487
    pass