Ejemplo n.º 1
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        from word2word import Word2word

        model = Word2word(self.config.lang, self._tgt)
        return PororoWord2Word(model, self.config)
Ejemplo n.º 2
0
def main(folder_path, lang):
    os.chdir(folder_path)
    files = os.listdir(folder_path)
    for index, file in enumerate(files):
        if file.endswith(".csv") and lang in file:
            print(file)
            df = pd.read_csv(file, dtype=str, sep='\t')
            i = 0
            with open(lang, 'a') as f:
                l = convert_iso_code(lang)
                with open(lang + '.en', 'a') as e:
                    with open(lang + '.' + l, 'a') as t:
                        for i, row in df.iterrows():
                            if isinstance(row['source_text'],
                                          str) and isinstance(
                                              row['target_text'], str
                                          ) and row['item_type'] != 'RESPONSE':
                                source = remove_punctuation_and_lower_case(
                                    row['source_text'])
                                target = remove_punctuation_and_lower_case(
                                    row['target_text'])
                                f.write(str(i) + '|' + source + '|' + target)
                                i += 1
                                f.write("\n")
                                e.write(str(i) + '|' + source)
                                e.write("\n")
                                t.write(str(i) + '|' + target)
                                t.write("\n")
            f.close()
            e.close()
            t.close()

    print(folder_path + "/" + lang + "_dict")
    mcsq_dict = Word2word.make("en",
                               l,
                               folder_path + "/" + lang,
                               savedir=folder_path)
Ejemplo n.º 3
0
#!/usr/bin/env python
# - *- coding: utf- 8 - *-

from word2word import Word2word

dict_path = '/Volumes/Data/dataset/word2word'

with open('word2word/supporting_languages.txt') as f:
    lines = f.readlines()
    for dict_pair in lines:
        codes = dict_pair.strip('\n').split('-')
        print("getting dictionary %s-%s" % (codes[0], codes[1]))
        w2w = Word2word(codes[0], codes[1], dict_path=dict_path)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--lang1',
                        type=str,
                        required=True,
                        help="ISO 639-1 code of language. "
                        "See `http://opus.nlpl.eu/OpenSubtitles2018.php`")
    parser.add_argument('--lang2',
                        type=str,
                        required=True,
                        help="ISO 639-1 code of language. "
                        "See `http://opus.nlpl.eu/OpenSubtitles2018.php`")
    parser.add_argument('--datapref',
                        type=str,
                        default=None,
                        help="data prefix to a custom parallel corpus. "
                        "builds a bilingual lexicon using OpenSubtitles2018 "
                        "unless this option is provided.")
    parser.add_argument('--n_lines',
                        type=int,
                        default=100000000,
                        help="number of parallel sentences used")
    parser.add_argument(
        '--cutoff',
        type=int,
        default=5000,
        help=
        "number of words that are used in calculating collocates within each language"
    )
    parser.add_argument(
        '--rerank_width',
        default=100,
        type=int,
        help="maximum number of target-side collocates considered for reranking"
    )
    parser.add_argument(
        '--rerank_impl',
        default="multiprocessing",
        type=str,
        help=
        "choice of reranking implementation: simple, multiprocessing (default)"
    )
    parser.add_argument('--cased',
                        dest="cased",
                        action="store_true",
                        help="Keep the case.")
    parser.add_argument('--n_translations',
                        type=int,
                        default=10,
                        help="number of final word2word translations kept")
    parser.add_argument('--save_cooccurrence',
                        dest="save_cooccurrence",
                        action="store_true",
                        help="Save the cooccurrence results")
    parser.add_argument('--save_pmi',
                        dest="save_pmi",
                        action="store_true",
                        help="Save the pmi results")
    parser.add_argument('--savedir',
                        type=str,
                        default=None,
                        help="location to store bilingual lexicons."
                        "make sure to use this input when loading from "
                        "a custom-bulit lexicon.")
    parser.add_argument('--num_workers',
                        default=16,
                        type=int,
                        help="number of workers used for multiprocessing")
    args = parser.parse_args()

    Word2word.make(**vars(args))
Ejemplo n.º 5
0
from word2word import Word2word

en2fr = Word2word("en","fr")        #French
en2es = Word2word("en","es")        #Spanish
en2it = Word2word("en","it")        #Italian
en2de = Word2word("en","de")        #German
en2ta = Word2word("en","ta")
en2te = Word2word("en","te")


def translate(english_word,caseTranslate):
    print("Yaha Translate.py ka ilaka start hota hai")
    list=english_word.split(" ")
    print(list)
    try:
        if(caseTranslate==1):
            for word in list:
                trans=en2fr(word)[0]
                english_word=english_word.replace(word,trans)
            return english_word

        elif(caseTranslate==2):
            for word in list:
                trans=en2es(word)[0]
                english_word=english_word.replace(word,trans)
            return english_word

        elif(caseTranslate==3):
            for word in list:
                trans=en2it(word)[0]
                english_word=english_word.replace(word,trans)
Ejemplo n.º 6
0
#!/usr/bin/env python
# - *- coding: utf- 8 - *-

from word2word import Word2word

dict_path = '/Volumes/Data/dataset/word2word'
en2fr = Word2word("en", "fr", dict_path=dict_path)
# out: ['pomme', 'pommes', 'pommier', 'tartes', 'fleurs']
print '%s' % ','.join(map(str, en2fr("apple")))

fr2en = Word2word("fr", "en", dict_path=dict_path)
print '%s' % ','.join(map(str, fr2en("pomme", n_best=2)))

# out: ['travaillé', 'travaillait']
print '%s' % ','.join(map(str, en2fr("worked", n_best=2)))

en2zh = Word2word("en", "zh_cn", dict_path=dict_path)
# out: ['老师', '教师', '学生', '导师', '墨盒']
print '%s' % ','.join(map(str, en2zh("teacher")))

zh2en = Word2word("zh_cn", "en", dict_path=dict_path)
print '%s' % ','.join(map(str, zh2en("老师")))

hi2en = Word2word("hi", "en", dict_path=dict_path)
print '%s' % ','.join(map(str, hi2en("मिलने")))