def translate_vocab(vocab, lg_from, lg_to):
    """Pickle dictionary of translated vocab"""
    # Load the dictionary if it exists
    try:
        d = pickle_rw((lg_from + '_' + lg_to, 0), write=False)
    except:
        d = {}

    counter = 0
    # For each word in vocab
    for v in vocab:
        # If the word isn't already in the dictionary
        if v not in d:
            t = translate_text(v, lg_from, lg_to)
            d[v] = t

        counter += 1
        if counter % 100 == 0:
            print(counter)
        if counter % 10000 == 0:
            # Pickle dictionary
            pickle_rw((lg_from + '_' + lg_to, d))
    pickle_rw((lg_from + '_' + lg_to, d))
    print("Complete")
    return
def get_polyglot_links():
    """Get polyglot links from web-site and pickle"""
    address = 'https://sites.google.com/site/rmyeid/projects/polyglot'
    with url.urlopen(address) as f:
        f_str = str(f.read())

    pattern = '<a href="(http://bit.ly/\w{7})" rel="nofollow">'
    pattern += 'polyglot-(\w{2}\w?).pkl</a>'
    polyglot_links = re.findall(pattern, f_str)
    polyglot_links.append(('http://bit.ly/19bTJYC', 'zhc'))

    pickle_rw(('polyglot_links', polyglot_links))
    return polyglot_links
Beispiel #3
0
def embedding_languages_lgs(embedding):
    """Load languages and lgs lists for embedding"""
    if embedding == 'gensim':
        languages, lgs = pickle_rw(('gensim_languages', 0),
                                   ('gensim_lgs', 0), write=False)
    elif embedding == 'polyglot':
        # polyglot doesn't have languages, so use lgs for both
        languages, lgs = pickle_rw(('polyglot_lgs', 0),
                                   ('polyglot_lgs', 0), write=False)
    elif embedding == 'fasttext':
        languages, lgs = pickle_rw(('fasttext_languages', 0),
                                   ('fasttext_lgs', 0), write=False)
    elif embedding == 'zeroshot':
        languages, lgs = pickle_rw(('zeroshot_languages', 0),
                                   ('zeroshot_lgs', 0), write=False)
    else:
        pass
    return languages, lgs
            print(counter)
        if counter % 10000 == 0:
            # Pickle dictionary
            pickle_rw((lg_from + '_' + lg_to, d))
    pickle_rw((lg_from + '_' + lg_to, d))
    print("Complete")
    return


if __name__ == '__main__':
    # Use fasttext embedding for translation vocabularies
    embedding = 'fasttext'

    # Load languages and lgs lists for embedding
    languages, lgs = embedding_languages_lgs(embedding)

    # List of all (lg_from, lg_to) combinations
    translations = [(a, b) for a in lgs for b in lgs if a != b]
    translations = [('de', 'en')]  # Temporary override

    # For each combination of lgs
    for translation in translations:
        lg_from, lg_to = translation

        # Load vocab for lg_from
        vocab = pickle_rw((lg_from + '_' + embedding + '_vocab', 0),
                          write=False)

        # Create/Update and Pickle Translation Dictionary
        translate_vocab(vocab, lg_from, lg_to)
from gensim_download import pickle_rw

if __name__ == "__main__":
    # Pickle lgs and languages lists
    zeroshot_lgs = ['en', 'it']
    zeroshot_languages = ['English', 'Italian']
    pickle_rw(('zeroshot_lgs', zeroshot_lgs))
    pickle_rw(('zeroshot_languages', zeroshot_languages))
Beispiel #6
0
    #embeddings = ['gensim', 'polyglot', 'fasttext', 'zeroshot']
    embeddings = ['gensim', 'polyglot']

    # For each embedding
    for embedding in embeddings:
        # Load languages and lgs lists for embedding
        languages, lgs = embedding_languages_lgs(embedding)

        # Results lists
        norm_EDA_results = []
        pca_EDA_results = []

        # For each language
        for lg in lgs:
            # Load vocab and vectors for lg/embedding
            vocab, vectors = pickle_rw((lg + '_' + embedding + '_vocab', 0),
                                       (lg + '_' + embedding + '_vectors', 0),
                                       write=False)

            # EDA on the norm of the embedding vectors
            norm_EDA_results.append(norm_EDA(vectors, lg, embedding))

            # PCA and isotropy of the embedding vectors
            pca_EDA_results.append(pca_EDA(vectors, lg, embedding))

        # Save norm and pca EDA results
        csv_EDA(lgs, embedding)

        # Create markdown report
        report_EDA(lgs, languages, embedding)
Beispiel #7
0
from gensim_download import pickle_rw


if __name__ == "__main__":
    # Pickle lgs and languages lists
    fasttext_lgs = ['de', 'en', 'ru','zh_yue','es','fr','it','ja']
    #fasttext_lgs = ['fr','it','ja']
    #fasttext_languages = ['French','Italian','Japanese']
    fasttext_languages = ['German','English','Russian', 'Chinese',s'French','Italian','Japanese']
    pickle_rw(('fasttext_lgs', fasttext_lgs))
    pickle_rw(('fasttext_languages', fasttext_languages))
        # If the url file was too big, we retrieved an error file.
        # If we obtained the correct file, is is bytes and try will fail.
        try:
            with open('../data/polyglot/' + lg + '.pkl') as f:
                f_str = f.read()
            pattern = '<a href="https://docs.google.com/open\?id=(\w{28})">'
            g_id = re.findall(pattern, f_str)[0]
        except:
            g_id = None

        # If we retrieved the google id from the error file
        if g_id:
            f = drive.CreateFile({'id': g_id})
            f.GetContentFile('../data/polyglot/' + lg + '.pkl')


if __name__ == "__main__":
    # Get polyglot file links from web-site
    polyglot_links = get_polyglot_links()

    # Save polyglot lgs
    polyglot_lgs = [_[1] for _ in polyglot_links]
    pickle_rw(('polyglot_lgs', polyglot_lgs))

    # Set google Auth and instantiate drive
    gauth = googauth()
    drive = GoogleDrive(gauth)

    # Retrive the files
    polyglot_retrieve(polyglot_links)
Beispiel #9
0
        vocab, vectors = gensim_vocab_vectors(lg)
    elif embedding == 'polyglot':
        vocab, vectors = polyglot_vocab_vectors(lg)
    elif embedding == 'fasttext':
        vocab, vectors = fasttext_vocab_vectors(lg)
    elif embedding == 'zeroshot':
        vocab, vectors = zeroshot_vocab_vectors(lg)
    else:
        pass
    return vocab, vectors


if __name__ == "__main__":
    # List embeddings
    #embeddings = ['gensim', 'polyglot', 'fasttext', 'zeroshot']
    embeddings = ['fasttext']

    # For each embedding
    for embedding in embeddings:
        # Load languages and lgs lists for embedding
        languages, lgs = embedding_languages_lgs(embedding)

        # For each language
        for lg in lgs:
            # Load vocab and vectors for embedding/lg
            vocab, vectors = pick_vocab_vectors(embedding, lg)

            # Pickle the vocab and vector objects
            pickle_rw((lg + '_' + embedding + '_vocab', vocab),
                      (lg + '_' + embedding + '_vectors', vectors))
Beispiel #10
0
    t = translate_text(word, lg_from, lg_to)
    return (word, t)


if __name__ == '__main__':
    # Instantiate Spark Context
    sc = SparkContext()
    sc.setLogLevel("ERROR")

    # Set embedding and translation languages
    embedding = 'fasttext'
    translation = ('ru', 'en')

    # Load vocab and dictionary
    lg_from, lg_to = translation
    vocab = pickle_rw((lg_from + '_' + embedding + '_vocab', 0), write=False)
    d = pickle_rw((lg_from + '_' + lg_to, 0), write=False)
    vocab_new = list(set(vocab).difference(set(d.keys())))

    counter = 0
    while len(vocab_new) != 0:
        # Parallelize and translate
        vocabRDD = sc.parallelize(vocab_new[:10000])
        translateRDD = vocabRDD.map(
            lambda x: translate_word_spark(x, lg_from, lg_to))
        translated = translateRDD.collect()

        # Add Translated to dictionary
        for k, v in translated:
            d[k] = v