def translate_vocab(vocab, lg_from, lg_to): """Pickle dictionary of translated vocab""" # Load the dictionary if it exists try: d = pickle_rw((lg_from + '_' + lg_to, 0), write=False) except: d = {} counter = 0 # For each word in vocab for v in vocab: # If the word isn't already in the dictionary if v not in d: t = translate_text(v, lg_from, lg_to) d[v] = t counter += 1 if counter % 100 == 0: print(counter) if counter % 10000 == 0: # Pickle dictionary pickle_rw((lg_from + '_' + lg_to, d)) pickle_rw((lg_from + '_' + lg_to, d)) print("Complete") return
def get_polyglot_links(): """Get polyglot links from web-site and pickle""" address = 'https://sites.google.com/site/rmyeid/projects/polyglot' with url.urlopen(address) as f: f_str = str(f.read()) pattern = '<a href="(http://bit.ly/\w{7})" rel="nofollow">' pattern += 'polyglot-(\w{2}\w?).pkl</a>' polyglot_links = re.findall(pattern, f_str) polyglot_links.append(('http://bit.ly/19bTJYC', 'zhc')) pickle_rw(('polyglot_links', polyglot_links)) return polyglot_links
def embedding_languages_lgs(embedding): """Load languages and lgs lists for embedding""" if embedding == 'gensim': languages, lgs = pickle_rw(('gensim_languages', 0), ('gensim_lgs', 0), write=False) elif embedding == 'polyglot': # polyglot doesn't have languages, so use lgs for both languages, lgs = pickle_rw(('polyglot_lgs', 0), ('polyglot_lgs', 0), write=False) elif embedding == 'fasttext': languages, lgs = pickle_rw(('fasttext_languages', 0), ('fasttext_lgs', 0), write=False) elif embedding == 'zeroshot': languages, lgs = pickle_rw(('zeroshot_languages', 0), ('zeroshot_lgs', 0), write=False) else: pass return languages, lgs
print(counter) if counter % 10000 == 0: # Pickle dictionary pickle_rw((lg_from + '_' + lg_to, d)) pickle_rw((lg_from + '_' + lg_to, d)) print("Complete") return if __name__ == '__main__': # Use fasttext embedding for translation vocabularies embedding = 'fasttext' # Load languages and lgs lists for embedding languages, lgs = embedding_languages_lgs(embedding) # List of all (lg_from, lg_to) combinations translations = [(a, b) for a in lgs for b in lgs if a != b] translations = [('de', 'en')] # Temporary override # For each combination of lgs for translation in translations: lg_from, lg_to = translation # Load vocab for lg_from vocab = pickle_rw((lg_from + '_' + embedding + '_vocab', 0), write=False) # Create/Update and Pickle Translation Dictionary translate_vocab(vocab, lg_from, lg_to)
from gensim_download import pickle_rw if __name__ == "__main__": # Pickle lgs and languages lists zeroshot_lgs = ['en', 'it'] zeroshot_languages = ['English', 'Italian'] pickle_rw(('zeroshot_lgs', zeroshot_lgs)) pickle_rw(('zeroshot_languages', zeroshot_languages))
#embeddings = ['gensim', 'polyglot', 'fasttext', 'zeroshot'] embeddings = ['gensim', 'polyglot'] # For each embedding for embedding in embeddings: # Load languages and lgs lists for embedding languages, lgs = embedding_languages_lgs(embedding) # Results lists norm_EDA_results = [] pca_EDA_results = [] # For each language for lg in lgs: # Load vocab and vectors for lg/embedding vocab, vectors = pickle_rw((lg + '_' + embedding + '_vocab', 0), (lg + '_' + embedding + '_vectors', 0), write=False) # EDA on the norm of the embedding vectors norm_EDA_results.append(norm_EDA(vectors, lg, embedding)) # PCA and isotropy of the embedding vectors pca_EDA_results.append(pca_EDA(vectors, lg, embedding)) # Save norm and pca EDA results csv_EDA(lgs, embedding) # Create markdown report report_EDA(lgs, languages, embedding)
from gensim_download import pickle_rw if __name__ == "__main__": # Pickle lgs and languages lists fasttext_lgs = ['de', 'en', 'ru','zh_yue','es','fr','it','ja'] #fasttext_lgs = ['fr','it','ja'] #fasttext_languages = ['French','Italian','Japanese'] fasttext_languages = ['German','English','Russian', 'Chinese',s'French','Italian','Japanese'] pickle_rw(('fasttext_lgs', fasttext_lgs)) pickle_rw(('fasttext_languages', fasttext_languages))
# If the url file was too big, we retrieved an error file. # If we obtained the correct file, is is bytes and try will fail. try: with open('../data/polyglot/' + lg + '.pkl') as f: f_str = f.read() pattern = '<a href="https://docs.google.com/open\?id=(\w{28})">' g_id = re.findall(pattern, f_str)[0] except: g_id = None # If we retrieved the google id from the error file if g_id: f = drive.CreateFile({'id': g_id}) f.GetContentFile('../data/polyglot/' + lg + '.pkl') if __name__ == "__main__": # Get polyglot file links from web-site polyglot_links = get_polyglot_links() # Save polyglot lgs polyglot_lgs = [_[1] for _ in polyglot_links] pickle_rw(('polyglot_lgs', polyglot_lgs)) # Set google Auth and instantiate drive gauth = googauth() drive = GoogleDrive(gauth) # Retrive the files polyglot_retrieve(polyglot_links)
vocab, vectors = gensim_vocab_vectors(lg) elif embedding == 'polyglot': vocab, vectors = polyglot_vocab_vectors(lg) elif embedding == 'fasttext': vocab, vectors = fasttext_vocab_vectors(lg) elif embedding == 'zeroshot': vocab, vectors = zeroshot_vocab_vectors(lg) else: pass return vocab, vectors if __name__ == "__main__": # List embeddings #embeddings = ['gensim', 'polyglot', 'fasttext', 'zeroshot'] embeddings = ['fasttext'] # For each embedding for embedding in embeddings: # Load languages and lgs lists for embedding languages, lgs = embedding_languages_lgs(embedding) # For each language for lg in lgs: # Load vocab and vectors for embedding/lg vocab, vectors = pick_vocab_vectors(embedding, lg) # Pickle the vocab and vector objects pickle_rw((lg + '_' + embedding + '_vocab', vocab), (lg + '_' + embedding + '_vectors', vectors))
t = translate_text(word, lg_from, lg_to) return (word, t) if __name__ == '__main__': # Instantiate Spark Context sc = SparkContext() sc.setLogLevel("ERROR") # Set embedding and translation languages embedding = 'fasttext' translation = ('ru', 'en') # Load vocab and dictionary lg_from, lg_to = translation vocab = pickle_rw((lg_from + '_' + embedding + '_vocab', 0), write=False) d = pickle_rw((lg_from + '_' + lg_to, 0), write=False) vocab_new = list(set(vocab).difference(set(d.keys()))) counter = 0 while len(vocab_new) != 0: # Parallelize and translate vocabRDD = sc.parallelize(vocab_new[:10000]) translateRDD = vocabRDD.map( lambda x: translate_word_spark(x, lg_from, lg_to)) translated = translateRDD.collect() # Add Translated to dictionary for k, v in translated: d[k] = v