def load_from_file(chakin_index, nb_dims, pre_fix, sub_folder, root_folder, save_dir): zip_file = os.path.join(root_folder, "{}.zip".format(sub_folder)) zip_file_alt = pre_fix + zip_file[5:] unzip_folder = os.path.join(root_folder, sub_folder) if sub_folder[-1] == "d": golve_fname = os.path.join(unzip_folder, "{}.txt".format(sub_folder)) else: golve_fname = os.path.join(unzip_folder, "{}.{}d.txt".format(sub_folder, nb_dims)) if not os.path.exists(zip_file) and not os.path.exists(unzip_folder): print("Downloading embeddings to '{}'".format(zip_file)) chakin.download(number=chakin_index, save_dir=save_dir) else: print("Embeddings has already been downloaded") if not os.path.exists(unzip_folder): import zipfile if not os.path.exists(zip_file) and os.path.exists(zip_file_alt): zip_file = zip_file_alt with zipfile.ZipFile(zip_file, "r") as zip_ref: print("Extracting embeddings to '{}'".format(unzip_folder)) zip_ref.extractall(unzip_folder) else: print("Embeddings has already been extracted.") # Load indicies from disk word_to_embedding_dict = dict() index_to_embedding = [] num_rep = 0 j = 0 with open(golve_fname, "r") as golve_file: for i, line in enumerate(golve_file): split = line.split(" ") word = split[0] representation = split[1:] representation = np.array( [float(val) for val in representation] ) word_to_embedding_dict[word] = i index_to_embedding.append(representation) if num_rep == 0: num_rep = len(representation) j = i _word_not_found = np.array([0*0] * num_rep) j += 1 word_to_embedding_dict["UNKNOWN"] = j index_to_embedding = np.array(_word_not_found + [_word_not_found]) return word_to_embedding_dict, index_to_embedding
def _prepare_embeddings(self, embedding_name: str, embed_dim: int, embed_zip_folder: str = None, **read_csv_kwargs): tablename = self._tablename() with TemporaryDirectory() as d: if not self.embedding_exists(): if not embed_zip_folder: logging.info( f"Can't find {embedding_name} locally. Started download." ) download_filename = chakin.download(name=embedding_name, save_dir=d) else: d = embed_zip_folder # Use the downloaded folder reader = self._parse_embeddings(d, embed_dim, **read_csv_kwargs) for chunk_df in reader: # perform any transformations to these rows in memory df = chunk_df.word_vec.str.split(" ", n=1, expand=True).rename( { 0: "word", 1: "vector_str" }, axis=1) df.to_sql(tablename, self.db, if_exists="append") # create an index for faster lookups self.cur.execute( f"CREATE UNIQUE INDEX IF NOT EXISTS {tablename}_vector_str_idx ON {tablename}(vector_str)" )
def embedding_weights_load(words_map, embedding_weights_path): pre_trained_embedding = None try: model = FastText.load_fasttext_format(embedding_weights_path) pre_trained_embedding = "bin" except: print("fastText binary file (.bin) is not found!") if os.path.exists("./Word_embedding/wiki.en.vec"): print("Using wikipedia(en) pre-trained word vectors.") else: print("Downloading wikipedia(en) pre-trained word vectors.") chakin.download(number=2, save_dir="./Word_embedding") print("Loading vectors...") if os.path.exists("./Word_embedding_model.pkl"): with open("./Word_embedding_model.pkl", mode="rb") as f: model = pickle.load(f) else: model = KeyedVectors.load_word2vec_format( './Word_embedding/wiki.en.vec') with open("Word_embedding_model.pkl", mode="wb") as f: pickle.dump(model, f) pre_trained_embedding = "txt" vocab_size = len(words_map) word_dimension = model['a'].shape[0] w = np.zeros((vocab_size, word_dimension), dtype=np.float32) for k, v in words_map.items(): word = k word_number = v try: w[word_number][:] = model[word] except KeyError as e: if pre_trained_embedding == "bin": w[word_number][:] = model.seeded_vector(word) else: np.random.seed(word_number) w[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) return w
def download_glove(): global ZIP_FILE if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER): # GloVe by Stanford is licensed Apache 2.0: # https://github.com/stanfordnlp/GloVe/blob/master/LICENSE # http://nlp.stanford.edu/data/glove.twitter.27B.zip # Copyright 2014 The Board of Trustees of The Leland Stanford Junior University print("Downloading embeddings to '{}'".format(ZIP_FILE)) chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER)) else: print("Embeddings already downloaded.") if not os.path.exists(UNZIP_FOLDER): import zipfile if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT): ZIP_FILE = ZIP_FILE_ALT with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref: print("Extracting embeddings to '{}'".format(UNZIP_FOLDER)) zip_ref.extractall(UNZIP_FOLDER) else: print("Embeddings already extracted.")
def embedding_weights_load(words_map, embeddingWeights_path): pre_trained_embedding = None try: model = FastText.load_fasttext_format( embeddingWeights_path) #binファイルがある場合はそちらを読み込む pre_trained_embedding = "bin" except: print("fastText binary file (.bin) is not found!" ) #ない場合はwikipediaの分散表現を使用する if os.path.exists("./Word_embedding/wiki.en.vec"): print("Using wikipedia(en) pre-trained word vectors.") else: print("Downloading wikipedia(en) pre-trained word vectors.") chakin.download(number=2, save_dir="./Word_embedding") print("Loading vectors...") model = KeyedVectors.load_word2vec_format( './Word_embedding/wiki.en.vec') pre_trained_embedding = "txt" vocab_size = len(words_map) word_dimension = model['a'].shape[0] #次元数を取得 W = np.zeros((vocab_size, word_dimension), dtype=np.float32) #分散表現を格納するための行列 for k, v in words_map.items(): #kには単語,vには単語ID word = k word_number = v #モデル中に存在しないチャンゴがある場合には、その単語の分散表現は乱数となる try: W[word_number][:] = model[word] except KeyError as e: if pre_trained_embedding == "bin": W[word_number][:] = model.seeded_vector(word) else: np.random.seed(word_number) W[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) return W
def chakin(self, lang="", number=-1, name=""): import chakin if lang: chakin.search(lang) elif number > -1 or name: path = self.data_path("external") if not os.path.exists(path): os.mkdir(path) table = chakin.downloader.load_datasets() index = number if number < 0: index = table.index[table["Name"] == name].tolist() index = index[0] _name = table.iloc[index]["Name"].lower() for ext in [".txt", ".vec"]: check_path = os.path.join(path, _name) + ext if os.path.exists(check_path): return check_path vec_path = chakin.download(index, path) base, ext = os.path.splitext(vec_path) _dir = os.path.dirname(vec_path) if ext == ".vec": vec_path = os.rename(vec_path, os.path.join(_dir, _name + ext)) elif ext in [".zip", ".gz"]: _path = self.expand(vec_path, ext) os.remove(vec_path) vec_path = _path return vec_path else: raise Exception("You have to specify lang to search or " "number/name to download")
def downloadGlove(gloveFolderPath, gloveDim = 100): #<- downloads every Dimension, so the indx doesnt really matter print("Download Glove") gloveTwitterIDX = { '25' : 17, "50" : 18, "100" : 19, "200" : 20 } #key = dim, value = index chakinIDX = gloveTwitterIDX[str(gloveDim)] zipFile = chakin.download(number=chakinIDX, save_dir='/tmp/glove') print("Unzip Glove") ##unzip unzipedPath = "/tmp/glove_unzipped" with zipfile.ZipFile(zipFile, 'r') as zip_ref: zip_ref.extractall(unzipedPath) ##Move dest = Path(gloveFolderPath) destAbsolute = dest.resolve() dest.mkdir(parents=True, exist_ok=True) for f in [join(unzipedPath, f) for f in listdir(unzipedPath)]: shutil.move(f, destAbsolute) ##Try to delete TMP try: Path("/tmp/").unlink() except: pass
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:] # sometimes it's lowercase only... UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME) if SUBFOLDER_NAME[-1] == "d": GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME)) else: GLOVE_FILENAME = os.path.join( UNZIP_FOLDER, "{}.{}d.txt".format(SUBFOLDER_NAME, NUMBER_OF_DIMENSIONS)) if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER): # GloVe by Stanford is licensed Apache 2.0: # https://github.com/stanfordnlp/GloVe/blob/master/LICENSE # http://nlp.stanford.edu/data/glove.twitter.27B.zip # Copyright 2014 The Board of Trustees of The Leland Stanford Junior University print("Downloading embeddings to '{}'".format(ZIP_FILE)) chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER)) else: print("Embeddings already downloaded.") if not os.path.exists(UNZIP_FOLDER): import zipfile if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT): ZIP_FILE = ZIP_FILE_ALT with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref: print("Extracting embeddings to '{}'".format(UNZIP_FOLDER)) zip_ref.extractall(UNZIP_FOLDER) else: print("Embeddings already extracted.") print('\nRun complete')
import chakin chakin.search(lang='English') chakin.download(21, "/.") if __name__ == '__main__': pass
from typing import List import os from collections import defaultdict from tqdm import tqdm from keras.preprocessing.sequence import pad_sequences import chakin # In[20]: print("Searching for avaiable package.") chakin.search(lang="English") DOWNLOAD = bool(input("Download embedding? >>> ").upper() == "Y") if DOWNLOAD: emb_idx = int(input("Index of embedding to download >>> ")) save_dir = input("Directory to save embeddding ") chakin.download(number=emb_idx, save_dir="../data/") # In[21]: from data_import import load_embedding_from_disks # In[22]: # Parameter # GLOVE_FILENAME = "../data/glove.840B.300d.txt" GLOVE_FILENAME = "../data/glove.6B.50d.txt" # In[23]: df = pd.read_csv("./text_emotion.csv") df.head()
ZIP_FILE = os.path.join(pre_trained_dir, "{}.zip".format(SUBFOLDER_NAME)) ZIP_FILE_ALT = "glove" + ZIP_FILE[5:] # sometimes it's lowercase only... UNZIP_FOLDER = os.path.join(pre_trained_dir, SUBFOLDER_NAME) if SUBFOLDER_NAME[-1] == "d": GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME)) else: GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.{}d.txt".format(SUBFOLDER_NAME, NUMBER_OF_DIMENSIONS)) if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER): # GloVe by Stanford is licensed Apache 2.0: # https://github.com/stanfordnlp/GloVe/blob/master/LICENSE # http://nlp.stanford.edu/data/glove.twitter.27B.zip # Copyright 2014 The Board of Trustees of The Leland Stanford Junior University print("Downloading embeddings to '{}'".format(ZIP_FILE)) chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(pre_trained_dir)) else: print("Embeddings already downloaded.") if not os.path.exists(UNZIP_FOLDER): import zipfile if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT): ZIP_FILE = ZIP_FILE_ALT with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref: print("Extracting embeddings to '{}'".format(UNZIP_FOLDER)) zip_ref.extractall(UNZIP_FOLDER) else: print("Embeddings already extracted.") end = datetime.datetime.now() print('ELMO Embeddings from Glove are generated in {} minutes'.format((end - start).seconds / 60))