def __init__(self, input_embs_dir, graph_file, ignore_langs=[]): """Contructor :param input_embs_dir: the folder with pre-computed embeddings :param graph_file: the graph file of type graphml :param ignore_langs: used when learning embedding from scratch, the embeddings for these languages are considered 0 """ # Read pre-computed embeddings self.embeddings = {} for lang in utils.langs: emb_path = os.path.join(input_embs_dir, lang, 'wavg.csv') embs, _ = utils.read_embeddings(emb_path, sep=',', binary=False) if lang in ignore_langs: print(lang) self.embeddings.update( {lang + ':' + k: np.zeros(embs[k].shape) for k in embs}) else: self.embeddings.update({lang + ':' + k: embs[k] for k in embs}) if ignore_langs != []: self.name = ''.join([ os.path.basename(graph_file).split('.')[0], '_unknown_', '_'.join(ignore_langs) ]) else: self.name = os.path.basename(graph_file).split('.')[0] # Read graph self._read_graph(graph_file)
def _read_embs(self, emb_dir): all_embs = {} self.name = self.get_name() for lang in utils.langs: lang_embs_path = os.path.join(emb_dir, lang, self.emb_type + '.csv') embs, _ = utils.read_embeddings(lang_embs_path, sep=',', binary=True) all_embs.update({lang + ':' + k: embs[k] for k in embs}) return all_embs
def __init__(self, path): """Constructor :param path: fastText embeddings file in text format """ self.embeddings, self.emb_dim = utils.read_embeddings(path) self.estimate_word_freqs()
def _read_embs(self, emb_file): file_name = os.path.basename(emb_file).split('.')[0] self.name = ''.join([self.get_name(), '_', file_name]) all_embs, _ = utils.read_embeddings(emb_file, sep=',', binary=True) return all_embs