def __init__(self, input_embs_dir, graph_file, ignore_langs=[]):
     """Contructor
     :param input_embs_dir: the folder with pre-computed embeddings
     :param graph_file: the graph file of type graphml
     :param ignore_langs: used when learning embedding from scratch, the
     embeddings for these languages are considered 0
     """
     # Read pre-computed embeddings
     self.embeddings = {}
     for lang in utils.langs:
         emb_path = os.path.join(input_embs_dir, lang, 'wavg.csv')
         embs, _ = utils.read_embeddings(emb_path, sep=',', binary=False)
         if lang in ignore_langs:
             print(lang)
             self.embeddings.update(
                 {lang + ':' + k: np.zeros(embs[k].shape)
                  for k in embs})
         else:
             self.embeddings.update({lang + ':' + k: embs[k] for k in embs})
     if ignore_langs != []:
         self.name = ''.join([
             os.path.basename(graph_file).split('.')[0], '_unknown_',
             '_'.join(ignore_langs)
         ])
     else:
         self.name = os.path.basename(graph_file).split('.')[0]
     # Read graph
     self._read_graph(graph_file)
Example #2
0
 def _read_embs(self, emb_dir):
     all_embs = {}
     self.name = self.get_name()
     for lang in utils.langs:
         lang_embs_path = os.path.join(emb_dir, lang,
                                       self.emb_type + '.csv')
         embs, _ = utils.read_embeddings(lang_embs_path,
                                         sep=',',
                                         binary=True)
         all_embs.update({lang + ':' + k: embs[k] for k in embs})
     return all_embs
 def __init__(self, path):
     """Constructor
     :param path: fastText embeddings file in text format
     """
     self.embeddings, self.emb_dim = utils.read_embeddings(path)
     self.estimate_word_freqs()
Example #4
0
 def _read_embs(self, emb_file):
     file_name = os.path.basename(emb_file).split('.')[0]
     self.name = ''.join([self.get_name(), '_', file_name])
     all_embs, _ = utils.read_embeddings(emb_file, sep=',', binary=True)
     return all_embs