コード例 #1
0
    def _emb(self):

        if self._embeddings is None:

            self._embeddings = load_facebook_vectors(self._path)

        return self._embeddings
コード例 #2
0
    def __init__(self,
                 filepath,
                 text_field,
                 label_field,
                 embeddings=None,
                 max_text_len=cfg.max_text_len,
                 alphabet=None,
                 noise_level=0,
                 elmo=False):
        assert not elmo, 'ELMo support is deprecated'
        if isinstance(embeddings, str):
            self.embeddings = load_facebook_vectors(embeddings)
        elif isinstance(embeddings, (FastTextKeyedVectors, KeyedVectors)):
            self.embeddings = embeddings
        else:
            raise ValueError('embeddings should be path to FastText file or '
                             'gensim FastTextKeyedVectors object'
                             f'got {type(embeddings)} instead')

        self._noise_level = noise_level
        self.alphabet = alphabet or cfg.alphabet
        self.text_field = text_field
        self.label_field = label_field
        self.data = pd.read_csv(filepath)
        self.max_text_len = max_text_len
        if self.embeddings is not None:
            self.unk_vec = np.random.rand(self.embeddings.vector_size)
        self.label2int = {
            l: i
            for i, l in enumerate(sorted(self.data[self.label_field].unique()))
        }
        self._data = self._preprocess_df(self.data)
コード例 #3
0
ファイル: __init__.py プロジェクト: 90217/rakun
    def __init__(self, hyperparameters, verbose=True):

        self.distance_method = hyperparameters["distance_method"]
        self.hyperparameters = hyperparameters
        self.verbose = verbose
        self.keyword_graph = None
        self.inverse_lemmatizer_mapping = {}

        if self.distance_method == "fasttext":
            from gensim.models import fasttext
            self.pretrained_embedding_path = hyperparameters[
                'pretrained_embedding_path']
            self.model = fasttext.load_facebook_vectors(
                self.pretrained_embedding_path)

        if self.verbose:
            logging.info("Initiated a keyword detector instance.")

        self.default_visualization_parameters = {
            "top_n": 10,
            "max_node_size": 8,
            "min_node_size": 2,
            "label_font_size": 10,
            "text_color": "red",
            "num_layout_iterations": 50,
            "edge_width": 0.08,
            "alpha_channel": 0.5
        }
コード例 #4
0
 def __init__(self, TH, query):
     self.query = query
     self.TH = TH
     self.cap_path = datapath(
         "/home/ubuntu/seungho/fastText/build/run11_chat_mecab_190824.bin")
     self.model = load_facebook_vectors(self.cap_path)
     self.example = self.model['안녕']
コード例 #5
0
    def __init__(self, embeddings_path: str, verbose: bool = True) -> None:
        super().__init__(verbose=verbose)

        if platform.system() == "Windows":
            self.model = load_facebook_vectors(embeddings_path)
        else:
            self.model = load_fasttext_embeddings(embeddings_path)
コード例 #6
0
    def __init__(self, vectors_path='data/fasttext/wiki.en.bin', cuda=True):
        super().__init__()
        print('Init FastText embedder')
        self.wv = load_facebook_vectors(vectors_path)
        self.word_vec_dim = 300

        self.cuda = cuda
コード例 #7
0
def load_fasttext_model(fasttext_path):
    """
    Args:
        fasttext_path: Path to fastText binary file
    """
    import gensim.models.fasttext as ft 
    cap_path = datapath(fasttext_path)
    wv = ft.load_facebook_vectors(cap_path)
    return wv
コード例 #8
0
 def train_model(self, corpus):
     if self.model is None:
         logging.info(f"Start loading model {self.pretrained_model_path}")
         if self.pretrained_model_path.endswith(".bin"):
             self.model = load_facebook_vectors(self.pretrained_model_path)
         else:
             self.model = FastTextKeyedVectors.load(self.pretrained_model_path)
         self.model.init_sims(True)
         logging.info(f"Finished loading model {self.pretrained_model_path}")
     return self.model
コード例 #9
0
def load(path: str, name: str):
    if name.startswith('cc'):
        # Case: Native fastText embeddings.
        return load_facebook_vectors(path, encoding='latin1')

    if name.endswith('bin'):
        # Case: Models trained specifically for this project.
        model = FastText.load(path)
        # Pre-compute L2-normalized vectors.
        model.init_sims(replace=True)
        return model.wv

    if name.endswith('zip'):
        return load_gensim_model(filepath=path)
コード例 #10
0
 def test_get_fasttext_model(self):
     data = pandas.read_csv(str(TEST_DATA_DIR / "prepared_data.csv.xz"),
                            index_col=0,
                            keep_default_na=False)
     with tempfile.TemporaryDirectory(
             prefix="lookout_typos_fasttext_") as temp_dir:
         config = {
             "size": 100,
             "path": os.path.join(temp_dir, "ft.bin"),
             "dim": 5
         }
         train_fasttext(data, config)
         wv = load_facebook_vectors(config["path"])
         self.assertTupleEqual(wv["get"].shape, (5, ))
コード例 #11
0
def get_embeddings(
    embeddings: str,
    embeddings_format: str = 'glove',
    embeddings_binary: bool = False,
) -> KeyedVectors:
    """
    Get the embeddings model and matrix used in the setup function

    Parameters
    ----------
    embeddings : Optional[str], optional
        Path to pretrained embeddings, by default None
    embeddings_format : str, optional
        The format of the input embeddings, should be one of:
        'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can
        be used to download embeddings hosted on gensim on the fly.
        See https://github.com/RaRe-Technologies/gensim-data
        for the list of available embedding aliases.
    embeddings_binary : bool, optional
        Whether the input embeddings are provided in binary format,
        by default False

    Returns
    -------
    KeyedVectors
        The embeddings object specified by the parameters.
    """
    model = None

    if embeddings_format == 'glove':
        with temporary_file('temp.txt') as temp:
            glove2word2vec(embeddings, temp)
            model = KeyedVectors.load_word2vec_format(temp,
                                                      binary=embeddings_binary)
    elif embeddings_format == 'word2vec':
        model = KeyedVectors.load_word2vec_format(embeddings,
                                                  binary=embeddings_binary)
    elif embeddings_format == 'fasttext':
        model = fasttext.load_facebook_vectors(embeddings)
    elif embeddings_format == 'gensim':
        try:
            model = KeyedVectors.load(embeddings)
        except FileNotFoundError:
            model = api.load(embeddings)
    else:
        raise ValueError(
            "Only formats supported are word2vec, fasttext and gensim")

    return model
コード例 #12
0
def load_words_embeddings(filepath, base_file, vocab_file=""):
    # cap_path = datapath(filepath)
    model = load_facebook_vectors(filepath)

    stop_words = stopwords.words('english')

    words = []
    embeddings = []

    if base_file:

        vocab = model.vocab

        f = open('words.txt', 'w')

        if vocab_file!="":
            given_vocab = load_words(vocab_file)

            new_vocab = []
            for word in given_vocab:
                new_vocab.append(word)

        else:
            print("Initial vocab length: "+str(len(vocab)))
            new_vocab = []
            for word in vocab:
                word = re.sub(r'[^a-z-_]+', '', word.lower())
                word = word.strip()
                if word not in new_vocab and word not in stop_words and word!="" and len(word)>2:
                    new_vocab.append(word)

        vocab = new_vocab
        print("Processed vocab length: "+str(len(vocab)))
        for word in vocab:
            words.append(word)
            f.write(word+"\n")
            embeddings.append(model[word].tolist())
        f.close()

    else:
        vocab = load_words('words.txt')
        vocab = [word.strip("\n") for word in vocab]
        for word in vocab:
            words.append(word)
            embeddings.append(model[word].tolist())
    return words, np.array(embeddings)
コード例 #13
0
    def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str,
                  config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction \
                                candidates. First token in every line split is added \
                                to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must \
                                 be two values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens on \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
                       set_min_freq: True to set the frequency of the unknown tokens to the \
                                     minimum frequency in the vocabulary. It is set to zero \
                                     otherwise.
        """
        self.set_config(config)
        self.checker = SymSpell(max_dictionary_edit_distance=self.config["max_distance"],
                                prefix_length=self.config["max_corrected_length"])
        self.checker.load_dictionary(vocabulary_file)
        self.wv = load_facebook_vectors(embeddings_file)
        self.tokens = set(read_vocabulary(vocabulary_file))
        self.frequencies = read_frequencies(frequencies_file)
        if self.config["set_min_freq"]:
            self.min_freq = min(self.frequencies.values())
コード例 #14
0
def cossim_compare(infile, outfile, visim):
    fields = ['Word1', 'Word2', 'Sim2']
    df = pandas.read_csv(visim,
                         sep='\t',
                         skipinitialspace=True,
                         usecols=fields)
    print('loading model')
    model = load_facebook_vectors(infile)
    print('calculating cosine similarity')
    score_list = []
    with open(outfile, 'w') as result:
        for i in range(df.shape[0]):
            word1 = model[(df.iloc[i, 0])]
            word2 = model[(df.iloc[i, 1])]
            similarity = df.iloc[i, 2] / 10
            cossim = 1 - distance.cosine(word1, word2)
            result.write(f'{cossim}\t{similarity}\n')
            score = cossim - similarity
            score_list.append(abs(score))

    with open(outfile, 'a') as result:
        score = sum(score_list)
        result.write(str(score))
コード例 #15
0
ファイル: iksimilarity.py プロジェクト: mgoldenisc/iknow
    def load_vectors(self, pmodel_name):
        """ Loads the VECTORS of an already trained model. It is much quicker and 
        less cumbersome to use just vectors than to use the model itself, but
        still comes with the various important syntactic/semantic tools.

        If the vectors of the specified model are not found but another model's vectors
        are already loaded, this instance will continue to use the already loaded vectors.

        Parameters
        -----------
        pmodel_name (str) - Name of the model to load vectors from

        Throws
        -----------
        FileNotFoundError - If specified model is not found.
        """
        try:
            if pmodel_name[-4:] != '.bin':
                pmodel_name = pmodel_name + '.bin'
            self.wordvectors = ft.load_facebook_vectors(
                os.path.join(self.__PATH_PREFIX__, pmodel_name))
        except FileNotFoundError as err:
            raise FileNotFoundError("Model with name {} not found.".format(
                pmodel_name[:-4])) from err
コード例 #16
0
def load_wv_with_gensim(pretrained_embedding: str,
                        cache_dir=DEFAULT_CACHE_DIR,
                        verbose: bool = False):
    """
    Loads word embeddings with Gensim.

    :param str pretrained_embedding:
    :param cache_dir: the directory for storing cached data
    :param bool verbose: `True` to increase verbosity
    :return: KeyedVectors or FastTextKeyedVectors
    """
    _word_embeddings_available(pretrained_embedding, can_use_subword=True)
    download_model(pretrained_embedding,
                   cache_dir,
                   _process_downloaded_embeddings,
                   verbose=verbose)
    wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin")

    if pretrained_embedding.split(".")[-1] == 'wv':
        return KeyedVectors.load_word2vec_format(wv_path, binary=True)

    elif pretrained_embedding.split(".")[-1] == 'swv':
        from gensim.models.fasttext import load_facebook_vectors
        return load_facebook_vectors(wv_path)
コード例 #17
0
REPEATED = []
W2VCANDIDAT = []
#LM_EM = []

TRAIN_DICTIONARY = {
    'tok_len': TOKEN_LEN,
    'blacklist': BLACKLISTS,
    'repeats': REPEATED,
    'word2vec': W2VCANDIDAT,
    'context': CONTEXT
}
TARGETS = []

ALL_RESULTS = []

MODEL = load_facebook_vectors("./cc.ru.300.bin")


class Word():
    def __init__(self, word):
        self.word = word
        self.token_length = len(word)

    def check_blacklist(self):
        """
        Check if the word in blacklist (number, latin or one-symbol)
        :return 0 if not in blacklist, 1 if in
        """
        pattern = '[0-9\\.\\:\\-\\/a-z]+'
        if len(self.word) == 1:
            return 1
コード例 #18
0
        args.input_data,
        delimiter=';',
        names=['idf', 'labels', 'sentences', 'pivot_words', 'src', 'alea'])

    print(df.head(5))

    y_test = df['labels']

    print('\n ** Transform sentences to ' + str(args.ngram_size) +
          ' ngrams... \n')
    ngrams_list = sentences_to_ngrams(df['sentences'], args.ngram_size,
                                      args.fr_nouns_file)
    print(ngrams_list)

    print("\n ** Loading fastText model...\n")
    fasttext_model = fasttext.load_facebook_vectors(args.model_fasttext)

    print('\n ** Vectorisation of inputs... \n')
    x_test = vectorization(args.ngram_size, ngrams_list, args.we_vector_size,
                           fasttext_model)

    np.random.seed(1)

    print('\n ** Loading model ' + args.model_path + ' \n')
    keras_models = ['GRU', 'MLP_PCA', 'MLP_AE']

    if args.algorithm in keras_models:
        clf = load_model(args.model_path)
    else:
        clf = load(args.model_path)
コード例 #19
0
def main():

    parser = argparse.ArgumentParser(description='Generative Evaluation for Visual Dialogue')
    parser.add_argument('--generations', dest='generations', default='./generations.json', help='Path to file with answer generations.')
    parser.add_argument('--references', dest='references', default='densevisdial/refs_S_val.json', help='Path to file with answer reference sets.')

    # overlap (CIDER, METEOR) parameters
    parser.add_argument('--n', dest='n', type=int, default=4, help='Cider n-gram (computes 1 to n).')
    parser.add_argument('--no_overlap', dest='no_overlap', action='store_true', help='Do not compute overlap metrics.')
   
    # embedding distance FastText parameters
    parser.add_argument('--fast_text_model', dest='fast_text_model', required=True, help='Path to FastText .bin model.')
    parser.add_argument('--no_embedding', dest='no_embedding', action='store_true', help='Do not compute embedding metrics.')
    args = parser.parse_args()

    # load answer generations and reference sets
    print ('loading generations and references from .json files...')
    with open(args.generations) as f:
        gens = json.load(f)
    with open(args.references) as f:
        refs = json.load(f)

    print ('preparing data...')
    generations, references = prepare_data(gens, refs)

    print ('# question-answer pairs: ' + str(len(refs)))
   
    # load models
    print ('loading models and word embeddings (may take a few minutes)...')
    if not args.no_overlap:
        cider_model = CiderScorer(references, n=args.n)
        meteor_model = Meteor()
    if not args.no_embedding:
        bert_client = BertClient(check_length=False)
        fasttext_wordvectors = FastText.load_facebook_vectors(args.fast_text_model)
        numconverter = inflect.engine()
    print ('models loaded!')

    scores = initialise_score_dicts(args)
    print ('evaluating generations...')
    for i, (gs, rs) in enumerate(zip(generations, references)):
        sys.stdout.write('\r{}/{} --> {:3.1f}%'.format(str(i+1), str(len(references)), (i+1)/float(len(references))*100))
        sys.stdout.flush()
        
        cider_list, meteor_list = [], []
        bert_list, fasttext_list = [], []
    
        # get bert embeddings of references
        if not args.no_embedding:
            bert_refs = get_bert_features(rs, bert_client)
            fasttext_refs = get_fasttext_features(rs, fasttext_wordvectors, numconverter)
        
        for ii, g in enumerate(gs): # loops through answer generations, if multiple
            
            if g == "": # ignore empty string
                scores['empty'] += 1
            else:

                if not args.no_overlap:
                    cider_list.append(compute_cider(g, rs, cider_model))
                    meteor_list.append(compute_meteor(g, rs, meteor_model))
                if not args.no_embedding:
                    bert_list.append(compute_bert(g, bert_refs, bert_client))
                    fasttext_list.append(compute_fasttext(g, fasttext_refs, fasttext_wordvectors, numconverter))
    
        # average over multiple generations
        if not args.no_overlap:
            n_grams_cider = np.mean(cider_list, axis=0)
            for n, n_gram_cider in enumerate(n_grams_cider):
                scores['cider_{:d}'.format(n+1)].append(n_gram_cider)
            scores['meteor'].append(np.mean(meteor_list))
        if not args.no_embedding:
            bert_scores = np.mean(bert_list, axis=0)
            scores['bert_l2'].append(bert_scores[0])
            scores['bert_cs'].append(bert_scores[1])
            fasttext_scores = np.mean(fasttext_list, axis=0)
            scores['fasttext_l2'].append(fasttext_scores[0])
            scores['fasttext_cs'].append(fasttext_scores[1])
    
    sys.stdout.write('\n')
    print_scores(scores)
    if 'meteor' in scores:
        meteor_model.close()
コード例 #20
0
 def load_fasttext(self, fasttext_path):
     path = datapath(fasttext_path)
     self.word_embedding = load_facebook_vectors(path)
コード例 #21
0
ファイル: train.py プロジェクト: Ds-Kang/KynG
parser.add_argument('--save-history', action='store_true', help='save history')

# parse arguments
args = parser.parse_args()

if args.save_epoch is not None:
    os.makedirs(args.save_epoch, exist_ok=True)

# device setting
on_gpu = args.gpu and torch.cuda.is_available()
device = torch.device('cuda' if on_gpu else 'cpu')

# load embedding
# loading gensim embedding
print(f'Loading embedding from {args.embedding}')
gensim_emb = fasttext.load_facebook_vectors(args.embedding)
emb_dim = gensim_emb.vector_size
n_vocabs = len(gensim_emb.vocab)

# make torch embedding for generator
torch_emb = nn.Embedding(n_vocabs, emb_dim)
torch_emb.weight.data.copy_(torch.tensor(gensim_emb.vectors))
torch_emb.require_grad = False  # disable update
torch_emb = torch_emb.to(device)

# make torch linear embedding for discriminator
linear_emb = nn.Linear(n_vocabs, emb_dim, bias=False)
linear_emb.weight.data.copy_(torch.tensor(gensim_emb.vectors).t())
# linear_emb.require_grad = False # disable update
linear_emb = linear_emb.to(device)
コード例 #22
0
ファイル: embeddings.py プロジェクト: kenanfa3/ebert
 def _load_bin(self, path):
     from gensim.models.fasttext import load_facebook_vectors
     self.vectors = load_facebook_vectors(path)