Example #1
0
    def __init__(self,
                 sentences=None,
                 corpus_file_path: str = None,
                 dim: int = 100,
                 saved_model_path: str = None,
                 save_path=None) -> None:
        """Constructor for FastTextEmbedding classes (RAII)"""
        super().__init__()

        if sentences is not None:
            self._impl = FastText(size=dim, sentences=sentences)
        elif corpus_file_path is not None:
            self._impl = FastText(size=dim, corpus_file=corpus_file_path)
        elif saved_model_path is not None:
            # load from saved FastText embedding file
            self._impl = load_facebook_model(saved_model_path)
        else:
            raise AssertionError(
                'sentences or corpus_file_path should be given as not None')

        self.vocab = self._impl.wv.index2word
        self.vocab_size = len(self.vocab)
        self.embedding_matrix = []
        for word in self.vocab:
            self.embedding_matrix.append(self._impl.wv[word])
Example #2
0
 def loadWikiModel(self):
     """
     위키 한국어 데이터를 기반으로 FastText 단어 임베딩 모델 학습
     : 기존 학습된 모델이 있는 경우 해당 모델 반환
     : 위키 한국어 데이터(./data/cc.ko.300.bin.gz)가 없는 경우 다운로드
     : 기존 학습된 모델이 없는 경우 학습
     : 학습된 결과를 ./we_model에 저장
     
     - export
     : CONST.wiki_model_path
     """
     model = None
     if not os.path.isfile(CONST.wiki_model_path):
         print('🐈  학습된 단어 임베딩 모델이 없습니다.')
         
         if not os.path.isfile(CONST.wiki_data_path):
             print('🐈  단어 임베딩 모델 학습에 필요한 데이터를 다운로드를 시작합니다.')
             downloadByURL(CONST.wiki_data_url, CONST.wiki_data_path)
         
         print('🐈  단어 임베딩 모델 학습을 시작합니다.')
         model = fasttext.load_facebook_model(CONST.wiki_data_path)
         
         print('🐈  단어 임베딩 모델을 저장합니다.')
         model.save(CONST.wiki_model_path)
     else:
         model = FastText.load(CONST.wiki_model_path)
     
     # print(f'vocab size : {len(model.wv.vocab)}') # 2,000,000
     return model
    def __init__(
        self,
        data_path='/data2/mrony/submission_soccer/data/soccer/',
        vec_dim=300,
        # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'):
        fasttext_model='/data2/mrony/submission_soccer/data/wiki.simple.bin'):
        self.data_path = data_path
        # self.max_similarity = 85
        self.vec_dim = vec_dim

        cap_path = datapath(fasttext_model)
        self.word_emb = load_facebook_model(cap_path)
        # print (self.max_er_vec)
        self.stop = set(stopwords.words('english'))
        self.punc = string.punctuation
        self.er_dict, self.global_ent, self.eo_dict, self.objlist = self.get_kg(
            self.data_path + 'KG/')

        self.args = get_args()
        self.train_dataset = self.get_data('train')
        self.val_dataset = self.get_data('val')
        self.test_dataset = self.get_data('test')

        self.entitites = [d['e'] for d in self.train_dataset]
        self.entitites = list(set(self.entitites))
        print("self.entities:   ", self.entitites)
        self.entities = [tmp.lower() for tmp in self.entitites]
        self.global_ent = [unidecode(tmp.lower()) for tmp in self.global_ent]
        print("self.globalent:   ", self.global_ent)
        self.allent = list(set(self.global_ent + self.entitites +
                               self.objlist))
        #  Create the vocab
        self.vocab = defaultdict(float)

        # self.vocab[pos]
        self.get_vocab(self.train_dataset)
        self.get_vocab(self.test_dataset)
        self.get_vocab(self.val_dataset)

        # Add additional tokens to vocab
        self.vocab[self.args.unk_tok] += 1.0
        self.vocab[self.args.sos_tok] += 1.0
        # self.vocab[self.args.ent_tok] += 1.0
        self.vocab[self.args.eou_tok] += 1.0
        self.vocab[self.args.eos_tok] += 1.0
        self.vocab[self.args.no_ent_tok] += 1.0

        self.stoi = dict(zip(self.vocab.keys(), range(1, len(self.vocab) + 1)))
        self.stoi[self.args.pad_tok] = 0

        self.itos = {v: k for k, v in self.stoi.items()}
        print(len(self.stoi))
        self.n_words = len(self.stoi)

        self.vectors = np.zeros((len(self.stoi), vec_dim))
        for w, w2i in self.stoi.items():
            # if w2i < self.stoi[self.args.eos_tok]:
            self.vectors[w2i] = self.get_w2v(w)
        self.ent_dict = dict(zip(self.allent, range(0, len(self.allent))))
        self.ent_dict["<no_ent>"] = len(list(self.ent_dict.keys()))
def load_fasttext_model(model_name: str) -> FastText:
    if normpath((model_name)).lower().endswith('.bin'):
        fasttext_model = load_facebook_model(datapath(normpath(model_name)))
    else:
        fasttext_model = FastText.load(datapath(normpath(model_name)))
    fasttext_model.init_sims(replace=True)
    return fasttext_model
Example #5
0
    def get_embeddings(self,
                       corpus_path: str,
                       model_path: str = 'models',
                       transfer: bool = False,
                       overwrite: bool = False):
        """
        Build FastText embeddings for a given corpus if no embeddings exist yet or existing embeddings are to be 
        overwritten. Loads and returns existing embeddings if they can be detected. TODO: implement that last bit.

        :param corpus_path: The path to the text corpus used to generate embeddings.
        :param model_path: The path where the word embeddings are to be stored.
        :param transfer: Encodes whether the new embeddings should be added "on top" of existing embeddings.
        :param overwrite: If a trained model already exists but the user still wants to train one from scratch, this is 
                          true.
        """
        if overwrite or len(os.listdir("models/transfer_learned")) == 0:
            print("Loading pretrained model...")
            model = load_facebook_model("models\wiki.en.bin")
            model.build_vocab(sentences=self.corpus, update=True)
            print(
                "Successfully loaded pretrained model!\nStart transfer-learning..."
            )
            model.train(sentences=self.corpus,
                        total_examples=len(self.corpus),
                        epochs=5)
            print("Successfully finished transfer learning!")
            model.save("models/transfer_learned/big_model.model")
        else:
            print("Loading word embeddings...")
            model = FastText.load("models/transfer_learned/big_model.model")
            print("Word embeddings loaded!")

        return model
 def __init__(self):
     print(
         f"{datetime.now():%Y-%m-%d %H:%M:%S} FastText model loading started"
     )
     self._model = load_facebook_model(_MODEL_PATH)
     print(
         f"{datetime.now():%Y-%m-%d %H:%M:%S} FastText model loading ended")
Example #7
0
def load_data(base_path, language, drop_columns, unreliable_sampling):
    datasets = {}
    for ds in tqdm(["train_reliable", "train_unreliable", "dev", "test"],
                   file=sys.stdout):
        if ds == "train_unreliable" and unreliable_sampling == 0:
            continue

        df = pd.read_parquet(
            path.join(base_path, f"{language}",
                      f"{ds}.parquet")).drop(drop_columns,
                                             axis=1,
                                             errors="ignore")

        if ds == "train_unreliable" and 0 < unreliable_sampling < 1:
            df = df.groupby(["category"]).apply(lambda cat: cat.sample(
                frac=unreliable_sampling)).reset_index(drop=True)
        elif ds == "train_unreliable" and unreliable_sampling > 1:
            df = df.groupby(["category"]).apply(lambda cat: cat.sample(n=int(
                unreliable_sampling))).reset_index(drop=True)

        if ds == "train_reliable":
            datasets["train"] = df
        elif ds == "train_unreliable":
            datasets["train"] = pd.concat([datasets["train"], df],
                                          ignore_index=True)
        else:
            datasets[ds] = df

    w2v = load_facebook_model(
        path.join(base_path, f"{language}", "fasttext.bin"))

    return datasets, w2v
    def __init__(
        self,
        data_path='data/soccer/',
        vec_dim=300,
        # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'):
        fasttext_model='/data/dchaudhu/soccerbot_acl/vocab/wiki.en.bin'):
        self.data_path = data_path
        self.max_similarity = 85
        self.vec_dim = vec_dim
        cap_path = datapath(fasttext_model)
        self.word_emb = load_facebook_model(cap_path)
        # print (self.max_er_vec)
        self.stop = set(stopwords.words('english'))
        self.punc = string.punctuation
        self.train_dataset = self.get_data('train')
        self.val_dataset = self.get_data('val')
        self.test_dataset = self.get_data('test')
        self.max_er_vec = []  # max er vector combination size
        for dat in self.train_dataset:
            self.max_er_vec.append(sum(len(v) for k, v in dat['kgER'].items()))
        self.max_out_reln = np.max(self.max_er_vec)
        self.inp_graph_max_size = np.max(
            [len(getER_vec(kg['kgER'])) for kg in self.train_dataset])
        print('input graph size:' + str(self.inp_graph_max_size))
        print(self.max_out_reln)
        self.objects = ['o' + str(j) for j in range(self.max_out_reln)]
        self.args = get_args()
        # Create vocabulary and word2id
        self.vocab = defaultdict(float)
        self.get_vocab(self.train_dataset)
        self.get_vocab(self.test_dataset)
        self.get_vocab(self.val_dataset)
        self.vocab[self.args.unk_tok] += 1.0
        self.vocab[self.args.sos_tok] += 1.0
        self.vocab[self.args.eos_tok] += 1.0
        for o in self.objects:
            self.vocab[o] += 1.0

        self.stoi = dict(zip(self.vocab.keys(), range(0, len(self.vocab))))
        # add additional tokens
        # self.stoi[self.args.unk_tok] = len(self.stoi)
        # self.stoi[self.args.sos_tok] = len(self.stoi)
        # self.stoi[self.args.eos_tok] = len(self.stoi)
        # print(len(self.stoi))
        # self.itos = {v: k for k, v in self.stoi.items()}

        # for j in range(self.max_out_reln):
        #     self.stoi['o'+str(j)] = len(self.stoi)+1
        # del self.stoi

        self.itos = {v: k for k, v in self.stoi.items()}
        print(len(self.stoi))
        self.n_words = len(self.stoi)

        self.vectors = np.zeros((len(self.stoi), vec_dim))
        for w, w2i in self.stoi.items():
            if w2i < self.stoi[self.args.eos_tok]:
                self.vectors[w2i] = self.word_emb.wv[w]
Example #9
0
 def load(self, file_name="SO_fasttext_vectors_200.bin"):
     start = time.time()
     file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                              file_name)
     logging.info(f"Loading model from {file_path}")
     self.model = load_facebook_model(file_path)
     end = time.time()
     logging.info(">> model loaded")
     logging.info(">> %s" % (end - start))
Example #10
0
def load_bin_model(modelPath):
    print("Loading the model...")
    #path=os.getcwd()+modelPath
    model =  load_facebook_model(modelPath)
    #model = FastText.load_fasttext_format(path,full_model=True)
    print("Loading done.")
    print_model_info(modelPath,model)
    
    return model
Example #11
0
def get_new_model():
    model_names = [
        'word2vec-skipgram', 'word2vec-cbow', 'glove', 'fasttext-skipgram',
        'fasttext-cbow', 'en-glove', 'en-fasttext-skipgram', 'en-fasttext-cbow'
    ]
    model = None
    while model == None:
        chosen_model = input('choose one of ' + str(model_names) + ': ')
        if chosen_model in model_names:
            print("loading model...")
        if chosen_model == 'word2vec-skipgram':
            model = Word2Vec.load('models/word2vec-skipgram/word2vec.model',
                                  mmap='r').wv
        elif chosen_model == 'word2vec-cbow':
            model = Word2Vec.load('models/word2vec-cbow/word2vec.model',
                                  mmap='r').wv
        elif chosen_model == 'glove':
            vectors_file = 'models/simplewiki_glove.txt'
            tmp_file = "models/gensim_glove_vectors.txt"
            glove2word2vec(glove_input_file=vectors_file,
                           word2vec_output_file=tmp_file)
            model = KeyedVectors.load_word2vec_format(tmp_file, binary=False)
        elif chosen_model == 'en-glove':
            vectors_file = 'models/enwiki_glove.txt'
            tmp_file = "models/gensim_glove_vectors.txt"
            glove2word2vec(glove_input_file=vectors_file,
                           word2vec_output_file=tmp_file)
            model = KeyedVectors.load_word2vec_format(tmp_file, binary=False)
        elif chosen_model == 'fasttext-skipgram':
            model = fasttext.load_facebook_model(
                'models/simplewiki_fasttext_skipgram.bin').wv
        elif chosen_model == 'fasttext-cbow':
            model = fasttext.load_facebook_model(
                'models/simplewiki_fasttext_cbow.bin').wv
        elif chosen_model == 'en-fasttext-skipgram':
            model = fasttext.load_facebook_model(
                'models/enwiki_fasttext_skipgram.bin').wv
        elif chosen_model == 'en-fasttext-cbow':
            model = fasttext.load_facebook_model(
                'models/enwiki_fasttext_cbow.bin').wv
        else:
            print('unrecognized model name. choose again')
    return model
Example #12
0
    def __init__(
        self,
        data_path='data/incar/',
        vec_dim=300,
        # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin'):
        fasttext_model='/home/deep/Emphatic_VW/emotion_classifer-cnn/vectors/wiki.en.bin'
    ):
        self.data_path = data_path
        # self.max_similarity = 85
        self.vec_dim = vec_dim

        cap_path = datapath(fasttext_model)
        self.word_emb = load_facebook_model(cap_path)
        # print (self.max_er_vec)
        self.stop = set(stopwords.words('english'))
        self.punc = string.punctuation
        self.er_dict, self.global_ent, self.eo_dict = self.get_kg(
            self.data_path + 'KG/')

        self.args = get_args()
        self.train_dataset = self.get_data('train')
        self.val_dataset = self.get_data('val')
        self.test_dataset = self.get_data('test')

        self.entitites = [d['e'] for d in self.train_dataset]
        self.entitites = list(set(self.entitites))
        #  Create the vocab
        self.vocab = defaultdict(float)

        # self.vocab[pos]
        self.get_vocab(self.train_dataset)
        self.get_vocab(self.test_dataset)
        self.get_vocab(self.val_dataset)

        # Add additional tokens to vocab
        self.vocab[self.args.unk_tok] += 1.0
        self.vocab[self.args.sos_tok] += 1.0
        # self.vocab[self.args.ent_tok] += 1.0
        if not self.args.use_bert:
            self.vocab[self.args.eou_tok] += 1.0
        self.vocab[self.args.eos_tok] += 1.0

        self.stoi = dict(zip(self.vocab.keys(), range(1, len(self.vocab) + 1)))
        self.stoi[self.args.pad_tok] = 0

        self.itos = {v: k for k, v in self.stoi.items()}
        print(len(self.stoi))
        self.n_words = len(self.stoi)

        self.vectors = np.zeros((len(self.stoi), vec_dim))
        for w, w2i in self.stoi.items():
            # if w2i < self.stoi[self.args.eos_tok]:
            self.vectors[w2i] = self.get_w2v(w)
        self.ent_dict = dict(
            zip(list(self.entitites), range(0, len(self.entitites))))
    def __init__(self, index_name, es_text_field, save_model_pth, num_training_epochs, use_analyzed_field=False,
                 init_from_pretrained=False, pretrained_path=None, bsize=128, emb_dim=300, window=5,
                 min_count=1, workers=12, max_vocab_size=None, must_have_fields=None, must_not_have_fields=None):
        assert (not init_from_pretrained and not pretrained_path) or (init_from_pretrained and pretrained_path), \
            "If init_from_pretrained=True, pretrained_path must be provided"
        if must_have_fields is None: must_have_fields = []
        if must_not_have_fields is None: must_not_have_fields = []
        # General parameters
        self.index_name = index_name
        self.es_text_field = es_text_field
        self.bsize = bsize
        self.use_analyzed_field = use_analyzed_field
        self.save_model_pth = save_model_pth
        self.num_training_epochs = num_training_epochs
        self.must_have_fields = must_have_fields + [es_text_field]
        self.must_not_have_fields = must_not_have_fields
        # Training parameters
        self.emb_dim = emb_dim  # May be updated below if using a pretrained model
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.max_vocab_size = max_vocab_size

        # Initialize model
        if init_from_pretrained and pretrained_path:
            try:
                self.model = load_facebook_model(pretrained_path)
            except NotImplementedError:
                self.model = FastText.load(pretrained_path)
            except Exception as e:
                raise RuntimeError("Fasttext model is neither facebook nor gensim model: {}".format(e))

            self.emb_dim = self.model.vector_size
            print('Ignoring arg `emb_dim` since loading a pretrained model. Emb_dim is set to {}'.format(self.emb_dim))
            # Update parameters
            self.model.workers = self.workers
            # Start iterating corpus and building vocab
            print('Updating vocabulary with first pass over corpus')
            self.model.build_vocab(
                sentences=TextFieldIterator(
                    index_name, es_text_field, self.must_have_fields,
                    self.must_not_have_fields, bsize=self.bsize, use_analyzed_field=self.use_analyzed_field
                ), update=True)
        else:
            # More parameters can be exposed when creating a new model;
            # https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText
            # Instantiate model
            self.model = FastText(size=emb_dim, window=window, min_count=min_count, workers=workers, max_vocab_size=max_vocab_size)
            # Start iterating corpus and building vocab
            self.model.build_vocab(
                sentences=TextFieldIterator(
                    index_name, es_text_field, self.must_have_fields,
                    self.must_not_have_fields, bsize=self.bsize, use_analyzed_field=self.use_analyzed_field,
                )
            )
Example #14
0
def prepare_wiki_fasttext():
    tokenized_corpus = [doc for _, doc in text_from_file()]
    t = time.time()
    ft_model = fasttext.load_facebook_model(fasttext_wiki_base_model)
    t2 = time.time()
    print(f"Took {t2-t} to load the model")
    t = time.time()
    ft_model.build_vocab(tokenized_corpus, update=True)
    t2 = time.time()
    print(f"Took {t2-t} to update the corpus the model")
    ft_model.save(fasttext_wiki_our_prepared_model)
Example #15
0
 def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path):
     logger.info("Initializing Memory Generator ....")
     self.conv2kg = conv2kg
     self.kgs = kgs
     self.mapping = json.load(open("data/" + dataset + "/ERmapping.json"))
     self.maxEntity, self.maxRel = self.read_dataset(dataset)
     logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " +
                 str(self.maxRel))
     self.matrix_dim = self.maxEntity + self.maxRel
     self.word_emb = load_facebook_model(
         datapath(os.getcwd() + "/" + fasttext_emb_path))
     logger.info("READY: Memory Generator")
    def _setup_model(self):

        if self.verbose:
            print('Loading model')

        loaded_model = load_facebook_model(self.default_model)
        self.model = loaded_model.wv

        if self.verbose:
            print('Model loaded')

        self.similarity_index = WordEmbeddingSimilarityIndex(self.model)
        self.model_ready = True
Example #17
0
def load_pretrained_vectors(dest_path, file_name="wiki.simple.bin"):
    """ Method that loads fastText vectors. Downloads if it doesn't exist.

    Args:
        file_name(str): Name of the fastText file.
        dest_path(str): Path to the directory where fastText vectors exist or will be
        downloaded.

    Returns:
        gensim.models.fasttext.load_facebook_model: Loaded word2vectors

    """

    file_path = _maybe_download_and_extract(dest_path, file_name)
    model = load_facebook_model(file_path)
    return model
 def train_model(self, corpus):
     if self.model is not None:
         return self.model
     logging.info(f"Start fine tuning model {self.pretrained_model_path}")
     self.model = load_facebook_model(self.pretrained_model_path)
     self.model.min_count = 1
     self.model.build_vocab(sentences=corpus.data, update=True)
     self.model.train(
         sentences=corpus.data,
         total_examples=len(corpus.data),
         callbacks=[EpochLogger("Finetuned")],
         epochs=self.epochs)
     self.model = self.model.wv
     self.model.init_sims(True)
     logging.info(f"Finished fine tuning model {self.pretrained_model_path}")
     return self.model
Example #19
0
def findSimilar2():
    # zh_model = FastText2.load_fasttext_format("wiki.zh.bin")
    zh_model = FastText3.load_facebook_model("wiki.zh.bin")
    # print ('程序员' in zh_model.wv.vocab)
    # words = []
    # for word in zh_model.words:
    # 	words.append(word)

    # print("预训练模型包含单词总数: {}".format(len(words)))

    # print (words[:10])

    find_similar_to = "程序员"
    for similar_word in zh_model.wv.similar_by_word(find_similar_to, topn=5):
        print("Word: {0}, Similarity: {1:.2f}".format(similar_word[0],
                                                      similar_word[1]))
Example #20
0
def get_embedding_matrix_from_fasttext_model(path,
                                             word2idx,
                                             embed_dim=300,
                                             MAX_NB_WORDS=200000):
    model = load_facebook_model(path)
    nb_words = min(MAX_NB_WORDS, len(word2idx))

    embedding_matrix = np.zeros((nb_words, embed_dim))

    for word, i in word2idx.items():
        if i >= nb_words:
            continue
        embedding_vector = model.wv[word.strip('_')]
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
Example #21
0
    def update_model(cls,
                     corpus_path,
                     pmodel_name,
                     use_iknow_entities=True,
                     tokenize_concepts=True,
                     installdir=''):
        """ Updates an already existing model by continuing its training
        on a new corpus.

        Parameters
        -----------
        corpus_path (str) - path to the corpus being used to update the model
        
        pmodel_name (str, optional) - The name of the model to be updated, defaults to the
        model currently in use

        Return
        -----------
        True if model was updated, else False

        Throws
        -----------
        FileNotFoundError - if corpus or model not found
        """

        model_path = installdir + IKFastTextModeling.__PATH_PREFIX__

        try:
            if pmodel_name[-4:] != '.bin':
                pmodel_name = pmodel_name + '.bin'
            path = os.path.join(model_path, pmodel_name)
            model = ft.load_facebook_model(path)

            super().update_model(corpus_path, model, use_iknow_entities,
                                 tokenize_concepts)

            # Clear current contents of folders storing model and KeyedVectors files as gensim doesn't do it
            os.remove(path)

            ft.save_facebook_model(model, path=path)

        except FileNotFoundError as err:
            raise FileNotFoundError(
                "Model could not be updated, check specified corpus and model names"
            ) from err
Example #22
0
def make_fasttext_pretrained(target_corpus, pretrined_model):

    target_corpus_name = target_corpus + '.txt'
    corpus_path = os.path.join(CONFIG.DATA_PATH, "corpus", target_corpus_name)
    fb_path = os.path.join(CONFIG.EMBEDDING_PATH, "facebook", pretrined_model)
    sentences = word2vec.LineSentence(corpus_path)
    embedding_model = load_facebook_model(fb_path)  # load pretrained model
    print("embedding started")
    embedding_model.build_vocab(sentences=sentences, update=True)
    print(embedding_model.epochs)
    print("train started")
    embedding_model.train(sentences=sentences,
                          total_examples=embedding_model.corpus_count,
                          epochs=10)
    print("train completed")
    model_name = "FASTTEXT_" + target_corpus + "_pretrained.model"
    embedding_model.wv.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    print("embedding completed")
Example #23
0
    def __init__(self):
        """
        When constructed, this class will automatically donwload pretrained English word embeddings
        from Facebook's FastText and save them for later use.

        WARNING: The pretrained vector file is 6GB, so make sure you download it on reliable WiFi.
        """
        if not path.exists(self.ZIP_FILE) and not path.exists(self.MODEL_FILE):
            print("Downloading model...", file=sys.stderr)
            request.urlretrieve(self.DOWNLOAD_LINK, self.MODEL_FILE)

        if not path.exists(self.MODEL_FILE):
            print("Unzipping model...", file=sys.stderr)
            with zipfile.ZipFile(self.ZIP_FILE, "r") as zip:
                zip.extract(self.MODEL_FILE)

        print("Reading file...", file=sys.stderr)
        self.fb_model = load_facebook_model(
            path.join(os.getcwd(), self.MODEL_FILE))
        self.model = fb_model.wv
Example #24
0
def main():
    random.seed(142)
    np.random.seed(142)

    parser = ArgumentParser()
    parser.add_argument('-s', '--src', dest='source_fasttext_name', type=str, required=True,
                        help='A binary file with a source Facebook-like FastText model (*.bin).')
    parser.add_argument('-d', '--dst', dest='destination_fasttext_name', type=str, required=True,
                        help='A binary file with a destination Facebook-like FastText model (*.bin) after '
                             'its fine-tuning.')
    parser.add_argument('-c', '--cache_dir', dest='cache_dir', type=str, required=True,
                        help='A directory with cached data for training.')
    parser.add_argument('--epochs', dest='max_epochs', type=int, required=False, default=5,
                        help='A number of epochs to train the FastText model.')
    args = parser.parse_args()

    source_fasttext_name = os.path.normpath(args.source_fasttext_name)
    assert os.path.isfile(source_fasttext_name), 'File `{0}` does not exist!'.format(source_fasttext_name)
    destination_fasttext_name = os.path.normpath(args.destination_fasttext_name)
    destination_fasttext_dir = os.path.dirname(destination_fasttext_name)
    assert os.path.isdir(destination_fasttext_dir), 'Directory `{0}` does not exist!'.format(destination_fasttext_dir)
    cache_data_dir = os.path.normpath(args.cache_dir)
    assert os.path.isdir(cache_data_dir), 'Directory `{0}` does not exist!'.format(cache_data_dir)
    ruwordnet_occurrences_file = os.path.join(cache_data_dir, 'ruwordnet_occurrences_in_texts.json')
    submission_occurrences_file = os.path.join(cache_data_dir, 'submission_occurrences_in_texts.json')
    assert os.path.isfile(ruwordnet_occurrences_file), 'File `{0}` does not exist!'.format(ruwordnet_occurrences_file)
    assert os.path.isfile(submission_occurrences_file), 'File `{0}` does not exist!'.format(submission_occurrences_file)

    texts = load_text_corpus(ruwordnet_occurrences_file) + load_text_corpus(submission_occurrences_file)
    print('{0} texts have been loaded...'.format(len(texts)))
    fasttext_model = load_facebook_model(datapath(source_fasttext_name))
    print('The FastText model has been loaded...')
    fasttext_model.workers = max(os.cpu_count(), 1)
    fasttext_model.min_count = 1
    fasttext_model.callbacks = [EpochLogger()] + list(fasttext_model.callbacks)
    fasttext_model.build_vocab(texts, update=True)
    fasttext_model.train(texts, total_examples=len(texts), epochs=args.max_epochs)
    fasttext_model.callbacks = ()
    fasttext_model.save(destination_fasttext_name)
Example #25
0
 def init__usif(self):
     """
     Train and save uSIF embedding model.
     :return:
     """
     embeddings = []
     i = 0
     # load dump of review analysis sentences
     with open(self.embedding_path, 'r') as f:
         for line in f:
             line = line[:-1]
             line = line.lower()
             line = re.sub(r'\d+', '', line)  # numbers
             line = re.sub(r'\p{P}+', '', line)  # punc
             embeddings.append( (line.split(), i) )
             i+=1
     print('Total lines {}'.format(str(i)))
     # get fasttext and train usif model on them
     fasttext = load_facebook_model(self.fasttext_path)
     self.usif_model = uSIF(fasttext, workers=16)
     self.usif_model.train(embeddings)
     self.usif_model.save(self.fse_path)
users = list(set(ratings[['user']].values.flatten()))

all_items = set(pd.read_csv('../datasets/goodbooks-10k-master/books.csv').index.values.flatten())


items_tags = pd.read_csv('../datasets/books-tags.csv', encoding='latin-1')


if MODE == Technique.WORD_2_VEC:
    wv = api.load('word2vec-google-news-300')
elif MODE == Technique.DOC_2_VEC:
    wv = Doc2Vec.load('../datasets/goodbooks-10k-master/doc2vec/doc2vec.bin')
elif MODE == Technique.TFIDFMODEL:
    wv = api.load("glove-wiki-gigaword-50")
else:
    wv = load_facebook_model('../datasets/goodbooks-10k-master/fasttext/wiki.simple.bin')

items_content, items_vectors = build_items_content()

output_path = OUTPUT_FOLDER + OUTPUT_FILE_NAME + pick_tag() + '.csv'

# write csv doc header
f = open(output_path, 'w')
f.write('user,item,score\n')
f.close()

for user in users:
    print(user)

    user_ratings = ratings.query('user == @user')
    rated_items = set(user_ratings[['item']].values.flatten())
Example #27
0
 def read_embedding(switch, data_addr):
     if switch == 'FastText':
         fb_model = fasttext.load_facebook_model(path=data_addr)
         return fb_model
Example #28
0
 def load_fasttext_model(self):
     fasttext = load_facebook_model(
         settings.PATH_TO_FASTTEXT_PT_EMBEDDINGS, encoding="utf-8")
     return fasttext
Example #29
0
    def __init__(self,
                 data_path='data/incar/',
                 pretrained_weights='bert-base-uncased',
                 use_bert=True,
                 fasttext_model='data/wiki.simple.bin',
                 batch_size=32,
                 max_sent_len=20,
                 vec_dim=300,
                 max_resp_len=15,
                 gpu=False,
                 domain='incar'):
        # fasttext_model='/home/debanjan/acl_submissions/soccerbot_acl/vocab/wiki.simple.bin',
        self.args = get_args()
        self.data_path = data_path
        self.batch_size = batch_size
        self.max_sent_len = max_sent_len
        self.max_out_len = max_resp_len
        self.vec_dim = vec_dim
        self.gpu = gpu
        self.n_graph_features = 1
        cap_path = datapath(fasttext_model)
        self.word_emb = load_facebook_model(cap_path)
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
        # SRC and TRG vocabularies
        self.src_vocab = defaultdict(float)
        self.trg_vocab = defaultdict(float)
        self.trg_vocab[self.args.sos_tok] = 1.0
        self.trg_vocab[self.args.eos_tok] = 1.0
        self.trg_vocab[self.args.unk_tok] = 1.0
        self.src_vocab[self.args.unk_tok] = 1.0
        # Load Datasets and preprocess files
        self.train_dataset = np.load(self.data_path + 'preproc_files_kg/' +
                                     'train.npy',
                                     allow_pickle=True)
        # random.shuffle(self.train_dataset)
        self.val_dataset = np.load(self.data_path + 'preproc_files_kg/' +
                                   'val.npy',
                                   allow_pickle=True)
        self.test_dataset = np.load(self.data_path + 'preproc_files_kg/' +
                                    'test.npy',
                                    allow_pickle=True)
        # Create vocabularies
        self.create_vocab(self.train_dataset)
        self.create_vocab(self.val_dataset)
        self.create_vocab(self.test_dataset)
        self.src_stoi = dict(
            zip(self.src_vocab.keys(), range(0, len(self.src_vocab.keys()))))
        self.src_itos = {v: k for k, v in self.src_stoi.items()}
        self.trg_stoi = dict(
            zip(self.trg_vocab.keys(), range(0, len(self.trg_vocab.keys()))))
        self.trg_itos = {v: k for k, v in self.trg_stoi.items()}

        # self.stoi = np.load(self.data_path+'preproc_files_kg/'+'stoi.npy', allow_pickle=True).item()
        self.etoi = np.load(self.data_path + 'preproc_files_kg/' + 'etoi.npy',
                            allow_pickle=True).item()
        # self.vectors = np.load(self.data_path+'preproc_files_kg/'+'wemb.npy', allow_pickle=True)
        # Remove vectors which are not present in source stoi
        self.src_vectors = np.zeros((len(self.src_stoi), self.vec_dim))
        self.trg_vectors = np.zeros((len(self.trg_stoi), self.vec_dim))
        # for w, i in self.src_stoi.items():
        #     self.src_vectors[i] = self.get_w2v(w)
        # for w, i in self.trg_stoi.items():
        #     self.trg_vectors[i] = self.get_w2v(w)
        # self.itos = {v: k for k, v in self.stoi.items()}
        self.itoe = {v: k for k, v in self.etoi.items()}

        self.er_dict, self.global_ent, self.eo_dict, self.e_o_1hop, self.e_r_l = self.get_kg(
            data_path + 'KG/', dat=domain)

        # Maximum graph input feature
        # self.max_er_vec = []  # max er vector combination size
        # for dat in self.train_dataset:
        #     self.max_er_vec.append(sum(len(v) for k, v in dat['kgER'].items()))
        # self.max_out_reln = np.max(self.max_er_vec)
        # Data Statistics

        # self.n_words = len(self.stoi)
        self.n_train = len(self.train_dataset)
        self.n_val = len(self.val_dataset)
        self.n_test = len(self.test_dataset)
existing_f = open("existing_full.txt", "r")
emerging_f = open("emerging_full.txt", "r")

sentences = []
for l in existing_f.readlines():
    sentences.append(l.split(" "))

for l in emerging_f.readlines():
    sentences.append(l.split(" "))
"""
ko_txt = open("ko.txt", "r")
sentences = []
for l in ko_txt.readlines():
    sentences.append(l.split(" "))

model = load_facebook_model("./ko.bin")
print(model.wv.most_similar("립스틱"))
sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'semi-groups']]
model.build_vocab(sentences, update=True)
model.train(sentences=sentences, total_examples=len(sentences), epochs=100)

print(model.wv.most_similar("립스틱"))
model.save("./ko_aihub_100.bin")
"""

#ko_model = gensim.models.Word2Vec.load('./ko.vec')
#model = KeyedVectors.load_word2vec_format(
#            "./ko.bin", binary=True)
#print(model.wv.most_similar("강아지"))

ko_model = Word2Vec(size=200, min_count=54)