class WordToVector:
    mode_dict = {0: "50d", 1: "100d", 2: "200d", 3: "300d"}

    def __init__(self,
                 train_new_model=False,
                 filepath=None,
                 mode=0,
                 tokenized_dataset=None,
                 vector_size=100,
                 train_epochs=30):

        self.word_weight_vec = None

        if not train_new_model:
            if filepath is not None:
                self.no_train = True
                with open(
                        filepath + "glove.6B." + self.mode_dict[mode] + ".txt",
                        "rb") as model_file:
                    self.word_vec_dict = {
                        word_vec_pair.split()[0]:
                        np.array(map(float,
                                     word_vec_pair.split()[1:]))
                        for word_vec_pair in model_file
                    }

        else:
            self.no_train = False
            if tokenized_dataset is not None:
                self.to_train_model = FastText(size=vector_size,
                                               window=4,
                                               min_count=2)
                self.to_train_model.build_vocab(sentences=tokenized_dataset)
                self.to_train_model.train(
                    sentences=tokenized_dataset,
                    total_examples=len(tokenized_dataset),
                    epochs=train_epochs)
                self.word_vec_dict = dict(
                    zip(self.to_train_model.wv.index2word,
                        self.to_train_model.wv.syn0))
            else:
                print(
                    "You have not given a tokenized_dataset. Please ensure that it is in the format of list of list \
                of tokens and also that the parameter you have passed is not None"
                )
        self.dim = len(next(iter(self.word_vec_dict)))

    def trainWithAdditionalData(self, tokenized_dataset_update):
        if not self.no_train:
            self.to_train_model.build_vocab(tokenized_dataset_update,
                                            update=True)
            self.to_train_model.train(
                tokenized_dataset_update,
                total_examples=len(tokenized_dataset_update),
                epochs=self.to_train_model.epochs)
            self.word_vec_dict = dict(
                zip(self.to_train_model.wv.index2word,
                    self.to_train_model.wv.syn0))
            self.dim = len(next(iter(self.word_vec_dict)))

    def convertSentenceToVector(self, sentence):
        if self.no_train:
            return np.array([
                np.mean(
                    [
                        self.word_vec_dict[
                            word]  # * self.word_weight_vec[word]
                        for word in sentence if word in self.word_vec_dict
                    ] or [np.zeros(self.dim)],
                    axis=0)
            ])
        else:
            return np.array([
                np.mean([
                    self.to_train_model.wv.get_vector(word)
                    for word in sentence
                ],
                        axis=0)
            ])

    def getWordVector(self, word):
        if self.no_train:
            return self.word_vec_dict[word]
        else:
            return self.to_train_model.wv.get_vector(word)
Ejemplo n.º 2
0
class Model:
    def __init__(self, name=None, modelType=None):
        if name and modelType:
            self.name = name
            self.modelType = modelType
            self.model_path = self.getModelPath(name, modelType)

    def create(self,
               data_path,
               modelName='wordEmbedding',
               modelType='word2vec',
               model_path=None):
        ''' Uses Gensim to train a word embedding Model, either fasttext or word2vec are possible.
        file_path points to a csv file containing articles with the text of newspaper articles in a column called body
        '''
        self.name = modelName
        self.modelType = modelType
        if model_path is None:
            model_path = self.getModelPath(self.name, self.modelType)
        self.model_path = model_path
        if self.modelType == 'word2vec':
            self.word_embedding = Word2Vec(min_count=8,
                                           window=5,
                                           workers=4,
                                           size=300,
                                           alpha=0.05,
                                           negative=10,
                                           sg=1)
        if self.modelType == 'fasttext':
            self.word_embedding = FastText(size=300)

        self.collectionInfo = CollectionInfo(data_path)
        collection = Collection(data_path)

        self.word_embedding.build_vocab(collection)
        self.word_embedding.train(
            collection,
            total_examples=self.word_embedding.corpus_count,
            epochs=self.word_embedding.iter)

        self.modelInfo = ModelInfo(self.modelType, self.word_embedding)

    def getModelPath(self, modelName, modelType):
        return './models/' + modelName + '_' + modelType

    def evaluate(self):
        ''' evaluates the semantic concepts a Word2Vec model has learned based on analogies, e.g. sister:brother :: daughter:son, in specific categories (e.g. currencies, verb forms, family, country capitals, etc.) '''
        with open('newsAnalysis/questions-words.txt', 'r') as evaluationFile:
            self.accuracy = self.word_embedding.wv.accuracy(evaluationFile)
        correctAnalogies = [len(result['correct']) for result in self.accuracy]
        totalAnalogies = [
            len(result['correct'] + result['incorrect'])
            for result in self.accuracy
        ]
        for ind in range(len(self.accuracy)):
            self.accuracy[ind]['nr_correct'] = correctAnalogies[ind]
            self.accuracy[ind]['nr_total'] = totalAnalogies[ind]

    def vectors2Bytes(self):
        vectors = self.word_embedding.wv.vectors
        vectors.tofile(self.model_path + '.bytes')

    def to_tsv(self):
        self.vectors2tsv()
        self.vocab2tsv()

    def vectors2tsv(self):
        with open(self.model_path + '.tsv', 'w') as f:
            writer = csv.writer(f, delimiter='\t', lineterminator='\n')
            writer.writerows(self.word_embedding.wv.vectors)
        f.close()

    def vocab2tsv(self):
        with open(self.model_path + '_metadata.tsv', 'w') as f:
            vocab = self.word_embedding.wv.vocab.keys()
            #vocabWithLineSeparator = [word + '\n' for ind,word in enumerate(vocab) if ind<len(vocab)-1]
            vocabWithLineSeparator = [word + '\n' for word in vocab]
            f.writelines(vocabWithLineSeparator)
        f.close()

    def exists(self, model_path=None):
        if hasattr(self, 'model_path'):
            return os.path.exists(self.model_path)
        elif model_path:
            return os.path.exists(model_path)
        else:
            return False

    def load(self, modelName=None, modelType=None, model_path=None):
        if hasattr(self, 'model_path'):
            model_path = self.model_path
        elif modelName and modelType:
            model_path = self.getModelPath(modelName, modelType)
        input_file = open(model_path + '.pkl', 'rb')
        self = pickle.load(input_file)
        self.word_embedding = KeyedVectors.load_word2vec_format(
            model_path)  #, mmap='r')
        return self

    def __getstate__(self):
        return (self.modelType, self.name, self.collectionInfo, self.modelInfo,
                self.accuracy)

    def __setstate__(self, state):
        self.modelType, self.name, self.collectionInfo, self.modelInfo, self.accuracy = state

    def save(self):
        output = open(self.model_path + '.pkl', 'wb')
        pickle.dump(self, output)
        self.word_embedding.wv.save_word2vec_format(self.model_path)

    def hasWord(self, word):
        if self.word_embedding.wv.vocab.get(word) == None:
            return False
        else:
            return True

    def filterNonVocabWords(self, word_list):
        valid_words = []
        oov = []
        for word in word_list:
            if self.hasWord(word):
                valid_words.append(word)
            else:
                oov.append(word)
        return valid_words, oov

    def getWordCount(self, word):
        if self.hasWord(word):
            return self.word_embedding.wv.vocab.get(word).count
        else:
            raise KeyError('ERROR: WORD not in Model')

    def wordListSimilarity(self, w, listOfWords):
        ''' return the mean cosine similarity of a word and all words in a list '''
        similarities = [
            self.word_embedding.wv.similarity(w, word) for word in listOfWords
        ]
        return np.mean(similarities)

    def mapWordOnAxis(self, word, attributes1, attributes2):
        ''' substract the mean cos distance of a word with all attributes in attributes1 with the mean cosine distance of word with all attributes in attributes2:
            s(w, A1, A2) = mean[for a1 in A1: cos(w, a1)] - mean[for a2 in A2: cos(w, a2)] '''
        return self.wordListSimilarity(
            word, attributes1) - self.wordListSimilarity(word, attributes2)

    def keywordMapping(self, listOfWords, attributes1, attributes2):
        wordAttributeSimTarget1 = [
            self.wordListSimilarity(word, attributes1) for word in listOfWords
        ]
        wordAttributeSimTarget2 = [
            self.wordListSimilarity(word, attributes2) for word in listOfWords
        ]
        return np.array(wordAttributeSimTarget1) - np.array(
            wordAttributeSimTarget2)

    def plotKeywordMapping(self, values, labels, title='test'):
        plotter = ImagePlotter(True)
        plotter.horizontalBarPlot(values,
                                  labels,
                                  title='Word-Axis Mapping',
                                  x_label='attribute association',
                                  path=title + '.png')

    def WEAT(self, targets1, targets2, attributes1, attributes2):
        wordAttributeSimTarget1 = [
            self.mapWordOnAxis(target, attributes1, attributes2)
            for target in targets1
        ]
        wordAttributeSimTarget2 = [
            self.mapWordOnAxis(target, attributes1, attributes2)
            for target in targets2
        ]
        return np.sum(wordAttributeSimTarget1) - np.sum(
            wordAttributeSimTarget2)

    def generate_analogies(self, w1, w2, restrict_vocab=3500):
        biasObject = BiasWordEmbedding(self.word_embedding)
        biasObject._identify_direction(w1, w2, [w1, w2], method='single')
        return biasObject.generate_analogies(restrict_vocab=restrict_vocab,
                                             unrestricted=True,
                                             n_analogies=10)

    def visualise(self):
        self.vocab2tsv()
        self.vectors2Bytes()

        projector = Projector()
        modelName = '_'.join([self.name, self.modelType])

        shutil.copy(self.model_path + '.bytes',
                    projector.data_path + '/' + modelName + '.bytes')
        shutil.copy(self.model_path + '_metadata.tsv',
                    projector.data_path + '/' + modelName + '_metadata.tsv')

        path = os.path.join(projector.data_path.split('/')[-1], modelName)
        projector.addModelToConfig(self.name, path + '.bytes',
                                   path + '_metadata.tsv',
                                   len(self.word_embedding.wv.vocab),
                                   self.word_embedding.vector_size)
        projector.writeConfigFile()
        projector.run()
def Updates():
    try:
        print("updating Doc2Vec")
        print(updating)
        a = stem.snowball.ArabicStemmer()
        stopwords_list = stopwords.words('arabic')
        df = pd.read_csv('textc-Copy1.csv', encoding='utf-8')
        df["contenu"].fillna("محتوى فارغ", inplace=True)
        df["article"].fillna("محتوى فارغ", inplace=True)
        y = df['ToF']
        df = df.drop('ToF', axis=1)
        text = []
        for i in range(df.shape[0]):
            x = nltk.tokenize.wordpunct_tokenize(df.contenu[i])
            text1 = [a.stem(word) for word in x]
        text.append(text1)
        titre = [
            a.stem(word) for word in df.article if word not in stopwords_list
        ]
        #doc2vec
        docs = []
        analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
        for i, te in enumerate(text):
            tags = [i]
            docs.append(analyzedDocument(te, tags))
        model = doc2vec.Doc2Vec(docs,
                                vector_size=300,
                                non_negative=True,
                                window=8,
                                min_count=1,
                                workers=4,
                                dm=1)
        from gensim.test.utils import get_tmpfile
        fname = get_tmpfile("doc2vec.model")
        model.save(fname)
        model = doc2vec.Doc2Vec.load(fname)
        print("updating fastext")

        class MyItera(object):
            def __iter__(self):
                for line in Corpus.article:
                    filtered_sentence = []
                    for w in tokenize(line):
                        if w not in stop_words:
                            filtered_sentence.append(w)
                    yield filtered_sentence

        class MyIter(object):
            def __iter__(self):
                for line in Corpus.contenu:
                    filtered_sentence = []
                    for w in tokenize(line):
                        if w not in stop_words:
                            filtered_sentence.append(w)
                    yield filtered_sentence

        model = FastText(size=150, window=3, min_count=1)
        model.build_vocab(sentences=MyIter())
        total_examples = model.corpus_count
        model.train(sentences=MyIter(),
                    total_examples=total_examples,
                    epochs=5)

    except:
        Update()
def word2vec(inputFile,
             outputFile,
             size=60,
             window=5,
             min_count=5,
             epoch=5,
             down_sampling=1e-4):
    '''
        This function triggers the FastText model
        Parameters:
                 1. inputFile => This is corpus data as input file  
                 2. outputFile => Output file 
                 3. size => embedding dimension size
                 4. window => window size for context (skip-gram)
                 5. min_count => minimum number of word count to be considered
                 6. epoch => number of times the algorithm will run
                 7. down_sampling => value for down_sampling
    '''
    print('inputFile:' + inputFile)
    print('outputFile:' + outputFile)
    corpus = []
    with open(inputFile, 'r') as fin:
        for blog in fin:
            corpus.append(blog.strip('\n'))

    word_tokenized_corpus = [review.split() for review in corpus]

    try:
        model = FastText(size=size,
                         window=window,
                         min_count=min_count,
                         seed=0,
                         workers=1)  # instantiate the fasttext model
        model.build_vocab(
            sentences=word_tokenized_corpus)  # build the vocabulary
        model.train(sentences=word_tokenized_corpus,
                    total_examples=len(word_tokenized_corpus),
                    sg=1,
                    sample=down_sampling,
                    epochs=epoch)

        word_vectors = []
        for w in model.wv.vocab:
            try:
                word_vectors.append(model[w])
            except Exception as err:
                print(str(err) + ": " + w)
                continue

        no_of_words = len(model.wv.vocab)
        dimension = size

        with open(outputFile, 'w') as fout:
            fout.write(str(no_of_words) + ' ' + str(dimension) + '\n')
            for i, w in enumerate(model.wv.vocab):
                fout.write(w)
                for feature in word_vectors[i]:
                    fout.write(' ' + str(feature))
                fout.write('\n')

    except Exception as err:
        print(err)
    return None
Ejemplo n.º 5
0
    return [sentence.split(" ") for sentence in sentences][:-1]


lumea_tokens = load_tokens("tokens_lumea.txt")
lumea_token_count = sum([1 for sentence in lumea_tokens for token in sentence])
blog_tokens = load_tokens("tokens_blog.txt")
blog_token_count = sum([1 for sentence in blog_tokens for token in sentence])

print("Start training FT model wtih blogs.")
model = FastText(blog_tokens, size=300, window=5, min_count=1, workers=4)
model.wv.save("model_ft_blog")

print("Start updating FT model wuth Lumea corpus.")
model.build_vocab(lumea_tokens, update=True)
model.train(lumea_tokens,
            total_examples=model.corpus_count,
            epochs=model.epochs)
model.wv.save("model_ft_expanded")

print("Start training FT model wtih Lumea corpus only.")
model = FastText(lumea_tokens, size=300, window=5, min_count=1, workers=4)
model.wv.save("model_ft_lumea")

print("Start training W2V model wtih blogs.")
model = Word2Vec(blog_tokens, size=300, window=5, min_count=1, workers=4)
model.wv.save("model_w2v_blog")

print("Start updating W2V model wuth Lumea corpus.")
model.build_vocab(lumea_tokens, update=True)
model.train(lumea_tokens,
            total_examples=model.corpus_count,
Ejemplo n.º 6
0
    if count != 0:
        vec /= count
    return vec


tokenizer = MosesTokenizer()

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

corpus = [
    tokenizer.tokenize(text) for df in (train, test) for text in df['text']
]
ft_model = FastText(size=embedding_size, window=5, min_count=2, seed=57)
ft_model.build_vocab(sentences=corpus)
ft_model.train(sentences=corpus, total_examples=len(corpus), epochs=10)

labels = list({label for df in (train, test) for label in df['artist']})

X_train = np.array([
    get_sentence_vector(text, embedding_size, tokenizer, ft_model)
    for text in train['text']
])
y_train = np.array([labels.index(label) for label in train['artist']])
X_test = np.array([
    get_sentence_vector(text, embedding_size, tokenizer, ft_model)
    for text in test['text']
])
y_test = np.array([labels.index(label) for label in test['artist']])

X_train, X_valid, y_train, y_valid = train_test_split(X_train,
Ejemplo n.º 7
0
class WordEmbedding():
    def __init__(self,
                 embedding_type="w2v",
                 embedding_size=100,
                 ngram=(3, 6),
                 window_size=5,
                 architecture="sg"):
        self.embedding_type = embedding_type
        self.window = window_size
        self.size = embedding_size
        self.model = None
        if architecture == "sg":
            self.skip_gram = True
        else:
            self.skip_gram = False
        if ngram is None:
            ngram = (3, 6)
        self.min_gram = ngram[0]
        self.max_gram = ngram[1]

    def train_embedding(self,
                        sentences,
                        n_iter=100,
                        workers=1,
                        min_count=3,
                        negative_sample=1):
        if self.embedding_type == "w2v":
            train_corpus = sentences
            if self.model is None:
                self.model = Word2Vec(size=self.size,
                                      window=self.window,
                                      min_count=min_count,
                                      negative=negative_sample,
                                      workers=workers,
                                      sg=int(self.skip_gram))
                self.model.build_vocab(train_corpus)
            # self.model.build_vocab()
            else:
                self.model.build_vocab(train_corpus, update=True)
        elif self.embedding_type == "ft":
            train_corpus = sentences
            if self.model is None:
                self.model = FastText(sg=int(self.skip_gram),
                                      size=self.size,
                                      window=self.window,
                                      min_count=min_count,
                                      min_n=self.min_gram,
                                      max_n=self.max_gram,
                                      workers=workers,
                                      negative=negative_sample)
                self.model.build_vocab(train_corpus)
            else:
                self.model.build_vocab(train_corpus, update=True)
        elif self.embedding_type == "glove":
            raise ValueError("GloVe training not supported use official repo")
        else:
            raise ValueError("Invalid Embedding Type")
        train_corpus = sentences
        self.model.train(train_corpus,
                         epochs=n_iter,
                         total_examples=self.model.corpus_count)

    def retrieve_vector(self, word):
        try:
            return self.model.wv[word]
        except KeyError:
            return np.random.random(self.size)

    def find_similar_word(self, word, n=10):
        try:
            return self.model.most_similar(positive=[word], topn=n)
        except KeyError:
            return []

    def save_model(self, file_name):
        self.model.save("{}.model".format(file_name))
        we_model_files = glob("{}.model*".format(file_name))
        with ZipFile(file_name, "w") as zipf:
            for we_file in we_model_files:
                zipf.write(we_file)
                os.remove(we_file)

    def load_model(self, file_name):
        try:
            with ZipFile(file_name, "r") as zipf:
                zipf.extractall("/tmp/")
                nl = zipf.namelist()
            fn = [name for name in nl if name.endswith(".model")][0]
            path = "/tmp/" + fn
        except BadZipFile:
            path = file_name

        if self.embedding_type == "w2v":
            self.model = KeyedVectors.load_word2vec_format(path)
        elif self.embedding_type == "ft":
            self.model = FastText.load_fasttext_format(path)
        elif self.embedding_type == "glove":
            """path name: .txt file"""
            try:
                glove_file = datapath(os.path.abspath(path))
                tmp_file = get_tmpfile("/tmp/g2w2v.txt")
                glove2word2vec(glove_file, tmp_file)
                self.model = KeyedVectors.load_word2vec_format(tmp_file)
            except UnicodeDecodeError:
                self.model = KeyedVectors.load(os.path.abspath(path))
        self.size = self.model.wv.vector_size

    def remove_from_vocab(self, word_list):
        new_vectors = []
        new_vocab = {}
        new_index2entity = []
        new_vectors_norm = []
        if self.embedding_type == "ft":
            self.model.wv.init_sims()
            for i in range(len(self.model.wv.vocab)):
                word = self.model.wv.index2entity[i]
                vec = self.model.wv.vectors[i]
                vocab = self.model.wv.vocab[word]
                vec_norm = self.model.wv.vectors_norm[i]
                if word not in word_list:
                    vocab.index = len(new_index2entity)
                    new_index2entity.append(word)
                    new_vocab[word] = vocab
                    new_vectors.append(vec)
                    new_vectors_norm.append(vec_norm)
            self.model.wv.vocab = new_vocab
            self.model.wv.vectors = np.array(new_vectors)
            self.model.wv.index2entity = new_index2entity
            self.model.wv.index2word = new_index2entity
            self.model.wv.vectors_norm = new_vectors_norm
        else:
            self.model.init_sims()
            for i in range(len(self.model.vocab)):
                word = self.model.index2entity[i]
                vec = self.model.vectors[i]
                vocab = self.model.vocab[word]
                vec_norm = self.model.vectors_norm[i]
                if word not in word_list:
                    vocab.index = len(new_index2entity)
                    new_index2entity.append(word)
                    new_vocab[word] = vocab
                    new_vectors.append(vec)
                    new_vectors_norm.append(vec_norm)
            self.model.vocab = new_vocab
            self.model.vectors = np.array(new_vectors)
            self.model.index2entity = new_index2entity
            self.model.index2word = new_index2entity
            self.model.vectors_norm = new_vectors_norm
    # ---------------------------------- Word2Vect -----------------------------------------------
    model_w2v = Word2Vec(reviews,
                         size=150,
                         window=10,
                         min_count=2,
                         workers=10,
                         sg=0)  #sg=0 cbow
    model_w2v.train(reviews, total_examples=len(reviews), epochs=12)

    # ---------------------------------- Fasttext -----------------------------------------------
    model_fasttext = FastText(size=170,
                              window=10,
                              min_count=2,
                              workers=10,
                              sg=0)  # instantiate
    model_fasttext.build_vocab(sentences=reviews)
    model_fasttext.train(sentences=reviews,
                         total_examples=len(reviews),
                         epochs=12)  # train

    word_vectors = model_fasttext.wv
    word_vectors_w2v = model_w2v.wv

    outF = open("Dataset/txt files/myOutFile1.txt", "w+")

    for i in range(
            len(attributes)
    ):  # Each attributes send to word2vectfonc fonction to use w2v and fasttex.
        word2vectfonc(attMatrix[i][0], i)

    outF.close()
def main():
    # global encode_length, vector_size

    ## 1. intent 데이터셋 불러오기
    config = Configs()
    okt = Okt()

    question = preprocess_data(True)
    joinStr = ' '.join(question)

    morphs = okt.morphs(joinStr)
    joinString = ' '.join(morphs)
    pos1 = okt.pos(joinString)
    pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
    morphs = list(map(lambda x: okt.morphs(x), pos2))


    ## 2. 워드 임베딩
    print("\n### Fasttext bulid model ###", end="\n")
    word2vec_model = FastText(size=config.vector_size, window=3, workers=8, min_count= 1)
    # word2vec_model = FastText(size=config.vector_size, window=2, workers=8, min_count= 1)
    word2vec_model.build_vocab(morphs)
    print('\n### Fasttext build complete ###', end="\n")

    print('\n### Fasttext trian start ###', end="\n")
    word2vec_model.train(morphs, total_examples= word2vec_model.corpus_count, epochs= word2vec_model.epochs, compute_loss=True, verbose=1)
    print('\n### Fasttext train complete ###', end="\n")

    word2vec_model.save(config.fasttext_model_path+"intent_fasttextmodel")
    print('\n### Fasttext model save ###', end="\n")
    
    w2c_index = word2vec_model.wv.index2word # fasttext가 적용된 단어 목록들
    print("[DEBUG1-1]############ FastText representation ############", end="\n\n")
    print(w2c_index, end="\n\n\n")
    print('\n\n[DEBUG1-1]word_index 단어 개수 >> ', len(w2c_index)) # <class 'list'>

    ### intentIndex 저장
    with open(config.fasttext_model_path+'/intentIndex.pickle', 'wb') as f:
        pickle.dump(w2c_index, f, pickle.HIGHEST_PROTOCOL)

    print("_________________________________________________________________________________________________________________\n")



    # # y_data 생성
    y_data = config.df['intent']
    y_data = y_data.map(config.intent_mapping)
    y_data = to_categorical(y_data)

    
    # x_data 생성
    # encode_length = 15
    x_data = []
    for q_raw in question:
        q_raw = okt.morphs(q_raw) # 문장 형태소별로 분리(단어 분리). str > list
        q_raw = list(map(lambda x: q_raw[x] if x < len(q_raw) else '#', range(config.encode_length)))
        q_raw = list(map(lambda x: word2vec_model[x] if x in w2c_index else np.zeros(config.vector_size, dtype=float), q_raw))
        q_raw = np.array(q_raw)
        x_data.append(q_raw)
        
    x_data = np.array(x_data)   # (None, 15, 300)
    x_data = x_data.reshape(len(config.df), config.encode_length, config.vector_size, 1)
    print(x_data.shape)

    ## vector numpy array save
    # np.save("fasttext_vector.npy", x_data)
    print("_________________________________________________________________________________________________________________\n")



    ## 3. 모델 생성 및 훈련
    print("shape >>", x_data.shape, y_data.shape)   # (None, 15 ,300, 1) / (None, 5)

    model = Sequential()
    model.add(Conv2D(12, kernel_size=(2,2), input_shape=(config.encode_length, config.vector_size, 1), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    
    model.add(Conv2D(12, kernel_size=(2,2), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))

    model.add(Conv2D(12, kernel_size=(2,2), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu", data_format='channels_first'))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))

    model.add(Conv2D(12, kernel_size=(2,2), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu", data_format='channels_first'))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))
    model.add(Conv2D(12, kernel_size=(4,4), strides=(1,1), padding="valid", activation="relu"))
    model.add(MaxPooling2D(pool_size=(1,1), strides=(1,1)))

    model.add(Flatten())
    model.add(BatchNormalization())
    # model.add(Dropout(1.0))
    model.add(Dense(128, activation="relu"))
    # model.add(Dropout(0.1))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # stop = EarlyStopping(monitor="loss", patience=20, mode="auto")


    model.summary()
    
    model.fit(x_data, y_data, batch_size=64, epochs=500)
    # model.fit(x_data, y_data, batch_size=64, epochs=500, callbacks=[stop])
    print("_________________________________________________________________________________________________________________")
    loss, acc = model.evaluate(x_data, y_data)
    print("loss >> ", loss)
    print("acc >>", acc, end="\n")



    ## 4. 모델 저장
    path = config.intent_model_path
    file_list = os.listdir(path)

    new_num = 0
    if os.path.exists(path):    # 파일 있을경우
        for i in file_list:
            num = int(i.split(".")[0].split("-")[-1])

            if new_num <= num:
                new_num = num + 100
            else:
                pass

        
        model_name = "intent_model-"+str(new_num)+".h5"
        weights_name = "intent_weights-"+str(new_num)+".h5"
        print("\n\nFile name >>",model_name)
        model.save(path+model_name)
        model.save_weights(path+weights_name)
            
    else:
        model.save(path+"intent_model-100.h5")
        model.save_weights(path+"intent_weights-100.h5")

    print("\n#### MODEL SAVE ####", end='\n')
Ejemplo n.º 10
0
        pbar.update(1)
del review_unclean

# FastText Vector
vector_size = 256
window = 5

fasttext_model = 'fasttext.model'
print('Generating FastText Vectors ..')

start = time.time()
model = FastText(size=vector_size)
model.build_vocab(review)
model.train(review,
            window=window,
            min_count=1,
            workers=4,
            total_examples=model.corpus_count,
            epochs=model.epochs)
print('FastText Created in {} seconds.'.format(time.time() - start))
model.save(fasttext_model)
print('FastText Model saved at {}'.format(fasttext_model))
del model

model = FastText.load(fasttext_model)
x_vectors = model.wv
del model

# Dataset Partition

# Spliting the review1 and labels in (x_train, y_train) and (x_test, y_test) with 90% for training and 10% for testing from all the tweets.
# Maximum number of tokens allowed for each review is set to be 15.
Ejemplo n.º 11
0
  def train(self, epochs=30):
    """
    Train with own Data(s)
    Support single or multiple corpus or dataframe.
    Parameters:
    -----------
    model_name(optional): preferred model name
    epochs : int : total epochs for training

    Example
    --------
    >>> from ekushey.feature_extraction import BN_FastText

    #Training Against Sentences
    >>> ft = BN_FastText(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ])
    >>> ft.train()

    #Training Against one Text Corpus
    >>> ft = BN_FastText(corpus_file="path_to_corpus.txt")
    >>> ft.train()

    #Training Against Multiple Corpuses
    path
      ->corpus
        ->1.txt
        ->2.txt
        ->3.txt

    >>> ft = BN_FastText(corpus_path="path/corpus")
    >>> ft.train(epochs=25)

    #Training Against a Dataframe Column

    >>> ft = BN_FastText(df= news_data['text_content'])
    >>> ft.train(epochs=25)

"""
    if not(self.sentences) and  not(self.corpus_file) and not(self.corpus_path) and self.df is None:
      raise Exception('Data is not given')
    elif self.sentences:
      data = self.sentences
      #print("got sentence")
    elif self.corpus_file:
      #print("got sentence")
      data = PathLineSentences(self.corpus_file)
    elif self.corpus_path:
      #print("got sentence")
      data = PathLineSentences(self.corpus_path)
    elif self.df is not None:
      #print("Dataframe got")
      data = '\n'.join(self.df)
      data = data.split('\n')
      data = [sent.split() for sent in data]
    else:
      print("Unexpected error occured: Please check your data file again.")
    
    
    cpu_cores = multiprocessing.cpu_count()
    ft_model = FastText(
                        size=self.size,
                        alpha=self.alpha,
                        window=self.window,
                        min_count=self.min_count,
                        max_vocab_size=self.max_vocab_size,
                        sample=self.sample,
                        workers=self.workers,
                        min_alpha=self.min_alpha,
                        sg=self.sg,
                        negative=self.negative
                       )

 
    print("Working with "+str(self.workers)+" worker threads")
    ft_model.build_vocab(data,  progress_per=10000)
    print("Vocabulary build Successfully")
    t=time()
    ft_model.train(data, total_examples=ft_model.corpus_count, epochs=epochs, report_delay=1)
    print('Training took : {} mins'.format(round((time() - t) / 60, 2)))
    ft_model.save(self.model_name)
    print(ft_model)
Ejemplo n.º 12
0
            if i % 10000 == 0:
                print(str(i) + " samples")
            yield words


for ep in range(epoch):
    step = 0
    for i in range(0, data_size, batch_size):
        step += 1
        if i == 0 and ep == 0:
            ## Initialize and train a FastText model ###
            fast_model = FastText(size=feature_size,
                                  window=window_context,
                                  min_count=min_word_count,
                                  workers=multiprocessing.cpu_count())
            tokenized_corpus = list(
                norm_doc_tokenizer("../data/full_dataset.txt", i, batch_size))
            fast_model.build_vocab(tokenized_corpus)
            fast_model.train(tokenized_corpus,
                             total_examples=batch_size,
                             epochs=fast_model.epochs)
        else:
            tokenized_corpus = list(
                norm_doc_tokenizer("../data/full_dataset.txt", i, batch_size))
            fast_model.build_vocab(tokenized_corpus, update=True)
            fast_model.train(tokenized_corpus,
                             total_examples=batch_size,
                             epochs=fast_model.epochs)
        print("Epoch", str(ep + 1), ",Step", str(step))

fast_model.save("./output/fasttext")
class TweetModelRunner:
    def __init__(self,
                 startdate=None,
                 enddate=None,
                 tweettype=None,
                 search_terms=None,
                 remove_search_terms=True,
                 size=None,
                 aws_credentials=None):
        self.creds = aws_credentials
        self.e = ESSearch(aws_credentials)
        self.startdate = startdate
        self.enddate = enddate
        self.tweettype = tweettype
        self.processed_count = 0
        self.total_count = 0
        self.fasttextModel = None
        self.d2vmodel = None
        self.search_terms = search_terms
        self.size = size
        self.stopwords = set(stopwords.words('english'))
        additional_stops = [
            'rt', 'de', 'que', 'en', 'la', 'por', 'un', 'se', 'el', '...',
            'amp', "coronavirus", "covid", "19", '&amp'
        ]
        for stop in additional_stops:
            self.stopwords.add(stop)
        if remove_search_terms is True and search_terms is not None:
            for term in self.search_terms.lower().translate(
                    str.maketrans('', '', string.punctuation)).split():
                self.stopwords.add(term)
        p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.MENTION)

    def _removeNonAscii(self, s):
        return "".join(i for i in s if ord(i) < 128)

    def _remove_stops(self, word_text):
        filtered_text = [w for w in word_text if not w in self.stopwords]
        return filtered_text

    def _clean_text(self, the_tweet_text):
        cleaned_text = p.clean(the_tweet_text).lower().replace("’", "'")
        words = cleaned_text.split()
        reformed = [
            CONTRACTIONS[word] if word in CONTRACTIONS else word
            for word in words
        ]
        cleaned_text = " ".join(reformed)
        cleaned_text = cleaned_text.translate(
            str.maketrans('', '', string.punctuation))
        cleaned_text = self._removeNonAscii(cleaned_text)
        tokenized_text = list(tokenize(cleaned_text))
        tokenized_text = self._remove_stops(tokenized_text)

        return tokenized_text

    def tweetsIter(self, d2v=False):
        for tw in self.e.query(self.search_terms,
                               tweettype=self.tweettype,
                               startDateString=self.startdate,
                               endDateString=self.enddate,
                               size=self.size):
            tokenized_text = self._clean_text(tw["_source"]["text"])
            if len(tokenized_text) > 3:
                if d2v:
                    yield TaggedDocument(tokenized_text, [
                        str(tw["_source"]["tweet_id"]) + "&" +
                        tw["_source"]["date"] + "&" +
                        str(tw["_source"]["retweets"])
                    ])
                else:
                    yield tokenized_text
            else:
                continue

    def fastText(self):
        self.total_count = self.e.count(self.search_terms,
                                        tweettype=self.tweettype,
                                        startDateString=self.startdate,
                                        endDateString=self.enddate)
        print("TOTAL TWEETS MATCHING:" + str(self.total_count))
        self.fasttextModel = FastText(size=4, window=3, min_count=1)
        self.fasttextModel.build_vocab(sentences=self.tweetsIter(
            search_terms=search_terms))
        total_examples = self.fasttextModel.corpus_count
        self.fasttextModel.train(
            sentences=self.tweetsIter(search_terms=search_terms),
            total_examples=total_examples,
            epochs=5)
        sstring = self.search_terms + self.startdate.replace('/', '-')
        fname = open('twitter_models/' + sstring + "fasttext.model", "wb")
        self.fasttextModel.save(fname)

    def loadFTModel(self, fileName):
        self.fasttextModel = FastText.load(fileName)

    def most_sims_FT(self, word):
        print(self.fasttextModel.wv.similar_by_word(word))

    def doc2vec(self, search_terms=None, save_model=True):
        from gensim.test.utils import common_texts
        print(self.tweettype)
        self.total_count = self.e.count(self.search_terms,
                                        tweettype=self.tweettype,
                                        startDateString=self.startdate,
                                        endDateString=self.enddate)
        print("TOTAL TWEETS MATCHING:" + str(self.total_count))
        self.d2vmodel = Doc2Vec(vector_size=100,
                                window=10,
                                min_count=1,
                                workers=4,
                                epochs=20)
        self.d2vmodel.build_vocab(self.tweetsIter(d2v=True))
        self.d2vmodel.train(self.tweetsIter(d2v=True),
                            total_examples=self.d2vmodel.corpus_count,
                            epochs=self.d2vmodel.epochs)
        sstring = self.search_terms + self.startdate.replace(
            '/', '-') if self.startdate is not None else self.search_terms
        sstring = sstring.replace('"', '*')
        fname = open(
            'twitter_network/twitter_created_models/' + sstring + "d2v.model",
            "wb")
        print(self.d2vmodel)
        if save_model:
            self.d2vmodel.save(fname)

    def loadd2vModel(self):
        lstring = self.search_terms + self.startdate.replace(
            '/', '-') if self.startdate is not None else self.search_terms
        lstring = sstring = sstring.replace('"', '*')
        fstring = 'twitter_network/twitter_created_models/' + lstring + "d2v.model"
        self.d2vmodel = Doc2Vec.load(fstring)
        print(self.d2vmodel.corpus_count)

    def jsonclusterd2vModel(self,
                            wfile=None,
                            write_s3=True,
                            write_local=False):
        from sklearn.cluster import AffinityPropagation
        from sklearn.cluster import KMeans
        from sklearn.cluster import MiniBatchKMeans
        from sklearn.preprocessing import StandardScaler
        from sklearn.decomposition import PCA
        import numpy
        import json
        import umap
        from collections import Counter
        import random
        if self.d2vmodel is None:
            raise ValueError("Please Initialize d2vmodel!")
        num_clusters = 1
        kmeans_model = KMeans(n_clusters=num_clusters,
                              init='k-means++',
                              max_iter=250)
        self.d2vmodel.init_sims(replace=True)
        X = kmeans_model.fit(self.d2vmodel.docvecs.doctag_syn0)
        labels = kmeans_model.labels_.tolist()
        l = kmeans_model.fit_predict(self.d2vmodel.docvecs.doctag_syn0)

        pca = PCA(n_components=2).fit(self.d2vmodel.docvecs.doctag_syn0)
        datapoint = pca.transform(self.d2vmodel.docvecs.doctag_syn0)

        if wfile:
            json_d = {
                "data": [],
                "centroids": [],
                "timeline": [],
                "search_terms": self.search_terms
            }
            centroid_labels = []
            centroids = kmeans_model.cluster_centers_

            for x in range(datapoint.shape[0]):
                json_d["data"].append({
                    "c":
                    labels[x],
                    "id":
                    self.d2vmodel.docvecs.index_to_doctag(x).split('&')[0],
                    "l":
                    datapoint[x].tolist(),
                    "d":
                    self.d2vmodel.docvecs.index_to_doctag(x).split('&')[1],
                    "p":
                    self.d2vmodel.docvecs.index_to_doctag(x).split('&')[2],
                })

            for cluster in range(num_clusters):
                thing = list(
                    filter(lambda x: x["c"] == cluster, json_d["data"]))
                centroide = (sum(map(lambda x: x["l"][0], thing)) / len(thing),
                             sum(map(lambda x: x["l"][1], thing)) / len(thing))
                wcounter = Counter()
                sample_n = 100 if len(thing) > 100 else len(thing)

                for choice in range(sample_n):
                    if sample_n < 100:
                        tweetobj = thing[choice]
                    else:
                        tweetobj = (random.choice(thing))
                    cleaned_text = self._clean_text((self.e.get_doc(
                        tweetobj["id"].split('&')[0])["_source"]["text"]))
                    wcounter.update(cleaned_text)
                centroid_labels.append(",".join(
                    map(lambda x: x[0], wcounter.most_common(5))))
                json_d["centroids"].append(
                    [centroide, centroid_labels[cluster]])
            print(centroid_labels)
            if write_local:
                pre = "twitter_network/static/twitter_network/data/"
                json_f = open(pre + wfile, "w")
                json.dump(json_d, json_f)
            if write_s3:
                S3_BUCKET = "socialmedia-models"
                s3 = S3Client(self.creds, S3_BUCKET)
                s3.upload_str(json.dumps(json_d), wfile)
Ejemplo n.º 14
0
from gensim.models import FastText

from configuration import ROOT_PATH, WORD_DiMENSION
from word_embeddings.skipgram_model import read_sentences

if __name__ == '__main__':
    sentences = read_sentences(ROOT_PATH + '/data/cornell_movie_dialogs_corpus/movie_lines.txt')

    sg_model = FastText(size=WORD_DiMENSION, window=5, min_count=10, workers=4, sg=1)
    sg_model.build_vocab(sentences)
    sg_model.train(sentences, total_examples=sg_model.corpus_count, epochs=1)
    sg_model.save(ROOT_PATH + '/models/embeddings/gensim_fasttext.models')
Ejemplo n.º 15
0

# Tokenizes and reads a corpus formatted as a CSV line by line.
class csvIterator(object):
    def __iter__(self):
        path = ('processed messages.csv')
        with open(path) as fin:
            for line in fin:
                yield list(tokenize(line))


# Basic Hyperparameters for training an embedding.
model = FT(size=350, window=5, min_count=5)

# Builds a list of all words encountered while reading the corpus.
model.build_vocab(sentences=csvIterator())

# Sets the total number of words in the model to be equal to the number of words in the corpus.
total_examples = model.corpus_count

# Trains the model. Epochs is an ML terms to refern to the number of times the model learns the training data.
model.train(sentences=csvIterator(), total_examples=total_examples, epochs=5)

# Normalizes the vector length. Helpful for similarity comparisons later on.
model.init_sims(replace=True)

print('Time elapsed during training: {:.2f} minutes'.format(
    (time.time() - training_time) / 60))

model.save('Embedding.model')
Ejemplo n.º 16
0
#import library
from nltk.tokenize import word_tokenize
from gensim.models import FastText
import pandas as pd

#load data

file = pd.read_csv("tmc_data.csv")
sentences = file["Text"]
corpus = []

#make corpus
for sent in sentences:
    corpus.append(word_tokenize(sent))

model = FastText(corpus, vector_size=100, workers=4, sg=1, window=3)
model.train(corpus, total_examples=len(corpus), epochs=10)

model.save("tmc2007_fasttext")  #binary file

#get info
print("Embeding size : ", 100)
#print("vocab size : ",len(model.vocab))

#test
print(model.wv.most_similar("airpor", topn=5))
print(model.wv.most_similar("airpo", topn=5))
print(model.wv.most_similar("airtraffic", topn=5))
print(model.wv.most_similar("craft", topn=5))
print(model.wv.most_similar("acce", topn=5))
Ejemplo n.º 17
0
class Embed_Vocab(object):
    def __init__(self,
                 corpus='Avocado',
                 corpus_size=-1,
                 embed_type='word2vec',
                 embed_dim=50,
                 window_size=5,
                 max_iter=10,
                 path_to_corpus='',
                 save_flag=True):

        self.corpus = corpus
        self.embed_type = embed_type
        self.embed_dim = embed_dim
        self.window_size = window_size
        self.max_iter = max_iter
        self.path_to_corpus = path_to_corpus
        self.save_flag = save_flag

        self.model = None

    def train(self):

        self.gen = Tokenize_Sent(self.path_to_corpus,
                                 corpus_size)  #Iterator to read files.
        if self.embed_type == 'fasttext':
            print('Training fasttext model ...')
            # self.model = FastText(sentences = self.gen, size = self.embed_dim, iter =self.max_iter, window = self.window_size, min_count = 5, workers = 1, sg = 1)
            self.model = FastText(size=self.embed_dim,
                                  window=self.window_size,
                                  min_count=5,
                                  workers=4,
                                  sg=1)
            self.model.build_vocab(sentences=self.gen)
            self.model.train(sentences=self.gen,
                             total_examples=self.gen.size,
                             epochs=self.max_iter)

            wv = self.model.wv

            print('Words most similar to \'manager\':')
            print(wv.most_similar('manager'))

        else:
            raise NotImplementedError

        if self.save_flag:
            self.save(self.model)

    def save(self, model):

        checkpoint_dir = '../../logs/checkpoint_wordEmbed/{}'.format(
            self.corpus)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        model.save(
            os.path.join(
                checkpoint_dir, '{}.{}d.model'.format(self.embed_type,
                                                      self.embed_dim)))

    def load(self):

        checkpoint_dir = '../../logs/checkpoint_wordEmbed/{}/corpus800k'.format(
            self.corpus)
        if not os.path.exists(checkpoint_dir):
            print('Checkpoint Dir Does not Exist !')
        else:

            if self.embed_type == 'fasttext':
                self.model = FastText.load(
                    os.path.join(checkpoint_dir,
                                 'fasttext.{}d.model'.format(self.embed_dim)))
            else:
                raise NotImplementedError

    def _infer(self):

        self.load()
        wv = self.model.wv
        print('hwty' in wv)
        print('##ed' in wv)
        print('##y' in wv)
        print('##mi' in wv)
        print('##ne' in wv)
        print('Vector embedding for \'hello\':')
        print(wv['hello'])

        word_list = [
            'thanks', 'dear', 'happy', 'sad', 'cost', 'will', 'engine', 'call',
            'mail', 'server', 'bug', 'posted', 'inform', 'done', 'send',
            'forward', 'talk', 'update', 'regards', 'best', 'worst', 'http'
        ]

        for word in word_list:
            print('Words most similar to \'{}\':'.format(word))
            print(wv.most_similar(word))
from gensim.models import FastText
from gensim.models import KeyedVectors
import csv
import os

recipe_sentences = [] # csv파일로 부터 읽어온 documents 저장
direction = 'recipe_data/'
recipe_folder = os.listdir(direction) # recipe_data폴더에 들어있는 파일(폴더) 목록 list
for i, folder in enumerate(recipe_folder):
    csv_filepath = os.listdir(direction + folder) # 해당 폴더에 들어있는 csv파일 목록 list
    for j, csv_file in enumerate(csv_filepath):
        fi = open(direction+folder+'/'+csv_file, 'rt', encoding='UTF8')
        rdr = csv.reader(fi)
        for k, row in enumerate(rdr):
            if k == 0:
                continue
            elif k % 2 == 0:
                recipe_sentences.append(row)
        fi.close()

model_ingredient = FastText(sg=1, window = 10 * 1000000, vector_size=100, min_count=3) # item2vec로 사용하기 위해 windowsize를 크게 설정
model_ingredient.build_vocab(recipe_sentences)
model_ingredient.train(recipe_sentences, epochs = 10, total_examples=model_ingredient.corpus_count)

model_ingredient.save("./_model_ingredient") #학습한 모델 저장
model_ingredient.wv.save("./_model_ingredient_wv") #학습한 모델의 wv 저장

similarity = model_ingredient.wv.most_similar(positive=['소세지'])
print(similarity)

Ejemplo n.º 19
0
Archivo: intent.py Proyecto: qubit56/AI
def main():
    print(
        '■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ main() ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■')
    df = pd.read_csv('./project/data/train_intent.csv')
    print(df.shape)  # (3918, 2)
    print(df.isnull().sum())  # 결측값 확인   question    0   intent      0

    # 형태소 추출 및 Word2Vec
    vector_size = 15
    okt = Okt()
    word2vec_model = FastText(size=vector_size, window=3, min_count=1)

    question = df['question']
    joinStr = ' '.join(question)  # list -> str로 형 변환
    morphs = okt.morphs(joinStr)  # 형태소 추출 -> list
    morphs = np.array(list(set(morphs)))  # set: 중복된 단어를 제거한다.
    morphs = morphs.reshape(
        1, len(morphs))  # FastText가 단어별로 적용되도록 차원 크기 변경. (1, n)
    # print(morphs)   # [['규모' '포시' '하하' ... '음악' '성시경' '공주']]
    # print(morphs.shape) # (1, 1605)

    print('FastText build compile')
    word2vec_model.build_vocab(sentences=morphs)
    print('FastText train')
    word2vec_model.train(sentences=morphs,
                         total_examples=word2vec_model.corpus_count,
                         epochs=10)
    print('FastText complete')
    w2c_index = word2vec_model.wv.index2word  # FastText가 적용된 단어 목록들

    # intent 값 분류
    intent = df['intent']  # 의도 값
    intent = list(set(intent))  # 중복된 단어를 제거한다.
    print(
        intent
    )  # ['명언', '번역', '날씨', '시간', '맛집', '먼지', '달력', '위키', '인물', '뉴스', '음악', '이슈']

    # intent_mapping 생성
    idx = 0
    intent_mapping = {}
    for i in intent:
        intent_mapping[i] = idx
        idx += 1
    print(
        intent_mapping
    )  # {'달력': 0, '번역': 1, '맛집': 2, '날씨': 3, '음악': 4, '이슈': 5, '뉴스': 6, '인물': 7, '시간': 8, '위키': 9, '먼지': 10, '명언': 11}

    # y_data 생성
    y_data = df['intent']  # 의도값
    y_data = y_data.map(intent_mapping)
    y_data = to_categorical(y_data)  # OneHot encoding
    print(y_data.shape)  # (3918, 12)

    # x_data 생성
    encode_length = 10
    x_data = []
    for q_raw in question:
        q_raw = okt.morphs(q_raw)  # 문장 형태소별로 분리(단어 분리). str > list
        q_raw = list(
            map(lambda x: q_raw[x] if x < len(q_raw) else '@',
                range(encode_length)))
        # x가 단어의 수보다 작을 경우 단어(q_raw[x]) 그대로 리스트에 삽입하고 아닐 경우 @를 삽입한다.

        q_raw = list(
            map(
                lambda x: word2vec_model[x]
                if x in w2c_index else np.zeros(vector_size, dtype=float),
                q_raw))
        q_raw = np.array(q_raw)
        x_data.append(q_raw)
    x_data = np.array(x_data)
    print(x_data.shape)  # (3918, 10, 15)

    x_data = x_data.reshape(len(x_data), encode_length * vector_size)
    print('Keras Start', x_data.shape, y_data.shape)

    model = Sequential()
    model.add(Dense(256, input_dim=150, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(12, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(x_data, y_data, batch_size=128, epochs=100)

    # 입력 데이터 중 불용어 제거
    del_josa = [
        '이구나', '이네', '이야', '은', '는', '이', '가', '을', '를', '로', '으로', '이야', '야',
        '냐', '니'
    ]

    def tokenize(sentence):
        word_bag = []
        pos = okt.pos(sentence)  # 형태소에 품사를 추가한다.

        for word, tag in pos:  # 단어와 품사
            if (tag == 'Josa' and word in del_josa) or tag == 'Punctuation':
                # 불 필요한 조사와 구두점을 제거
                continue
            else:
                word_bag.append(word)  # 단어를 리스트에 추가한다.
        result = ' '.join(word_bag)

        return result

    #  입력 데이터(문장)를 벡터화 한다. (데이터 전처리)
    def pred(text):
        q_raw = okt.morphs(text)
        q_raw = list(
            map(lambda x: q_raw[x] if x < len(q_raw) else '@',
                range(encode_length)))
        q_raw = list(
            map(
                lambda x: word2vec_model[x]
                if x in w2c_index else np.zeros(vector_size, dtype=float),
                q_raw))
        q_raw = np.array(q_raw)
        print(q_raw)
        q_raw = q_raw.reshape(1, 150)
        return q_raw

    # 작동.
    while True:
        print('User : '******'')
        speech = tokenize(input())
        print('tokenize : ', speech)
        speech = pred(speech)

        # 결과
        y_intent = model.predict(speech)
        y_intent = np.argmax(y_intent)

        for result, num in intent_mapping.items():
            if y_intent == num:
                print('Intent : ', result, y_intent)
                break
    print(
        '■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ main() end ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■'
    )
Ejemplo n.º 20
0
class Embedding:
    """
    Wrapper class for gensim FastText
    """

    # Constants
    DEFAULT_COS_DISTANCE = 0

    def __init__(self,
                 *args,
                 model_base=None,
                 model_filename=None,
                 fast_text=True,
                 **kwargs):
        """
        Initialize object.
        Set model_filename (path to saved base embedding model) OR model_base(initialized embedding model).
        If both are specified, model_base is preferred. If none, new model will be generated based on args and kwargs.
        """
        if model_base is not None:
            self.model = model_base
        elif model_filename is not None:
            self.load(path=model_filename)
        elif fast_text:
            self.model = FastText(*args, **kwargs)
        else:
            self.model = Word2Vec(*args, **kwargs)

    def save(self, path):
        """
        Save model to path
        """
        self.model.save(path)

    def load(self, path):
        """
        Load model from path
        """
        self.model = FastText.load(path)

    def build_vocab(self, *args, **kwargs):
        """
        Gensim build_vocab wrapper
        """
        self.model.build_vocab(*args, **kwargs)

    def train(self,
              *args,
              total_examples=None,
              epochs=None,
              verbose=False,
              **kwargs):
        """
        Gensim train wrapper
        """
        if total_examples is None:
            total_examples = self.model.corpus_count
        if epochs is None:
            epochs = self.model.epochs
        if verbose:
            kwargs['callbacks'] = [EmbeddingEpochCallback()]
        self.model.train(*args,
                         total_examples=total_examples,
                         epochs=epochs,
                         **kwargs)

    def get_vector_word(self, word, use_norm=False, handle_oov=True):
        """
        Get vector representation of a word
        """
        if handle_oov:
            try:
                result = self.model.wv.word_vec(word=word, use_norm=use_norm)
            except KeyError:
                result = np.zeros(self.model.wv.vector_size, np.float32)
            return result
        return self.model.wv.word_vec(word=word, use_norm=use_norm)

    def get_vector_sentence(self,
                            sentence,
                            min_n=3,
                            max_n=3,
                            use_norm_ngram_word=False,
                            use_norm_word=False,
                            use_norm_ngram_char=False,
                            handle_oov=True):
        """
        Get vector representation of a sentence
        """
        if min_n > max_n or min_n > len(sentence):
            min_n = len(sentence)
            max_n = len(sentence)
        elif max_n > len(sentence):
            max_n = len(sentence)
        ngrams = []
        for n in range(min_n, max_n + 1):
            ngrams += self._get_ngram_word(sentence=sentence, n=n)
        ngrams_found = 0
        result = np.zeros(self.model.wv.vector_size, np.float32)
        for ngram in ngrams:
            try:
                ngram_vector = self._get_vector_ngram_words(
                    ngram_words=ngram,
                    use_norm_word=use_norm_word,
                    use_norm_ngram_char=use_norm_ngram_char,
                    handle_oov=False)
                if use_norm_ngram_word:
                    ngram_vector = self.normalize_vector(ngram_vector)
                result += ngram_vector
                ngrams_found += 1
            except KeyError:
                pass
        if not handle_oov and ngrams_found == 0:
            raise KeyError('all word level n-grams are absent from model')
        else:
            return result / max(1, ngrams_found)

    @staticmethod
    def normalize_vector(vec):
        vec_length = np.linalg.norm(vec)
        if vec_length == 0.0:
            return vec
        return vec / vec_length

    @staticmethod
    def cosine_distance(vec1, vec2):
        return cosine_similarity([vec1], [vec2])[0][0]

    @staticmethod
    def _get_ngram_word(sentence, n):
        if n > len(sentence):
            n = len(sentence)
        elif n <= 0:
            return []
        result = []
        for idx_start in range(0, len(sentence) - n + 1):
            result.append(sentence[idx_start:idx_start + n])
        return result

    def _get_vector_ngram_words(self,
                                ngram_words,
                                use_norm_word=False,
                                use_norm_ngram_char=False,
                                handle_oov=True):
        words_found = 0
        result = np.zeros(self.model.wv.vector_size, np.float32)
        for word in ngram_words:
            try:
                word_vector = self.get_vector_word(
                    word=word, use_norm=use_norm_ngram_char, handle_oov=False)
                if use_norm_word:
                    word_vector = self.normalize_vector(word_vector)
                result += word_vector
                words_found += 1
            except KeyError:
                pass
        if not handle_oov and words_found == 0:
            raise KeyError('all words are absent from model')
        else:
            return result / max(1, words_found)
Ejemplo n.º 21
0
for com in data.Comment:
    com= re.sub("[^a-zA-Z0-8ğüşıöçİĞÜŞÖÇ]"," ",com)
    com=com.lower()
    com=nlp.word_tokenize(com)
    comment_list.append(com)   
 #%%
vector_size=250
window=5
#%% Fassttext modeli oluşturma ve diske kaydetme

fasttext_model = 'fasstext.model'
print("Generating Fasttext Vectors...")
start = time.time()
model= FastText(size = vector_size)
model.build_vocab(comment_list)
model.train( comment_list,window= window,min_count = 1, workers =4 , total_examples = model.corpus_count , epochs = model.epochs)

print("Model created in {} seconds",format(time.time() -start))

model.save(fasttext_model)

del model  

#%%
fasttext_model = 'fasstext.model'
model = FastText.load(fasttext_model)

#%% her bir yorumdaki kelimelerin vektör ortalamalarını hesaplama
main_mean_array=[]
mean_vektor = np.zeros((1,250))
with tqdm(total=len(comment_list)) as pbar:
Ejemplo n.º 22
0
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
'''
Created on 2019-05-21 15:23:11
@author: wind
'''

from gensim.models import FastText

# 训练
sentences = [["你", "是", "谁"], ["我", "是", "中国人"]]
# 方法一(官方不建议这样用)
# model = FastText(sentences,  size=4, window=3, min_count=1, iter=10,min_n = 3 , max_n = 6,word_ngrams = 1)
# 方法二
model = FastText(size=4, window=3, min_count=1, word_ngrams=1)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)  

# 获取词向量
print(model.wv['你']) # 词向量获得的方式
print(model.wv.word_vec('你'))

# 保存模型
model.save('./model.bin')
# 加载模型
model = FastText.load("./model.bin")

# 保存词向量
model.wv.save_word2vec_format("./wv.txt")
Ejemplo n.º 23
0
sentenceList, entityList = train_data_load(traindataset_path)  # 훈련 데이터

print('start', len(sentenceList), path + folder)
if not os.path.exists(path + folder): os.makedirs(path + folder)  # 풀더 생성
#────────────────────────────────────────────────────────────────────

# → 워드 임베딩
print("\n### Fasttext bulid model ###", end="\n")
w2vModel = FastText(size=vector_size, window=3, workers=8, min_count=1)
w2vModel.build_vocab(sentenceList)
print('\n### Fasttext build complete ###', end="\n")

print('\n### Fasttext trian start ###', end="\n")
w2vModel.train(sentenceList,
               total_examples=w2vModel.corpus_count,
               epochs=w2vModel.epochs,
               compute_loss=True,
               verbose=1)
print('\n### Fasttext train complete ###', end="\n")

w2vModel.save(path + folder + 'fasttext')  # 저장
print('\n### Fasttext model save ###', end="\n")

# w2vModel = FastText.load('model/entity/통합_1031/fasttext')

# → fasttext가 적응된 단어 목록들
w2vIndex = w2vModel.wv.index2word
print('*단어의 개수:', len(w2vIndex))
#────────────────────────────────────────────────────────────────────

# → 개체명 인덱스 부여
Ejemplo n.º 24
0
print(pubmed_wv.most_similar(positive=['brain']))
print('----------------------------')
'''

### Create word2vec model w/ merged vocab
t = time()
new_wv = FastText(size=30, window=5, min_count=1, workers=3, sg=0, hs=1, negative = 10, sample=0.001, alpha=0.1)
new_wv.build_vocab(sentences)
'''
total_examples = new_wv.corpus_count
new_wv.build_vocab([list(pubmed_wv.vocab.keys())], update=True)
new_wv.intersect_word2vec_format(preTrainedPath, binary=True, lockf=1.0)
'''

### Train for 2 epochs
new_wv.train(sentences, epochs=2) # , total_examples=total_examples
print('Time to train the model 2 epochs: {} mins'.format(round((time() - t) / 60, 2)))
print('----------------------------')
print(new_wv.most_similar(positive=['treatment']))
print(new_wv.most_similar(positive=['female']))
print(new_wv.most_similar(positive=['history']))
print(new_wv.most_similar(positive=['disease']))
print(new_wv.most_similar(positive=['brain']))
new_wv.save_word2vec_format('mimic-pubmed_2.bin', binary=True)
print('----------------------------')


# Train for 10 epochs
new_wv.train(sentences, epochs=8) # , total_examples=total_examples
print('Time to train the model 10 epochs: {} mins'.format(round((time() - t) / 60, 2)))
print('----------------------------')
Ejemplo n.º 25
0
class Text(Dataset):
    def __init__(self, file=None, df=None, feature_col='Text', label_col=''):
        super().__init__(file, df, feature_col, label_col)
        self.text = self.X
        self.weights = None
        self.split_text()
        # Split into text train and test

    def split_text(self):
        self.refresh()
        self.sentence_train, self.sentence_test, self.y_train, self.y_test = train_test_split(
            self.text, self.y)

    def bag_of_words(self, **kwargs):
        """Transform text corpus into bag of words
        i.e ['Hi you, how are you', 'I am doing well, thank you!'] -> [[1, 1, 1, 2, 0, 0, 0, 0, 0],  [0, 0, 0, 1, 1, 1, 1, 1, 1]]
        """
        self.vectorizer = CountVectorizer(**kwargs)
        self.vectorizer.fit(self.sentence_train)

        self.BoW_train = self.vectorizer.transform(
            self.sentence_train).toarray()
        self.BoW_test = self.vectorizer.transform(self.sentence_test).toarray()
        self.X_train = self.BoW_train
        self.X_test = self.BoW_test

        self.feature_names = self.vectorizer.get_feature_names()

    def vectorize(self, num_words=10000):
        """Transform text corpus to integers in a tokenizer
        i.e. ["Hi how are you?", "I'm well, how about you"] becomes [[10, 3, 4, 7, 0], [5, 12, 3, 15, 7]]
        """
        self.vectorizer = Tokenizer(num_words)
        self.vectorizer.fit_on_texts(self.sentence_train)

        self.tokenized_train = self.vectorizer.texts_to_sequences(
            self.sentence_train)
        self.tokenized_test = self.vectorizer.texts_to_sequences(
            self.sentence_test)

        self.wtoi = self.vectorizer.word_index
        self.itow = self.vectorizer.index_word
        self.pad_and_refresh()

    def pad_and_refresh(self, max_len=None):
        if max_len is None:
            self.tokenized_train = pad_sequences(self.tokenized_train,
                                                 padding='post')
            self.tokenized_test = pad_sequences(self.tokenized_test,
                                                padding='post')
        else:
            self.tokenized_train = pad_sequences(self.tokenized_train,
                                                 padding='post',
                                                 max_len=max_len)
            self.tokenized_test = pad_sequences(self.tokenized_test,
                                                padding='post',
                                                max_len=max_len)

        self.X_train = self.tokenized_train
        self.X_test = self.tokenized_test

        self.vocab_size = len(self.wtoi) + 1

    def create_pretrained_embedding_matrix(self, path, embedding_dim=300):
        # works after vectorize
        self.weights = np.zeros((self.vocab_size, embedding_dim))

        with open(path) as f:
            for line in f:
                word, vector = line.split()
                if word in self.vectorizer.word_index:
                    idx = self.wtoi(word)
                    self.weights[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

    def word_to_index(self, word):
        #word to index
        return self.wtoi[word]

    def index_to_word(self, idx):
        #index to word
        return self.itow[idx]

    def train_fasttext(self,
                       path,
                       sg=1,
                       embedding_dim=300,
                       min_count=2,
                       max_vocab_size=30000,
                       seed=42,
                       epochs=10,
                       workers=4,
                       lowercase=False,
                       full=False):

        sentences = self.sentence_train.values

        self.fasttext_model = FastText(sg=sg,
                                       size=embedding_dim,
                                       min_count=min_count,
                                       max_vocab_size=max_vocab_size,
                                       seed=seed,
                                       workers=workers)

        tokenized = list(self._gen_sentences(sentences))

        print('Building vocabulary for fasttext model...')
        self.fasttext_model.build_vocab(sentences=tokenized)

        print('Training fasttext model...')
        self.fasttext_model.train(sentences=tokenized,
                                  total_examples=len(tokenized),
                                  epochs=epochs)
        self.word_vectors = self.fasttext_model.wv

        counts = Counter({
            word: vocab.count
            for (word, vocab) in self.word_vectors.vocab.items()
        })

        self.wtoi = {
            t[0]: i + 1
            for i, t in enumerate(counts.most_common(max_vocab_size))
        }
        self.itow = {v: k for k, v in self.wtoi.items()}

        self.tokenized_train = [[self.wtoi.get(word, 0) for word in sentence]
                                for sentence in tokenized]

        tok_test = list(self._gen_sentences(self.sentence_test.values))
        self.tokenized_test = [[self.wtoi.get(word, 0) for word in sentence]
                               for sentence in tok_test]

        self.pad_and_refresh()

        self.save_fasttext(path)
        self.create_embedding_matrix(embedding_dim)

    def create_embedding_matrix(self, embedding_dim):
        self.weights = np.zeros((self.vocab_size, embedding_dim))

        for word, i in self.wtoi.items():
            if i >= 10000:
                continue
            try:
                embedding_vector = self.word_vectors[word]
                # words not found in embedding index will be all-zeros.
                self.weights[i] = embedding_vector
            except:
                pass

    def save_fasttext(self, path):
        model_path = os.path.join(path, 'fasttext.model')
        self.fasttext_model.save(model_path)

    def _gen_sentences(self, sentences, lowercase=False):
        for s in sentences:
            yield (list(tokenize(s, lowercase=lowercase)))
Ejemplo n.º 26
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        with open(self.raw_data_path, 'r') as file:
            text = file.read()
            sentences = sent_tokenize(text)

        occ_register = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<exc>', '<pad>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        texts = []
        unq_words = []
        unk_words = []

        for i, line in enumerate(sentences):
            words = word_tokenize(line)
            occ_register.update(words)
            texts.append(words)

        if self.pre_emb:
            model = KeyedVectors.load_word2vec_format(self.model_path)
        else:
            if os.path.exists(self.model_path):
                model = FastText.load(self.model_path)
            else:
                model = FastText(size=self.ft_size,
                                 window=self.ft_w,
                                 min_count=self.min_occ)
                model.build_vocab(sentences=texts)
                model.train(sentences=texts,
                            total_examples=len(texts),
                            epochs=0)
                model.save(self.model_path)

        base = np.ones((300, ), dtype=np.float32)
        emb = [base * (i - 1) for i in range(len(special_tokens))]

        for w, occ in occ_register.items():
            if occ > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)
                if self.pre_emb:
                    if w in model.vocab:
                        emb.append(model[w])
                    else:
                        emb.append(emb[0])
                        unk_words.append(w)
                else:
                    emb.append(model[w])
            else:
                unq_words.append(w)

        assert len(w2i) == len(i2w) == len(emb)
        emb = np.array(emb)
        # print(emb.min())
        # print(emb.max())
        emb = (emb - emb.min()) / (emb.max() - emb.min())

        print("Vocablurary of {} keys created, {} words are excluded, {} "
              "words not in embedding dictionary.".format(
                  len(w2i), len(unq_words), len(unk_words)))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.gen_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        with open(os.path.join(self.gen_dir, self.emb_file), 'wb') as emb_file:
            pickle.dump(np.array(emb), emb_file)

        with open(os.path.join(self.gen_dir, 'cub.unique'), 'wb') as unq_file:
            pickle.dump(np.array(unq_words), unq_file)

        with open(os.path.join(self.gen_dir, 'cub.unknown'),
                  'wb') as unknown_file:
            pickle.dump(np.array(unk_words), unknown_file)

        with open(os.path.join(self.gen_dir, 'cub.all'), 'wb') as a_file:
            pickle.dump(occ_register, a_file)

        self._load_vocab()
Ejemplo n.º 27
0
class OneVsRestSGDClassifier(LabelClassifier):
    def __init__(self,
                 f_dim=100,
                 ft_iters=20,
                 update_iters=100,
                 label_dict_path='data/labels.txt'):

        LabelClassifier.__init__(self, label_dict_path)
        self.f_dim = f_dim  # dimension of word feature vector
        self.ft_iters = ft_iters
        self.update_iters = update_iters

        self.ft_model = FastText(min_count=1, size=self.f_dim)
        self.clf = OneVsRestClassifier(
            SGDClassifier(loss='modified_huber',
                          class_weight={
                              0: 0.4,
                              1: 0.6
                          },
                          penalty='l2',
                          warm_start=False,
                          random_state=1))

    def init_fasttext(self, model_path=None, train_data=None):
        """
        if train_data is provided, train a new fasttext model;
        otherwise, load it from the given path

        --------
        Parameter:

            model_path: fasttext model prefix

            train_data: a list of tokenized sentences. if not provided,
                will try to load existing model from model_path

        """

        if not train_data and model_path and os.path.isfile(model_path):
            #=== load exisitng model ====
            print('loading fasttext model from', model_path)
            self.ft_model = FastText.load(model_path)

        elif train_data:
            #=== train fast text model ====
            # if train_data is not a list of list, split each sentence
            # into list of words
            print('training fasttext model from scratch...')
            train_data = [re.split(',| ',r) if (not isinstance(r,list)) else r\
                          for  r in  train_data ]

            self.ft_model.build_vocab(train_data)
            self.ft_model.train(train_data,
                                total_examples=len(train_data),
                                epochs=self.ft_iters)
            if model_path:
                self.ft_model.save(model_path, separately=[])
        else:
            #=== no train data and no model path provided
            raise TrainDataException(
                'Error building fasttext model. No data/model provided.')

    def div_norm(self, x):
        norm_value = np.sqrt(np.sum(x**2))  #l2norm
        if norm_value > 0:
            return x * (1.0 / norm_value)
        else:
            return x

    def sentence_to_vec(self, words):
        """ generating embedding by summing up normalized
        word embeddings

        --------
        Parameter:
            words: a list of words or a string representation of a sentence
            (seperated by space or ',' )

        Return:
            sentence embedding matrix of size len(words) x f_dim

        """
        if not isinstance(words, list):
            words = re.split(',| ', words)

        vecs = np.zeros((len(words), self.f_dim))
        for i, word in enumerate(words):
            v = self.ft_model.wv.get_vector(word)
            vecs[i] = self.div_norm(v)
        return np.mean(vecs, axis=0)

    def to_vec(self, data):
        """ batch computation of sentence embeddings """
        vec = np.zeros((len(data), self.f_dim))
        for i, sentence in enumerate(data):
            vec[i] = self.sentence_to_vec(sentence)

        return vec

    def train(self, train_data, train_label):
        """
        offline training of the SGD classifier

        --------
        Parameters:

            train_data: a list of tokenized sentences. Each sentence is either
                a string deliminated by comma or space, or a list of words.

            train_label: a list of labels. Each label is a string deliminated
                by comma or space.
        Return:

            X: sentence embedding matrix of size len(train_data) x f_dim
            Y: binary label matrix of size len(train_data) x #_classes
        """
        print('training multilabel classifier on %d samples...' %
              len(train_data))
        Y = np.zeros((len(train_label), len(self.labeldict)))
        for i, labels in enumerate(train_label):
            label_list = re.split(',| ', labels)

            for l in label_list:
                if l:
                    Y[i, self.labeldictR[l]] = 1

        # add dummy sample to classes that do not have samples
        indices = np.where(np.sum(Y, axis=0) == 0)[0]
        Y_new = np.zeros((len(indices), Y.shape[1]))
        for i, id in enumerate(indices):
            train_data.append([self.labeldict[id]])
            Y_new[i, id] = 1
        Y = np.vstack((Y, Y_new))

        X = self.to_vec(train_data)
        self.clf.fit(X, Y)
        return X, Y

    def train_update(self, train_data, train_label):
        """
        online training of the SGD classifier

        --------
        Parameters: see train()

        """
        Y = np.zeros((len(train_label), len(self.labeldict)))
        X = self.to_vec(train_data)
        for i, labels in enumerate(train_label):
            label_list = re.split(',| ', labels)
            for l in label_list:
                if l:
                    Y[i, self.labeldictR[l]] = 1
        for i in range(self.update_iters):
            self.clf.partial_fit(X, Y)
        return X, Y

    def classify(self, string):
        """
        predict the labels of a tokenized sentence

        --------
        Parameters:
            string: string delimited by comma or space, or a list of words

        Return:
            labels: a list of predicted labels

        """
        X = self.to_vec([string])
        Y = self.clf.predict(X)
        #print('class probability',self.clf.predict_proba(X) )
        labels = [self.labeldict[id] for id in np.nonzero(Y[0])[0]]

        return labels

    def save_clf(self, filename):
        print('writing classification model to', filename, '...')
        with open(filename, 'wb') as f:
            pickle.dump(self.clf, f)

    def load_clf(self, filename):
        print('loading classification model from', filename, '...')
        with open(filename, 'rb') as f:
            self.clf = pickle.load(f)
Ejemplo n.º 28
0
                        # Save the particular model
                        model_dbow.save("/share/pi/rubin/jiaming/models/{}.model".format(model_name))
                        model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

                    elif args.model == 'fasttext':
                        model_name = "fasttext_v{}_a{}_e{}_t{}_w{}_s{}_ns{}".format(args.size, args.alpha,
                                                                                    args.epochs, args.alg, 
                                                                                    args.window, args.sample, 
                                                                                    args.ns_exponent)
                        if model_name + ".model" in trained_models:
                            print (model_name + "already trained. Passing.")
                            continue
                        
                        print("Training model: " + model_name)
                        model = FastText(min_count=10, negative=5, 
                                         size=args.size, sg = args.alg,
                                         alpha=args.alpha, min_alpha=args.alpha,
                                         window=args.window, sample=args.sample,
                                         ns_exponent=args.ns_exponent, workers=10)
                        model.build_vocab(sentences=[word_tokenize(line.strip()) for line in note_sentences])
                        for epoch in range(args.epochs):
                            model.train(sentences=utils.shuffle([x for x in tqdm(note_sentences)]), 
                                        total_examples=len(note_sentences), epochs=1)
                            model.alpha -= 0.002
                            model.min_alpha = model.alpha

                        # Save the particular model
                        model.save("/share/pi/rubin/jiaming/models/{}.model".format(model_name))

Ejemplo n.º 29
0
def train_fasttext(corpus):
    model = FastText(size=9, window=2, min_count=1)
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus), epochs=10)

    model.save(r'models\fasttext.model')
Ejemplo n.º 30
0
    food = ["food"]
    rest = ["restaurant"]

    #-------------------------- Preproccessing ----------------------------------
    reviews = []
    for i in reviews_df_com["Review Text"]:
        reviews.append(clean_text(i))     #top.append(text)  # That is very IMPORTANT !!! :)

    # --------------------------- Word2Vect ----------------------------------
    # model = Word2Vec(reviews, size=150, window=10, min_count=2, workers=10)
    # model.train(reviews, total_examples=len(reviews), epochs=10)

    # ---------------------------- Fasttext ----------------------------------
    model = FastText(size=170, window=10, min_count=2, workers=10)  # instantiate
    model.build_vocab(sentences=reviews)
    model.train(sentences=reviews, total_examples=len(reviews), epochs=10)  # train

    word_vectors = model.wv

    word2vectfonc(hotel, 1)
    word2vectfonc(staff, 1)
    word2vectfonc(loc, 2)
    word2vectfonc(room, 3)
    word2vectfonc(breakfast, 4)
    word2vectfonc(bed, 5)
    word2vectfonc(service, 6)
    word2vectfonc(bath, 7)
    word2vectfonc(view, 8)
    word2vectfonc(food, 9)
    word2vectfonc(rest, 10)