Ejemplo n.º 1
0
    def testFastText(self):
        class LeeReader(object):
            def __init__(self, fn):
                self.fn = fn

            def __iter__(self):
                with smart_open(self.fn, 'r', encoding="latin_1") as infile:
                    for line in infile:
                        yield line.lower().strip().split()

        model = FastText(LeeReader(datapath('lee.cor')))
        model.init_sims()
        index = self.indexer(model, 10)

        self.assertVectorIsSimilarToItself(model.wv, index)
        self.assertApproxNeighborsMatchExact(model, model.wv, index)
        self.assertIndexSaved(index)
        self.assertLoadedIndexEqual(index, model)
Ejemplo n.º 2
0
def get_fasttext_embedding_matrix(word_index, max_nb_words):
    model = fText.load_fasttext_format(FASTTEXT_FILE)
    nb_words = max_nb_words
    word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > max_nb_words:
            continue
        embedding_vector = model.wv[word]
        if embedding_vector is not None:
            word_embedding_matrix[i] = embedding_vector

    print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
    return word_embedding_matrix, nb_words
Ejemplo n.º 3
0
    def _train(self):
        """
        Train the np2vec model.
        """
        if self.word_embedding_type == 'word2vec':
            self.model = Word2Vec(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=self.iter)

        elif self.word_embedding_type == 'fasttext':
            self.model = FastText(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=iter,
                min_n=self.min_n,
                max_n=self.max_n,
                word_ngrams=self.word_ngrams)
        else:
            logger.error(
                'invalid word embedding type: ' +
                self.word_embedding_type)
            sys.exit(0)
Ejemplo n.º 4
0
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')
Ejemplo n.º 5
0
 def word_embedding(self):
     return FastText(self.tokens, size=100, min_count=5, workers=multiprocessing.cpu_count(), sg=1)
Ejemplo n.º 6
0
def main():
    class MyIter(object):
        def __iter__(self):
            # absolute_path = r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko'
            file_name = 'voice.txt'
            path = datapath(absolute_path + '/' + file_name)  # 학습시킬 데이터의 절대경로
            with utils.open(path, 'r', encoding='UTF-8') as fin:
                for line in fin:
                    yield list(tokenize(line))

    # 데이터 불러오기
    # absolute_path = r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko'
    file_name = 'wiki.ko.bin'
    # file_name = 'cc.ko.300.bin' # wiki.ko.bin
    corpus_file = datapath(absolute_path + '/' +
                           file_name)  # 데이터의 절대경로 상대경로는 안되나? 응 안돼

    # 모델 구축
    # model 객체화 # 우측 링크의 매개변수 참조 : https://radimrehurek.com/gensim/models/fasttext.html
    print("모델 객체 생성")
    # model = FastText(size=1000, window=3, min_count=3, workers=4, sg=1)
    model = models.fasttext.load_facebook_model(corpus_file)

    # 불러온 모델에 이어서 학습하기
    # MyIter()로 불러온 데이터 new_sentences 리스트에 담기
    new_sentences = []
    for i in MyIter():
        new_sentences.append(i)
    print(new_sentences)

    # MyIter()로 불러온 데이터양 늘리기
    new_sentences = new_sentences * 10000
    print(len(new_sentences))

    # corpus_total_words (및 corpus_count) 모델 속성을 추가 설정
    # 이어서 학습시킬때에는 update=True로 설정해야 기존 학습한 어휘가 추가된체로 학습함.
    print("추가 어휘구성")
    model.build_vocab(new_sentences, update=True)  # Update the vocabulary
    print("추가 어휘구성 끝")
    # 추가 학습
    print("추가 학습")
    model.train(new_sentences,
                total_examples=len(new_sentences),
                epochs=model.epochs)
    print("추가 학습 끝")

    # 모델 저장 : 저장경로는 절대 절대 절대 경로로 설정 안그러면 어만데 저장됨.
    save_name = "wiki_ko_v3.model"
    fname = get_tmpfile(absolute_path + '/' + save_name)
    print("model save start")
    model.save(fname)
    print("model save end")

    # 저장했던 모델 불러오기 : 절대 경로임.
    # absolute_path = r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko'
    file_name = "wiki_ko_v3.model"
    fname = get_tmpfile(absolute_path + '/' + file_name)
    print("model load start")
    model = FastText.load(fname)
    print("model load end")

    menu_list = {
        '폴더버거 핫치킨': [
            '골드버거 치킨', '오늘도 봐봐 치킨', '오늘도 보고 와 치킨', '불도 먹었어 치킨', '골드버거 핫치킨',
            '골드버거 치킨', '월드 보고 아침에', '오늘도 보고 와 치킨', '폴더 버거 킹', '홀더 버거 치킨',
            '뭘 더 먹어 치킨', '너 먹어 치킨', '뭐 먹어 치킨'
        ],
        '폴더버거 비프': [
            '골드버그 비프', '올더 버거 비프', '폴더 버거 비프', '골드버그 비프 세트', '올더 버거 비프 세트',
            '어디서 먹어 핑크색', '물 더 먹어 비트 세트', '골드버그 비프 세트', '올 더 버거 비틀 세트',
            '홀더 버거 비프', '뭘 더 먹어 비프', '너 먹어 피프 세트', '뭐 먹어 비프'
        ],
        '리아미라클버거':
        ['리아미라클버거', '미아 미라클버거', '리아미라클버거 세트', '미라클버거 세트', '리아 미라클 버거 세트'],
        '와규 에디션 투': [
            '외규에디션 2', '마귀 에디션 2', '와규에디션 2', '와 귀신 전화', '월요일 좀 주세요 전화',
            '와규에디션 2 세트', '와규에디션 2 세트', '목요일 샘플 세트'
        ],
        '더블엑스투': [
            '브렉시트', '더블엑스 2', '더블 X2 세트', '저번에 지출 세트', '버그 렉스필드 전화',
            '노래 치킨 세트', '더블 X2 세트', '더블엑스 풀세트', '더블엑스 두 세트'
        ],
        '티렉스': [
            '티렉스 버거', '티렉스 버거 세트', '티렉스버거세트', '티렉스버거 찾아', '티렉스 버거 세트 두 개',
            '티렉스버거세트 두 개'
        ],
        '클래식 치즈버거': [
            '클래식 치즈버거', '클래식 치즈버거 세트 하나', '클래식 치즈버거 틀어', '클래식 치즈버거 세트',
            '클래식 치즈버거 세트 두 개'
        ],
        '한우불고기': [
            '한우 불고기', '한우불고기 세트 하나', '한우 불고기 집 전화', '한우 불고기 제주 TV',
            '한우불고기 두 개', '한우불고기 세트 두 개'
        ],
        '모짜렐라 인 더 버거 베이컨': [
            '모짜렐라인 더 버거', '모짜렐라인 더 버거 세트 하나', '모짜렐라 인더버거 베트남',
            '모짜렐라인 더 버거 세트 두 개', '모짜렐라인 더 버거 세트'
        ],
        # '모짜렐라 in the 버거':['모짜렐라인 더 버거', '모짜렐라인 더 버거 세트 하나',
        #                    '모짜렐라 인더버거 베트남', '모짜렐라인 더 버거 세트 두 개',
        #                    '모짜렐라인 더 버거 세트'],
        # '모짜렐라 인 더 버거':['모짜렐라인 더 버거', '모짜렐라인 더 버거 세트 하나',
        #                    '모짜렐라 인더버거 베트남', '모짜렐라인 더 버거 세트 두 개',
        #                    '모짜렐라인 더 버거 세트'],
        '에이지버거': [
            'az버거', 'az버거 세트', '에이지버거', '에이지버거 세트', '아재버거', '아재버거 세트 하나',
            '거제도 가서 찾아', '아재 동생 두 개', '아재버거세트 두 개', '아재버거 세트 두 개'
        ],
        'az버거': [
            'az버거', 'az버거 세트', '에이지버거', '에이지버거 세트', '아재버거', '아재버거 세트 하나',
            '거제도 가서 찾아', '아재 동생 두 개', '아재버거세트 두 개', '아재버거 세트 두 개'
        ],
        '에이제트버거': [
            'az버거', 'az버거 세트', '에이제트버거', '에이제트버거 세트', '아재버거', '아재버거 세트 하나',
            '거제도 가서 찾아', '아재 동생 두 개', '아재버거세트 두 개', '아재버거 세트 두 개'
        ],
        '원조 빅불': [
            '원조빅불', '원조빅불 세트', '원조빅불세트', '언제 도착하나', '물제 를 풀 세트', '원조 빅불 세트',
            '오늘 이불세트'
        ],
        '핫크리스피버거': [
            '핫크리스피버거', '하트스피가방', '핫크리스피버거 세트', '크리스피 버거 세트', '핫 크리스피버거 세트',
            '하트 스트커 세트'
        ],
        '불고기버거': ['불고기 버거', '불고기 버거 세트 하나', '불고기 버거 세트', '불고기버거 세트 두 개'],
        '데리버거': [
            '데리버거', '데리버거 세트 하나', '데리버거 찾아', '데리버거 두 개', '데리버거 세트 두 개',
            '데리버거세트 두 개'
        ],
        '치킨버거':
        ['치킨버거', '치킨 먹어', '치킨 버거 세트', '치킨 먹었다', '치킨 먹어서 두 개', '치킨 버거 세트 두 개'],
        '새우버거': ['새우버거', '재봉 설탕', '새우버거 세트', '일본어 태어나', '여보 가서 켜', '새우버거 속']
    }

    # 메뉴오탈과 메들 사이의 유사도
    print("메뉴오탈과 메들 사이의 유사도 비교 파일 생성 시작")
    for key1 in menu_list.keys():
        file_name = f'{key1}메뉴의 오탈자와 전체메뉴들과의 유사도.txt'
        for value in menu_list[key1]:
            for key2 in menu_list.keys():
                # print(f"메뉴'{key1}'의 오탈자 '{value}'와 메뉴명'{key2}'와의 유사도: {model.wv.similarity(value, key2)}")
                if file_name not in os.listdir(absolute_path):
                    with open(absolute_path + '/' + file_name,
                              'w',
                              encoding='utf-8') as file_data:
                        file_data.write(
                            f"'{key1}'의 오탈자 '{value}'와 메뉴명'{key2}'와의 유사도: {model.wv.similarity(value, key2)}\n"
                        )
                else:
                    with open(absolute_path + '/' + file_name,
                              'a',
                              encoding='utf-8') as file_data:
                        file_data.write(
                            f"'{key1}'의 오탈자 '{value}'와 메뉴명'{key2}'와의 유사도: {model.wv.similarity(value, key2)}\n"
                        )
    print("메뉴오탈과 메들 사이의 유사도 비교 파일 생성 종료")
def getResult(text):
    print("START READ AND PREPROCESSING", text)
    # kitab = np.load('dataset/numpy/kitabsave.npy', allow_pickle=True)
    kitab = np.load(resource_path('./data/dataset/kitabsave.npy'), allow_pickle=True)
    kitab = kitab.tolist()
    # sentence_clear = np.load('dataset/numpy/sentence_clearsave.npy', allow_pickle=True)
    sentence_clear = np.load(resource_path('./data/dataset/sentence_clearsave.npy'), allow_pickle=True)
    sentence_clear = sentence_clear.tolist()
    # kategori = np.load('dataset/numpy/kategori.npy', allow_pickle=True)
    kategori = np.load(resource_path('./data/dataset/kategori.npy'), allow_pickle=True)
    kategori = kategori.tolist()
    # namakitab = np.load('dataset/numpy/namakitab.npy', allow_pickle=True)
    namakitab = np.load(resource_path('./data/dataset/namakitab.npy'), allow_pickle=True)
    namakitab = namakitab.tolist()

    # kitab[0][1]
    # modelFT = ft.load('Model/modelFT.model')
    modelFT = ft.load(resource_path('./data/model/modelFT.model'))

    #TF-IDF
    tfidf_vectorizer = TfidfVectorizer()

    norm_tf=[]
    for isikitab in kitab:
        for ktb in isikitab:
            norm_tfidf = normalizeArabic(ktb)
            norm_tf.append(norm_tfidf) 

    tfidf_doc = tfidf_vectorizer.fit_transform(norm_tf)

    tfidf_word=tfidf_vectorizer.get_feature_names()

    PIFQvectorizer = CountVectorizer()
    vectoreTF = PIFQvectorizer.fit_transform(norm_tf)
    featureTf = PIFQvectorizer.get_feature_names()

    cosimhasil = []
    cosimhasilnilai = []
    hasilqenilaidicosim = []
    namakitabcosim = []
    halamankitabcosim = []
    isikitabcosim = []
    inputandicosim = []

    pifqhasil = []
    pifqhasilnilai = []
    hasilqenilaidipifq = []
    namakitabpifq = []
    halamankitabpifq = []
    isikitabpifq = []
    inputandipifq = []

    gabunganhasil = []
    gabunganhasilnilai = []
    hasilqenilaidigabungan = []
    namakitabgabungan= []
    halamankitabgabungan = []
    isikitabgabungan = []
    inputandigabungan = [] 
    
    # MOST SIMILAR WE
    hasilQE = modelFT.wv.most_similar(text)
    hasilQE = [(strip_tashkeel(''.join(c for c in hasilQE[i][0] if not ud.category(c).startswith('P'))), hasilQE[i][1]) for i in range(len(hasilQE))]
    # print(hasilQE)

    cosim = []
    hasilpifq = []
    hasilgabungan = []
    nilaihasilgabungan = []
    nilaihasilpifq = []
    nilaicosim = []
    QEpakai = hasilQE[0:3]
    for i in QEpakai:
        tes=i[0] 
        tfidf_query = tfidf_vectorizer.transform([tes])
        cos=0.0
        #hitung kedekatan query pada masing masing dokumen 
        cos=cosine_similarity(tfidf_doc,tfidf_query)
        # print(type(cos))
        cosim.append(max(cos))
        nilaicosim.append(cos)

        # print('tfidf')
        # ================
        countTF = []
        s = ''.join(c for c in tes if not ud.category(c).startswith('P'))
        s = strip_tashkeel(s)
        for k in range(len(featureTf)):
            if featureTf[k] == s:
            # print(k)
                for j in range(vectoreTF.shape[0]):
                    countTF.append(vectoreTF[j,k]) 

        #PIFQ
        nilaipifq = []
        for k in countTF:
            if sum(countTF) == 0:
                nilaipifq.append(0)
            else:
                nilaipifq.append(1 + np.log10(1 + (k / sum(countTF))) + 0.5)
        nilaihasilpifq.append(nilaipifq)
        hasilpifq.append(max(nilaipifq))
        # print('pifq')

        #gabungan
        nilaigabungan = []
        for k in range(vectoreTF.shape[0]):
            nilaigabungan.append(nilaipifq[k] * cos[k][0])
        nilaihasilgabungan.append(nilaigabungan)
        hasilgabungan.append(max(nilaigabungan)) 

#     print("======= hasil Cosim ===========")
    angka = 0
    for i in nilaicosim:
#         print(hasilQE[angka][0])
        for j in range(len(i)):
            if i[j] == cosim[angka]:
                panjangkitab= 0;
                for iterkitab in range(len(kitab)):
                    panjangkitab = panjangkitab + len(kitab[iterkitab])
                    if j <= panjangkitab:
                        tessplit = kitab[iterkitab][0].split(',')
#                         print('Nama Kitab {} halaman ke {}'.format(namakitab[iterkitab],tessplit[4]))
#                         print('isi kitab : ', tessplit[5])
                        cosimhasil.append(hasilQE[angka][0])
                        hasilqenilaidicosim.append(hasilQE[angka][1])
                        cosimhasilnilai.append(cosim[angka])
                        namakitabcosim.append(namakitab[iterkitab])
                        halamankitabcosim.append(tessplit[4])
                        isikitabcosim.append(tessplit[5])
                        # inputandicosim.append(kata)
                        break;
        angka += 1

#     print("====== hasil pifq ==========")
    angka = 0
    for i in nilaihasilpifq:
#         print(hasilQE[angka][0])
        for j in range(len(i)):
            if i[j] == hasilpifq[angka]:
                panjangkitab= 0;
                for iterkitab in range(len(kitab)):
                    panjangkitab = panjangkitab + len(kitab[iterkitab])
                    if j <= panjangkitab:
                        tessplit = kitab[iterkitab][0].split(',')
#                         print('Nama Kitab {} halaman ke {}'.format(namakitab[iterkitab],tessplit[4]))
#                         print('isi kitab : ', tessplit[5])
                        pifqhasil.append(hasilQE[angka][0])
                        pifqhasilnilai.append(hasilpifq[angka])
                        hasilqenilaidipifq.append(hasilQE[angka][1])
                        namakitabpifq.append(namakitab[iterkitab])
                        halamankitabpifq.append(tessplit[4])
                        isikitabpifq.append(tessplit[5])
                        # inputandipifq.append(kata)
                        break;
        angka += 1

#     print("============== Hasil Gabungan ===============")
    print("============== selesai ===============")
    angka = 0
    for i in nilaihasilgabungan:
#         print(hasilQE[angka][0])
        for j in range(len(i)):
            if i[j] == hasilgabungan[angka]:
                panjangkitab= 0;
                for iterkitab in range(len(kitab)):
                    panjangkitab = panjangkitab + len(kitab[iterkitab])
                    if j <= panjangkitab:
                        tessplit = kitab[iterkitab][0].split(',')
#                         print('Nama Kitab {} halaman ke {}'.format(namakitab[iterkitab],tessplit[4]))
#                         print('isi kitab : ', tessplit[5])
                        gabunganhasil.append(hasilQE[angka][0])
                        gabunganhasilnilai.append(hasilgabungan[angka])
                        hasilqenilaidigabungan.append(hasilQE[angka][1])
                        namakitabgabungan.append(namakitab[iterkitab])
                        halamankitabgabungan.append(tessplit[4])
                        isikitabgabungan.append(tessplit[5])
                        # inputandigabungan.append(kata)
                        break;
        angka += 1

    nilaihasilcosim = []
    for i in cosimhasilnilai:
        nilaihasilcosim.append(i[0])
Ejemplo n.º 8
0
def train_model(clxn):
    sents = chain(*(tokenize(item.text) for item in clxn))

    model = FastText(sents, size=300)

    return model
    x.append(ls[0])
    temp = []
    #print(ls[0])
    for j in tokenizer.tokenize(ls[0].decode('utf-8')):
        #print(j)
        temp.append(j)
    data.append(temp)
    lent.append(len(temp))
    y_test.append(int(ls[1]))
f.close()

pad_len = max(lent)

#model_FT = FastText(data, size=10, window=5, min_count=1, workers=5, sg=1,max_vocab_size=10000)
#model_FT.save("SG_fasttext.model")
model_FT = FastText.load("SG_fasttext.model")

print "SG FT model_done!"

voc = list(model_FT.wv.vocab)
print(len(voc))
XVAL = fit_transform(data)

print("Transformed!!")

x_train = []
x_train = XVAL[:m]
print(np.array(x_train).shape)
x_test = []
x_test = XVAL[m:]
print(np.array(x_test).shape)
Ejemplo n.º 10
0
                                                     test_size=0.3,
                                                     random_state=2018)
x_pos_train, x_pos_test, __, ___ = train_test_split(x_pos,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=2018)
print('==== training BiLSTM-CRF ====')

# %%
# from gensim.models import Word2Vec
# model_ted = Word2Vec(sentences=vocab, size=50, window=3, min_count=1, workers=4, sg=1)
# model_ted.save('word2vec.model')
from gensim.models import FastText, Word2Vec
model_ted = FastText(sentences=vocab,
                     size=EMBED_DIM // 4,
                     window=3,
                     min_count=1,
                     workers=4)
# model_ted = Word2Vec(sent, size=2 * (EMBED_DIM // 5), window=3, min_count=1, workers=4, sg=0)
# model_ted = Word2Vec(sent, size=2 * (EMBED_DIM // 5), window=3, min_count=1, workers=4, sg=1)


def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab)
    # define weight matrix dimensions with all 0
    weight_matrix = numpy.zeros((vocab_size, 50))
    # step vocab, store vectors using the Tokenizer's integer mapping
    #wordvectors = embedding.wv
    for i, word in enumerate(vocab):
        #weight_matrix[i] = wordvectors.word_vec(word)
Ejemplo n.º 11
0
sims = sorted(enumerate(sims), key=lambda item: -item[1])
df2 = pd.DataFrame(sims, columns=['attack_id', 'sim_score'])
for i in range(0, len(df2)):
    df2.text[i] = df.Description[df2.attack_id[i]]

from gensim.models import Word2Vec
model_hack = Word2Vec(sentences=texts,
                      size=100,
                      window=5,
                      min_count=2,
                      workers=4,
                      sg=0)
model_hack.wv.most_similar('researchers')

from gensim.models import FastText
model_hack = FastText(texts, size=100, window=5, min_count=5, workers=4, sg=1)
model_hack.wv.most_similar('researchers')

from elasticsearch import Elasticsearch
es = Elasticsearch(['http://cloudweb01.isi.edu/es/'],
                   http_auth=('effect', 'c@use!23'),
                   port=80)
print(es.info())

#search for blogs/news since certain timeframe.
results = es.search(index="effect/socialmedia",
                    scroll='1d',
                    size=20000,
                    body={
                        "query": {
                            "range": {
    def error_correction_fasttext_with_retrain_wiki(self, model_type,
                                                    datasets_type, dataparam_1,
                                                    dataparam_2):
        total_error = 0
        total_error_to_repaired = 0
        total_repaired = 0
        if model_type == "Fasttext_All_Domain":  #every time it will load the pretrained model to test new wiki table
            error_correction = self.prepare_testing_datasets_wiki(
                dataparam_1, dataparam_2
            )  #dataparam_1 : json_list, dataparam_2: path of json_filelist
            model_fasttext = FastText.load("model/Fasttext_All_Domain.w2v")
        if model_type == "Fasttext_CV_Fold":
            model_fasttext = FastText.load("model/Fasttext_CV_Fold.w2v")
        if model_type == "Fasttext_Domain_Location":
            model_fasttext = FastText.load(
                "model/Fasttext_Location_Domain.w2v")
            error_correction = self.prepare_domain_testing_datasets_wiki(
                dataparam_1, dataparam_2, "location")
            total_error = self.calculate_total_error_wiki(
                dataparam_1, dataparam_2)
        if datasets_type == "wiki":
            train_data_rows = []
            for rf in dataparam_1:
                if rf.endswith(".json"):
                    try:
                        revision_list = json.load(
                            io.open(os.path.join(dataparam_2, rf),
                                    encoding="utf-8"))
                        one_item = revision_list[-1]
                        old_value = str(one_item[0]['old_value'].strip())
                        new_value = str(one_item[0]['new_value'].strip())
                        vicinity = one_item[0]['vicinity']
                        vicinity = remove_markup(str(vicinity))
                        vicinity = ast.literal_eval(vicinity)
                        #print('Before: ',vicinity)
                        train_vicinity_index = vicinity.index(old_value)
                        del vicinity[train_vicinity_index]
                        vicinity.append(new_value)
                        vicinity = [
                            x for x in vicinity
                            if not any(x1.isdigit() for x1 in x)
                        ]
                        vicinity = [x for x in vicinity if len(x) != 0
                                    ]  #remove empty item from list
                        #vicinity=[re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in vicinity]
                        #print('After: ', vicinity)
                        #row=list(filter(None, row))
                        dirty_table = one_item[0]['dirty_table']
                        for index, row in enumerate(dirty_table):
                            if index == 0:
                                continue
                            shape = len(row)
                            row = remove_markup(str(row))
                            row = ast.literal_eval(row)
                            row = list(filter(None, row))
                            #remove all digit
                            row = [
                                x for x in row
                                if not any(x1.isdigit() for x1 in x)
                            ]
                            row = [x for x in row if len(x) != 0
                                   ]  #remove empty item from list
                            if row:
                                row = [
                                    re.sub('[^a-zA-Z0-9.-]+', ' ', _)
                                    for _ in row
                                ]
                                train_data_rows.append(row)
                    except Exception as e:
                        print('Exception: ', str(e))
            if train_data_rows:
                model_fasttext.build_vocab(train_data_rows, update=True)
                model_fasttext.train(sentences=train_data_rows,
                                     total_examples=len(train_data_rows),
                                     epochs=5)
            for error_value, actual_value in zip(error_correction['error'],
                                                 error_correction['actual']):
                try:
                    if model_type == "Fasttext_Domain_Location":
                        pass
                    else:
                        total_error = total_error + 1

                    if not any(x1.isdigit() for x1 in error_value):
                        total_error_to_repaired = total_error_to_repaired + 1
                        similar_value = model_fasttext.most_similar(
                            error_value)
                        #print('Actual value: ',  actual_value,'Most similar value of : ',error_value, ' ' , similar_value)
                        first, b = similar_value[0]
                        #print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value)
                        first = first.strip()
                        actual_value = actual_value.strip()
                        if first == actual_value:
                            print('Error : ', error_value, ' Fixed: ', first,
                                  ' Actual: ', actual_value)
                            total_repaired = total_repaired + 1
                except:
                    continue
        print(total_error, total_error_to_repaired, total_repaired)
        model_type = model_type + ' retrain wiki '
        self.evaluate_model(model_type, total_error, total_error_to_repaired,
                            total_repaired)
Ejemplo n.º 13
0
def loadEmbedding(filename):
    print("Loading Word Embedding...")
    it_model = FastText.load_fasttext_format(filename, full_model=False)
    print("...Done!")
    print("Building Sqlite DB....")
    return it_model
Ejemplo n.º 14
0
def main(argv):
    topic = argv[0]
    filelang = argv[1]
    mainlang = argv[2]

    path = "/home/oyku/embeddings/fasttext/wiki." + filelang + ".align.vec"
    dictionary = load_vec(path)

    mono_path = "/home/oyku/monolingual_fasttext/cc." + filelang + ".300"
    mono_wv = fText.load_fasttext_format(mono_path)

    file = "/home/oyku/myversion/oov_words/" + mainlang + "/" + topic + "_" + filelang + ".txt"
    f = open(file, 'r', encoding='utf8')
    content = f.readlines()

    cont = set()

    for el in content:
        if not el.strip().isdigit():
            cont.add(el.strip())

    print("The number of OOVs: " + str(len(content)))
    print("The number of word OOVs: " + str(len(cont)))

    ## Morphologic
    morphs = {}
    for blob in cont:
        if not blob.isdigit():
            text = Text(blob)
            text.language = filelang
            morphemes = []
            for morp in text.morphemes:

                if len(morp) > 3 and morp in dictionary:
                    morphemes.append(morp)

            if len(morphemes) != 0:
                morphs[blob] = morphemes

    print("Morphologic check is over")

    left = cont.difference(morphs)

    ## Spelling
    spellex = {}
    for oov in left:
        if len(oov) > 2:
            possibles = []
            for inv in dictionary:
                if stringdist.rdlevenshtein(oov, inv) == 1:
                    possibles.append(inv)
            if len(possibles) == 1:
                spellex[oov] = possibles

    print("Spelling check is over")

    next_left = left.difference(spellex)

    fasttext_bin = {}
    for oov in next_left:
        try:
            similars = mono_wv.wv.most_similar(oov.strip())

            most_sim = ""
            for sim in similars:
                if sim[0] in dictionary and sim[1] > 0.5:
                    most_sim = sim[0]
                    break

            if most_sim != "":
                fasttext_bin[oov.strip()] = [most_sim]
        except:
            continue

    print("Fasttext check is over")

    print("-----------------------------------------------")

    print("Identified with morphologic analysis: " + str(len(morphs)))
    print("Identified with spell analysis: " + str(len(spellex)))
    print("Identified with Fasttext: " + str(len(fasttext_bin)))

    union = union3(morphs, spellex, fasttext_bin)
    print("Total: " + str(len(union)))

    saved_path = "/home/oyku/myversion/oov_matches/" + mainlang + "/" + topic + "_" + filelang + ".p"
    pickle.dump(union, open(saved_path, "wb"))
Ejemplo n.º 15
0
def Updates():
    try:
        print("updating Doc2Vec")
        print(updating)
        a = stem.snowball.ArabicStemmer()
        stopwords_list = stopwords.words('arabic')
        df = pd.read_csv('textc-Copy1.csv', encoding='utf-8')
        df["contenu"].fillna("محتوى فارغ", inplace=True)
        df["article"].fillna("محتوى فارغ", inplace=True)
        y = df['ToF']
        df = df.drop('ToF', axis=1)
        text = []
        for i in range(df.shape[0]):
            x = nltk.tokenize.wordpunct_tokenize(df.contenu[i])
            text1 = [a.stem(word) for word in x]
        text.append(text1)
        titre = [
            a.stem(word) for word in df.article if word not in stopwords_list
        ]
        #doc2vec
        docs = []
        analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
        for i, te in enumerate(text):
            tags = [i]
            docs.append(analyzedDocument(te, tags))
        model = doc2vec.Doc2Vec(docs,
                                vector_size=300,
                                non_negative=True,
                                window=8,
                                min_count=1,
                                workers=4,
                                dm=1)
        from gensim.test.utils import get_tmpfile
        fname = get_tmpfile("doc2vec.model")
        model.save(fname)
        model = doc2vec.Doc2Vec.load(fname)
        print("updating fastext")

        class MyItera(object):
            def __iter__(self):
                for line in Corpus.article:
                    filtered_sentence = []
                    for w in tokenize(line):
                        if w not in stop_words:
                            filtered_sentence.append(w)
                    yield filtered_sentence

        class MyIter(object):
            def __iter__(self):
                for line in Corpus.contenu:
                    filtered_sentence = []
                    for w in tokenize(line):
                        if w not in stop_words:
                            filtered_sentence.append(w)
                    yield filtered_sentence

        model = FastText(size=150, window=3, min_count=1)
        model.build_vocab(sentences=MyIter())
        total_examples = model.corpus_count
        model.train(sentences=MyIter(),
                    total_examples=total_examples,
                    epochs=5)

    except:
        Update()
Ejemplo n.º 16
0
def loadfasttext():
    fname = get_tmpfile("fasttext.model")
    model_fasttext = FastText.load(fname)
    return model_fasttext
Ejemplo n.º 17
0
def make_model_fasttext(dataset, setting):
    pert_sent = list(perturbed_iterator(dataset, setting))
    model = FastText(pert_sent, workers=effective_n_jobs(-1))
    model.save(f'fasttext_models/{setting}.bin')
Ejemplo n.º 18
0
from gensim.models import FastText

print('Starting to load fasttext embeddings...')
path_to_fasttext_emb = '/tmp/wiki.ru.bin'
print('Done!')

ft_model = FastText.load_fasttext_format(path_to_fasttext_emb)

print(ft_model.wv['снег'])
Ejemplo n.º 19
0
from fse import IndexedList
from fse.models.average import FAST_VERSION, MAX_WORDS_IN_BATCH
from fse.models import SIF
from gensim.models import FastText
import logging
logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)

w2v_model = "H:/Vietnamese word representations/Word_vector_data/VnNewsWord2Vec/VnNewsWord2Vec.bin"

lookup = FastText.load_fasttext_format(w2v_model, encoding='utf-8')

sentences = []
s = IndexedList(sentences)
print(len(s))

title_file = 'H:/Vietnamese word representations/News-titles-embedding/Data/tokenized_titles_cleaned'

with open(title_file, 'r', encoding='utf-8') as file:
    for line in file:
        sentences.append(line.split())

s = IndexedList(sentences)

model = SIF(lookup, workers=2)
model.train(s)

model.save('sent2vec')
Ejemplo n.º 20
0
def main():
    # --- argument parsing ---
    (
        model_name, epochs, min_count, cores, checkpoint_every,
        cache_in_memory, lowercase, fasttext, args
    ) = parse_args(default_model_name='w2v_default', default_epochs=100)

    # --- init logging ---
    logger = init_logging(name=model_name, basic=True, to_file=True, to_stdout=False)
    log_args(logger, args)

    input_dir = join(SMPL_PATH, 'dewiki')
    model_dir = join(EMB_PATH, model_name)
    if not exists(model_dir):
        makedirs(model_dir)
    logger.info('model dir: ' + model_dir)

    t0 = time()
    if cache_in_memory:
        # needs approx. 25GB of RAM
        logger.info('cache data in memory')
        sentences = [s for s in Sentences(input_dir, logger, lowercase=lowercase)]
    else:
        sentences = Sentences(input_dir, logger, use_file_cache=True, lowercase=lowercase)
    gc.collect()

    # Model initialization
    logger.info('Initializing new model')
    if fasttext:
        model = FastText(
            size=300,
            window=5,
            min_count=min_count,
            sample=1e-5,
            negative=5,
            sg=1,
            seed=42,
            iter=epochs,
            workers=cores,
            min_n=3,
            max_n=6,
        )
    else:
        model = Word2Vec(
            size=300,
            window=5,
            min_count=min_count,
            sample=1e-5,
            negative=5,
            sg=1,
            seed=42,
            iter=epochs,
            workers=cores,
        )
    logger.info('Building vocab')
    model.build_vocab(sentences, progress_per=100_000)

    # Model Training
    epoch_saver = EpochSaver(model_name, model_dir, checkpoint_every)
    epoch_logger = EpochLogger(logger)

    logger.info('Training {:d} epochs'.format(epochs))
    model.train(
        sentences,
        total_examples=model.corpus_count,
        epochs=model.epochs,
        report_delay=60,
        callbacks=[epoch_logger, epoch_saver],
    )

    # saving model
    file_path = join(model_dir, model_name)
    logger.info('Writing model to ' + file_path)
    model.callbacks = ()
    model.save(file_path)

    t1 = int(time() - t0)
    logger.info("all done in {:02d}:{:02d}:{:02d}".format(t1//3600, (t1//60) % 60, t1 % 60))
 def error_correction_fasttext_with_retrain_realworld(
         self, datasets_type, dataset_1, dataset_2, model_type):
     total_error = 0
     total_error_to_repaired = 0
     total_repaired = 0
     try:
         if model_type == "Fasttext_All_Domain":
             if datasets_type == "real_world":
                 error_correction = self.prepare_testing_datasets_real_world(
                     dataset_1,
                     dataset_2)  #dataset_1 clean data for real world
             model_fasttext = FastText.load("model/Fasttext_All_Domain.w2v")
         if model_type == "Fasttext_Domain_Location":
             error_correction = self.prepare_domain_testing_datasets_realworld(
                 dataset_1, dataset_2)  #dataset_1 clean data for real world
             total_error = self.calculate_total_error_realworld(
                 dataset_1, dataset_2)
             model_fasttext = FastText.load(
                 "model/Fasttext_Location_Domain.w2v")
     except Exception as e:
         print('Model Error: ', str(e))
     data_for_retrain = self.prepare_dataset_for_retrain_realworld(
         dataset_1, dataset_2)
     train_data_rows = []
     try:
         data_for_retrain = data_for_retrain.values.tolist()
         for row in data_for_retrain:
             row = list(map(str, row))
             row = list(filter(None, row))
             train_data_rows.append(row)
         if train_data_rows:
             if train_data_rows:
                 model_fasttext.build_vocab(train_data_rows, update=True)
                 model_fasttext.train(sentences=train_data_rows,
                                      total_examples=len(train_data_rows),
                                      epochs=5)
     except Exception as e:
         print("Exception from spell model : ", str(e))
     for error_value, actual_value in zip(error_correction['error'],
                                          error_correction['actual']):
         if model_type == "Fasttext_Domain_Location" and datasets_type == "real_world":
             pass
         else:
             total_error = total_error + 1
         try:
             if not any(x1.isdigit() for x1 in error_value):
                 total_error_to_repaired = total_error_to_repaired + 1
                 similar_value = model_fasttext.most_similar(error_value)
                 first, b = similar_value[0]
                 first = first.lower()
                 actual_value = actual_value.lower()
                 #print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value)
                 first = first.strip()
                 actual_value = actual_value.strip()
                 if first == actual_value:
                     print('Error : ', error_value, ' Fixed: ', first,
                           ' Actual: ', actual_value)
                     total_repaired = total_repaired + 1
         except Exception as e:
             print('Error correction model: ', str(e))
             continue
     self.evaluate_model(model_type, total_error, total_error_to_repaired,
                         total_repaired)
Ejemplo n.º 22
0
for ix in range(len(word_index)):
    word = word_index[ix]
    #     word_encode = word_index[ix].encode(('utf-8'))
    vec = word_vectors[word]
    for j in range(int(dim)):
        E[ix][j] = vec[j]

window_size = 10000
embedding = Embedding(100,
                      32,
                      weights=[E],
                      input_length=window_size,
                      trainable=False)
# https://gist.github.com/brandonrobertz/49424db4164edb0d8ab34f16a3b742d5

trained_model = FastText.load_fasttext_format('data_noun_token')

input_train = []

fin = open('data_noun_token.txt', 'r')
lines = fin.readlines()
print(len(lines))
for line in lines:
    word_vector = []
    for word in line.split():
        try:
            word_vector.append(trained_model[word])
        except:
            pass
    input_train.append(word_vector)
fin.close()
        for word in tokens:
            if word not in diccionario:
                diccionario[word] = 1
            else:
                diccionario[word] += 1

print("numero de oraciones presentes en el corpus " + str(len(sentences)))
print("numero de palabras unicas " + str(len(diccionario)))

num_features = [20, 50, 100]  #Dimensionality of the resulting word vectors
min_word_count = 1  #Minimum word count threshold
num_workers = multiprocessing.cpu_count(
)  #Number of threads to run in parallel
context_size = 5  #Context window length
seed = 1  #Seed for the RNG, to make the result reproducible

for p in num_features:
    fasttext_model = FastText(
        sentences=sentences,
        size=p,
        window=context_size,
        min_count=min_word_count,
        workers=num_workers,
        sg=1  #skip-gram
    )

    fasttext_model.wv.save_word2vec_format(
        'model/fasttext_skip-gram_model_bioinfer_' + str(p) + '.txt',
        binary=False)

    del fasttext_model
Ejemplo n.º 24
0
import numpy as np
import pickle
import utils

# Setting
max_size, max_seq_len = 50, 30
(modelno_to_goodsnm, modelno_to_goodsnms) = utils.model_basic_dict()
#print("✱ the # of classes(catalogs): ", len(modelno_to_goodsnm)) # Not-Toy set

# Loading..
# ├ ⑴Toy dict.(pre-stored @toyData.py)
# ├ ⑵Word Embedding Model (pre-trained @wordEmbedding.py)
# └ ⑶LSTM model (pre-stored @lstm.py)
with open('dictionary/toyDict.pickle', 'rb') as handle:
    toy_dict = pickle.load(handle)
fastText = FastText.load('model/FastText.bin')
preLSTM = load_model('model/lstm.h5')

LSTM = MyLSTM(toy_dict=toy_dict, embedding_model=fastText)
(X_train, Y_train, X_val, Y_val, X_test, Y_test,
 toy_train_dict) = LSTM.split_train_test()
X_test = sequence.pad_sequences(np.array(X_test), maxlen=max_seq_len)
index_dict = LSTM.create_index_dict()

#---------------------Prediction---------------------#
X_new = X_test[:50]
Y_new = Y_test[:50]
Y_hat = list(preLSTM.predict_classes(X_new))
print("\nY_hat: {},\nY_new: {}\n".format(Y_hat, Y_new))

for i in range(len(Y_hat)):
Ejemplo n.º 25
0
def train_fasttext(corpus):
    model = FastText(size=9, window=2, min_count=1)
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus), epochs=10)

    model.save(r'models\fasttext.model')
Ejemplo n.º 26
0
def main():
    try:
        with open("token_counts.json") as infile:
            token_counts = json.load(infile)
    except:
        print(
            "Failed to load token counts - did you `python train.py build-corpus $INFILE` yet?"
        )
        sys.exit(1)

    if os.path.exists("vectors_out.sqlite"):
        os.unlink("vectors_out.sqlite")

    conn = sqlite3.connect("vectors_out.sqlite")
    conn.execute("""
        CREATE TABLE vector_meta (
            vector_float_bytes integer, 
            embedding_dimensions integer,
            vocab_size integer,
            oov_token text,
            build_parameters text
        )
    """)
    conn.execute(
        "INSERT INTO vector_meta VALUES (?, ?, ?, ?, ?)",
        ("float32", EMBED_DIM, VOCAB_SIZE, OOV_TOKEN, jsonify_build_params()),
    )
    conn.execute(
        "CREATE TABLE vectors (token text primary key, vector_bytes blob);")
    conn.execute(
        "CREATE TABLE frequencies (token text primary key, count integer);")

    print(f"Loaded {sum(token_counts.values())} words.  Training model...")

    ft_model = FastText(
        corpus_file="corpus.txt",
        size=EMBED_DIM,
        window=WINDOW_LEN,
        min_count=MIN_COUNT,
        max_vocab_size=VOCAB_SIZE,
        alpha=LEARNING_RATE,
        sg=SKIPGRAM,
        sorted_vocab=1,
        iter=EPOCHS,
        workers=NUM_WORKERS,
        negative=NEGATIVE_SAMPLES,
        ns_exponent=NEG_SAMP_DIST,
    )

    vectors = {}
    vocab = {}
    sorted_counts = sorted(token_counts.items(), key=lambda p: -p[1])
    idx = 2
    progress = tqdm(desc="Writing Vectors", total=VOCAB_SIZE)
    oov_vector = ft_model.wv[OOV_TOKEN]
    conn.execute("INSERT INTO vectors VALUES (?, ?)", (OOV_TOKEN, oov_vector))
    conn.execute("INSERT INTO frequencies VALUES (?, ?)", (OOV_TOKEN, 0))
    for token, count in sorted_counts:
        try:
            vector = ft_model.wv.get_vector(token)
        except:
            continue
        vectors[token] = vector
        vocab[token] = idx
        vector_bytes = vector.astype("float32").tobytes()
        conn.execute("INSERT INTO vectors VALUES (?, ?)",
                     (token, vector_bytes))
        conn.execute("INSERT INTO frequencies VALUES (?, ?)", (token, count))
        progress.update(1)
        idx += 1
        if idx >= VOCAB_SIZE:
            break

    print("Writing vectors_out.sqlite...")
    conn.commit()
    progress.close()
    print("Writing vocab.json...")
    with open("vocab.json", "w") as outfile:
        json.dump(vocab, outfile)
    print("Done!")
Ejemplo n.º 27
0
                    if embedding == 'w2v':
                        model = Word2Vec(sents,
                                         size=dimension,
                                         window=word_window,
                                         workers=3,
                                         sg=skip_grams,
                                         hs=softmax,
                                         negative=negative_sample,
                                         iter=EPOCHS)
                        model.wv.save_word2vec_format(model_filename,
                                                      binary=False)
                        model.vocabulary.save(vocab_filename)
                    else:
                        model = FastText(sg=skip_grams,
                                         hs=softmax,
                                         size=dimension,
                                         window=word_window,
                                         workers=3,
                                         negative=negative_sample)
                        model.build_vocab(sentences=sents)
                        model.train(sentences=sents,
                                    total_examples=len(sents),
                                    epochs=EPOCHS)
                        model.save(model_filename)
                        # model.vocabulary.save(vocab_filename)

                    print("Model saved at " + model_filename)

                embedding_model = []
                if embedding == 'w2v':
                    embedding_model = Embedding.from_word2vec(model_filename)
                else:
def generate_fasttext(messages: list):
    model = FastText(min_count=3, workers=8, window=3, min_n=1)
    model.build_vocab(sentences=messages)
    model.train(sentences=messages, total_examples=len(messages), epochs=10)
    return model
Ejemplo n.º 29
0
                    help='corpus')

parser.add_argument('--base',
                    '-b',
                    dest='base',
                    action='store',
                    required=True,
                    help='base')
args = parser.parse_args()

corpus_file = args.base + "/" + args.corpus
base = args.base

#modelSG = Word2Vec(sentencia, sg=1,window=1,min_count=1)
output = base + "/gSG_" + base + ".vec"
modelSG = Word2Vec(corpus_file=corpus_file, sg=1, window=1, min_count=1)
modelSG.wv.save_word2vec_format(output, binary=False)

#modelCB = Word2Vec(sentencia, sg=0,window=1,min_count=1)
output = base + "/gCB_" + base + ".vec"
modelCB = Word2Vec(corpus_file=corpus_file, sg=0, window=1, min_count=1)
modelCB.wv.save_word2vec_format(output, binary=False)

output = base + "/gFT_" + base + ".vec"
modelFT = FastText(window=1, min_count=1, sg=1)
modelFT.build_vocab(
    corpus_file=corpus_file)  # scan over corpus to build the vocabulary
total_words = modelFT.corpus_total_words  # number of words in the corpus
modelFT.train(corpus_file=corpus_file, total_words=total_words, epochs=5)
modelFT.wv.save_word2vec_format(output, binary=False)
Ejemplo n.º 30
0
#characters
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print("Number of Labels: ", n_chars)

tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

# Char Key:char -> Value:token_index
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0

words_fast = FastText.load('model_fast30/model_fast.model')

#load pretrained word embedding
embedding_matrix = np.ones((len(word2idx), 100), dtype='float32')
embedding_matrix[0] = np.zeros(100, dtype='float32')
# with open('wiki-news-300d-1M.vec') as f:
for i in range(2, len(idx2word) - 2):
    embedding_matrix[i] = words_fast[idx2word[i]]
#         ordered_words_ft.append(s[0])
print('Found %s word vectors.' % len(embedding_matrix))

# for word, i in word2idx.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector
Ejemplo n.º 31
0
"""
`gensim.models.FastText` 使用示例
"""
# gensim 示例
import gensim
import numpy as np
from gensim.test.utils import common_texts
from gensim.models.keyedvectors import FastTextKeyedVectors
from gensim.models._utils_any2vec import compute_ngrams, ft_hash
from gensim.models import FastText

# 构建 FastText 模型
sentences = [["Hello", "World", "!"], ["I", "am", "huay", "."]]
min_ngrams, max_ngrams = 2, 4  # ngrams 范围
model = FastText(sentences, size=5, min_count=1, min_n=min_ngrams, max_n=max_ngrams)

# 可以通过相同的方式获取每个单词以及任一个 n-gram 的向量
print(model.wv['hello'])
print(model.wv['<h'])
"""
[-0.03481839  0.00606661  0.02581969  0.00188777  0.0325358 ]
[ 0.04481247 -0.1784363  -0.03192253  0.07162753  0.16744071]
"""
print()

# 词向量和 n-gram 向量是分开存储的
print(len(model.wv.vectors))  # 7
print(len(model.wv.vectors_ngrams))  # 57
# gensim 好像没有提供直接获取所有 ngrams tokens 的方法

print(model.wv.vocab.keys())
    word2id[i[0]] = int(i[1])
read_file.close()

#参数配置
epcho = 1
batch_size = 256
num_to_ev = 400  # 训练多少批,在本地评测一次
vocab_size = len(word2id)  # 词典大小
embedding_dim = 256  # 词向量维度
t_max_len = 22  #title的最大长度
q_max_len = 11  #query的最大长度
lr = 0.0001  #学习率

#加载验证集
val_data = load_esim_data_and_labels(
    "/home/kesci/work/data/eval_data/19_eval.csv",
    word2id,
    q_max_len=q_max_len,
    t_max_len=t_max_len)

# 每个词拼接使用128维的w2v和128维的fast向量到256维的组合向量表示
ce = np.random.uniform(-1, 1, [vocab_size + 1, embedding_dim])
word2vec_model = Word2Vec.load("/home/kesci/word2vec.model")
fast_model = FastText.load("./data/fast_w2v.model")
ce[0] = np.zeros(embedding_dim)
for i in word2id:
    try:
        ce[word2id[i]] = np.concatenate((word2vec_model[i], fast_model[i]))
    except:
        print(i)
Ejemplo n.º 33
0
    set_b = set(b["PaperTitle"].to_list() + b["abstract"].to_list())
    list_ab = list(set_a) + list(set_b)
    cores = multiprocessing.cpu_count()
    ps = PorterStemmer()
    with Pool(cores) as p:
        sentences = p.map(stem_lines, list_ab)
    # sentences = [stem_lines(i) for i in list_ab]
    # for s in tokenizer.pipe(list_ab):
    #     sentences.append([t.text for t in s if not t.check_flag(IS_PUNCT)])

    print("Train the FastText model")
    # word2vec_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=cores, sg=0)
    # word2vec_model.wv.save_word2vec_format("word2vec_model", binary=True)  # binary format
    print(sentences[0])
    fastText_model = FastText(size=100,
                              window=5,
                              min_count=1,
                              sentences=sentences,
                              iter=5,
                              workers=cores)
    fastText_model.wv.save("fastText_model")
    print("finish training")
    word_vectors = fastText_model.wv  # KeyedVectors.load_word2vec_format("fastText_model")

    b['embedding'] = b.apply(
        lambda x: wv_papers(x['abstract'], x['PaperTitle'], word_vectors),
        axis=1)
    print(b.head())
    print(len(b))
    b.to_pickle("fasttext_embedding.pickle")
import os
from gensim.models import FastText

EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment"
# EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment"
model_file = EXP_HOME + '/pymodel/github-deepcs'
model = FastText.load(model_file)

# print(len(model.wv))
# quit()

word_file = EXP_HOME + '/w2vec-data/words.txt'
vec_file = EXP_HOME + '/w2vec-data/github-vector.txt'
vec_lines = list()
words = open(word_file, 'r')
for word in words:
    try:
        if model.wv.__contains__(word.strip()):
            vector = model.wv[word.strip()]
            line = word.strip() + " " + ' '.join(str(x) for x in vector)
            vec_lines.append(line)
    except IOError:
        print("Could not found " + word)
        pass

output_file = open(vec_file, 'w')
for content in vec_lines:
    output_file.write("%s\n" % content)
output_file.close()
Ejemplo n.º 35
0
class NP2vec:
    """
    Initialize the np2vec model, train it, save it and load it.
    """

    def is_marked(self, s):
        """
        Check if a string is marked.

        Args:
            s (str): string to check
        """
        return len(s) > 0 and s[-1] == self.mark_char

    def __init__(
            self,
            corpus,
            corpus_format='txt',
            mark_char='_',
            word_embedding_type='word2vec',
            sg=0,
            size=100,
            window=10,
            alpha=0.025,
            min_alpha=0.0001,
            min_count=5,
            sample=1e-5,
            workers=20,
            hs=0,
            negative=25,
            cbow_mean=1,
            iter=15,
            min_n=3,
            max_n=6,
            word_ngrams=1):
        """
        Initialize np2vec model and train it.

        Args:
          corpus (str): path to the corpus.
          corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json
          formats are supported. For json format, the file should contain an iterable of
          sentences. Each sentence is a list of terms (unicode strings) that will be used for
          training.
          mark_char (char): special character that marks NP's suffix.
          word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and
          fasttext are supported.
          np2vec_model_file (str): path to the file where the trained np2vec model has to be
          stored.
          binary (bool): boolean indicating whether the model is stored in binary format; if
          word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True.
          sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training
          algorithm. If 1, CBOW is used,otherwise, skip-gram is employed.
          size (int): model training hyperparameter, size of the feature vectors.
          window (int): model training hyperparameter, maximum distance between the current and
          predicted word within a sentence.
          alpha (float): model training hyperparameter. The initial learning rate.
          min_alpha (float): model training hyperparameter. Learning rate will linearly drop to
          `min_alpha` as training progresses.
          min_count (int): model training hyperparameter, ignore all words with total frequency
          lower than this.
          sample (float): model training hyperparameter, threshold for configuring which
          higher-frequency words are randomly downsampled, useful range is (0, 1e-5)
          workers (int): model training hyperparameter, number of worker threads.
          hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1,
          hierarchical softmax will be used for model training. If set to 0, and `negative` is non-
                        zero, negative sampling will be used.
          negative (int): model training hyperparameter, negative sampling. If > 0, negative
          sampling will be used, the int for negative specifies how many "noise words" should be
          drawn (usually between 5-20). If set to 0, no negative sampling is used.
          cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context
          word vectors. If 1, use the mean, only applies when cbow is used.
          iter (int): model training hyperparameter, number of iterations.
          min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used
          for training word representations.
          max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for
          training word representations. Set `max_n` to be lesser than `min_n` to avoid char
          ngrams being used.
          word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word
          vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training.

        """

        self.mark_char = mark_char
        self.word_embedding_type = word_embedding_type
        self.sg = sg
        self.size = size
        self.window = window
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.min_count = min_count
        self.sample = sample
        self.workers = workers
        self.hs = hs
        self.negative = negative
        self.cbow_mean = cbow_mean
        self.iter = iter
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams

        if corpus_format == 'txt':
            self._sentences = LineSentence(corpus)
        elif corpus_format == 'json':
            with open(corpus) as json_data:
                self._sentences = json.load(json_data)
        elif corpus_format == 'conll2000':
            try:
                self._sentences = list()
                for chunked_sent in conll2000.chunked_sents(corpus):
                    tokens = list()
                    for chunk in chunked_sent:
                        if hasattr(chunk, '_label') and chunk._label == 'NP':
                            s = ''
                            for w in chunk:
                                s += w[0] + self.mark_char
                            tokens.append(s)
                        else:
                            if isinstance(chunk, nltk.Tree):
                                for w in chunk:
                                    tokens.append(w[0])
                            else:
                                tokens.append(chunk[0])
                        self._sentences.append(tokens)
            except Exception:
                print('Conll2000 dataset is missing from NLTK. See downloading details in the '
                      'README file')
        else:
            logger.error('invalid corpus format: ' + corpus_format)
            sys.exit(0)

        if word_embedding_type == 'fasttext' and word_ngrams == 1:
            # remove the marking character at the end for subword fasttext model training
            for i, sentence in enumerate(self._sentences):
                self._sentences[i] = [
                    w[:-1] if self.is_marked(w) else w for w in sentence]

        logger.info('training np2vec model')
        self._train()

    def _train(self):
        """
        Train the np2vec model.
        """
        if self.word_embedding_type == 'word2vec':
            self.model = Word2Vec(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=self.iter)

        elif self.word_embedding_type == 'fasttext':
            self.model = FastText(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=iter,
                min_n=self.min_n,
                max_n=self.max_n,
                word_ngrams=self.word_ngrams)
        else:
            logger.error(
                'invalid word embedding type: ' +
                self.word_embedding_type)
            sys.exit(0)

    def save(self, np2vec_model_file='np2vec.model', binary=False):
        """
        Save the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
        """
        if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1:
            if not binary:
                logger.error(
                    "if word_embedding_type is fasttext and word_ngrams is 1, "
                    "binary should be set to True.")
                sys.exit(0)
            # not relevant to prune fasttext subword model
            self.model.save(np2vec_model_file)
        else:
            # prune non NP terms
            logger.info('pruning np2vec model')
            total_vec = 0
            vector_size = self.model.vector_size
            for word in self.model.wv.vocab.keys():
                if self.is_marked(word):
                    total_vec += 1
            logger.info(
                "storing %sx%s projection weights for NP's into %s" %
                (total_vec, vector_size, np2vec_model_file))
            with utils.smart_open(np2vec_model_file, 'wb') as fout:
                fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
                # store NP vectors in sorted order: most frequent NP's at the top
                for word, vocab in sorted(
                        iteritems(
                            self.model.wv.vocab), key=lambda item: -item[1].count):
                    if self.is_marked(word):
                        embedding_vec = self.model.wv.syn0[vocab.index]
                        if binary:
                            fout.write(
                                utils.to_utf8(word) + b" " + embedding_vec.tostring())
                        else:
                            fout.write(
                                utils.to_utf8(
                                    "%s %s\n" %
                                    (word, ' '.join(
                                        "%f" %
                                        val for val in embedding_vec))))

    @classmethod
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')