Ejemplos de Word2VecEmbedding en Python, ejemplos de nlp4kor_pytorch.word2vec.word2vec_embedding.Word2VecEmbedding en Python

Ejemplo n.º 1

0

Mostrar archivo

def test_root(embedding: Word2VecEmbedding, top_n=1, metric=METRIC):
    text = """서태지가 직접 활동한 것은 전무하며 음반 발매 이후 삼성전자에서 서태지 기념앨범 음원 14곡과 뮤직비디오, 미공개 동영상 등 스페셜 에디션에 내장된 MP3 ‘옙 P2 서태지 스페셜 에디션’을 판매했다.
이 사고의 여파로 첫 프로리그 경기인 삼성전자 칸과 KTF 매직엔스의 대결은 무관중으로 경기를 해야 했다.
하지만 2011년 피처 폰은 삼성전자의 천지인으로 통일하였다.
삼성전자 선수였던 이왕돈과의 사이에 둔 1남 1녀인 이광재(현 부산 KT 소닉붐)와 이유진(현 부천 하나외환이 모두 선수로 뛰고 있는 농구 가족이다.
2009년 상반기 드래프트에서 삼성전자 칸의 2차 지명으로 입단하였다.
삼성물산 경공업본부 뱅커스트러스트 인터내셔널 동경지점 국제자본시장부 부사장 삼성 회장 비서실 국제금융담당 이사 삼성전자 자금팀장 삼성생명 전무 삼성투자신탁운용 대표이사 사장 삼성증권 사장 우리금융 회장 겸 우리은행장 법무법인 세종 고문 KB금융지주 회장 차병원그룹 부회장 차바이오앤디오스텍 대표이사 회장 한국금융투자협회 회장 첫 사회생활은 삼성물산에서 시작했다.
이후 삼성전자와 삼성생명, 삼성투신, 삼성증권 등 다양한 계열사를 거치며 금융과 실물경제 모두를 고루 섭렵했다.
황의 법칙()은 한국의 삼성전자의 기술총괄 사장이었던 황창규가 제시한 이론이다.
현재 김앤장 법률사무소의 상임고문이며, 삼성전자 이사회의 사외이사이다.
에이스침대 1985년 삼성전자 삼성 세탁기 "점보 크리스탈" (Feat. 김민자) 1986년 ~ 1990년 동서식품 동서 차 시리즈 1986년 삼성전자 삼성 세탁기 "센서 크리스탈" (Feat. 김민자, 조용원, 이순재)
2013년 ~ 현재 : KT 대표이사 회장 2013년 ~ 현재 : 부산 KT 소닉붐 구단주 2013년 ~ 현재 : KT 위즈 구단주 2010년 지식경제R&D 전략기획단 단장 2008년 삼성전자 기술총괄 사장 2004년 ~ 2008년 삼성전자 반도체총괄 겸 메모리사업부 사장 1994년 삼성전자 반도체연구소 상무 1987년 미국 인텔사 자문 1985년 미국 스탠퍼드대 전기공학과 책임연구원
2006년 대한민국 최고과학기술인상 2005년 홍콩 아시아머니 선정 아시아 최고경영자 1994년 삼성전자 특별개발포상
2010년 삼성전자 YEPP CF 모델 (With 장근석) 2012년 KBS2 《드림하이 2》 - 나나 역 2014년 Mnet 《No Mercy》 2015년 MBC 《나는 가수다 3》
훈민정음은 삼성전자에서 개발한 윈도용 한글 워드 프로세서로, 자사 컴퓨터에 번들 소프트웨어로 훈민정음을 제공하여 사용자들을 얻었다.
2014년 3분기 기준으로 세계에서 휴대 전화 시장 점유율이 가장 높은 업체는 삼성전자이며 24.7%의 점유율로 작년보단 많이 떨어졌다.
2012년 1분기 기준으로 삼성전자가 노키아를 제치고 세계 휴대 전화 시장 점유율 1위에 올랐다.
미국 시장조사업체 스트래티지어낼리틱스(SA)는 26일(현지시간) “삼성전자가 1분기에 휴대 전화 9350만 대(점유율 25%)를 판매해 14년간 1위를 지킨 노키아를 눌렀다”고 발표했다.
하지만 2011년 피처 폰은 삼성전자의 천지인으로 통일하였다.
"""

    for sentence in text.split('\n'):
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue
        print()
        print(sentence)
        for word in sentence.split(' '):
            _roots = embedding.roots(word, top_n=top_n, metric=metric)
            if len(_roots) > 0:
                print(word, '->', _roots)

Ejemplo n.º 2

0

Mostrar archivo

def test_relation(embedding: Word2VecEmbedding, top_n=1, metric=METRIC):
    """
    semantic & syntatic analogy task
    :param embedding:
    :param top_n: number of data, default: 3
    :param metric: 'cosine' or 'euclidean', defulat: 'cosine'
    :return:
    """
    for word1, word2, word3 in [
        ('한국', '서울', '일본'),
        ('한국', '서울', '미국'),  # 미국의 수도가 아니라 스웨덴의 수도가 나옴.
        ('한국', '서울', '중국'),  # 미국의 수도가 아니라 영국의 수도가 나옴.
        ('서울', '서울시', '부산'),
        ('서울', '서울시', '인천'),

        ('왕', '여왕', '남자'),
        ('소년', '남자', '소녀'),

        ('유치원', '초등학교', '중학교'),

        ('태어난', '태어났다.', '발매된'),
        ('태어난', '태어났다.', '공개된'),
        # ('시작', '시작했다.', '사과'),  # 동음이의어 오류

        ('공개', '공개했다.', '발매'),
        ('공개', '공개하고', '발매'),

        ('연구', '연구를', '공부'),
        ('연구', '연구를', '축구'),
        ('스포츠', '스포츠를', '공부'),  # 연구-연구를 보다 잘 나옴.
        ('스포츠', '스포츠를', '축구'),  # 연구-연구를 보다 잘 나옴.
    ]:
        y = embedding.relation_ab2xy(word1, word2, word3, top_n=top_n, metric=metric)
        print(f"{word1} vs {word2} = {word3} vs {y}")

Ejemplo n.º 3

0

Mostrar archivo

def test_suffix(embedding: Word2VecEmbedding, top_n=3, metric=METRIC):
    for root in ['출시', '공부', '연구', '시작', '종료', '사랑', '살인', '소통']:
        for suffix in ['했다.', '하고', '하여', '했지만', '하는', '하자']:
            # print(root, suffix)
            y_list = embedding.add_suffix(root, suffix, top_n=top_n, metric=metric)
            if len(y_list) > 0:
                print(root, f'+ {suffix} ->', y_list)

Ejemplo n.º 4

0

Mostrar archivo

def test_doesnt_match(embedding: Word2VecEmbedding, metric=METRIC):
    for words in [
        ('한국', '미국', '중국', '서울'),
        ('초등학교', '중학교', '고등학교', '학원'),
        ('김영삼', '노태우', '김대중', '노무현', '이건희'),
        ('아이폰', '아이패드', '안드로이드', '맥북'),
        ('삼성전자', 'LG전자', '애플', '네이버'),
        ('기독교', '천주교', '불교', '학교'),
        ('코카콜라', '펩시', '포도주'),
        ('프린터', '마우스', '키보드', '모니터', '선풍기'),
        ('냉장고', '선풍기', '세탁기', '컴퓨터'),
    ]:
        print(words, '->', embedding.doesnt_match(words, top_n=1, metric=metric))

Ejemplo n.º 5

0

Mostrar archivo

        #     print(w, i)

        sentiments = []
        for word, weight in zip(words, weights):
            if weight > 0.01:
                pos_sim = embedding.similarity_vec(embedding[word], pos_vec, metric=metric)
                neg_sim = embedding.similarity_vec(embedding[word], neg_vec, metric=metric)

                if pos_sim > 0 and neg_sim > 0 and abs(pos_sim - neg_sim) > 0.01:
                    # print(word, pos_sim > neg_sim, pos_sim, neg_sim)
                    print(f'{pos_sim - neg_sim:.3f} (weight: {weight:.2f}) {word}')  # , f'{pos_sim:.2f}', f'{neg_sim:.2f}')
                    sentiments.append(weight * numpy.sign(pos_sim - neg_sim))

        print(f'sentiment: {numpy.mean(sentiments):.3f}')


if __name__ == '__main__':
    # embedding_file = Word2VecEmbedding.DEFAULT_FILE
    embedding_file = os.path.join(WORD2VEC_DATA_DIR, 'ko.wikipedia.org.sentences.token_word.vocab_1e+05.vocab.window_1.side_both.corpus.embed_300.batch_500.neg_100.subsample_1e-05.lr_1e-04.decay_0.0.epoch_20.embedding')

    print()
    print(embedding_file)

    embedding = Word2VecEmbedding.load(embedding_file)
    print()
    test_doesnt_match(embedding)
    print()
    test_relation(embedding)
    print()
    test_root(embedding)

Ejemplo n.º 6

0

Mostrar archivo

    def train(self, iterations: int, batch: int, embedding: Word2VecEmbedding,
              args: argparse.Namespace) -> str:
        batches_in_epoch = int(numpy.ceil(
            len(self.dataloader.dataset) / batch))
        total_batches = batches_in_epoch * iterations
        nth_total_batch = 0
        log.info(f'batches_in_epoch: {batches_in_epoch}')
        log.info(f'total_batches: {total_batches}')

        watch = WatchUtil(auto_stop=False)
        watch.start()
        best_loss = float("inf")
        first_epoch, last_epoch = self.epoch + 1, self.epoch + iterations + 1
        last_embedding_file = None

        log.info(Word2VecEmbedding.get_filenpath(args))
        for self.epoch in range(first_epoch, last_epoch):
            log.info(f"[e{self.epoch:2d}] {self}")
            loss_list = []
            for nth, (iword, owords) in enumerate(self.dataloader, 1):
                try:
                    loss = self.sgns(iword, owords)
                except RuntimeError:
                    loss_list = [float('-inf')]
                    break

                self.optim.zero_grad()
                loss.backward()
                self.optim.step()
                # if nth_batch == 1 and self.scheduler is not None and self.epoch >= self.decay_start_epoch:  # TODO: TEST
                #     self.scheduler.step()

                if self.learning_decay != 0:
                    PytorchUtil.set_learning_rate(self.optim,
                                                  self.epoch,
                                                  gamma=self.learning_decay,
                                                  base_lr=self.init_lr,
                                                  min_lr=1e-10,
                                                  decay_start=2,
                                                  decay_interval=3)

                lr = PytorchUtil.get_learning_rate(self.optim)

                _, negatives = owords.size()
                real_loss = loss.data[0] / float(negatives)

                loss_list.append(real_loss)

                nth_total_batch += 1
                progressed = nth_total_batch / total_batches
                seconds_per_batch = float(
                    watch.elapsed()) / float(nth_total_batch)
                remain_batches = total_batches - nth_total_batch
                remain_secs = int(seconds_per_batch * remain_batches)

                if nth == 1 or nth == batches_in_epoch or nth % 1000 == 0:
                    log.info(
                        f"[e{self.epoch:2d}][b{nth:5d}/{batches_in_epoch:5d}][{progressed*100:.1f}% remain: {DateUtil.secs_to_string(remain_secs)}][window: {self.window}][lr: {lr:.0e}] loss: {real_loss:.7f}"
                    )

            total_loss = numpy.mean(loss_list)
            log.info(
                f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss: {total_loss:.7f}, best_loss: {best_loss:.7f}"
            )
            if total_loss > best_loss or total_loss == float(
                    'inf') or total_loss == float(
                        '-inf'):  # bad loss than before or diverge
                log.info('')
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss > best_loss BREAK"
                )
                log.info('')
                break
            else:
                if best_loss < total_loss:
                    best_loss = total_loss
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save()..."
                )
                args.epoch = self.epoch
                last_embedding_file = embedding.save(
                    idx2vec=trainer.embedding,
                    filepath=Word2VecEmbedding.get_filenpath(args))
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save() OK. {os.path.basename(embedding.filepath)}"
                )
        return last_embedding_file

Ejemplo n.º 7

0

Mostrar archivo

    try:
        log.info(f'load {args.corpus_file} ...')
        watch.start()
        corpus = Word2VecCorpus.load(filepath=args.corpus_file)
        log.info(
            f'load {args.corpus_file} OK. (elapsed: {watch.elapsed_string()})')
        log.info(corpus.vocab)

        if len(corpus.vocab) > 1e5:  # out of memory (11GB GPU memory)
            args.device_no = None

        log.info('')
        log.info(args)
        log.info('')

        embedding_file = Word2VecEmbedding.get_filenpath(args)
        if os.path.exists(embedding_file):
            log.info(f'embedding_file: {embedding_file} exists. skipped')
            if is_server():
                SlackUtil.send_message(
                    f'embedding_file: {embedding_file} exists. skipped')
                exit()

        log.info('')

        log.info(f'Word2VecTrainer() ...')
        watch.start()
        trainer = Word2VecTrainer(vocab=corpus.vocab,
                                  corpus=corpus,
                                  batch=args.batch,
                                  device_no=args.device_no,

Ejemplo n.º 8

0

Mostrar archivo

def word2vec_tensorboard(embedding_file_list,
                         top_n=1e5,
                         output_dir=TENSORBOARD_LOG_DIR):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    for filename in os.listdir(output_dir):
        os.remove(os.path.join(output_dir,
                               filename))  # remove old tensorboard files

    config = projector.ProjectorConfig()
    embedding_list = []
    for embedding_file in embedding_file_list:
        if not os.path.exists(embedding_file):
            log.info(f'{embedding_file} not exists. skipped.')
            continue

        embedding = Word2VecEmbedding.load(embedding_file)

        name = os.path.basename(embedding_file.replace('+', ''))
        while name.startswith('_'):
            name = name[1:]

        idx2vec = embedding.idx2vec
        idx2word, idx2freq = embedding.idx2word, embedding.idx2freq
        if top_n > 0:
            name += f'.top_n_{top_n}'
            idx2vec, idx2word, idx2freq = idx2vec[:
                                                  top_n], embedding.idx2word[:
                                                                             top_n], embedding.idx2freq[:
                                                                                                        top_n]

        embedding_var = tf.Variable(idx2vec, name=name)
        embedding_list.append(embedding_var)
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = os.path.join(output_dir, f'{name}.tsv')

        log.info('')
        log.info(f'{embedding_file} loaded.')
        log.info(
            f'embedding_var.name: {embedding_var.name} shape: {embedding_var.shape}'
        )
        log.info(f'embedding.metadata_path: {embedding.metadata_path}')
        with open(embedding.metadata_path, 'wt') as out_f:
            out_f.write('spell\tfreq\n')
            for spell, freq in zip(idx2word, idx2freq):
                out_f.write(f'{spell}\t{freq:.7f}\n')

    summary_writer = tf.summary.FileWriter(output_dir)
    projector.visualize_embeddings(summary_writer, config)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(var_list=embedding_list)
        checkpoint_file = os.path.join(output_dir, f'{name}.ckpt')
        saver.save(sess, checkpoint_file, global_step=None)
        log.info(f'checkpoint_file: {checkpoint_file}')

    # change absolute path -> relative path
    for filename in ['checkpoint', 'projector_config.pbtxt']:
        filepath = os.path.join(output_dir, filename)

        lines = []
        with open(filepath, 'rt') as f:
            for line in f.readlines():
                lines.append(line.replace(output_dir, '.'))
        os.remove(filepath)
        with open(filepath, 'wt') as f:
            for line in lines:
                f.write(line)