Ejemplo n.º 1
0
def train(sentences):
    print("starting to train!")
    # train model
    if args.train_pairs and args.relevant_selects:
        min_count = args.min_count * 5
    elif args.train_pairs:
        min_count = args.min_count * 10
    else:
        min_count = args.min_count

    if "word2vec" in args.gensim_model_name:
        model = Word2Vec(sentences,
                         size=args.embedding_size,
                         window=20,
                         sg=args.skipgram,
                         workers=16,
                         min_count=min_count)
    elif "fast" in args.gensim_model_name:
        model = FastText(sentences,
                         size=args.embedding_size,
                         window=20,
                         sg=args.skipgram,
                         workers=16,
                         min_count=min_count)

    # summarize the loaded model
    print(model)
    # access vector for one word
    # save model
    # trim unneeded model memory = use (much) less RAM
    model.init_sims(replace=True)
    model.save(args.data_dir + args.model_name)
Ejemplo n.º 2
0
def test_fasttext_similar_ir():
    model = FastText([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], min_count=1)
    model.save('model_ft')
    model.init_sims(replace=True)

    model = Word2Vec.load('model_ft')
    match_op = Matching()
    wcr = Word2VecRetrieval(model.wv, analyzer=DEFAULT_ANALYZER)
    retrieval = Retrieval(wcr, matching=match_op)  #, labels=['1번', '2번', '3번', '4번', '5번', '6번', '7번', '8번'])
    retrieval.fit(DOCUMENTS)

    start = time.time()  # 시작 시간 저장
    result, score = retrieval.query("안냥")
    print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간
    print(result)
    print(score)
Ejemplo n.º 3
0
def train_fasttext(tokens):
    ft_model = FastText(min_count=10,
                        window=5,
                        size=150,
                        negative=10,
                        alpha=0.03,
                        min_alpha=0.0007,
                        sample=6e-5,
                        sg=0)
    ft_model.build_vocab(tokens)
    print(ft_model.corpus_count)
    ft_model.train(tokens,
                   total_examples=ft_model.corpus_count,
                   epochs=300,
                   report_delay=1)
    ft_model.init_sims(replace=True)
    write_pickle(ft_model, 'ft_model2')
    return ft_model
Ejemplo n.º 4
0
    def testFastText(self):
        class LeeReader(object):
            def __init__(self, fn):
                self.fn = fn

            def __iter__(self):
                with smart_open(self.fn, 'r', encoding="latin_1") as infile:
                    for line in infile:
                        yield line.lower().strip().split()

        model = FastText(LeeReader(datapath('lee.cor')))
        model.init_sims()
        index = self.indexer(model, 10)

        self.assertVectorIsSimilarToItself(model.wv, index)
        self.assertApproxNeighborsMatchExact(model, model.wv, index)
        self.assertIndexSaved(index)
        self.assertLoadedIndexEqual(index, model)
Ejemplo n.º 5
0
    def testFastText(self):
        class LeeReader(object):
            def __init__(self, fn):
                self.fn = fn

            def __iter__(self):
                with smart_open(self.fn, 'r', encoding="latin_1") as infile:
                    for line in infile:
                        yield line.lower().strip().split()

        model = FastText(LeeReader(datapath('lee.cor')))
        model.init_sims()
        index = self.indexer(model, 10)

        self.assertVectorIsSimilarToItself(model.wv, index)
        self.assertApproxNeighborsMatchExact(model, model.wv, index)
        self.assertIndexSaved(index)
        self.assertLoadedIndexEqual(index, model)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(description='Trains word embeddings')
    parser.add_argument('--config_file',
                        type=str,
                        default='configs/echoes_local.config',
                        help='location of the configuration file')
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read(args.config_file)

    print(config['word']['model_dir'])

    sentences = Sentences(input_file=config['general']['corpus_file'])
    try:
        shutil.rmtree(config['word']['model_dir'])
    except FileNotFoundError:
        pass
    os.mkdir(config['word']['model_dir'])

    logging.info('Building fasttext model...')
    model = FastText(sentences,
                     size=int(config['word']['size']),
                     window=int(config['word']['window']),
                     min_count=int(config['word']['min_count']),
                     iter=int(config['word']['epochs']),
                     workers=int(config['word']['workers']))
    model.init_sims()
    model.save(f"{config['word']['model_dir']}/ft_model")
    logging.info(f"Saved fasttext model under {config['word']['model_dir']}")

    logging.info('Building word2vec model...')
    model = Word2Vec(sentences,
                     size=int(config['word']['size']),
                     window=int(config['word']['window']),
                     min_count=int(config['word']['min_count']),
                     iter=int(config['word']['epochs']),
                     workers=int(config['word']['workers']))
    model.init_sims()
    annoy_index = AnnoyIndexer(model, 100)
    annoy_index.save(f"{config['word']['model_dir']}/annoy_model")
    model.save(f"{config['word']['model_dir']}/w2v_model")
    logging.info(f"Saved word2vec model under {config['word']['model_dir']}")
Ejemplo n.º 7
0
            # Split a review into parsed sentences.
            sentences += KaggleWord2VecUtility.review_to_sentences(
                review, tokenizer, remove_stopwords=True)
        except:
            continue

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
                        level=logging.INFO)

    num_features = int(sys.argv[1])  # Word vector dimensionality
    min_word_count = 20  # Minimum word count
    num_workers = 40  # Number of threads to run in parallel
    context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    print("Training FastText model...")
    # Train FastText model.
    model = FastText(sentences, workers=num_workers, hs=0, sg=1, negative=10, iter=25, \
                     size=num_features, min_count=min_word_count, \
                     window=context, sample=downsampling, seed=1)

    model_name = str(num_features) + "features_" + str(
        min_word_count) + "minwords_" + str(context) + "context_len2alldata"
    model.init_sims(replace=True)
    # Save FastText model.
    print("Saving FastText model...")
    model.save(model_name)
    endmodeltime = time.time()

    print("time : ", endmodeltime - start)
Ejemplo n.º 8
0
class Char2VecFeatureExtractor(VectorFeatureExtractor):
    def __init__(self):
        super().__init__()
        self.key = KEY
        self.config = Char2VecModelConfiguration()

    def fit(self, X, y=None, size=100, min_count=5, workers=1, window=5, sample=1e-3, skipgram=False, min_n=3, max_n=6):
        """ Trains a Word2vec model on given documents.
            Each document should represent a sentence.
        Args:
            X: list(Document | AnnotatedDocument | list(str))
            y: optional labels
            size: Size of embeddings to be learnt (Default 100), i.e. word vector dimensionality
            min_count: Minimum word count. Ignore words with number of occurrences below this (Default 5).
            workers: Number of threads to run in parallel
            window: Context window size
            sample: Threshold for downsampling higher-frequency words (Default 0.001)
            skipgram: Use skip-gram if True and CBOW otherwise
            min_n: min length of char ngrams (Default 3)
            max_n: max length of char ngrams (Default 6)
        """
        log.info("Checking parameters...")
        self.config.set_parameters({
            "size": size,
            "min_count": min_count,
            "workers": workers,
            "window": window,
            "sample": sample,
            "min_n": min_n,
            "max_n": max_n
        })
        self.config.validate()
        # Get sentences as lists of tokens
        log.info("Tokenizing {} documents...".format(len(X)))
        sentences = []
        for idx, doc in enumerate(X):
            sentences.append(document_to_tokens(doc))
            log_progress(log, idx, len(X))
        # Initialize and train the model (this will take some time)
        log.info("Training FastText on {} sentences...".format(len(X)))
        self.model = FastText(
            sentences,
            workers=self.config.get_parameter("workers"),
            size=self.config.get_parameter("size"),
            min_count=self.config.get_parameter("min_count"),
            window=self.config.get_parameter("window"),
            sample=self.config.get_parameter("sample"),
            sg=1 if skipgram else 0,
            min_n=self.config.get_parameter("min_n"),
            max_n=self.config.get_parameter("max_n"))

        # If you don't plan to train the model any further, calling
        # init_sims() will make the model much more memory-efficient.
        self.model.init_sims(replace=True)
        return self

    def transform(self, X, y=None):
        """ Transforms the list of documents and returns tokens with their features.
            Each document should represent a sentence.
        """
        log.info("Generating features for {} documents...".format(len(X)))
        features = []
        for doc in X:
            doc_features = []
            for token in document_to_tokens(doc):
                if token in self.model.wv:
                    doc_features.append((token, self.model.wv[token]))
            features.append(doc_features)
        return features

    def save(self, file_path):
        save_path = Path(file_path)
        mkdir(save_path)
        model_save_path = save_path.joinpath("char2vec.model")
        config_save_path = save_path.joinpath("char2vec.config")
        self.model.save(str(model_save_path))
        self.config.save(config_save_path)

    def load(self, file_path):
        load_path = Path(file_path)
        model_load_path = load_path.joinpath("char2vec.model")
        config_load_path = load_path.joinpath("char2vec.config")
        self.model = FastText.load(str(model_load_path))
        self.config.load(config_load_path)
        return self
Ejemplo n.º 9
0
class WordEmbedding():
    def __init__(self,
                 embedding_type="w2v",
                 embedding_size=100,
                 ngram=(3, 6),
                 window_size=5,
                 architecture="sg"):
        self.embedding_type = embedding_type
        self.window = window_size
        self.size = embedding_size
        self.model = None
        if architecture == "sg":
            self.skip_gram = True
        else:
            self.skip_gram = False
        if ngram is None:
            ngram = (3, 6)
        self.min_gram = ngram[0]
        self.max_gram = ngram[1]

    def train_embedding(self,
                        sentences,
                        n_iter=100,
                        workers=1,
                        min_count=3,
                        negative_sample=1):
        if self.embedding_type == "w2v":
            train_corpus = sentences
            if self.model is None:
                self.model = Word2Vec(size=self.size,
                                      window=self.window,
                                      min_count=min_count,
                                      negative=negative_sample,
                                      workers=workers,
                                      sg=int(self.skip_gram))
                self.model.build_vocab(train_corpus)
            # self.model.build_vocab()
            else:
                self.model.build_vocab(train_corpus, update=True)
        elif self.embedding_type == "ft":
            train_corpus = sentences
            if self.model is None:
                self.model = FastText(sg=int(self.skip_gram),
                                      size=self.size,
                                      window=self.window,
                                      min_count=min_count,
                                      min_n=self.min_gram,
                                      max_n=self.max_gram,
                                      workers=workers,
                                      negative=negative_sample)
                self.model.build_vocab(train_corpus)
            else:
                self.model.build_vocab(train_corpus, update=True)
        elif self.embedding_type == "glove":
            raise ValueError("GloVe training not supported use official repo")
        else:
            raise ValueError("Invalid Embedding Type")
        train_corpus = sentences
        self.model.train(train_corpus,
                         epochs=n_iter,
                         total_examples=self.model.corpus_count)

    def retrieve_vector(self, word):
        try:
            return self.model.wv[word]
        except KeyError:
            return np.random.random(self.size)

    def find_similar_word(self, word, n=10):
        try:
            return self.model.most_similar(positive=[word], topn=n)
        except KeyError:
            return []

    def save_model(self, file_name):
        self.model.save("{}.model".format(file_name))
        we_model_files = glob("{}.model*".format(file_name))
        with ZipFile(file_name, "w") as zipf:
            for we_file in we_model_files:
                zipf.write(we_file)
                os.remove(we_file)

    def load_model(self, file_name):
        try:
            with ZipFile(file_name, "r") as zipf:
                zipf.extractall("/tmp/")
                nl = zipf.namelist()
            fn = [name for name in nl if name.endswith(".model")][0]
            path = "/tmp/" + fn
        except BadZipFile:
            path = file_name

        if self.embedding_type == "w2v":
            self.model = KeyedVectors.load_word2vec_format(path)
        elif self.embedding_type == "ft":
            self.model = FastText.load_fasttext_format(path)
        elif self.embedding_type == "glove":
            """path name: .txt file"""
            try:
                glove_file = datapath(os.path.abspath(path))
                tmp_file = get_tmpfile("/tmp/g2w2v.txt")
                glove2word2vec(glove_file, tmp_file)
                self.model = KeyedVectors.load_word2vec_format(tmp_file)
            except UnicodeDecodeError:
                self.model = KeyedVectors.load(os.path.abspath(path))
        self.size = self.model.wv.vector_size

    def remove_from_vocab(self, word_list):
        new_vectors = []
        new_vocab = {}
        new_index2entity = []
        new_vectors_norm = []
        if self.embedding_type == "ft":
            self.model.wv.init_sims()
            for i in range(len(self.model.wv.vocab)):
                word = self.model.wv.index2entity[i]
                vec = self.model.wv.vectors[i]
                vocab = self.model.wv.vocab[word]
                vec_norm = self.model.wv.vectors_norm[i]
                if word not in word_list:
                    vocab.index = len(new_index2entity)
                    new_index2entity.append(word)
                    new_vocab[word] = vocab
                    new_vectors.append(vec)
                    new_vectors_norm.append(vec_norm)
            self.model.wv.vocab = new_vocab
            self.model.wv.vectors = np.array(new_vectors)
            self.model.wv.index2entity = new_index2entity
            self.model.wv.index2word = new_index2entity
            self.model.wv.vectors_norm = new_vectors_norm
        else:
            self.model.init_sims()
            for i in range(len(self.model.vocab)):
                word = self.model.index2entity[i]
                vec = self.model.vectors[i]
                vocab = self.model.vocab[word]
                vec_norm = self.model.vectors_norm[i]
                if word not in word_list:
                    vocab.index = len(new_index2entity)
                    new_index2entity.append(word)
                    new_vocab[word] = vocab
                    new_vectors.append(vec)
                    new_vectors_norm.append(vec_norm)
            self.model.vocab = new_vocab
            self.model.vectors = np.array(new_vectors)
            self.model.index2entity = new_index2entity
            self.model.index2word = new_index2entity
            self.model.vectors_norm = new_vectors_norm
Ejemplo n.º 10
0
def _fasttext(table, input_col, sg=1, size=100, window=5, min_count=1,
              max_vocab_size=None, train_epoch=100, workers=1, alpha=0.025,
              min_alpha=0.025, seed=None, hs=1, negative=5, ns_exponent=0.75,
              topn=30, hashfxn=hash, min_n=3, max_n=6, bucket=2000000):
    if isinstance(sg, str):
        sg = int(sg)
    algo = {1: 'Skip-gram', 0: 'CBOW'}[sg]

    tagged_sents = table[input_col].apply(list).tolist()
    ft = FastText(sentences=tagged_sents,
                  sg=sg,
                  size=size,
                  window=window,
                  alpha=alpha,
                  min_alpha=min_alpha,
                  seed=seed,
                  min_count=min_count,
                  max_vocab_size=max_vocab_size,
                  workers=workers,
                  iter=train_epoch,
                  hs=hs,
                  negative=negative,
                  ns_exponent=ns_exponent,
                  hashfxn=hashfxn,
                  min_n=min_n,
                  max_n=max_n,
                  bucket=bucket)

    ft.init_sims(replace=True)
    vocab = ft.wv.vocab

    analogies_score, sections = ft.wv.evaluate_word_analogies(
        'brightics/function/textanalytics/data/word2vec_questions_words.txt')
    pearson_1, spearman_1, oov_ratio_1 = ft.wv.evaluate_word_pairs(
        'brightics/function/textanalytics/data/word2vec_wordsim353.tsv')
    pearson_2, spearman_2, oov_ratio_2 = ft.wv.evaluate_word_pairs(
        'brightics/function/textanalytics/data/word2vec_simlex999.tsv')

    params = {'Input column': input_col,
              'Training algorithm': algo,
              'Word vector dimensionality': size,
              'Window': window,
              'Minimum word count': min_count,
              'Max vocabulary size': max_vocab_size,
              'Train epoch': train_epoch,
              'Number of workers': workers,
              'Alpha': alpha,
              'Minimum alpha': min_alpha,
              'Seed': seed,
              'Hierarchical softmax': hs,
              'Negative': negative,
              'Negative sampling exponent': ns_exponent}

    # tsne visualization
    length = len(vocab)
    if length < topn:
        topn = length
    topn_words = sorted(vocab, key=vocab.get, reverse=True)[:topn]

    X = ft[topn_words]
    tsne = TSNE(n_components=min(2, topn), random_state=seed)
    X_tsne = tsne.fit_transform(X)
    df = pd.DataFrame(X_tsne, index=topn_words, columns=['x', 'y'])

    fig = plt.figure()
    fig.set_size_inches(50, 40)
    ax = fig.add_subplot(1, 1, 1)

    ax.scatter(df['x'], df['y'], s=1000)
    ax.tick_params(axis='both', which='major', labelsize=50)

    for word, pos in df.iterrows():
        ax.annotate(word, pos, fontsize=80)
    plt.show()
    fig = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## FastText Result
    |
    | ### Total Number of words
    | {length}
    |
    | ### Top {topn} Words
    | {topn_words}
    | {fig}
    |
    | ### Word analogy score
    | {analogies_score}
    |
    | ### Word correlation scores
    | #### Pearson correlation coefficient with 2-tailed p-value (WordSim353)
    | {pearson_1_1}, {pearson_1_2}
    | #### Spearman rank-order correlation coefficient with 2-tailed p-value (WordSim353)
    | {spearman_1_1}, {spearman_1_2}
    | #### The ratio of pairs with unknown words (WordSim353) 
    | {oov_ratio_1}
    | #### Pearson correlation coefficient with 2-tailed p-value (SimLex999)
    | {pearson_2_1}, {pearson_2_2}
    | #### Spearman rank-order correlation coefficient with 2-tailed p-value (SimLex999)
    | {spearman_2_1}, {spearman_2_2}
    | #### The ratio of pairs with unknown words (SimLex999)
    | {oov_ratio_2}
    |
    | ### Parameters
    | {params}
    """.format(length=length, analogies_score=analogies_score,
               pearson_1_1=pearson_1[0], pearson_1_2=pearson_1[1], spearman_1_1=spearman_1[0],
               spearman_1_2=spearman_1[1], oov_ratio_1=oov_ratio_1,
               pearson_2_1=pearson_2[0], pearson_2_2=pearson_2[1], spearman_2_1=spearman_2[0],
               spearman_2_2=spearman_2[1], oov_ratio_2=oov_ratio_2,
               topn=topn, topn_words=topn_words, params=dict2MD(params), fig=fig)))

    vocab = list(ft.wv.vocab)

    model = _model_dict('fasttext_model')
    model['params'] = params
    model['vocab'] = vocab
    model['ft'] = ft.wv
    model['_repr_brtc_'] = rb.get()

    out_table = pd.DataFrame({'words': vocab, 'word_vectors': ft.wv[vocab].tolist()})
    return {'model': model, 'out_table': out_table}
Ejemplo n.º 11
0
# ## preprocessing
topics = [[] for i in range(len(df))]
para = [[] for i in range(len(df))]
topics1 = [[] for i in range(len(df))]
para1 = [[] for i in range(len(df))]
for i in range(len(df_combined)):
    text = df_combined.iloc[i][0]
    text = str(text)
    topics[i] = preprocess_text(text)
    text = df_combined.iloc[i][1]
    text = str(text)
    para[i] = preprocess_text(text)

model_para = FastText(para, min_count=1)
model_para.init_sims(replace=True)
model_topic = FastText(topics, min_count=1)
model_topic.init_sims(replace=True)


# ## applying model
def get_answers(df_combined, query1):
    query = Answer_Pre_Processing(query1)
    q = preprocess_text(query)

    count = 0
    min1 = 1000
    result = ""
    for i in range(len(df_combined)):

        distance = model_para.wmdistance(q, para[i])