Exemple #1
0
def train_model(book_name):
    print(book_name + ' fasttext model train start')
    # extracting  file name
    model_name = book_name.split('.')
    # train and save model
    fasttext.skipgram(book_name, model_name[0] + "_fasttext_model")
    print(book_name + " fasttext model saved")
def create_word_vectors(input_file, output_dir, dim, word_ngram):

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    filename = output_dir.split('/')[-2]
    output_file = os.path.join(output_dir, filename)

    fasttext.skipgram(input_file, output_file, dim=dim, word_ngrams=word_ngram)
Exemple #3
0
def fasttext_model(doc, model):
    fasttext.skipgram(doc,
                      model,
                      dim=param.dim,
                      ws=param.ws,
                      min_count=param.min_count,
                      t=param.t,
                      thread=param.thread)
Exemple #4
0
def train_embedding_fasttext():
    '''
    使用fasttext 生成字符级别和单词级别的词嵌入
    :return:
    '''
    # Skipgram model
    logging.info(
        'generating CHAR embedding %s with fasttext using %s algorithm',
        'char2vec_fastskip256', 'Skipgram')
    model = fasttext.skipgram(CHAR_LEVEL_CORPUS,
                              os.path.join(MODEL_DIR, 'char2vec_fastskip256'),
                              word_ngrams=2,
                              ws=5,
                              min_count=10,
                              dim=256)
    del (model)

    # CBOW model
    logging.info(
        'generating CHAR embedding %s with fasttext using %s algorithm',
        'char2vec_fastcbow256', 'CBOW')
    model = fasttext.cbow(CHAR_LEVEL_CORPUS,
                          os.path.join(MODEL_DIR, 'char2vec_fastcbow256'),
                          word_ngrams=2,
                          ws=5,
                          min_count=10,
                          dim=256)
    del (model)

    # Skipgram model
    logging.info(
        'generating WORD embedding %s with fasttext using %s algorithm',
        'word2vec_fastskip256', 'Skipgram')
    model = fasttext.skipgram(WORD_LEVEL_CORPUS,
                              os.path.join(MODEL_DIR, 'word2vec_fastskip256'),
                              word_ngrams=2,
                              ws=5,
                              min_count=10,
                              dim=256)
    del (model)

    # CBOW model
    logging.info(
        'generating WORD embedding %s with fasttext using %s algorithm',
        'word2vec_fastcbow256', 'CBOW')
    model = fasttext.cbow(WORD_LEVEL_CORPUS,
                          os.path.join(MODEL_DIR, 'word2vec_fastcbow256'),
                          word_ngrams=2,
                          ws=5,
                          min_count=10,
                          dim=256)
    del (model)
Exemple #5
0
    def generate_model(self, file_name='doc.txt', model_name='model',
                       load=True):
        u"""document名・model名を指定してmodelを生成する。

        Args:
            file_name : String,  形態素解析したドキュメントデータ
            model_name: String,  skip-gramの結果生成されるファイルの名前
            load      : Boolean, model生成後、そのmodelをloadするか否か
        """
        ft.skipgram(file_name, model_name, **self.__params_fasttext)
        if load:
            self.model = ft.load_model(model_name + '.bin')
            self.df_word = self.model2df()
Exemple #6
0
def train_w2v(train_file, model_file):
    # doc: https://pypi.org/project/fasttext/
    model = fasttext.skipgram(train_file, model_file, \
            lr=0.01, dim=256, \
            min_count=1, thread=30, \
            t=1e-4, ws=5, neg=5, \
            epoch=10, silent=False)
Exemple #7
0
 def train(self, txt_path, config=DEF_CONFIG):
     if self.mode == "skipgram":
         self.model = fasttext.skipgram(txt_path, self.model_path, **config)
     elif self.mode == "cbow":
         self.model = fasttext.cbow(txt_path, self.model_path, **config)
     elif self.mode == "supervised":
         self.model = fasttext.supervised(txt_path, self.model_path, **config)
def word_embedding(fname='filtered_tweets.txt'):
    vec_dim = 10
    maximum = 24
    tokenizer = RegexpTokenizer(r'\w+')

    with open(fname) as f:
        tweets = f.readlines()
    m = len(tweets)

    for i in range(0, m):
        tweets[i] = tokenizer.tokenize(tweets[i])

    model = fasttext.skipgram(fname, 'model', dim=vec_dim)

    for i in range(0, m):
        print '\n\ntweet #' + str(i)
        tweets[i] = np.array([model[w] for w in tweets[i]])
        if len(tweets[i]) == 0:
            tweets[i] = np.zeros([maximum, vec_dim])
        elif len(tweets[i]) < maximum:
            padlen = maximum - len(tweets[i])
            padding = np.zeros([padlen, vec_dim])
            tweets[i] = np.concatenate((np.array(tweets[i]), padding), axis=0)
        tweets[i] = tweets[i].flatten()

    return tweets
Exemple #9
0
def train_fasttext(path):
    # https://pypi.python.org/pypi/fasttext
    import fasttext
    model = fasttext.skipgram(path,
                              'embeddings_models/model_fasttext_' +
                              str(TRAINING_SENTENCES),
                              dim=300)
Exemple #10
0
    def Item2vec(self, fileL):
        file_list = self.Fname(fileL)

        # 根据切词结果训练模型
        filename_cut = 'news_fasttext_train.txt'
        model = fasttext.skipgram(filename_cut, 'model')

        fout = open('item2vec.txt', 'w+')
        i = 9
        for file in file_list:
            i += 1
            #叠加关键词的向量
            vec = []
            filename_key = file + '_keywords.txt'
            fin = open(filename_key, 'r')
            while True:
                line = fin.readline()
                if line:
                    keyword = line.strip()
                    # print keyword
                    k_vec = model[keyword]
                    # print k_vec
                    if len(vec) == 0:
                        vec = k_vec
                    else:
                        vec = list(map(lambda x: x[0] + x[1], zip(vec, k_vec)))
                else:
                    break
            fin.close()

            fout.write(str(vec) + '\n')

        fout.close()
Exemple #11
0
def building_word_vector_model(option, sentences, embed_dim, workers, window, y_train):
    """
        Builds the word vector
        Args:
            type = {bool} 0 for Word2vec. 1 for gensim Fastext. 2 for Fasttext 2018.
            sentences = {list} list of tokenized words
            embed_dim = {int} embedding dimension of the word vectors
            workers = {int} no. of worker threads to train the model (faster training with multicore machines)
            window = {int} max distance between current and predicted word
            y_train = y_train
        Returns:
            model = Word2vec/Gensim fastText/ Fastext_2018 model trained on the training corpus
    """
    if option == 0:
        print("Training a word2vec model")
        model = Word2Vec(sentences = sentences, size = embed_dim, workers = workers, window = window, epochs = 10)
        print("Training complete")

    elif option == 1:
        print("Training a Gensim FastText model")
        model = FastText(sentences = sentences, size = embed_dim, workers = workers, window = window, iter = 10)
        print("Training complete")

    elif option == 2:
        print("Training a Fasttext model from Facebook Research")
        y_train = ["__label__positive" if i == 1 else "__label__negative" for i in y_train]

        with open("imdb_train.txt","w") as text_file:
            for i in range(len(sentences)):
                print(sentences[i],y_train[i],file = text_file)

        model = fasttext.skipgram("imdb_train.txt","model_ft_2018_imdb",dim = embed_dim)
        print("Training complete")

    return model
Exemple #12
0
 def learn_embeddings(self,
                      inp_path,
                      out_path,
                      emb_epoch=40,
                      emb_lr=0.01,
                      emb_dim=100,
                      encoding_type='utf-8'):
     import fasttext
     fasttext.skipgram(inp_path,
                       out_path,
                       epoch=emb_epoch,
                       lr=emb_lr,
                       dim=emb_dim)
     from gensim.models.wrappers import FastText
     return FastText.load_fasttext_format(out_path + '.bin',
                                          encoding=encoding_type)
def execute():
    # Verify that mandatory arguments are present
    if "-i" not in args:
        return "ERROR: No input file was given"

    if "-t" not in args:
        return "ERROR: No model type was given"

    # Extract arguments
    train_file = args[args.index("-i")+1]
    model_type = args[args.index("-t")+1]

    # Extract optional arguments
    epoch = get_optional_param('--epoch',5)
    ngrams = get_optional_param('--ngrams',1)
    label_prefix = get_optional_param('--label','__label__')

    # Create temporary file
    tmp, modelname = tempfile.mkstemp()

    # Use specified classifier with parameters and output model to the name of the temporary file
    if model_type == "supervised":
        classifier = fasttext.supervised(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    elif model_type == "skipgram":
        classifier = fasttext.skipgram(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    elif model_type == "cbow":
        classifier = fasttext.cbow(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix)

    # Return the temporary file name
    return modelname
Exemple #14
0
def create_fast_text_model(folder, merged_spellcheck_path):
    start_time_fasttext = time.time()
    path = './' + str(folder) + '/model'
    model = fasttext.skipgram(merged_spellcheck_path, path)
    print("Time used to create Fasttext model: ",
          get_time(start_time_fasttext))
    return model
Exemple #15
0
def trainFastText(data_path,
                  embedding_size=300,
                  context_window=5,
                  min_count=5,
                  save_path='outputs/FastText'):
    # function that fits a fastext model, can only be in one file path
    print('Training FastText Model...')

    # train model
    model = fasttext.skipgram(data_path,
                              save_path + '/fasttext',
                              dim=embedding_size,
                              ws=context_window,
                              min_count=min_count)

    # save model
    with open('data/words.txt', 'r') as f:
        words = f.read().split('\n')[:-1]

    shared_words = list(set(words).intersection(set(model.words)))

    fasttext_dict = {}
    for word in shared_words:
        fasttext_dict[word] = model[word]

    np.save('DSMs/fasttext.npy', fasttext_dict)
def fasttext_lstm():
    dimens = 100
    #ftmodel = ft.supervised('data/trainprocess.txt', 'model/train', label_prefix='__label__')
    #ftmodel = ft.load_model('model/model_sentiment.bin', encoding = 'utf-8', label_prefix='__label__')
    ftmodel = ft.skipgram('data/trainprocess.txt', 'skip_gram', dim=dimens)
    # print(len(ftmodel['langgg']))
    # print(ftmodel.words)
    train_embed = []
    test_embed = []

    for text in train_data:
        tokens = separate(text)
        embed = []
        for token in tokens:
            vec = ftmodel[token]
            embed.append(vec)
        train_embed.append(embed)
        #print(embed)

    for text in test_data:
        tokens = separate(text)
        embed = []
        for token in tokens:
            vec = ftmodel[token]
            embed.append(vec)
        test_embed.append(embed)

    train_embed = np.array(train_embed)
    test_embed = np.array(test_embed)
    return train_embed, train_labels, test_embed, test_labels
Exemple #17
0
def pre_train_fasttext():
    thefile = open('data2.txt', 'w', encoding='utf-8')
    for i in merge:
        sentence = clean_sent(i)
        thefile.write("%s\n" % sentence)
    model = fasttext.skipgram(thefile, 'model')
    return model
Exemple #18
0
def train_embedding_fasttext():
    
    # Skipgram model
    model = fasttext.skipgram(model_dir + 'train_char.txt', model_dir + 'char2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)

    # CBOW model
    model = fasttext.cbow(model_dir + 'train_char.txt', model_dir + 'char2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)

    # Skipgram model
    model = fasttext.skipgram(model_dir + 'train_word.txt', model_dir + 'word2vec_fastskip256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)

    # CBOW model
    model = fasttext.cbow(model_dir + 'train_word.txt', model_dir + 'word2vec_fastcbow256', word_ngrams=2, ws=5, min_count=10, dim=256)
    del(model)
Exemple #19
0
def trainWord2Vector():
    model1 = fasttext.skipgram(
        '/Users/didi/workspace/data/tensorflow/resume_data/resume_train.txt',
        'model',
        lr=0.01,
        dim=300)
    #for word in model1.words:
    print model1.words['模型'].encode('utf-8')
    def train_model(self, train_txt, vector_len=100):
        """Trains a new fasttext model"""

        print("Creating new fasttext model...")
        output_path = self._ft_dir + "ft_model"
        # Using skipgram model to learn vector representations
        self.model = ft.skipgram(train_txt, output_path, dim=vector_len)
        self._retrieve_word_embeddings()
        print("Done")
Exemple #21
0
 def fasttext_model(doc, model):
     """
     generate without label and save it
     generate the fasttext model
     :param doc -> string: data path  
     :param model -> string: fasttext model path
     :return: 
     """
     with open('./cnews/without.dat', 'w', encoding='utf8') as f:
         data = read_file(doc)
         for d, _ in data:
             f.write(d)
             f.flush()
     fasttext.skipgram('./cnews/without.dat',
                       model,
                       dim=self.dim,
                       ws=self.ws,
                       min_count=self.min_count)
Exemple #22
0
def trainSkipgram(inFile,mdlfile):
    # Skipgram model
    model = fasttext.skipgram(input_file=inFile,\
                              output=mdlfile,\
                              lr=0.1,\
                              dim=200,\
                              epoch=10,\
                              word_ngrams=3,\
                              bucket=5000000)
Exemple #23
0
def trainWordToVec():
    """

    Will train word2vec from wikipedia text
    make sure that hewiki-latest-pages-articles.xml.bz2 is downloaded to
    the data folder

    this function creates word2vec model
    :return:

    """

    print("WordToVec model exists: {}".format(os.path.isfile(utils.word2VecFiles + ".bin")))
    from gensim.corpora import WikiCorpus

    # stop if model already has been created
    if os.path.isfile(utils.word2VecFiles + ".bin"):
        return

    # download from wikipedia
    if not os.path.isfile(utils.wikiTar):
        import urllib
        tarLocation = 'https://dumps.wikimedia.org/hewiki/latest/hewiki-latest-pages-articles.xml.bz2'
        wikiConn = urllib.request.urlopen(tarLocation)
        with open(utils.wikiTar, 'wb') as wikiSaver:
            wikiSaver.write(wikiConn.read())

    # parse tar file
    if not os.path.isfile(utils.wikiFull):
        i = 0

        output = open(utils.wikiFull, 'w')
        wiki = WikiCorpus(utils.wikiTar, lemmatize=False, dictionary={})
        for text in wiki.get_texts():
            article = " ".join([t for t in text])
            output.write("{}\n".format(article))
            i += 1
            if i % 500 == 0:
                print("{} items loaded".format(str(i)))

        output.close()
    # train word2vec
    fasttext.skipgram(utils.wikiFull, utils.word2VecFiles)
    print("step 1 - {0} created".format(word2VecFiles))
def text2vec_fast(data_file, method='cbow', modelname='model', **kargs):
    clean_data(data_file, 'cleaned.txt')
    if method == 'skipgram':
        model = fasttext.skipgram('cleaned.txt', modelname, **kargs)
    else:
        model = fasttext.cbow('cleaned.txt', modelname, **kargs)

    vector_dict, index_dict = extract_word_vectors(modelname + '.vec')
    os.remove('cleaned.txt')
    return vector_dict, index_dict
Exemple #25
0
    def train(self, questions_path, vector_length=100):
        self.dim = vector_length

        model_name = 'model_' + os.path.split(questions_path)[-1].split('.')[0]
        model_path = os.path.join(self.files_path, model_name)

        self.model = fasttext.skipgram(questions_path,
                                       model_path,
                                       dim=self.dim,
                                       thread=4)
def fasttext_tfidf():
    dimens = 150
    vectorizer = TfidfVectorizer(analyzer=separate)
    vectorizer = vectorizer.fit(train_data)
    train_tfidf = vectorizer.transform(train_data)
    test_tfidf  = vectorizer.transform(test_data)
    vocab = vectorizer.vocabulary_
    print(type(vocab))
    print(test_tfidf.shape)
    ftmodel = ft.skipgram('data/trainprocess.txt', 'skip_gram', dim=dimens)
    # print(ftmodel.words)
    train_embed = []
    test_embed = []

    for j in range(len(train_data)):
        text = train_data[j]
        tokens = separate(text)
        embed = []
        for i in range(dimens):
            embed.append(0)
        for token in tokens:
            vec = ftmodel[token]
            multi = 1
            if (token in vocab.keys()):
                multi = train_tfidf[j, vocab[token]]
            for i in range(dimens):
                embed[i] += vec[i] * multi
        for i in range(dimens):
            embed[i] = embed[i]/(len(tokens))
        train_embed.append(embed)
        #print(embed)

    for j in range(len(test_data)):
        text = test_data[j]
        tokens = separate(text)
        embed = []
        for i in range(dimens):
            embed.append(0)
        for token in tokens:
            vec = ftmodel[token]
            multi = 1
            if (token in vocab.keys()):
                multi = test_tfidf[j, vocab[token]]
            for i in range(dimens):
                embed[i] += vec[i] * multi
        for i in range(dimens):
            embed[i] = embed[i] / (len(tokens))
        test_embed.append(embed)

    train_embed = np.array(train_embed)
    test_embed = np.array(test_embed)
    print(train_embed.shape)
    print(test_embed.shape)
    return train_embed, train_labels, test_embed, test_labels
def run_model(db_path, model_path, model_type, model_params):
    print('Training FastText model...')
    start_time = clock()
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    if not os.path.isdir(model_path):
        os.makedirs(model_path)
    temp_fpath = os.path.join(model_path, '__ft_temp.txt')

    get_text_command = 'SELECT preprocessed FROM rss_data ORDER BY cachedate DESC'
    results = cursor.execute(get_text_command)
    with open(temp_fpath, 'w', newline='', encoding='utf-8') as tmpfile:
        for r in results:
            if r[0] is not None:
                tmpfile.write(r[0] + '.\n')
    conn.close()

    model_fpath = os.path.join(model_path, 'fasttext_model')
    model = None
    p_ngrams = model_params.get('ngram', 1)
    p_dim = model_params.get('dim', 100)
    p_ws = model_params.get('ws', 5)
    p_epoch = model_params.get('epoch', 5)
    p_loss = model_params.get('loss', 'ns')
    p_min_count = model_params.get('min_count', 5)
    p_silent = model_params.get('silent', 1)
    if model_type == 'cbow':
        model = fasttext.cbow(temp_fpath,
                              model_fpath,
                              word_ngrams=p_ngrams,
                              dim=p_dim,
                              ws=p_ws,
                              epoch=p_epoch,
                              loss=p_loss,
                              silent=p_silent,
                              min_count=p_min_count)
    else:
        model = fasttext.skipgram(temp_fpath,
                                  model_fpath,
                                  word_ngrams=p_ngrams,
                                  dim=p_dim,
                                  ws=p_ws,
                                  epoch=p_epoch,
                                  loss=p_loss,
                                  silent=p_silent,
                                  min_count=p_min_count)

    os.remove(temp_fpath)

    end_time = clock()
    print('Model trained in {} seconds.'.format(int(end_time - start_time)))

    return model
Exemple #28
0
def train(inp = "wiki2.he.text",out_model = "wiki.he.fasttext.model", alg = "CBOW"):
    start = time.time()
    if alg == "skipgram":
        # Skipgram model
        model = fasttext.skipgram(inp, out_model)
        print(model.words) # list of words in dictionary
    else:
        # CBOW model
        model = fasttext.cbow(inp, out_model)
        print(model.words) # list of words in dictionary
    print(time.time()-start)
    model.save(out_model)
Exemple #29
0
    def train_word_repr(self):
        documents = []

        for xlsx in os.listdir(self.excel_dir):
            print('reading Excel: %s ...' % xlsx)
            df = pd.read_excel(os.path.join(COMMENTS_DIR, xlsx),
                               encoding='GBK',
                               index_col=None)
            df = df.dropna(how='any')

            documents += df['content'].tolist()

        lines = []
        for i in range(len(documents)):
            line = " ".join(jieba.cut(str(documents[i]).strip()))
            lines.append(line)

        with open('./comments.txt', mode='wt', encoding='utf-8') as f:
            f.write("".join(lines))

        fasttext.skipgram('comments.txt', 'fastTextRepr')
Exemple #30
0
def grid_search(dims, lr, train, full_data, test, test_labels, epoch, ngrams,
                ws):
    """
    Performs a grid seach over the parameters dims and lr.
    params: dims : list of dimension parameters
               lr : list of learning rate parameters
    Returns : Best parameters and best accuracy. 
    """

    #Initialzing best accuracy and best parameters
    best_accracy = 0.0
    best_params = (0.0, 0.0)

    #preprocessing testing set.
    actual = [1 if '__label__positive' in t else -1 for t in test_labels]

    #Iterating over paramters
    for grams in ngrams:
        print('ngrams = ', grams)
        for w in ws:
            print('ws = ', w)
            for k in epoch:
                print('epoch = ', str(k))
                for i in dims:
                    print('dim = ', str(i))
                    for j in lrs:
                        print('learning rate = ', str(j))

                        #writing files
                        write_to_file(train, 'train.txt')
                        write_to_file(full_data, 'data.txt')

                        #building model.
                        model = fasttext.skipgram('data.txt',
                                                  'model',
                                                  dim=i,
                                                  lr=j,
                                                  epoch=k,
                                                  word_ngrams=grams,
                                                  ws=w)
                        classifier = fasttext.supervised(
                            'train.txt', 'model', label_prefix='__label__')
                        labels = classifier.predict(test)
                        pred = [1 if t == ['positive'] else -1 for t in labels]

                        #Computing accuracy.
                        accuracy = 1 - np.mean(
                            np.abs(np.array(pred) - np.array(actual)) / 2)
                        print('accuracy = ' + str(accuracy))
                        if (accuracy > best_accracy):
                            best_accracy = accuracy
                            best_params = (i, j, k, w, grams)
    return best_accracy, best_params
Exemple #31
0
    def test_train_skipgram_model(self):
        # set params
        lr=0.005
        dim=10
        ws=5
        epoch=5
        min_count=1
        neg=5
        word_ngrams=1
        loss='ns'
        bucket=2000000
        minn=3
        maxn=6
        thread=4
        lr_update_rate=10000
        t=1e-4
        silent=1

        # train skipgram model
        model = ft.skipgram(input_file, output, lr, dim, ws, epoch, min_count,
                neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate,
                t, silent)

        # Make sure the model is generated correctly
        self.assertEqual(model.dim, dim)
        self.assertEqual(model.ws, ws)
        self.assertEqual(model.epoch, epoch)
        self.assertEqual(model.min_count, min_count)
        self.assertEqual(model.neg, neg)
        self.assertEqual(model.loss_name, loss)
        self.assertEqual(model.bucket, bucket)
        self.assertEqual(model.minn, minn)
        self.assertEqual(model.maxn, maxn)
        self.assertEqual(model.lr_update_rate, lr_update_rate)
        self.assertEqual(model.t, t)

        # Make sure .bin and .vec are generated
        self.assertTrue(path.isfile(output + '.bin'))
        self.assertTrue(path.isfile(output + '.vec'))

        # Make sure the vector have the right dimension
        self.assertEqual(len(model['the']), dim)

        # Make sure we support unicode character
        unicode_str = 'Καλημέρα'
        self.assertTrue(unicode_str in model.words)
        self.assertTrue(unicode_str in model)
        self.assertEqual(len(model[unicode_str]), model.dim)
Exemple #32
0
    def test_train_skipgram_model_default(self):
        default_args = default_params.read_file(params_txt)
        model = ft.skipgram(input_file, output)

        # Make sure the default params of skipgram is equal
        # to fasttext(1) default params
        self.assertEqual(model.model_name, 'skipgram')
        self.assertEqual(model.dim, int(default_args['dim']))
        self.assertEqual(model.ws, int(default_args['ws']))
        self.assertEqual(model.epoch, int(default_args['epoch']))
        self.assertEqual(model.min_count, int(default_args['minCount']))
        self.assertEqual(model.neg, int(default_args['neg']))
        self.assertEqual(model.word_ngrams, int(default_args['wordNgrams']))
        self.assertEqual(model.loss_name, default_args['loss'])
        self.assertEqual(model.bucket, int(default_args['bucket']))
        self.assertEqual(model.minn, int(default_args['minn']))
        self.assertEqual(model.maxn, int(default_args['maxn']))
        self.assertEqual(model.lr_update_rate,
                float(default_args['lrUpdateRate']))
        self.assertEqual(model.t, float(default_args['t']))
def main():
    data_path = '/Users/ruizhang/Documents/NLP_dataset/'


    #############
    #
    ############
    # Load train set
    train_file = data_path +'dbpedia_csv/train.csv'
    df = pd.read_csv(train_file, header=None, names=['class', 'name', 'description'])

    # Load test set
    test_file = data_path + 'dbpedia_csv/test.csv'
    df_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description'])

    # Mapping from class number to class name
    class_dict = {
        1: 'Company',
        2: 'EducationalInstitution',
        3: 'Artist',
        4: 'Athlete',
        5: 'OfficeHolder',
        6: 'MeanOfTransportation',
        7: 'Building',
        8: 'NaturalPlace',
        9: 'Village',
        10: 'Animal',
        11: 'Plant',
        12: 'Album',
        13: 'Film',
        14: 'WrittenWork'
    }
    df['class_name'] = df['class'].map(class_dict)
    df.head()

    #############
    #
    ############
    desc = df.groupby('class')
    desc.describe().transpose()

    # Transform datasets
    df_train_clean = clean_dataset(df, True, False)
    df_test_clean = clean_dataset(df_test, False, False)

    # Write files to disk
    train_file_clean = data_path + 'dbpedia.train'
    df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    test_file_clean = data_path + 'dbpedia.test'
    df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    # Train a classifier
    output_file = data_path + 'dp_model'
    classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__')

    result = classifier.test(test_file_clean)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)

    sentence1 = ['Picasso was a famous painter born in Malaga, Spain. He revolutionized the art in the 20th century.']
    labels1 = classifier.predict(sentence1)
    class1 = int(labels1[0][0])
    print("Sentence: ", sentence1[0])
    print("Label: %d; label name: %s" % (class1, class_dict[class1]))

    sentence2 = ['One of my favourite tennis players in the world is Rafa Nadal.']
    labels2 = classifier.predict_proba(sentence2)
    class2, prob2 = labels2[0][0]  # it returns class2 as string
    print("Sentence: ", sentence2[0])
    print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2))

    sentence3 = ['Say what one more time, I dare you, I double-dare you m**********r!']
    number_responses = 3
    labels3 = classifier.predict_proba(sentence3, k=number_responses)
    print("Sentence: ", sentence3[0])
    for l in range(number_responses):
        class3, prob3 = labels3[0][l]
        print("Label: %s; label name: %s; certainty: %f" % (class3, class_dict[int(class3)], prob3))

    # Load train set
    train_file = data_path + 'amazon_review_polarity_train.csv'
    df_sentiment_train = pd.read_csv(train_file, header=None, names=['class', 'name', 'description'])

    # Load test set
    test_file = data_path + 'amazon_review_polarity_test.csv'
    df_sentiment_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description'])

    # Transform datasets
    df_train_clean = clean_dataset(df_sentiment_train, True, False)
    df_test_clean = clean_dataset(df_sentiment_test, False, False)

    # Write files to disk
    train_file_clean = data_path + 'amazon.train'
    df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    test_file_clean = data_path + 'amazon.test'
    df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    dim = 10
    lr = 0.1
    epoch = 5
    min_count = 1
    word_ngrams = 2
    bucket = 10000000
    thread = 12
    label_prefix = '__label__'

    # Train a classifier
    output_file = data_path + 'amazon_model'
    classifier = fasttext.supervised(train_file_clean, output_file, dim=dim, lr=lr, epoch=epoch,
                                     min_count=min_count, word_ngrams=word_ngrams, bucket=bucket,
                                     thread=thread, label_prefix=label_prefix)

    # Evaluate classifier
    result = classifier.test(test_file_clean)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)

    class_dict = {
        1: "Negative",
        2: "Positive"
    }

    sentence1 = ["The product design is nice but it's working as expected"]
    labels1 = classifier.predict_proba(sentence1)
    class1, prob1 = labels1[0][0]  # it returns class as string
    print("Sentence: ", sentence1[0])
    # print("Label: %s; label name: %s; certainty: %f" % (class1, class_dict[int(class1)], prob1))

    sentence2 = ["I bought the product a month ago and it was working correctly. But now is not working great"]
    labels2 = classifier.predict_proba(sentence2)
    class2, prob2 = labels2[0][0]  # it returns class as string
    print("Sentence: ", sentence2[0])
    # print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2))

    url = "https://twitter.com/miguelgfierro/status/805827479139192832"
    response = urlopen(url).read()
    title = str(response).split('<title>')[1].split('</title>')[0]
    print(title)

    # # Format tweet
    # tweet = unescape(title)
    # print(tweet)
    #
    # # Classify tweet
    # label_tweet = classifier.predict_proba([tweet])
    # class_tweet, prob_tweet = label_tweet[0][0]
    # print("Label: %s; label name: %s; certainty: %f" % (class_tweet, class_dict[int(class_tweet)], prob_tweet))


    wiki_dataset_original = data_path + 'enwik9'
    wiki_dataset = data_path + 'text9'
    if not os.path.isfile(wiki_dataset):
        os.system("perl wikifil.pl " + wiki_dataset_original + " > " + wiki_dataset)

    output_skipgram = data_path + 'skipgram'
    if os.path.isfile(output_skipgram + '.bin'):
        skipgram = fasttext.load_model(output_skipgram + '.bin')
    else:
        skipgram = fasttext.skipgram(wiki_dataset, output_skipgram, lr=0.02, dim=50, ws=5,
                                     epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
                                     thread=4, t=1e-4, lr_update_rate=100)
    print(np.asarray(skipgram['king']))

    print("Number of words in the model: ", len(skipgram.words))

    # Get the vector of some word
    Droyals = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['queen']), 2)).sum()
    print(Droyals)
    Dpeople = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['woman']), 2)).sum()
    print(Dpeople)
    Dpeople2 = np.sqrt(pow(np.asarray(skipgram['man']) - np.asarray(skipgram['woman']), 2)).sum()
    print(Dpeople2)

    print(len(skipgram.words))
    targets = ['man', 'woman', 'king', 'queen', 'brother', 'sister', 'father', 'mother', 'grandfather', 'grandmother',
               'cat', 'dog', 'bird', 'squirrel', 'horse', 'pig', 'dove', 'wolf', 'kitten', 'puppy']
    classes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
    X_target = []
    for w in targets:
        X_target.append(skipgram[w])
    X_target = np.asarray(X_target)
    word_list = list(skipgram.words)[:10000]
    X_subset = []
    for w in word_list:
        X_subset.append(skipgram[w])
    X_subset = np.asarray(X_subset)
    X_target = np.concatenate((X_subset, X_target))
    print(X_target.shape)
    X_tsne = TSNE(n_components=2, perplexity=40, init='pca', method='exact',
                  random_state=0, n_iter=200, verbose=2).fit_transform(X_target)
    print(X_tsne.shape)
    X_tsne_target = X_tsne[-20:, :]
    print(X_tsne_target.shape)
    plot_words(X_tsne_target, targets, classes=classes)
    plot_words(X_tsne_target, targets, xlimits=[0.5, 0.7], ylimits=[-3.7, -3.6])
Exemple #34
0
import fasttext

INPUT_TXT = '/path/to/file.txt'
OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram'
OUTPUT_PATH_CBOW = '/tmp/cbow'

# Learn the word representation using skipgram model
skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5,
        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
        thread=4, t=1e-4, lr_update_rate=100)

# Get the vector of some word
print skipgram['word']

# Learn the word representation using cbow model
cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5,
        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
        thread=4, t=1e-4, lr_update_rate=100)

# Get the vector of some word
print cbow['word']

# Load pre-trained skipgram model
SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin'
skipgram = fasttext.load_model(SKIPGRAM_BIN)
print skipgram['word']

# Load pre-trained cbow model
CBOW_BIN = OUTPUT_PATH_CBOW + '.bin'
cbow = fasttext.load_model(CBOW_BIN)
print cbow['word']