Ejemplo n.º 1
0
def main():
    model, encoder_model, decoder_model = create_models(300, 512, 300)
    model.load_weights(sys.argv[1])

    ft_en = FastText('embeddings/wiki.en.bin')
    ft_tl = FastText('embeddings/wiki.tl.bin')

    start_seq = ft_en.get_numpy_vector(SOS, normalized=True).reshape(1, 1, -1)

    chars = '.,?!()'

    while True:
        input_sentence = input('Input Tagalog: ').lower()  #'kamusta ka ?'

        for c in chars:
            input_sentence = input_sentence.replace(c, ' ' + c + ' ')

        print('Embedding...')
        input_seq = input_sentence.lower().split()
        aaa = np.zeros((1, 15, 300), dtype='float32')
        for i, w in enumerate(input_seq):
            aaa[0, i] = ft_tl.get_numpy_vector(w, normalized=True)
        #input_seq = [ft_tl.get_numpy_vector(i, normalized=True) for i in input_seq]
        #input_seq = np.stack(input_seq).reshape(1, -1, 300)
        input_seq = aaa
        print(input_seq)

        print('Translating...')

        decoded_sentence = decode_sequence(input_seq, encoder_model,
                                           decoder_model, ft_en, start_seq)
        print('-')
        print('Input sentence:', input_sentence)
        print('Decoded sentence:', decoded_sentence)
Ejemplo n.º 2
0
 def load_model(self):
     if not os.path.exists(self.model_path):
         raise FileNotFoundError('model file not found!')
     if self.model_name == 'fasttext':
         self.model = FastText(self.model_path)
     else:
         self.model = gensim.models.Word2Vec.load(self.model_path, mmap='r')
Ejemplo n.º 3
0
def get_fasttext_matrix(vocab, initial_embedding_np):
    """
    return an embeddings matrix
    :param self:
    :param embeddings_file:
    :param initial_embedding_np:
    :return: np array of [V,E]
    """
    from pyfasttext import FastText

    logging.info('Loading the FastText embeddings')
    model = FastText(cfg.embeddings_path)

    cnt = 0
    vec_array = initial_embedding_np
    old_avg = np.average(vec_array)
    old_std = np.std(vec_array)
    vec_array = vec_array.astype(np.float32)
    new_avg, new_std = 0, 0

    for word in vocab._item2idx:
        vec = model[word]
        vec = np.array(vec, np.float32)
        word_idx = vocab.encode(word)
        cnt += 1
        vec_array[word_idx] = vec
        new_avg += np.average(vec)
        new_std += np.std(vec)

    new_avg /= cnt
    new_std /= cnt
    logging.info(
        '%d known embedding. old mean: %f new mean %f, old std %f new std %f' %
        (cnt, old_avg, new_avg, old_std, new_std))
    return vec_array
Ejemplo n.º 4
0
    def __init__(self, vocab_path, vocab):
        self.pad_token = '<blank>'
        self.unk_token = '<unk>'
        self.model = FastText(vocab_path)
        self.vocab = ['<blank>', '<unk>'] + vocab
        self.token2id = {}
        self.id2token = {}
        self.embed_dim = 300  #this is deployed temporarily
        if not os.path.exists('embeddings.npy'):
            self.embeddings = np.random.rand(self.size(), self.embed_dim)
        else:
            self.embeddings = np.load('embeddings.npy')
        self.logger = logging.getLogger("sentiment")

        i = 0
        for token in [self.pad_token, self.unk_token]:
            self.embeddings[i] = np.zeros([self.embed_dim])
            self.token2id[token] = i
            self.id2token[i] = token
            i += 1
        for token in vocab:
            self.token2id[token] = i
            self.id2token[i] = token
            i += 1
        '''
Ejemplo n.º 5
0
def main():
    model = FastText('model_text8.bin')

    target_words = [
        'granada', 'python', 'harmony', 'mafia', 'yoga', 'goth', 'cyberpunk',
        'nasa', 'japan', 'boolean', 'foodball', 'algorithm', 'china', 'usa',
        'internet', 'harvard', 'earth', 'horse', 'angel', 'rock'
    ]
    for t_word in target_words:
        # get embedding
        target_word_embedding = model.get_numpy_vector(t_word)
        print('Target word:', t_word)
        #print('Embedding shape:', target_word_embedding.shape)
        #print('Embedding:', target_word_embedding[0:10], '...')

        # find closest words
        closest_words = model.nearest_neighbors(t_word, k=15)
        # init array
        nn_word_embedding = np.zeros(shape=(15, 128))
        i = 0
        for word, similarity in closest_words:
            # get each word embedding
            nn_word_embedding[i] = model.get_numpy_vector(word)
            #print('Word:', word, 'Vec:', nn_word_embedding[i])
            i = i + 1
        # kmeans
        #print(nn_word_embedding.shape)
        #print(closest_words)
        cluster_model = KMeans(n_clusters=3, init='k-means++')
        prediction = cluster_model.fit_predict(nn_word_embedding)
        print(prediction)
        j = 0
        for word in closest_words:
            print('Word:', word[0], '- Cluster #%d' % (prediction[j] + 1))
            j = j + 1
Ejemplo n.º 6
0
def collect_docs(p, lang_detection_model_name=None, lang='en'):

    if lang_detection_model_name != None:
        from pyfasttext import FastText
        model_path = SparkFiles.get(lang_detection_model_name)
        model = FastText(model_path)

    regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  #domain...
        r'localhost|'  #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$',
        re.IGNORECASE)

    result = []
    lines = list(p)
    indices = [i for i, line in enumerate(lines) if regex.search(line.strip())]
    for i in range(0, len(indices)):
        idx = indices[i]
        content = lines[idx + 1]
        paras = re.findall('<PAR>(.*?)</PAR>', content, re.DOTALL)

        if model:
            #filter only english paras
            langs = model.predict(paras)
            en_paras = list(filter(lambda p: lang in p[1], zip(paras, langs)))
            paras = list(map(lambda pair: pair[0], en_paras))

        if paras:
            url = lines[idx].strip()
            result.append((url, paras))

    return result
Ejemplo n.º 7
0
    def train(self, trainingfile):
        """Starts model building"""

        logger.info(
            f'Training started with : learningRate:{self.config.learningRate!s}, epoch:{self.config.epoch!s}, ngrams :{self.config.ngrams!s}'
        )
        model = FastText()
        if self.supervised:
            model.supervised(input=trainingfile,
                             output=self.filepath,
                             epoch=self.config.epochs,
                             lr=self.config.learningRate,
                             wordNgrams=self.config.ngrams,
                             verbose=2,
                             minCount=1)
        elif self.config.method == "cbow":
            model.cbow(input=trainingfile,
                       output='model',
                       epoch=self.config.epoch,
                       lr=self.config.learningRate)
        else:
            model.skipgram(input=trainingfile,
                           output='model',
                           epoch=self.config.epoch,
                           lr=self.config.learningRate)
Ejemplo n.º 8
0
 def fasttext(text):
     test_data = text.replace('\n', ' ')
     model = FastText('./model_audit.bin')
     test = test_data + '\n'
     pred = model.predict_proba_single(test, k=2)
     out = pred[0][1]
     return out
Ejemplo n.º 9
0
    def predict(self, test_set, test_labels_vector=None, report_accuracy=True):
        """
        uses the trained model to predict the test set
        :param test_set: the test set
        :param test_labels_vector: the labels vector of the test set for accuracy computation
        :param report_accuracy: defines whether to report the prediction or not
        """

        if self.model_name:
            from pyfasttext import FastText
            predictor = FastText()
            predictor.load_model('ft_extras/'+self.model_name+'.bin')
            predicted_labels = predictor.predict_proba(test_set)
            if report_accuracy and test_labels_vector:
                test_set_size = len(test_set)
                correct_predictions = 0
                invalid_labels = 0
                for index, labels in enumerate(predicted_labels):
                    if len(labels) != 0:
                        best_label = max(labels,key=lambda label:label[1])
                        if best_label[0] == test_labels_vector[index]:
                            correct_predictions += 1
                    else:
                        invalid_labels += 1
                        continue
                print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels)))
        else:
            print('Please use the train method to train a model first.')
            return
Ejemplo n.º 10
0
def use_pyfasttext_model():
    # OK
    # 训练模型可以使用fasttext命令行工具进行(../doc/fastText_train.png),也可以使用本文件使用的pyfasttext包训练。
    """
    # OK: 1. pyfasttext包训练的模型的导入
    model = FastText("../data/lxw_model_sg_pyfasttext.bin")
    print(model["先生"])     # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))

    model = FastText("../data/lxw_model_cbow_pyfasttext.bin")
    print(model["先生"])
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))
    # NOTE: 简单的测试发现, 两个不同的模型针对同一个OOV计算得到的向量是一样的(与fasttext包的情况相同,详情可参见NO_2_use_fasttext_model), 非OOV的向量是不一样的。
    """

    # OK: 2. fasttext命令行工具训练出来的模型的导入
    model = FastText("../data/880w_fasttext_skip_gram.bin")
    print(model["先生"])  # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))
    # print(model["刘晓伟"])   # OK. OOV
    # print(model["陈贺"])   # OK. OOV

    # Sentence and text vectors.
    sentence_vec = model.get_numpy_sentence_vector("刘晓伟 是 个 好人")
    print(sentence_vec)
    """
Ejemplo n.º 11
0
    def get_vector(self, text, get_type=2):
        '''
        根据分词内容获取分词向量
        :param text: 分词内容
        :param get_type: 分词模式
        :return:分词向量,np:(n, 300)
        '''
        word_np = []
        if self.model is None:
            model = FastText(self.fasttext_bin)
        else:
            model = self.model
        if get_type == 1:
            seg_list = jieba.cut(text, cut_all=True)  #全模式
        elif get_type == 2:
            seg_list = jieba.cut(text, cut_all=False)  #精确模式
        else:
            seg_list = jieba.cut_for_search(text)  #搜索引擎模式

        for li in list(seg_list):
            word_np.append(np.array(model[li]))
        if len(word_np) == 0:
            word_np = np.zeros((1, 300))
        else:
            word_np = np.array(word_np)
        return word_np
Ejemplo n.º 12
0
    def test_vector(self):
        model = FastText()

        model.supervised(input='/input/tests/data/text.txt',
                         output='model',
                         epoch=1,
                         lr=0.7)
def make_embedding_matrix(word_index, fname):
    model = FastText(os.path.join('embeddings', fname))
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM),
                                dtype='float32')
    for word, i in word_index.items():
        embedding_matrix[i] = model.get_numpy_vector(word, normalized=True)
    return embedding_matrix
Ejemplo n.º 14
0
def create_predict(HudongItem_csv):
    # 读取neo4j内容
    db = Neo4j()
    db.connectDB()

    predict_List = readCSVbyColumn(HudongItem_csv, 'title')
    file_object = open('vector.txt', 'a')

    model = FastText('wiki.zh.bin')

    count = 0
    vis = set()
    for p in predict_List:
        cur = HudongItem(db.matchHudongItembyTitle(p))
        count += 1
        title = cur.title
        if title in vis:
            continue
        vis.add(title)
        wv_list = model[title]
        strr = str(title)
        for p in wv_list:
            strr += ' ' + str(p)[:7]
        file_object.write(strr + "\n")
        print(str(count) + ' / ' + str(len(predict_List)))

    file_object.close()
Ejemplo n.º 15
0
 def load(self):
     print('Loading fasttext model.')
     self.ftmodel = FastText()
     self.ftmodel.load_model(self.file)
     self.vdim = len(self.ftmodel['is'])
     print('Finished loading fasttext model.')
     return self
Ejemplo n.º 16
0
def get_language(text):
    """Given a list of lines, return a list of (line, lang)"""
    if not hasattr(settings, '_lang_detector'):
        lid_model = FastText()
        lid_model.load_model(settings.LID_MODEL_PATH)
        settings._lang_detector = lid_model
    langs = settings._lang_detector.predict([text])
    return langs[0]
Ejemplo n.º 17
0
 def load(self):
     try:
         self.ft = FastText(self.filepath)
     except:
         return "Failed to Load FT file"
     logger.info(f"loaded file {self.filepath}")
     self.loaded = True
     return "success"
Ejemplo n.º 18
0
def text():
	model = FastText('wiki.zh.bin')
	print('load over..')
	s1 = '启航'
	s2 = '董启航'
	s3 = ' 董启文'
	print(model.nearest_neighbors('桃', k=5))
		
#text()
Ejemplo n.º 19
0
def pyfasttext_sample():
    """https://pypi.org/project/pyfasttext/
    """
    model = FastText()
    # model.load_model('output/model_cooking_6.bin')
    model.load_model('output/model_cooking_5.ftz')
    result = model.predict_file('data/cooking/pre_cooking.valid', 2)
    for i, r in enumerate(result):
        print(i, r)
Ejemplo n.º 20
0
def train_pyfasttext_model():
    # Skipgram model
    model_sg = FastText()
    # equals to: `./fasttext skipgram -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_sg_pyfasttext`
    model_sg.skipgram(
        input="../data/880w_news_title_content_seg_sort_uniq_head_2.txt",
        output="../data/lxw_model_sg_pyfasttext")
    # 自动生成文件../data/lxw_model_sg_pyfasttext.bin 和 ../data/lxw_model_sg_pyfasttext.vec
    print(model_sg.words)  # list of words in dictionary

    # CBOW model
    model_cbow = FastText()
    # equals to: `./fasttext cbow -input ../data/880w_news_title_content_seg_sort_uniq_head_2.txt -output lxw_model_cbow_pyfasttext`
    model_cbow.cbow(
        input="../data/880w_news_title_content_seg_sort_uniq_head_2.txt",
        output="../data/lxw_model_cbow_pyfasttext")
    # 自动生成文件../data/lxw_model_cbow_pyfasttext.bin 和 ../data/lxw_model_cbow_pyfasttext.vec
    print(model_cbow.words)  # list of words in dictionary
    print(type(model_cbow.words))  # <class 'list'>
Ejemplo n.º 21
0
    def load_fasttext(self, iword_dict):

        embed_dict = {}
        print(self.fasttext_path)
        model = FastText(self.fasttext_path)
        for word, key in iword_dict.items():
            embed_dict[key] = model[word]
            # print(embed_dict[key])
        print('Embedding size: %d' % (len(embed_dict)))
        return embed_dict
Ejemplo n.º 22
0
def build_w2v(relevant_tokens, model_file='wiki.cy.bin'):
    # using this library because it's more memory friendly for python :)
    from pyfasttext import FastText
    model = FastText(model_file)

    w2v = {}
    for token in relevant_tokens:
        vec = model.get_numpy_vector(token)
        w2v[token] = vec
    return w2v
Ejemplo n.º 23
0
def load_wv(path, filter_model_path=None, kind=['gensim', 'srcd', 'fasttext']):
    if isinstance(kind, str):
        return load_wv(path, [kind])

    # First, load the filtering model, if any. (FastText not supported atm.)
    to_try = kind[0]
    filter_wv = None
    if filter_model_path:
        filter_wv = load_wv(filter_model_path, kind=['gensim', 'srcd'])

    # Alright, now let's load the model, taking the filter into account.
    if to_try == 'gensim':
        try:
            if filter_wv and to_try != "fasttext":
                raise ValueError(
                    f"`filter_model_path` not supported with {to_try}")
            return gensim.models.Word2Vec.load(path).wv
        except Exception:
            if len(kind) > 1:
                return load_wv(path,
                               filter_model_path=filter_model_path,
                               kind=kind[1:])
            raise
    elif to_try == 'srcd':
        try:
            if filter_wv and to_try != "fasttext":
                raise ValueError(
                    f"`filter_model_path` not supported with {to_try}")
            with open(path, "rb") as fin:
                words, _, embeddings = pickle.load(fin)
            return build_gensim_wv(words, embeddings)
        except Exception:
            if len(kind) > 1:
                return load_wv(path,
                               filter_model_path=filter_model_path,
                               kind=kind[1:])
            raise
    elif to_try == 'fasttext':
        try:
            fasttext_wv = FTWrapper(FastText(path))
            if not filter_wv:
                return fasttext_wv
            words = filter_wv.index2word
            embeddings = [np.array(fasttext_wv[w]) for w in words]
            return build_gensim_wv(words, embeddings)
        except Exception:
            if len(kind) > 1:
                return load_wv(path,
                               filter_model_path=filter_model_path,
                               kind=kind[1:])
            raise
    else:
        raise ValueError(f"unknown kind {to_try}")
Ejemplo n.º 24
0
def train():
    train_file = const.train_processed_binary_file_name
    validate_file = const.validate_processed_binary_file_name

    current_best_score = 0
    current_best_name = ''
    lr = 0.01

    for epoch_i in range(1, 30):
        start_time = datetime.datetime.now().replace(microsecond=0)

        model_file_name = 'data/model_' + str(lr) + '_' + str(epoch_i)

        model = FastText()
        model.supervised(input=train_file,
                         output=model_file_name,
                         lr=lr,
                         epoch=epoch_i,
                         loss='softmax',
                         wordNgrams=3,
                         thread=12,
                         ws=5,
                         minn=2,
                         maxn=4,
                         dim=50)

        micro_precision, micro_recall, micro_f1, macro_precision, macro_recall, macro_f1 = validate_model(
            model_file_name, validate_file)

        end_time = datetime.datetime.now().replace(microsecond=0)

        result_log = ("epoch:" + str(epoch_i) + ': micro precision:' +
                      str(round(micro_precision, 4)) + ', micro_recall:' +
                      str(round(micro_recall, 4)) + ', micro_f1:' +
                      str(round(micro_f1, 4)) + ', macro_precision:' +
                      str(round(macro_precision, 4)) + ', macro_recall:' +
                      str(round(macro_recall, 4)) + ', macro_f1:' +
                      str(round(macro_f1, 4)) + ', lr:' + str(lr) +
                      ', duration:' + str(end_time - start_time))

        if current_best_score < micro_f1:
            current_best_score = micro_f1
            print(result_log + ' ====> Model improved!!!!')
            if current_best_name != '':
                os.remove(current_best_name)
            current_best_name = model_file_name + '.bin'

        else:
            print(result_log)
            os.remove(model_file_name + '.bin')
            os.remove(model_file_name + '.vec')

        sys.stdout.flush()
Ejemplo n.º 25
0
def text():
	model = FastText('wiki.zh.bin')
	print('load over..')
	s1 = '水果是指多汁且有甜味的植物果实,不但含有丰富的营养且能够帮助消化。水果是对部分可以食用的植物果实和种子的统称。水果有降血压、减缓衰老、减肥瘦身、皮肤保养、明目、抗癌、降低胆固醇等保健作用。一般的水果都是生食,不经过加工,洗干净就直接吃了,这样维生素很少损失,弥补了蔬菜的不足。'
	s2 = '在全球层面上,亚投行建立的主要背景是新兴大国的异军突起。'
	s3 = '亚洲基础设施投资银行Asian Infrastructure Investment Bank ,简称亚投行,AIIB是一个政府间性质的亚洲区域多边开发机构。重点支持基础设施建设,成立宗旨是为了促进亚洲区域的建设互联互通化和经济一体化的进程,并且加强中国及其他亚洲国家和地区的合作,是首个由中国倡议设立的多边金融机构,总部设在北京,法定资本1000亿美元。截至2017年10月,亚投行有70个正式成员国。2013年10月2日,习近平主席提出筹建倡议,2014年10月24日,包括中国、印度、新加坡等在内21个首批意向创始成员国的财长和授权代表在北京签约,共同决定成立投行。2015年12月25日,亚洲基础设施投资银行正式成立。2016年1月16日至18日,亚投行开业仪式暨理事会和董事会成立大会在北京举行。亚投行的治理结构分理事会、董事会、管理层三层。理事会是最高决策机构,每个成员在亚投行有正副理事各一名。董事会有12名董事,其中域内9名,域外3名。管理层由行长和5位副行长组成。'
	
	s1 = s1[:100]
	s2 = s2[:100]
	print(s2)
	s3 = s3[:100]
	print(model.similarity(s1,s2))
	print(model.similarity(s3,s2))
Ejemplo n.º 26
0
def embeddings(data):

    model = FastText('skipModelNilc.bin')

    word_vec = []
    for token in data:
        vec = model[token].tolist()
        word_vec.append(vec)

    print(word_vec)

    print("Word Embeddings concluido")

    return word_vec
Ejemplo n.º 27
0
def init():
    global processtext
    processtext = ProcessText()
    
    global labels_list
    with open("both_labels.pkl", "rb") as f:
        labels_list = pickle.load(f)
    
    global contcmp
    contcmp = ContCmp("root_feature_file.allid")
    #loadModel()
    
    global fasttext_model
    fasttext_model = FastText()
    fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
Ejemplo n.º 28
0
def main():
    model = FastText('model_text8.bin')

    target_word = 'dog'

    # get embedding
    target_word_embedding = model.get_numpy_vector(target_word)
    print('Target word:', target_word)
    print('Embedding shape:', target_word_embedding.shape)
    print('Embedding:', target_word_embedding[0:10], '...')

    # find closest words
    closest_words = model.nearest_neighbors(target_word, k=15)
    for word, similarity in closest_words:
        print('Word:', word, 'similarity:', similarity)
Ejemplo n.º 29
0
def print_subwords(fname):
    model = FastText(fname)
    maxn = model.args['maxn']
    res = {}

    for word in model.words:
        for subword, arr in zip(model.get_subwords(word),
                                model.get_numpy_subword_vectors(word)):
            # real ngram, not the full word?
            if len(subword) > maxn:
                continue

            res[subword] = arr

    for key in sorted(res.keys()):
        print('{} {}'.format(key, ' '.join(str(val) for val in res[key])))
Ejemplo n.º 30
0
 def make_embeddings_simple_in_memory(self,
                                      name="fasttext-crawl",
                                      hasHeader=True):
     nbWords = 0
     print('loading embeddings...')
     begin = True
     description = self._get_description(name)
     if description is not None:
         embeddings_path = description["path"]
         embeddings_type = description["type"]
         self.lang = description["lang"]
         print("path:", embeddings_path)
         if self.extension == 'bin':
             self.model = FastText(embeddings_path)
             nbWords = self.model.nwords
             self.embed_size = 300
         else:
             if embeddings_type == "glove":
                 hasHeader = False
             with open(embeddings_path) as f:
                 for line in f:
                     line = line.strip()
                     line = line.split(' ')
                     if begin:
                         if hasHeader:
                             # first line gives the nb of words and the embedding size
                             nbWords = int(line[0])
                             self.embed_size = int(line[1].replace(
                                 "\n", ""))
                             begin = False
                             continue
                         else:
                             begin = False
                     word = line[0]
                     #if embeddings_type == 'glove':
                     vector = np.array(
                         [float(val) for val in line[1:len(line)]],
                         dtype='float32')
                     #else:
                     #    vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32')
                     if self.embed_size == 0:
                         self.embed_size = len(vector)
                     self.model[word] = vector
             if nbWords == 0:
                 nbWords = len(self.model)
         print('embeddings loaded for', nbWords, "words and",
               self.embed_size, "dimensions")