Esempio n. 1
0
def train():
    config = CONFIG()
    print('加载word2id===========================')
    word2id = load_word2id(config.word2id_file)
    config.vocab_size = len(word2id)
    print('加载word2vec==========================')
    word2vec = load_corpus_word2vec(config.corpus_w2v_file)
    print('加载train语料库========================')
    train = load_corpus(config.train_file,
                        word2id,
                        max_sen_len=config.max_sen_len)
    x_tr = train[:-1]
    y_tr = train[-1]
    print('加载test语料库==========================')
    test = load_corpus(config.test_file,
                       word2id,
                       max_sen_len=config.max_sen_len)
    x_te = test[:-1]
    y_te = test[-1]
    print('训练模型===============================')
    lstm = LSTM(CONFIG, embeddings=word2vec)
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        lstm.fit(sess, x_tr, y_tr, x_te, y_te, config.save_dir,
                 config.print_per_batch)
Esempio n. 2
0
def train():
    with open('train_config.json') as train_config_file:
        train_config = json.load(train_config_file)
    train_data_path = train_config['train_data_path']
    test_data_path = train_config['test_data_path']
    vocab_path = train_config['vocab_path']

    train_input_data, train_input_label = load_corpus(
        file_path=train_data_path, make_vocab=True, vocab_path=vocab_path)
    val_input_data, val_input_label = load_corpus(file_path=test_data_path,
                                                  make_vocab=False)

    vocab = Vocabulary(vocab_path)

    model = Spacing(vocab_len=len(vocab))

    print(model)

    trainer = Trainer(model=model,
                      vocab=vocab,
                      train_data=train_input_data,
                      train_label=train_input_label,
                      val_data=val_input_data,
                      val_label=val_input_label,
                      config=train_config)
    trainer.train(total_epoch=10, validation_epoch=1)
Esempio n. 3
0
 def __init__(self):
     """
     Initial config
     :param bi_grams_path: path to bi-grams set
     :param tri_grams_path: path to tri-grams set
     """
     self.bi_grams = utils.load_corpus('bi')
     self.tri_grams = utils.load_corpus('tri')
Esempio n. 4
0
def init():
    print 'Loading askubuntu training samples..'
    askubuntu_training_samples = utils.load_samples(
        '../data/askubuntu/train_random.txt')
    print len(askubuntu_training_samples)

    print 'Loading askubuntu dev samples..'
    askubuntu_dev_samples = utils.load_samples('../data/askubuntu/dev.txt')
    print len(askubuntu_dev_samples)

    print 'Loading askubuntu test samples..'
    askubuntu_test_samples = utils.load_samples('../data/askubuntu/test.txt')
    print len(askubuntu_test_samples)

    print 'Loading askubuntu corpus..'
    askubuntu_question_map = utils.load_corpus(
        '../data/askubuntu/text_tokenized.txt')
    print len(askubuntu_question_map)

    print 'Loading android dev samples..'
    android_dev_samples = utils.load_samples_stupid_format(
        '../data/android/dev.pos.txt', '../data/android/dev.neg.txt')
    print len(android_dev_samples)

    print 'Loading android test samples..'
    android_test_samples = utils.load_samples_stupid_format(
        '../data/android/test.pos.txt', '../data/android/test.neg.txt')
    print len(android_test_samples)

    print 'Loading android corpus..'
    android_question_map = utils.load_corpus('../data/android/corpus.tsv')
    print len(android_question_map)
    
    print 'Loading stop words..'
    stop_words = utils.load_stop_words('../data/english_stop_words.txt')
    print len(stop_words)

    corpus_texts = map(lambda (t, b): t + ' ' + b,
                       askubuntu_question_map.values() + android_question_map.values())
    
    print 'Loading embeddings..'
    embedding_map = utils.load_embeddings(
        '../data/pruned_android_vector.txt', corpus_texts, stop_words)  # pruned_askubuntu_android_vector.txt
    print len(embedding_map)
    print

    utils.store_embedding_map(embedding_map)

    return (
        askubuntu_training_samples,
        askubuntu_dev_samples,
        askubuntu_test_samples,
        askubuntu_question_map,
        android_dev_samples,
        android_test_samples,
        android_question_map,
        embedding_map)
Esempio n. 5
0
 def __init__(self,
              config_root_path="",
              bi_grams_path='bi_grams.txt',
              tri_grams_path='tri_grams.txt',
              crf_config_path='crf_config.json',
              features_path='crf_features.txt',
              model_path='tokenizer.crfsuite',
              load_data_f_file=load_data_from_file,
              base_lib='sklearn_crfsuite'):
     """
     Initial config
     :param config_root_path: path to directory where you put config files such as bi_grams.txt, tri_grams.txt, ...
     :param bi_grams_path: path to bi-grams set
     :param tri_grams_path: path to tri-grams set
     :param crf_config_path: path to crf model config file
     :param features_path: path to feature config file
     :param model_path: path to save or load model to/from file
     :param load_data_f_file: method using to load data from file to return sentences and labels
     :param base_lib: library to use for CRF algorithm, default: sklearn_crfsuite, other choices are pycrfsuite
     """
     self.bi_grams = load_corpus("bi")
     self.tri_grams = load_corpus("tri")
     self.crf_config = load_config(config_root_path + crf_config_path)
     self.features_cfg_arr = load_crf_config(config_root_path +
                                             features_path)
     self.center_id = int((len(self.features_cfg_arr) - 1) / 2)
     self.function_dict = {
         'bias':
         lambda word, *args: 1.0,
         'word.lower()':
         lambda word, *args: word.lower(),
         'word.isupper()':
         lambda word, *args: word.isupper(),
         'word.istitle()':
         lambda word, *args: word.istitle(),
         'word.isdigit()':
         lambda word, *args: word.isdigit(),
         'word.bi_gram()':
         lambda word, word1, relative_id, *args: self._check_bi_gram(
             [word, word1], relative_id),
         'word.tri_gram()':
         lambda word, word1, word2, relative_id, *args: self.
         _check_tri_gram([word, word1, word2], relative_id)
     }
     self.model_path = model_path
     self.load_data_from_file = load_data_f_file
     self.tagger = None
     self.base_lib = base_lib
Esempio n. 6
0
def main(gpu, path_model, path_corpus, path_config, path_target, path_program,
         path_json, path_img, win_size, path_word2vec):
    MAX_LENGTH = 70

    config = utils.Config(path_config)
    word_dim = config.getint("word_dim")
    state_dim = config.getint("state_dim")
    batch_size = config.getint("batch_size")

    print "[info] CONFIG: %s" % path_config
    print "[info] PRE-TRAINED WORD EMBEDDINGS: %s" % path_word2vec
    print "[info] LOADED MODEL: %s" % path_model
    print "[info] WORD DIM: %d" % word_dim
    print "[info] STATE DIM: %d" % state_dim
    print "[info] BATCH SIZE: %d" % batch_size

    sents_train, sents_val, vocab, ivocab = \
            utils.load_corpus(path_corpus=path_corpus, max_length=MAX_LENGTH)

    cuda.get_device(gpu).use()

    model = utils.load_cxt_model(path_model, path_config, vocab)
    model.to_gpu(gpu)

    sents = parse(vocab, path_target)
    probs = inspect(model, sents)

    words, probs = aggregate(sents, probs, vocab, ivocab, win_size)

    tokens = [ivocab[w] for w in words]

    prob_dist, grid_text = collate(tokens, probs, path_program)

    generate_json(prob_dist, 0.05, path_program, path_json)
    draw_heatmap(np.array(prob_dist), grid_text, path_img)
Esempio n. 7
0
def main(args):
    sentences, index2word = utils.load_corpus(corpus_file=args.corpus)
    vocab_size = len(index2word)

    # create input
    couples, labels = utils.skip_grams(sentences, args.window_size, vocab_size)
    print('Shape of couples: ' + str(couples.shape))
    print('Shape of labels: ' + str(labels.shape))

    # metrics
    nb_batch = len(labels) // args.batch_size
    samples_per_epoch = args.batch_size * nb_batch

    model = build_model(vocab_size, args.vec_dim, args.batch_size)

    if (args.multi_gpu):
        model = multi_gpu_model(model)

    opt = RMSprop(lr=5e-4, decay=5e-6)
    checkpoint = ModelCheckpoint(os.path.join(args.ckpt_path,
                                              'Word2Vec_{epoch:03d}.h5'),
                                 period=args.ckpt_period,
                                 save_weights_only=True)
    early_stop = EarlyStopping(monitor='loss', patience=10)

    model.compile(optimizer=opt, loss='mse', metrics=['accuracy'])
    model.fit_generator(generator=batch_generator(couples, labels,
                                                  args.batch_size, nb_batch),
                        steps_per_epoch=samples_per_epoch,
                        epochs=args.epochs,
                        callbacks=[checkpoint, early_stop],
                        verbose=1)

    # save weights
    utils.save_weights(model, index2word, args.vec_dim)
Esempio n. 8
0
def init():
    print 'Loading training samples..'
    training_samples = utils.load_samples('../data/askubuntu/train_random.txt')
    print len(training_samples)

    print 'Loading dev samples..'
    dev_samples = utils.load_samples('../data/askubuntu/dev.txt')
    print len(dev_samples)

    print 'Loading test samples..'
    test_samples = utils.load_samples('../data/askubuntu/test.txt')
    print len(test_samples)

    print 'Loading corpus..'
    question_map = utils.load_corpus('../data/askubuntu/text_tokenized.txt')
    print len(question_map)

    print 'Loading stop words..'
    stop_words = utils.load_stop_words('../data/english_stop_words.txt')
    print len(stop_words)

    corpus_texts = map(lambda (t, b): t + ' ' + b, question_map.values())

    print 'Loading embeddings..'
    embedding_map = utils.load_embeddings(
        '../data/pruned_askubuntu_android_vector.txt', corpus_texts,
        stop_words)
    print len(embedding_map)
    print

    utils.store_embedding_map(embedding_map)

    return (training_samples, dev_samples, test_samples, question_map,
            embedding_map)
Esempio n. 9
0
def test():
    config = CONFIG()
    print('加载word2id===========================')
    word2id = load_word2id(config.word2id_path)
    config.vocab_size = len(word2id)
    print('加载test语料库=========================')
    x, y = load_corpus(config.test_path, word2id, max_sen_len=config.max_sen_len)
    # x, y = x[:10], y[:10]
    model = TextCNN(config)
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(config.save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        yhat = model.predict(sess, x)

    cat, cat2id = cat_to_id()
    y_cls = np.argmax(y, 1)
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_cls, yhat, target_names=cat))
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_cls, yhat)
    print(cm)
Esempio n. 10
0
def train():
    config = CONFIG()
    print('加载word2id===========================')
    word2id = load_word2id(config.word2id_path)
    print('加载word2vec==========================')
    word2vec = load_corpus_word2vec(config.corpus_word2vec_path)
    print('加载train语料库========================')
    x_tr, y_tr = load_corpus(config.train_path, word2id, max_sen_len=config.max_sen_len)
    print('加载dev语料库==========================')
    x_val, y_val = load_corpus(config.dev_path, word2id, max_sen_len=config.max_sen_len)
    print('训练模型===============================')
    tc = TextCNN(CONFIG, embeddings=word2vec)
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        tc.fit(sess, x_tr, y_tr, x_val, y_val, config.save_dir, config.print_per_batch)
Esempio n. 11
0
    def generate_tweet(self):
        corpus = load_corpus('tweets.txt')
        if corpus:
            tweet = self.markov(corpus)
            return tweet

        return None
Esempio n. 12
0
def train_model(nameCorpus='cookbook',
                epochs=10,
                vector_size=300,
                min_count=1,
                window=10):
    global w2v

    try:
        print('loading corpus: {0}...\n'.format(nameCorpus))
        corpus = utils.load_corpus(nameCorpus)
        print('corpus already loaded\n')
    except NameError:
        print('the corpus {0} is not available'.format(nameCorpus))
        return

    corpus = list(corpus)
    if not corpus:
        print('The corpus is empty. There is nothing to train.')
        return

    w2v = Word2vec(vector_size, min_count, window)
    print('training... please wait\n')
    w2v.train(corpus, epochs=epochs)
    print('The model with {0} corpus is trained'.format(nameCorpus))
    return w2v.model.wv
Esempio n. 13
0
 def _load_vocab(self, side: str) -> List[str]:
     vocab_path = self.model_dir / f"{side}.vcb"
     vocab: List[str] = ["NULL", "UNK"]
     for line in load_corpus(vocab_path):
         index_str, word, _ = line.split()
         assert int(index_str) == len(vocab)
         vocab.append(word)
     return vocab
Esempio n. 14
0
def test(model, config):
    max_sentences = config.get("max_sentences", 1e9)
    max_tokens = config.get("max_tokens", 1e9)

    corpus_prefix = Path(config['corpus_prefix']) / "subword"
    model_path = corpus_prefix / "spm.model"
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(str(model_path))
    test_src = load_corpus(corpus_prefix / Path(config["test_source"]).name,
                           tokenizer)
    num_test_sents = len(test_src)

    eos_id = tokenizer.eos_id()
    test_ids = list(range(num_test_sents))

    test_itr = create_batch_itr(test_src,
                                max_tokens=max_tokens,
                                max_sentences=max_sentences,
                                shuffle=False)
    test_itr = tqdm(test_itr, desc='test')
    for batch_ids in test_itr:
        src_batch = make_batch(test_src, batch_ids, eos_id)
        src_mask = padding_mask(src_batch, eos_id)
        src_encode = model.encode(src_batch, src_mask, train=False)

        trg_ids = [np.array([tokenizer.PieceToId('<s>')] * len(batch_ids))]
        eos_ids = np.array([eos_id] * len(batch_ids))
        while (trg_ids[-1] != eos_ids).any():
            if len(trg_ids) > config['generation_limit']:
                print("Warning: Sentence generation did not finish in",
                      config['generation_limit'],
                      "iterations.",
                      file=sys.stderr)
                trg_ids.append(eos_ids)
                break

            trg_mask = [
                subsequent_mask(len(trg_ids))
                for _ in padding_mask(trg_ids, eos_id)
            ]
            out = model.decode(src_encode,
                               trg_ids,
                               src_mask,
                               trg_mask,
                               train=False)
            y = TF.pick(out, [out.shape()[0] - 1], 0)
            y = np.array(y.argmax(1))
            trg_ids.append(y)

        hyp = [
            hyp_sent[:np.where(hyp_sent == eos_id)[0][0]]
            for hyp_sent in np.array(trg_ids).T
        ]
        for ids in hyp:
            sent = tokenizer.DecodeIds(ids.tolist())
            print(sent)
Esempio n. 15
0
def run(args):
    (
        adj,
        features,
        y_train,
        y_val,
        y_test,
        train_mask,
        val_mask,
        test_mask,
        train_size,
        test_size,
    ) = load_corpus(args.select_data)

    train_mask = train_mask + val_mask
    y_train = y_train + y_val

    adj_dense = preprocess_adj(adj).toarray().astype(np.float32)
    features_dense = preprocess_features(features).toarray().astype(np.float32)

    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)
    train_mask = train_mask.astype(np.float32)
    test_mask = test_mask.astype(np.float32)

    gcn_model = GCN(
        tf.convert_to_tensor(adj_dense),
        layers=args.layers,
        hidden_size=args.hidden_size,
        dropout=args.dropout,
    )

    loss_fn = masked_softmax_cross_entropy

    # acc_fn = masked_accuracy

    optimizer = Adam(learning_rate=args.lr)
    # print("Model Layers: ", gcn_model.trainable_variables)
    model_textGCN = TextGCN(model=gcn_model,
                            loss=loss_fn,
                            optimizer=optimizer,
                            args=args)

    model_textGCN.train(features_dense, y_train, train_mask)

    sns.distplot(model_textGCN.train_accuracy)
    plt.savefig("train_acc.png")

    plt.clf()

    sns.distplot(model_textGCN.train_losses)
    plt.savefig("train_losses.png")

    eval_result = model_textGCN.evaluate(features_dense, y_test, test_mask)

    print(f"Final Evaluation Result: {eval_result}")
Esempio n. 16
0
def calculate_class_score(sentence, class_name):
    score = 0
    sentence = normalize(sentence)
    sentence = remove_stopwords(sentence)
    sentence = stemming(sentence)
    dados = load_corpus()
    for word in sentence:
        if word in dados[class_name]:
            score += dados[class_name][word]
    return score
Esempio n. 17
0
def calculate_score(sentence):
    high_score = 0
    classname = 'default'
    dados = load_corpus()
    for classe in dados.keys():
        pontos = 0
        pontos = calculate_class_score(sentence, classe)
        if pontos > high_score:
            high_score = pontos
            classname = classe
    return {'classname': classname, 'high_score': high_score}
Esempio n. 18
0
def main():

    net = SkipGramNetwork(hp.VOCAB_SIZE, hp.EMBED_SIZE).to(device)
    print(net)

    if args.restore:
        net.load_state_dict(torch.load(args.restore))
        vocab, inverse_vocab = utils.load_data(args.restore)
        print("Model restored from disk.")
    else:
        sentences = utils.load_corpus(args.corpus)
        word_freqs = utils.word_counts(sentences)
        sentences, word_freqs = utils.trunc_vocab(sentences,
                                                  word_freqs)  # TODO
        sentences = utils.subsample(sentences, word_freqs)

        vocab, inverse_vocab = utils.construct_vocab(sentences)  # TODO
        skipgrams = skip_grams(sentences, vocab)  # TODO
        utils.save_data(args.save, vocab, inverse_vocab)

        loader = DataLoader(skipgrams, batch_size=hp.BATCH_SIZE, shuffle=True)
        loss_hist = train(
            net,
            loader)  # TODO returns loss function evaluations as python list
        """ You can plot loss_hist for your writeup:
            plt.plot(loss_hist)
            plt.show()
        """
        plt.plot(loss_hist)
        plt.show()

    # the weights of the embedding matrix are the lookup table
    lookup_table = net.embeddings.weight.data.cpu().numpy()
    """ TODO: Implement what you need in order to answer the writeup questions. """

    nearest = most_similar(lookup_table, lookup_table[vocab['journeyed']])
    nearest_words = [inverse_vocab[w] for w in nearest if w in inverse_vocab]
    print('Nearest to {0}: {1}'.format('journeyed', nearest_words))

    #    print('Dimension Reduction and Plotting')
    #    reduced = TSNE().fit_transform(lookup_table)
    #    plt.scatter(reduced[:,0], reduced[:,1])
    #    plt.show()

    tsne = TSNE(perplexity=30,
                n_components=2,
                init='pca',
                n_iter=5000,
                method='exact')
    plot_only = 500
    low_dim_embs = tsne.fit_transform(lookup_table[:plot_only, :])
    labels = [inverse_vocab[i] for i in range(plot_only)]
    plot_with_labels(low_dim_embs, labels, 'tsne.png')
Esempio n. 19
0
    def test_PWID(self):
        """
        Simply run through a corpus to make sure that it is loaded and analyzed
        :return:
        """

        text = load_corpus(self.passwords[0])
        pwid = PWID(ultra_verbose=True, fast=False)

        matches, score = pwid.identify_passwords(text)

        assert len(matches) > 0
Esempio n. 20
0
    def test_PWID_no_passwords(self):

        pwid = PWID(ultra_verbose=True, fast=False)

        for file_path in self.no_passwords:

            logger.info('Executing on %s' % file_path)

            text = load_corpus(file_path)
            matches, score = pwid.identify_passwords(text)

            print score, matches
            assert score <= 0
Esempio n. 21
0
 def _load_distortion_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]:
     table: Dict[Tuple[int, int], Dict[int, float]] = {}
     for line in load_corpus(self.model_dir / f"src_trg_{align_model}.d3.final"):
         fields = line.split(maxsplit=5)
         j = int(fields[0])
         i = int(fields[1])
         tlen = int(fields[3])
         prob = float(fields[4])
         key = (i, tlen)
         probs = table.get(key)
         if probs is None:
             probs = {}
             table[key] = probs
         probs[j] = prob
     return table
Esempio n. 22
0
 def _load_head_distortion_table(self, align_model: str) -> Dict[Tuple[int, int], Dict[int, float]]:
     table: Dict[Tuple[int, int], Dict[int, float]] = {}
     for line in load_corpus(self.model_dir / f"src_trg_{align_model}.d4.final"):
         fields = line.split()
         trg_word_class = int(fields[3])
         src_word_class = int(fields[4])
         key = (src_word_class, trg_word_class)
         probs = table.get(key)
         if probs is None:
             probs = {}
             table[key] = probs
         for index, prob_str in enumerate(fields[9:]):
             if prob_str != "0":
                 dj = index - MAX_SENT_LENGTH
                 probs[dj] = float(prob_str)
     return table
Esempio n. 23
0
def main():
    """
    Task: Transform a corpus of text into word vectors according to this context-window principle.
    1. Load the data - this is done for you.
    2. Construct a vocabulary across the entire corpus. This should map a string (word) to id.
    3. Use the vocabulary (as a word-to-id mapping) and corpus to construct the sparse word vectors.
    """
    sentences = utils.load_corpus(args.corpus)
    # print (len(sentences))

    vocab_full, inverse_vocab_full = utils.construct_vocab(sentences)
    # print (vocab_full)
    # print (inverse_vocab_full)

    counts = utils.word_counts(sentences)
    new_corpus, new_counts = utils.trunc_vocab(sentences, counts)

    # print (len(new_corpus))
    # print ("**********************************************")
    # print (new_counts)

    global inverse_vocab_truncated
    vocab_truncated = {}
    inverse_vocab_truncated = {}

    for word in new_counts:
        vocab_truncated[word] = vocab_full[word]
        inverse_vocab_truncated[vocab_full[word]] = word

    print(vocab_truncated)
    # print (inverse_vocab_truncated)
    favorite = input("Enter Favorite Word\n")
    global idtoindex
    idtoindex = {}

    i = 0

    for word_id in inverse_vocab_truncated:
        idtoindex[word_id] = i
        i += 1

    # print (idtoindex)
    lookup_table = word_vectors(new_corpus, vocab_truncated)

    most_similar(lookup_table, lookup_table[vocab_truncated[favorite]])
    # print (lookup_table)
    """ TODO: Implement what you need to answer the writeup questions. """
Esempio n. 24
0
File: train.py Progetto: luizfan/nlp
def learning(training_data):
    corpus_words = load_corpus()
    for data in training_data:
        phrase = data['phrase']
        phrase = normalize(phrase)
        phrase = remove_stopwords(phrase)
        phrase = stemming(phrase)

        class_name = data['class']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in phrase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    return corpus_words
Esempio n. 25
0
 def _load_nonhead_distortion_table(self, align_model: str) -> Dict[int, Dict[int, float]]:
     table: Dict[Tuple[int, int], Dict[int, float]] = {}
     ext = "db4" if platform.system() == "Windows" else "D4"
     is_key_line = True
     for line in load_corpus(self.model_dir / f"src_trg_{align_model}.{ext}.final"):
         fields = line.split()
         if is_key_line:
             trg_word_class = int(fields[3])
         else:
             probs = table.get(trg_word_class)
             if probs is None:
                 probs = {}
                 table[trg_word_class] = probs
             for index, prob_str in enumerate(fields):
                 if prob_str != "0":
                     dj = index - MAX_SENT_LENGTH
                     probs[dj] = float(prob_str)
         is_key_line = not is_key_line
     return table
Esempio n. 26
0
 def _load_lexicon(
     self,
     src_vocab: List[str],
     trg_vocab: List[str],
     align_model: str,
     threshold: float = 0.0,
     include_special_tokens: bool = False,
 ) -> Lexicon:
     lexicon = Lexicon()
     model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}"
     for line in load_corpus(model_path):
         src_index_str, trg_index_str, prob_str = line.split(maxsplit=3)
         src_index = int(src_index_str)
         trg_index = int(trg_index_str)
         if include_special_tokens or (src_index > 1 and trg_index > 1):
             src_word = src_vocab[src_index]
             trg_word = trg_vocab[trg_index]
             prob = float(prob_str)
             if prob > threshold:
                 lexicon[src_word, trg_word] = prob
     return lexicon
Esempio n. 27
0
    def test_PWID_anomalies(self):
        """
        None of the anomalies should register a positive score.
        Anomalies are more likely than other texts to register a positive score from the filters and produce false
        positives.
        :return:
        """

        pwid = PWID(ultra_verbose=True, fast=False)

        for file_path in self.anomalies:

            logger.info('Executing on %s' % file_path)

            text = load_corpus(file_path)
            matches, score = pwid.identify_passwords(text)

            print score, matches
            print pwid.filter.aggregate_score

            assert score <= 0
Esempio n. 28
0
                uni_words = list(dictionary.keys())
                uni_ids = list(dictionary.values())
                embedding_matrix = self.tf_sess.run(
                    tf.nn.embedding_lookup(
                        emb_net.normalized_embeddings,
                        tf.constant(uni_ids, dtype=tf.int32)))
                word2emb = dict(zip(uni_words, embedding_matrix))
                tl.files.save_any_to_npy(save_dict=word2emb,
                                         name=embedding_file_path)
                with open('%s/%s.json' % (OutputDir, self.model_name),
                          'w') as o_file:
                    for k in word2emb:
                        o_file.write(
                            '%s\n' %
                            json.dumps({k: word2emb[k].flatten().tolist()},
                                       ensure_ascii=False))
                o_file.close()


if __name__ == '__main__':
    fmt = "%(asctime)s %(levelname)s %(message)s"
    logging.basicConfig(format=fmt, level=logging.INFO)

    ## load corpus
    with utils.timer('Loading corpus'):
        corpus = utils.load_corpus(corpus_file, debug)

    ## train word2vec with skip-gram & negative sampling
    with utils.timer('word2vec training'):
        Word2Vec(corpus, g_params).train()
Esempio n. 29
0
import random
sys.path.append('data_process/')
import utils

recall_cands_file = sys.argv[1]
ce_score_file = sys.argv[2]
outfile = sys.argv[3]

random_seed = 111
rng = random.Random(random_seed)

neg_cnt = 4
ce_threshold_neg = 0.1
ce_threshold_pos = 0.9

q_text, p_text, p_title = utils.load_corpus(corpus='marco', q_type='train')
pos_qp, pos_qp_add = utils.load_pos_examples(p_text)
cand_qp_all, train_qids = utils.load_candidates(recall_cands_file, col=4)
ce_score = utils.load_ce_score(ce_score_file, train_qids)

# neg examples
neg_qp = {}
for qid, pids in cand_qp_all.items():
    if qid not in pos_qp:
        continue
    select_pid = []
    pos_cnt = len(pos_qp[qid])
    for index in range(50):
        _pid = pids[index]
        if len(select_pid) == neg_cnt * pos_cnt:
            break
Esempio n. 30
0
    batch_size = 160
    output_channels = 20
    n_hidden = 256
    n_epoch = 5
    learning_rate = 0.01
    drop_keep_prob = 0.4
    num_filters = 256
    kernel_size = 3


config = CONFIG()
word2id = load_word2id('./data/word_to_id.txt')
print('加载word2vec==========================')
word2vec = load_corpus_word2vec('./data/corpus_word2vec.txt')
print('加载train语料库========================')
train = load_corpus('./data/train/', word2id, max_sen_len=config.max_sen_len)
print('加载dev语料库==========================')
dev = load_corpus('./data/dev/', word2id, max_sen_len=config.max_sen_len)
print('加载test语料库=========================')
test = load_corpus('./data/test/', word2id, max_sen_len=config.max_sen_len)

x_tr, y_tr = train
x_val, y_val = dev

config = CONFIG()
tc = TextCNN(config=config, embeddings=word2vec)

with tf.Session() as sess:
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    tc.fit(sess, x_tr, y_tr, x_val, y_val)