Exemple #1
0
def main(unused_argv):
    if not FLAGS.data_dir:
        raise ValueError("--data_dir is required.")
    if not FLAGS.output_dir:
        raise ValueError("--output_dir is required.")

    encoder = encoder_manager.EncoderManager()

    # Maybe load unidirectional encoder.
    if FLAGS.uni_checkpoint_path:
        print("Loading unidirectional model...")
        uni_config = configuration.model_config()
        encoder.load_model(uni_config, FLAGS.uni_vocab_file,
                           FLAGS.uni_embeddings_file,
                           FLAGS.uni_checkpoint_path)

    # Maybe load bidirectional encoder.
    if FLAGS.bi_checkpoint_path:
        print("Loading bidirectional model...")
        bi_config = configuration.model_config(bidirectional_encoder=True)
        encoder.load_model(bi_config, FLAGS.bi_vocab_file,
                           FLAGS.bi_embeddings_file, FLAGS.bi_checkpoint_path)

    evaluate(encoder, FLAGS.output_dir, evaltest=True, loc=FLAGS.data_dir)

    encoder.close()
Exemple #2
0
def main(unused_argv):
    if not FLAGS.data_dir:
        raise ValueError("--data_dir is required.")

    encoder = encoder_manager.EncoderManager()

    # Maybe load unidirectional encoder.
    if FLAGS.uni_checkpoint_path:
        print("Loading unidirectional model...")
        uni_config = configuration.model_config()
        encoder.load_model(
            uni_config, FLAGS.uni_vocab_file,
            FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path)

    # Maybe load bidirectional encoder.
    if FLAGS.bi_checkpoint_path:
        print("Loading bidirectional model...")
        bi_config = configuration.model_config(bidirectional_encoder=True)
        encoder.load_model(
            bi_config, FLAGS.bi_vocab_file, FLAGS.bi_embeddings_file,
            FLAGS.bi_checkpoint_path)

    if FLAGS.eval_task in ["MR", "CR", "SUBJ", "MPQA"]:
        eval_classification.eval_nested_kfold(
            encoder, FLAGS.eval_task, FLAGS.data_dir, use_nb=False)
    elif FLAGS.eval_task == "SICK":
        eval_sick.evaluate(encoder, evaltest=True, loc=FLAGS.data_dir)
    elif FLAGS.eval_task == "MSRP":
        eval_msrp.evaluate(encoder, evalcv=True, evaltest=True, use_feats=True, loc=FLAGS.data_dir)
    elif FLAGS.eval_task == "TREC":
        eval_trec.evaluate(encoder, evalcv=True, evaltest=True, loc=FLAGS.data_dir)
    else:
        raise ValueError("Unrecognized eval_task: %s" % FLAGS.eval_task)

    encoder.close()
Exemple #3
0
def get_encoder():
    # Download and extract the bidirectional model. (shell script)
    # cd models/
    # wget "http://download.tensorflow.org/models/skip_thoughts_bi_2017_02_16.tar.gz"
    # tar -xvf skip_thoughts_bi_2017_02_16.tar.gz
    # rm skip_thoughts_bi_2017_02_16.tar.gz
    # cd ..
    #
    # Set paths to the model.
    pretrained_path = 'models/skip_thoughts_bi_2017_02_16/'
    VOCAB_FILE = os.path.join(pretrained_path, 'vocab.txt')
    EMBEDDING_MATRIX_FILE = os.path.join(pretrained_path, 'embeddings.npy')
    CHECKPOINT_PATH = os.path.join(pretrained_path, 'model.ckpt-500008')

    # Set up the encoder. Here we are using a single unidirectional model.
    # To use a bidirectional model as well, call load_model() again with
    # configuration.model_config(bidirectional_encoder=True) and paths to the
    # bidirectional model's files. The encoder will use the concatenation of
    # all loaded models.
    encoder = encoder_manager.EncoderManager()
    encoder.load_model(configuration.model_config(bidirectional_encoder=True),
                       vocabulary_file=VOCAB_FILE,
                       embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                       checkpoint_path=CHECKPOINT_PATH)

    return encoder
def extract_by_skip_thought(sent_list: List[str]):
    """
    To make it compatible with the toolkit, we need the input to be a list of sentences
    :param sent_list:
    :return:
    """
    skip_thought_dir = os.path.join('/home/junpeiz/Project/Twitter/data',
                                    'skipThoughts', 'pretrained',
                                    'skip_thoughts_uni_2017_02_02')
    # Set paths to the model.
    VOCAB_FILE = os.path.join(skip_thought_dir, "vocab.txt")
    EMBEDDING_MATRIX_FILE = os.path.join(skip_thought_dir, "embeddings.npy")
    CHECKPOINT_PATH = os.path.join(skip_thought_dir, "model.ckpt-501424")
    # The following directory should contain files rt-polarity.neg and
    # rt-polarity.pos.
    # MR_DATA_DIR = "/dir/containing/mr/data"

    # Set up the encoder. Here we are using a single unidirectional model.
    # To use a bidirectional model as well, call load_model() again with
    # configuration.model_config(bidirectional_encoder=True) and paths to the
    # bidirectional model's files. The encoder will use the concatenation of
    # all loaded models.
    encoder = encoder_manager.EncoderManager()
    encoder.load_model(configuration.model_config(),
                       vocabulary_file=VOCAB_FILE,
                       embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                       checkpoint_path=CHECKPOINT_PATH)

    encoding_list = encoder.encode(sent_list)
    return encoding_list
Exemple #5
0
    def __init__(self, use_char=False):
        super(SkipThought, self).__init__()
        self.use_char2vec = use_char

        cur_path = os.path.abspath(os.path.dirname(__file__))
        # Set paths to the model.
        VOCAB_FILE = os.path.join(
            cur_path, "../../models/skip_thoughts_uni_2017_02_02/vocab.txt")
        EMBEDDING_MATRIX_FILE = os.path.join(
            cur_path,
            "../../models/skip_thoughts_uni_2017_02_02/embeddings.txt")
        CHECKPOINT_PATH = os.path.join(
            cur_path,
            "../../models/skip_thoughts_uni_2017_02_02/model.ckpt-501424")

        self.encoder = encoder_manager.EncoderManager()
        self.encoder.load_model(configuration.model_config(),
                                vocabulary_file=VOCAB_FILE,
                                embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                                checkpoint_path=CHECKPOINT_PATH)

        if self.use_char2vec:
            PROJ_MODEL_PATH = os.path.join(
                cur_path,
                "../../models/char_word2vec/skip-thought_linear_projection.m")
            self.char_w2v = CharWord2vec()
            with open(PROJ_MODEL_PATH) as f:
                self.proj = pickle.load(f)
Exemple #6
0
def load_model(vocab_file, embedding_matrix_file, checkpoint_path,
               bidirectional_encoder):
    encoder = encoder_manager.EncoderManager()
    encoder.load_model(configuration.model_config(
        bidirectional_encoder=bidirectional_encoder),
                       vocabulary_file=vocab_file,
                       embedding_matrix_file=embedding_matrix_file,
                       checkpoint_path=checkpoint_path)
    return encoder
Exemple #7
0
def setup_encoder():
    VOCAB_FILE = '/data/ryli/kcli/skip-thoughts/pretrained/skip_thoughts_uni_2017_02_02/vocab.txt'
    EMBEDDING_MATRIX_FILE = '/data/ryli/kcli/skip-thoughts/pretrained/skip_thoughts_uni_2017_02_02/embeddings.npy'
    CHECKPOINT_PATH = '/data/ryli/kcli/skip-thoughts/pretrained/skip_thoughts_uni_2017_02_02/model.ckpt-501424'

    encoder = encoder_manager.EncoderManager()
    encoder.load_model(configuration.model_config(),
                       vocabulary_file=VOCAB_FILE,
                       embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                       checkpoint_path=CHECKPOINT_PATH)

    return encoder
Exemple #8
0
    def __init__(self, withSVM=False):
        [lib, con,
         neu] = cPickle.load(open(os.getcwd() + '/sampleData.pkl', 'rb'))

        self.bias_dict = {}

        for tree in lib:
            sentence = tree.get_words()
            self.bias_dict[sentence] = 1

        for tree in con:
            sentence = tree.get_words()
            self.bias_dict[sentence] = -1

        for tree in neu:
            sentence = tree.get_words()
            self.bias_dict[sentence] = 0

        self.encoder = encoder_manager.EncoderManager()
        self.data_encodings = []
        self.data = self.bias_dict.keys()

        self.blacklist = []

        #f = open('skipthoughts.pkl', 'rb')
        # right now, we're using a unidirectional skip model;
        # we can try the bidirectional model later

        dir_path = os.path.dirname(os.path.realpath(__file__))

        VOCAB_FILE = dir_path + "/../data/vocab.txt"
        EMBEDDING_MATRIX_FILE = dir_path + "/../data/embeddings.npy"
        CHECKPOINT_PATH = dir_path + "/../data/model.ckpt-501424"

        self.encoder.load_model(configuration.model_config(),
                                vocabulary_file=VOCAB_FILE,
                                embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                                checkpoint_path=CHECKPOINT_PATH)

        self.sentiment = SentimentIntensityAnalyzer()

        self.clf = None

        self.withSVM = withSVM

        if withSVM:
            print('using the SVM!')
            f = open('./svm.pkl', 'rb')
            self.clf = cPickle.load(f)
Exemple #9
0
def main():

    parser = argparse.ArgumentParser(
        description="encoding sentences example for skip_thoughts.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('vocab_file', help="specify the vocab_file")
    parser.add_argument('embedding_matrix_file',
                        help='specify the embedding_matrix_file')
    parser.add_argument('checkpoint_path', help="specify the checkpoint_path")
    parser.add_argument('mr_data_dir', help="specify the mr_data_dir")
    parser.add_argument('--model_name', default="skip_thoughts")
    parser.add_argument('--bidirect',
                        choices=["True", "False"],
                        default="False")

    args = parser.parse_args()

    if args.bidirect == "True":
        args.bidirect = True
    else:
        args.bidirect = False

    encoder = encoder_manager.EncoderManager(args.model_name)
    encoder.load_model(
        configuration.model_config(bidirectional_encoder=args.bidirect),
        vocabulary_file=args.vocab_file,
        embedding_matrix_file=args.embedding_matrix_file,
        checkpoint_path=args.checkpoint_path)

    data = []
    with open(os.path.join(args.mr_data_dir, 'rt-polarity.neg'), 'rb') as f:
        data.extend([line.decode('latin-1').strip() for line in f])
    with open(os.path.join(args.mr_data_dir, 'rt-polarity.pos'), 'rb') as f:
        data.extend([line.decode('latin-1').strip() for line in f])

    encodings = encoder.encode(data)

    def get_nn(ind, num=10):
        encoding = encodings[ind]
        scores = sd.cdist([encoding], encodings, 'cosine')[0]
        sorted_ids = np.argsort(scores)
        print("Senetence:")
        print("", data[ind])
        print("\nNearest neighbors:")
        for i in range(1, num + 1):
            print(" %d. %s (%.3f)" %
                  (i, data[sorted_ids[i]], scores[sorted_ids[i]]))

    get_nn(0)
def restore_skipthought(model_dir, model_name, skipthought_embedding,
                        skipthought_vocab):
    """
    :rtype: encoder_manager.EncoderManager()
    :return:
    """
    check_point_path = os.path.join(model_dir, model_name)
    skip_thought_embedding_matrix = os.path.join(model_dir,
                                                 skipthought_embedding)
    skip_thought_vocab = os.path.join(model_dir, skipthought_vocab)

    encoder = encoder_manager.EncoderManager()
    encoder.load_model(configuration.model_config(),
                       vocabulary_file=skip_thought_vocab,
                       embedding_matrix_file=skip_thought_embedding_matrix,
                       checkpoint_path=check_point_path)
    return encoder
Exemple #11
0
    def __init__(self, modelPath, checkpointPath):
        """Initialize skip though model.

        Arguments:
            modelPath {str} -- the path to model
            checkpointPath {str} -- the filename of mode.ckpt-xxxx
        """
        self.modelPath = modelPath
        self.checkpointPath = os.path.join(modelPath, "..", checkpointPath)
        self.vocabFile = os.path.join(modelPath, "vocab.txt")
        self.embeddingMatrixFile = os.path.join(modelPath, "embeddings.npy")

        self.encoder = encoder_manager.EncoderManager()
        self.encoder.load_model(configuration.model_config(),
                                vocabulary_file=self.vocabFile,
                                embedding_matrix_file=self.embeddingMatrixFile,
                                checkpoint_path=self.checkpointPath)
	def __init__(self, withSVM=False):
		[lib, con, neu] = cPickle.load(open('sampleData.pkl', 'rb'))

		self.bias_dict = {}

		for tree in lib:
			sentence = tree.get_words()
			self.bias_dict[sentence] = 1

		for tree in con:
			sentence = tree.get_words()
			self.bias_dict[sentence] = -1

		for tree in neu:
			sentence = tree.get_words()
			self.bias_dict[sentence] = 0

		self.encoder = encoder_manager.EncoderManager()
		self.data_encodings = []
		self.data = self.bias_dict.keys()

		self.blacklist = []

		#f = open('skipthoughts.pkl', 'rb')
		# right now, we're using a unidirectional skip model;
		# we can try the bidirectional model later
		VOCAB_FILE = "/Users/az/Desktop/projects/modemo/backend/modules/tf/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/vocab.txt"
		EMBEDDING_MATRIX_FILE = "/Users/az/Desktop/projects/modemo/backend/modules/tf/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/embeddings.npy"
		CHECKPOINT_PATH = "/Users/az/Desktop/projects/modemo/backend/modules/tf/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/model.ckpt-501424"

		self.encoder.load_model(configuration.model_config(), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH)

		self.sentiment = SentimentIntensityAnalyzer()

		self.clf = None

		if withSVM:
			print('using the SVM!')
			f = open('./svm.pkl', 'rb')
			self.clf = cPickle.load(f)
    # traditional measurement like levenstein distance, dynamic time wrapping, jaro, etc.

    print(_generate_log("Average Embedding", ae_sims, sim_names))
    print(_generate_log("InferSent", inf_sims, sim_names))
    print(_generate_log("SkipThought", st_sims, sim_names))


if __name__ == '__main__':
    # Load in InferSent
    infersent = torch.load(MODEL_PATH)  # rely on "models.py" as well
    infersent.set_glove_path(GLOVE_PATH)

    # Load in SkipThought
    config_gpu = tf.ConfigProto()
    config_gpu.gpu_options.allow_growth = True

    with tf.Graph().as_default(), tf.Session(config=config_gpu) as session:
        skipthought = encoder_manager.EncoderManager()

        skipthought.load_model(
            configuration.model_config(bidirectional_encoder=True),
            vocabulary_file=VOCAB_FILE,
            embedding_matrix_file=EMBEDDING_MATRIX_FILE,
            checkpoint_path=CHECKPOINT_PATH)

    # Load in average embedding
    avg_emb = AverageEmbedder(word_emb_dim=300)
    avg_emb.set_glove_path(GLOVE_PATH)

    IPython.embed()
Exemple #14
0
import pandas as pd
from skip_thoughts import configuration
from skip_thoughts import encoder_manager
from sklearn.feature_extraction.text import TfidfVectorizer

VOCAB_FILE = ".\\skip_thoughts_bi_2017_02_16\\vocab.txt"
EMBEDDING_MATRIX_FILE = ".\\skip_thoughts_bi_2017_02_16\\embeddings.npy"
CHECKPOINT_PATH = ".\\skip_thoughts_bi_2017_02_16\\model.ckpt-500008"

encoder = encoder_manager.EncoderManager()
encoder.load_model(configuration.model_config(bidirectional_encoder=True),vocabulary_file=VOCAB_FILE,embedding_matrix_file=EMBEDDING_MATRIX_FILE,checkpoint_path=CHECKPOINT_PATH)

def neural_features(dataset_loc):

	english_dataset = pd.read_csv(dataset_loc)
	headline = english_dataset['headline']
	body = english_dataset['content']
	labels = [int(x) for x in english_dataset['label']]

	labels_done = []
    flag = True
    body_encodings = np.zeros((len(body),2400))
    j = 0
    for i in range(len(body)):
        flag=True
        try:
            current_body_encoding = encoder.encode(body[i:i+1])
        except:
            flag=False