Exemple #1
0
    def __init__(self, x_max_lengths=None, y_max_length=8):

        self.name = 'wikipedia'

        self.x_max_length = max(x_max_lengths)
        self.x_max_lengths = x_max_lengths
        self.y_max_length = y_max_length

        # Vocabulary as defined in two dictionaries
        # self.vocabulary | token -> idx
        # self.reversed_vocabulary | idx -> token
        self.vocabulary = self.read_vocabulary()
        self.vocab_size = len(self.vocabulary)
        self.reversed_vocabulary = dict(
            zip(self.vocabulary.values(), self.vocabulary.keys()))

        self.test = WikipediaTestData(self,
                                      path=directory('/data/wikipedia_data/') +
                                      'test.txt',
                                      x_max_length=self.x_max_length,
                                      y_max_length=self.y_max_length)

        self.train = WikipediaBucketedTrainData(
            self,
            paths=[
                directory('/data/wikipedia_data/') + 'train-33.txt',
                directory('/data/wikipedia_data/') + 'train-62.txt',
                directory('/data/wikipedia_data/') + 'train-118.txt',
                directory('/data/wikipedia_data/') + 'train-504.txt'
            ],
            x_max_lengths=self.x_max_lengths,
            y_max_length=self.y_max_length)
Exemple #2
0
    def setupMainWindow(self):
        self.setWindowTitle("Classifier")

        self.documents_directory = utils.directory(config[KEY_DOCUMENTS])
        self.documents = self.documents_directory.get_all_files()

        self.categories_directory = utils.directory(config[KEY_CATEGORIES])
        self.categories = self.categories_directory.get_all_directories()

        QtGui.QShortcut(QtGui.QKeySequence("Ctrl+Z"), self, self.undoAction)
Exemple #3
0
def wikipedia(n, should_filter_pretrained=False):

    d = WikipediaDataset(x_max_lengths=[33, 62, 118, 504], y_max_length=8)
    ep = EmbeddingProcessor(d.vocabulary,
                            path=directory('/data/compvec_wikipedia/'))
    d.store_dataset(path=directory('/data/compvec_wikipedia/'))

    if should_filter_pretrained:
        filter_pretrained(ep)

    runs(n, d, ep, neural=True, random=False)
Exemple #4
0
def single_wordnet(n, should_filter_pretrained=False):

    d = WordnetDataset(x_max_length=32, y_max_length=1)
    ep = EmbeddingProcessor(d.vocabulary,
                            path=directory('/data/compvec_wordnet_single/'))

    d.store_dataset(path=directory('/data/compvec_wordnet_single/'))

    if should_filter_pretrained:
        filter_pretrained(ep)

    runs(n, d, ep, yc=False, neural=False, random=False)
    runs(n, d, ep, yc=True, neural=False, random=False)
Exemple #5
0
def multi_wordnet(n, should_filter_pretrained=False):
    d = WordnetDataset(
        test_data_path=directory('/data/compvec_wordnet_single/') +
        'test_data.gz',
        x_max_length=32,
        y_max_length=6)
    ep = EmbeddingProcessor(d.vocabulary,
                            path=directory('/data/compvec_wordnet_multi/'))

    d.store_dataset(path=directory('/data/compvec_wordnet_multi/'))

    if should_filter_pretrained:
        filter_pretrained(ep)

    runs(n, d, ep, neural=True, random=True)
Exemple #6
0
    def __init__(self,
                 run_group_name,
                 dataset,
                 embedding_processor,
                 yc=True,
                 pretraining=None,
                 learning_rate=1e-3,
                 batch_size=512,
                 embedding_size=300,
                 stop_gradients_y_n=False,
                 dropout_keep_p=0.75,
                 margin=0.25,
                 composition='sum',
                 loss='mse',
                 refine_after_x_steps=0,
                 no=None):

        self.dataset = dataset
        self.embedding_processor = embedding_processor
        self.y_composition = yc
        self.no = no

        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.dropout_keep_p = dropout_keep_p
        self.pretraining = pretraining
        self.refine_after_x_steps = refine_after_x_steps

        self.run_dir = directory('/out/run-%s' % run_group_name,
                                 ['logs', 'tsne', 'embeddings', 'output'])
        self.data_dir = embedding_processor.path

        self.x_max_buckets = None

        if hasattr(self.dataset, 'x_max_lengths'):
            self.x_max_buckets = self.dataset.x_max_lengths

        self.graph = tf.Graph()
        with self.graph.as_default():

            # Model
            self.model = Model(embedding_size=embedding_size,
                               x_max_buckets=self.x_max_buckets,
                               x_max_length=self.dataset.x_max_length,
                               y_max_length=self.dataset.y_max_length,
                               y_composition=self.y_composition,
                               vocab_size=self.dataset.vocab_size,
                               margin=margin,
                               composition=composition,
                               loss=loss,
                               stop_gradients_y_n=stop_gradients_y_n)

            # Evaluator
            self.evaluation = Evaluation(self.model, self.dataset)

            # Assign Tensorflow Operations
            self.assign_ops()

        # Setup Writers for Tensorboard
        self.setup_writers()
Exemple #7
0
def deploy_file(path, kwargs, config):
	with utils.directory(os.path.dirname(path)):
		config.update(kwargs)
		if 'FunctionName' not in config:
			clip.exit('You must provide a function name', err=True)
		# Zip up directory
		utils.make_zip(config['FunctionName'])
		# Upload!
		upload(config['FunctionName'], config)
Exemple #8
0
def deploy_dir(path, kwargs):
	with utils.directory(path):
		config = LambdaConfig().load_from_cwd().update_config(kwargs)
		config.verify()
		# Remove ignore paths
		for e in config.get('ignore', []) + ['.git/', '.gitignore']:
			utils.delete_resource(e)
		# Run install command
		if 'install' in config:
			utils.shell(config.get('install'))
		upload(config.get_config())
Exemple #9
0
def deploy_dir(path, kwargs):
    with utils.directory(path):
        config = LambdaConfig().load_from_cwd().update_config(kwargs)
        config.verify()
        # Remove ignore paths
        for e in config.get('ignore', []) + ['.git/', '.gitignore']:
            utils.delete_resource(e)
        # Run install command
        if 'install' in config:
            utils.shell(config.get('install'))
        upload(config.get_config())
Exemple #10
0
def favicon():
    '''
    Serves GET favicon requests
    Favicons are the icons that show up in your browser's tab.
    For the purposes of this project, a favicon is not needed, so a blank one is used.

    Note:
        This function is required because browsers will try to access '{base_url}/favicon.ico'
        and without this function, the request will automatically go to the date_page() function
        where things will get messed up.
    '''
    return send_from_directory(utils.directory(APP.root_path), 'favicon.ico')
Exemple #11
0
    def __init__(self, model, dataset):
        self.m = model
        self.d = dataset

        path = directory('/data/wordnetsingle/') + 'test_data.gz'
        self.compveceval_test = WordnetData.from_path(path, self.d.vocabulary,
                                                      self.d.x_max_length,
                                                      self.d.y_max_length)

        self.compveceval = CompVecEvalEvaluation(self.m, self.compveceval_test)
        self.senteval = SentEvalEvaluation(self.m, self.d)
        self.wordsim = WordSimEvaluation(self.m, self.d)
Exemple #12
0
    def __init__(self, vocabulary=None, path=None):

        if vocabulary is None:
            self.vocabulary = dict()
        else:
            self.vocabulary = vocabulary

        if path is None:
            self.path = directory('/data/compositional_wordnet')
        else:
            self.path = path

        self.reversed_vocabulary = dict(
            zip(self.vocabulary.values(), self.vocabulary.keys()))
Exemple #13
0
def run(tasks, dir=None, silent=False):
    if dir is None:
        dir = os.getcwd()
    okapi.silent = silent
    with utils.directory(dir):
        config = utils.load_config()
        okapi.log('Running tasks on {}'.format(config.project if hasattr(
            config, 'project') else os.path.basename(dir)))
        setattr(config, 'ok', okapi)
        for task in tasks:
            if not hasattr(config, task):
                okapi.log('"{}" not a valid task, skipping!'.format(task))
                continue
            okapi.run(getattr(config, task))
    okapi.log('All tasks complete!')
Exemple #14
0
def deploy_dir(path, kwargs):
	with utils.directory(path):
		config = utils.load_config()
		config['config'].update(kwargs)
		if 'FunctionName' not in config['config']:
			clip.exit('You must provide a function name', err=True)
		# Remove ignore paths
		for e in config['ignore'] + ['.git/', '.gitignore']:
			utils.delete_resource(e)
		# Run install command
		if 'install' in config:
			utils.shell(config['install'])
		# Zip up directory
		utils.make_zip(config['config']['FunctionName'])
		# Upload!
		params = config['config']
		upload(params['FunctionName'], params)
Exemple #15
0
    def read_fallback_embeddings(vocabulary, pretrain=None, evaluation='MR'):

        original_embedding_path = original_embedding_file(pretrain)

        if original_embedding_path is None:
            return dict()

        ped = EmbeddingProcessor(vocabulary)
        processed_embeddings_path = '%s/%s-%s.vec.gz' % (directory('/data/senteval_embeddings'), pretrain, evaluation)

        if not os.path.isfile(processed_embeddings_path):
            ped.process_pretrained_embeddings(input_filename=original_embedding_path,
                                              output_filename=processed_embeddings_path)

        embeddings = ped.read_embeddings(processed_embeddings_path)

        return embeddings
Exemple #16
0
    def read_vocabulary(self,
                        path=None,
                        vocab_min_frequency=18,
                        vocab_size_limit=5171164):
        """
        Read vocabulary from vocab file.
        """

        vocab_ns = dict()

        if path is None:
            path = directory('/data/wikipedia_data/') + 'vocab.txt'

        with open(path) as f:

            for i, line in enumerate(f):

                if i % 10000 == 0:
                    sys.stdout.write("\rReading vocabulary… %6.2f%%" %
                                     ((100 * i) / float(VOCAB_SIZE), ))

                s = line.split()

                if len(s) != 2:
                    continue

                t, n = s[0], int(s[1])
                if n >= vocab_min_frequency and i < vocab_size_limit - 2:
                    vocab_ns[t] = n
                else:
                    break  # vocabulary file is already sorted

            print("\rVocabulary read %d" % (len(vocab_ns) + 2))

        # Sort the vocabulary by frequency, frequent words on top
        vocab_ns = sorted(vocab_ns.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

        vocabulary = {token: i for i, (token, count) in enumerate(vocab_ns, 2)}
        vocabulary['PAD'] = PAD_SYMBOL
        vocabulary['UNK'] = UNK_SYMBOL

        return vocabulary
Exemple #17
0
    parser.add_argument('data_dir')
    parser.add_argument('-a', '--annotator', default='default')
    parser.add_argument('-i', '--interface', default='interface')
    parser.add_argument('--past_reports', default=False, action='store_true')
    parser.add_argument('-m', '--models', action='append')
    parser.add_argument('-d', '--device', default='cpu')
    parser.add_argument('-r', '--reload', default=False, action='store_true')
    parser.add_argument('-p', '--port')
    args = parser.parse_args()
    if args.data_dir is None:
        raise NotImplementedError
    startup['annotations_dir'] = join(args.data_dir,
                                      args.annotator + '_annotations')
    if not exists(startup['annotations_dir']):
        mkdir(startup['annotations_dir'])
    np.random.seed(0)
    startup['file_generator'] = FileGenerator(args.data_dir,
                                              startup['annotations_dir'],
                                              reload=args.reload)
    try:
        startup['file'] = next(startup['file_generator'])
    except StopIteration:
        startup['file'] = None
    with directory(args.interface_dir):
        exec('import ' + args.interface)
    models_to_load = args.models if args.models is not None else []
    startup['interface'] = eval(args.interface).FullModelInterface(
        models_to_load=models_to_load, device=args.device)
    startup['include_past_reports'] = args.past_reports
    app.run(debug=True, port=args.port)
Exemple #18
0
 def root(self, path):
     with utils.directory(path):
         yield
Exemple #19
0
def deploy_file(path, kwargs):
    with utils.directory(os.path.dirname(path)):
        config = LambdaConfig().load_from_front_matter(path).update_config(
            kwargs)
        config.verify()
        upload(config.get_config())
Exemple #20
0
def deploy_file(path, kwargs):
	with utils.directory(os.path.dirname(path)):
		config = LambdaConfig().load_from_front_matter(path).update_config(kwargs)
		config.verify()
		upload(config.get_config())