Example #1
0
def create_tgrep2_corpus_file(gzipped_corpus_file_path, format_):
    """Use TGrep2 to create a .t2c corpus file from the gzipped file of phrase-structure trees.

    :param str gzipped_corpus_file_path: absolute path to the gzipped corpus file.
    :param str format_: the format in which the corpus has just been written to disk.
    :returns: the absolute path to the .t2c file or ``False``.

    """
    if format_ == u'treebank' and h.command_line_program_installed('tgrep2'):

        out_path = '%s.t2c' % os.path.splitext(gzipped_corpus_file_path)[0]
        with open(os.devnull, "w") as fnull:
            call(['tgrep2', '-p', gzipped_corpus_file_path, out_path], stdout=fnull, stderr=fnull)
        if os.path.exists(out_path):
            return out_path
        return False
    return False
Example #2
0
def create_tgrep2_corpus_file(gzipped_corpus_file_path, format_):
    """Use TGrep2 to create a .t2c corpus file from the gzipped file of phrase-structure trees.

    :param str gzipped_corpus_file_path: absolute path to the gzipped corpus file.
    :param str format_: the format in which the corpus has just been written to disk.
    :returns: the absolute path to the .t2c file or ``False``.

    """
    if format_ == u'treebank' and h.command_line_program_installed('tgrep2'):

        out_path = '%s.t2c' % os.path.splitext(gzipped_corpus_file_path)[0]
        with open(os.devnull, "w") as fnull:
            call(['tgrep2', '-p', gzipped_corpus_file_path, out_path],
                 stdout=fnull,
                 stderr=fnull)
        if os.path.exists(out_path):
            return out_path
        return False
    return False
Example #3
0
    def test_writetofile_content_specified(self):
        """Tests file writing/retrieval of a corpus whose forms are specified in the ``content`` attribute.

        """

        tgrep2_installed = h.command_line_program_installed('tgrep2')

        # Get ids of all sentences.
        sentences = Session.query(model.Form).\
            filter(model.Form.syntactic_category.\
                has(model.SyntacticCategory.name==u'S')).all()
        len_sentences = len(sentences)
        sentences = u','.join(map(str, map(lambda f: f.id, sentences)))

        # Get ids of all sentences with more than 5 words.
        long_sentences = Session.query(model.Form).\
            filter(and_(
                model.Form.syntactic_category.has(model.SyntacticCategory.name==u'S'),
                model.Form.transcription.op('regexp')(u'^([^ ]+ ){5}[^ ]+'))).all()
        len_long_sentences = len(long_sentences)
        long_sentences = u','.join(map(str, map(lambda f: f.id, long_sentences)))

        content = u','.join([sentences, long_sentences, long_sentences, long_sentences])
        anticipated_length = len_sentences + (3 * len_long_sentences)
        name = u'Corpus of sentences with 6+ word sentences repeated'
        description = u'Ordered by content field; duplicates of words with more than 6 words.'

        # Generate some valid corpus creation input parameters.
        params = self.corpus_create_params.copy()
        params.update({
            'name': name,
            'description': description,
            'content': content
        })
        params = json.dumps(params)

        # Create the corpus
        original_corpus_count = Session.query(Corpus).count()
        response = self.app.post(url('corpora'), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        corpus_id = resp['id']
        new_corpus_count = Session.query(Corpus).count()
        corpus = Session.query(Corpus).get(corpus_id)
        corpus_dir = os.path.join(self.corpora_path, 'corpus_%d' % corpus_id)
        corpus_dir_contents = os.listdir(corpus_dir)
        assert new_corpus_count == original_corpus_count + 1
        assert resp['name'] == name
        assert resp['description'] == description
        assert corpus_dir_contents == []
        assert response.content_type == 'application/json'
        assert resp['content'] == content
        # The ``forms`` attribute is a collection, no repeats, that's why the following is true:
        assert len(corpus.forms) == len_sentences

        # Write the corpus to file as a treebank
        sleep(1)
        params = json.dumps({u'format': u'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        resp2 = json.loads(response.body)
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        corpus_tbk_gzipped_path = '%s.gz' % corpus_tbk_path
        corpus_tbk_gzipped_size = get_file_size(corpus_tbk_gzipped_path)
        corpus_tbk_file_length = h.get_file_length(corpus_tbk_path)
        corpus_tbk_t2c_path = os.path.join(corpus_dir, 'corpus_%d.tbk.t2c' % corpus_id)
        corpus_file_id = resp2['files'][0]['id']
        assert resp['id'] == resp2['id']
        assert resp['name'] == resp2['name']
        assert resp2['datetime_modified'] > resp['datetime_modified']
        assert os.path.exists(corpus_tbk_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        assert get_file_size(corpus_tbk_path) > corpus_tbk_gzipped_size
        assert anticipated_length == corpus_tbk_file_length

        # Retrieve the corpus file directly from the filesystem.
        corpus_file_object = open(corpus_tbk_path, 'rb')
        corpus_file_content = corpus_file_object.read()

        # Attempt to retrieve the gzipped corpus file via request as a restricted
        # user and expect to fail.
        response = self.app.get(url('/corpora/%d/servefile/%d' % (
            corpus_id, corpus_file_id)), status=403,
            headers=self.json_headers, extra_environ=self.extra_environ_contrib)
        resp = json.loads(response.body)
        assert resp == h.unauthorized_msg

        # Retrieve the gzipped corpus file via request.
        response = self.app.get(url('/corpora/%d/servefile/%d' % (
            corpus_id, corpus_file_id)),
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        assert len(response.body) < len(corpus_file_content)
        unzipped_corpus_file_content = decompress_gzip_string(response.body)
        assert unzipped_corpus_file_content == corpus_file_content

        # Write the corpus to file as a list of transcriptions, one per line.
        sleep(1)
        params = json.dumps({u'format': u'transcriptions only'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        old_resp2 = resp2
        resp2 = json.loads(response.body)
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_TO_path = os.path.join(corpus_dir, 'corpus_%d_transcriptions.txt' % corpus_id)
        corpus_TO_gzipped_path = '%s.gz' % corpus_TO_path
        corpus_TO_gzipped_size = get_file_size(corpus_TO_gzipped_path)
        corpus_TO_file_length = h.get_file_length(corpus_TO_path)
        if tgrep2_installed:
            # Five files should be present: tbk, tbk.gz, tbk.t2c, txt and txt.gz
            assert len(corpus_dir_contents) == 5
        else:
            # Four files should be present: tbk, tbk.gz, txt and txt.gz
            assert len(corpus_dir_contents) == 4
        assert resp2['datetime_modified'] > old_resp2['datetime_modified']
        assert os.path.exists(corpus_TO_path)
        assert os.path.exists(corpus_TO_gzipped_path)
        assert get_file_size(corpus_TO_path) > corpus_TO_gzipped_size
        assert anticipated_length == corpus_TO_file_length

        # Finally delete the corpus and expect it, its file data and corpus file 
        # objects to have been deleted.
        assert os.path.exists(corpus_TO_path)
        assert os.path.exists(corpus_TO_gzipped_path)
        assert os.path.exists(corpus_tbk_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        corpus_file_ids = [cf['id'] for cf in resp2['files']]
        self.app.delete(url('corpus', id=corpus_id), headers=self.json_headers,
            extra_environ=self.extra_environ_admin)
        assert Session.query(model.Corpus).get(corpus_id) == None
        for corpus_file_id in corpus_file_ids:
            assert Session.query(model.CorpusFile).get(corpus_file_id) == None
        assert not os.path.exists(corpus_TO_path)
        assert not os.path.exists(corpus_TO_gzipped_path)
        assert not os.path.exists(corpus_tbk_path)
        assert not os.path.exists(corpus_tbk_t2c_path)
        assert not os.path.exists(corpus_tbk_gzipped_path)
Example #4
0
    def test_aaa_initialize(self):
        """Initialize the database using pseudo-data generated from random lorem ipsum sentences.

        These are located in ``onlinelinguisticdatabase/tests/data/corpora``.
        The data contain morphologically analyzed sentences, their component
        morphemes, and syntactic categories.  The sentences have phrase
        structure trees in bracket notation.

        The test will try to load the lorem ipsum dataset from a MySQL/SQLite
        dump file in ``onlinelinguisticdatabase/tests/data/corpora``.  If the
        dump file corresponding to ``loremipsum_path`` does not exist, it will
        import the lorem ipsum data directly from the text files and create
        the dump file so that future tests can run more speedily.  The
        ``loremipsum100_path``, ``loremipsum1000_path``, ``loremipsum10000_path``
        and ``loremipsum30000_path`` files are available and contain 100, 1000
        and 10,000 sentences, respectively.

        Setting the ``via_request`` variable to ``True`` will cause all of the
        forms to be created via request, i.e., via
        ``self.app.post(url('forms))...``.  This is much slower but may be
        desirable since values for the morphological analysis attributes
        will be generated.

        .. note::

            In order to run ``mysqldump`` with the MySQL user listed in
            ``test.ini``, that user must have permission to lock and update
            tables::

                mysql -u root -p<root_password>
                grant lock tables, update on old_test.* to 'old'@'localhost';

        .. warning::

            Loading the .txt or .sql files with the ``via_request`` option set to
            ``True`` will take a very long time.  This might be an argument for
            separating the interface and logic components of the controllers so
            that a "core" HTTP-less OLD application could be exposed.  This
            would facilitate the creation of models with system-generated data
            and validation but without the HTTP overhead...

        """


        ########################################################################
        # Configure lorem ipsum data set import
        ########################################################################

        # Set ``loremipsum_path`` this to ``self.loremipsum100_path``,
        # ``self.loremipsum1000_path`` or ``self.loremipsum10000_path``.
        # WARNING: the larger ones will take a long time.
        # Use the 10,000-sentence lorem ipsum dataset to ensure that
        # very large corpora are handled correctly.
        loremipsum_path = self.loremipsum100_path 

        # Set ``via_request`` to ``True`` to create all forms via HTTP requests.
        via_request = True

        self._add_SEARCH_to_web_test_valid_methods()

        # Add an application settings so that morpheme references will work out right.
        application_settings = h.generate_default_application_settings()
        Session.add(application_settings)
        Session.commit()

        def create_model(line, categories, via_request=False):
            """Create a model (form or syncat) using the string in ``line``."""
            model = 'Form'
            elements = unicode(line).split('\t')
            non_empty_elements = filter(None, elements)
            try:
                ol, mb, mg, ml, sc, sx = non_empty_elements
            except Exception:
                try:
                    ol, mb, mg, ml, sc = non_empty_elements
                    sx = u''
                except Exception:
                    try:
                        model = 'SyntacticCategory'
                        n, t = non_empty_elements
                    except Exception:
                        return categories
            if via_request:
                if model == 'SyntacticCategory':
                    params = self.syntactic_category_create_params.copy()
                    params.update({
                        'name': n,
                        'type': t
                    })
                    params = json.dumps(params)
                    response = self.app.post(url('syntacticcategories'), params, self.json_headers,
                                  self.extra_environ_admin)
                    cat_id = json.loads(response.body)['id']
                    categories[n] = cat_id
                else:
                    params = self.form_create_params.copy()
                    params.update({
                        'transcription': ol,
                        'morpheme_break': mb,
                        'morpheme_gloss': mg,
                        'translations': [{'transcription': ml, 'grammaticality': u''}],
                        'syntax': sx,
                        'syntactic_category': categories.get(sc, u'')
                    })
                    params = json.dumps(params)
                    self.app.post(url('forms'), params, self.json_headers,
                                  self.extra_environ_admin)
            else:
                if model == 'SyntacticCategory':
                    syntactic_category = model.SyntacticCategory()
                    syntactic_category.name = n
                    syntactic_category.type = t
                    Session.add(syntactic_category)
                    categories[n] = syntactic_category.id
                else:
                    form = model.Form()
                    form.transcription = ol
                    form.morpheme_break = mb
                    form.morpheme_gloss = mg
                    translation = model.Translation()
                    translation.transcription = ml
                    form.translations.append(translation)
                    form.syntax = sx
                    form.syntacticcategory_id = categories.get(sc, None)
                    Session.add(form)
            return categories

        def add_loremipsum_to_db(loremipsum_path, via_request=False):
            """Add the contents of the file at ``loremipsum_path`` to the database."""
            categories = {}
            with open(loremipsum_path, 'r') as f:
                i = 0
                for l in f:
                    if i % 100 == 0:
                        if not via_request: Session.commit()
                        log.debug('%d lines processed' % i)
                    i = i + 1
                    categories = create_model(l.replace('\n', ''), categories,
                                             via_request)
                Session.commit()

        loremipsum_path_no_ext = os.path.splitext(loremipsum_path)[0]
        sqlalchemy_URL = self.config['sqlalchemy.url']
        sqlalchemy_URL_list = sqlalchemy_URL.split(':')
        olddump_script_path = os.path.join(self.test_scripts_path, 'olddump.sh')
        oldload_script_path = os.path.join(self.test_scripts_path, 'oldload.sh')
        RDBMS = sqlalchemy_URL_list[0]

        if RDBMS == 'mysql':
            mysql_dump_path = '%s_mysql.sql' % loremipsum_path_no_ext
            username = sqlalchemy_URL_list[1][2:]
            password = sqlalchemy_URL_list[2].split('@')[0]
            dbname = sqlalchemy_URL_list[3].split('/')[1]
            if os.path.exists(mysql_dump_path):
                log.debug('The lorem ipsum MySQL dump file exists.  Loading it...')
                # Clear the current DB completely
                h.clear_all_models(retain=[])
                # Load the dump file to the DB
                shell_script = '#!/bin/sh\nmysql -u %s -p%s %s < %s' % (
                    username, password, dbname, mysql_dump_path)
                with open(oldload_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(oldload_script_path, 0744)
                # Load the DB
                with open(os.devnull, 'w') as f:
                    call([oldload_script_path], stdout=f, stderr=f)
                # Destroy the load script
                os.remove(oldload_script_path)
                log.debug('Loaded.')
            else:
                log.debug('Have to import the lorem ipsum dataset from the text file and create the MySQL dump file.')
                # Populate the database from the loremipusm text file and dump it
                add_loremipsum_to_db(loremipsum_path, via_request=via_request)
                # Write the DB dump shell script
                shell_script = '#!/bin/sh\nmysqldump -u %s -p%s --no-create-info %s > %s' % (
                    username, password, dbname, mysql_dump_path)
                with open(olddump_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(olddump_script_path, 0744)
                # Dump the DB
                with open(os.devnull, 'w') as f:
                    call([olddump_script_path], stdout=f, stderr=f)
                # Destroy the dump script
                os.remove(olddump_script_path)
                log.debug('Imported and dumped.')
        elif RDBMS == 'sqlite' and h.command_line_program_installed('sqlite3'):
            sqlite_dump_path = '%s_sqlite.sql' % loremipsum_path_no_ext
            sqlite_db = sqlalchemy_URL.split('/')[-1]
            dbpath = os.path.join(self.here, sqlite_db)
            if os.path.exists(sqlite_dump_path):
                log.debug('The lorem ipsum SQLite dump file exists.  Loading it...')
                # Clear the current DB completely
                h.clear_all_models(retain=[])
                # Load the dump file to the DB
                shell_script = '#!/bin/sh\nsqlite3 %s < %s' % (
                    dbpath, sqlite_dump_path)
                with open(oldload_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(oldload_script_path, 0744)
                # Load the DB
                with open(os.devnull, 'w') as f:
                    call([oldload_script_path], stdout=f, stderr=f)
                # Destroy the load script
                os.remove(oldload_script_path)
                log.debug('Loaded.')
            else:
                log.debug('Have to import the lorem ipsum dataset from the text file and create the SQLite dump file.')
                # Populate the database from the loremipusm text file and dump it
                add_loremipsum_to_db(loremipsum_path, via_request=via_request)
                # Write the DB dump shell script
                shell_script = '#!/bin/sh\nsqlite3 %s ".dump" | grep -v "^CREATE" > %s' % (
                    dbpath, sqlite_dump_path)
                with open(olddump_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(olddump_script_path, 0744)
                # Dump the DB
                with open(os.devnull, 'w') as f:
                    call([olddump_script_path], stdout=f, stderr=f)
                # Destroy the dump script
                os.remove(olddump_script_path)
                log.debug('Imported and dumped.')
        forms = h.get_forms()
        log.debug('Lorem ipsum data loaded.  There are now %d forms in the db.' % len(forms))

        # Restrict one sentential form in the db.
        restricted_tag = h.generate_restricted_tag()
        Session.add(restricted_tag)
        Session.commit()
        a_form = Session.query(model.Form).\
            filter(model.Form.syntactic_category.\
                has(model.SyntacticCategory.name==u'S')).first()
        a_form_id = a_form.id
        a_form.tags.append(restricted_tag)
        Session.commit()
        restricted_form = Session.query(model.Form).\
            filter(model.Form.tags.any(model.Tag.name==u'restricted')).first()
        assert a_form_id == restricted_form.id
Example #5
0
    def test_writetofile_all_sentences(self):
        """Tests file writing/retrieval of a corpus containing all sentences.

        That is, that ``PUT /corpora/id/writetofile`` and
        ``GET /corpora/id/servefile`` both work with a corpus defined by a form
        search model that returns all sentences.

        """

        restricted_form_id = Session.query(model.Form).filter(
                model.Form.tags.any(model.Tag.name==u'restricted')).first().id
        tgrep2_installed = h.command_line_program_installed('tgrep2')

        # Create a form search model that retrieves all sentences
        query = {'filter': ['Form', 'syntactic_category', 'name', '=', 'S']}
        params = json.dumps({
            'name': u'Get all sentences',
            'description': u'Query to return all sentences in the database.',
            'search': query
        })
        response = self.app.post(url('formsearches'), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        form_search_id = resp['id']

        # Perform the search to get the resulting forms.
        params = json.dumps({
            'query': query,
            'paginator': {'page': 1, 'items_per_page': 1}})
        response = self.app.post(url('/forms/search'), params,
                                 self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        sentence_count = resp['paginator']['count']

        # Generate some valid corpus creation input parameters.
        params = self.corpus_create_params.copy()
        params.update({
            'name': u'Corpus of sentences',
            'description': u'No ordering, no duplicates.',
            'form_search': form_search_id
        })
        params = json.dumps(params)

        # Create the corpus
        #assert os.listdir(self.corpora_path) == []
        original_corpus_count = Session.query(Corpus).count()
        response = self.app.post(url('corpora'), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        corpus_id = resp['id']
        new_corpus_count = Session.query(Corpus).count()
        corpus = Session.query(Corpus).get(corpus_id)
        corpus_dir = os.path.join(self.corpora_path, 'corpus_%d' % corpus_id)
        corpus_dir_contents = os.listdir(corpus_dir)
        assert new_corpus_count == original_corpus_count + 1
        assert resp['name'] == u'Corpus of sentences'
        assert resp['description'] == u'No ordering, no duplicates.'
        assert corpus_dir_contents == []
        assert response.content_type == 'application/json'
        assert resp['content'] == u''
        assert len(corpus.forms) == sentence_count
        assert resp['form_search']['id'] == form_search_id

        # Try to TGrep2-search the corpus without first writing it to file
        # and expect to fail.
        tgrep2pattern = json.dumps({'tgrep2pattern': u'S < NP-SBJ'})
        if h.command_line_program_installed('tgrep2'):
            # Failed tgrep2 search with invalid corpus id.
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=tgrep2pattern, headers=self.json_headers,
                    environ=self.extra_environ_admin, status=400)
            tgrep2resp = json.loads(response.body)
            assert tgrep2resp['error'] == 'Corpus %d has not been written to file as a treebank.'

        # Write the corpus to file
        sleep(1)
        params = json.dumps({'format': 'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        resp2 = json.loads(response.body)
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        corpus_tbk_mod_time = h.get_file_modification_time(corpus_tbk_path)
        corpus_tbk_gzipped_path = '%s.gz' % corpus_tbk_path
        corpus_tbk_file_length = h.get_file_length(corpus_tbk_path)
        corpus_tbk_t2c_path = os.path.join(corpus_dir, 'corpus_%d.tbk.t2c' % corpus_id)
        corpus_file_id = resp2['files'][0]['id']
        assert resp['id'] == resp2['id']
        assert resp['name'] == resp2['name']
        assert resp2['datetime_modified'] > resp['datetime_modified']
        assert os.path.exists(corpus_tbk_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        assert get_file_size(corpus_tbk_path) > get_file_size(corpus_tbk_gzipped_path)
        assert sentence_count == corpus_tbk_file_length

        # Retrieve the corpus file directly from the filesystem.
        corpus_file_object = open(corpus_tbk_path, 'rb')
        corpus_file_content = corpus_file_object.read()

        # Attempt to retrieve the gzipped corpus file via request as a restricted
        # user and expect to fail.  This is because there is one restricted
        # sentential form in the db, cf. the ``initialize`` "test".
        response = self.app.get(url('/corpora/%d/servefile/%d' % (
            corpus_id, corpus_file_id)), params, status=403,
            headers=self.json_headers, extra_environ=self.extra_environ_contrib)
        resp = json.loads(response.body)
        assert resp == h.unauthorized_msg

        # Retrieve the gzipped corpus file via request.
        response = self.app.get(url('/corpora/%d/servefile/%d' % (
            corpus_id, corpus_file_id)), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        unzipped_corpus_file_content = decompress_gzip_string(response.body)
        assert unzipped_corpus_file_content == corpus_file_content
        assert response.content_type == u'application/x-gzip'

        # Now update the corpus by changing the form search, re-write-to-file
        # and make sure everything works.

        # Create a form search model that retrieves all sentences with even-numbered
        # ids and the restricted form.
        query = {'filter': ['and', [
                    ['Form', 'syntactic_category', 'name', '=', 'S'],
                    ['or', [['Form', 'id', '=', restricted_form_id],
                            ['Form', 'id', 'regex', '[02468]$']]]]]}
        params = json.dumps({
            'name': u'Get even-numbered or restricted sentences',
            'description': u'Query to return all sentences in the database that have even-numbered ids or are restricted.',
            'search': query
        })
        response = self.app.post(url('formsearches'), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        form_search_id = resp['id']

        # Perform the search to get the resulting forms.
        params = json.dumps({
            'query': query,
            'paginator': {'page': 1, 'items_per_page': 1}})
        response = self.app.post(url('/forms/search'), params,
                                 self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        sentence_count = resp['paginator']['count']

        # Update the above-created corpus.
        params = self.corpus_create_params.copy()
        params.update({
            'name': u'Corpus of even-numbered sentences',
            'description': u'No ordering, no duplicates.',
            'form_search': form_search_id
        })
        params = json.dumps(params)
        original_corpus_count = Session.query(Corpus).count()
        response = self.app.put(url('corpus', id=corpus_id), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        new_corpus_count = Session.query(Corpus).count()
        corpus = Session.query(Corpus).get(corpus_id)
        corpus_dir = os.path.join(self.corpora_path, 'corpus_%d' % corpus_id)
        corpus_dir_contents = os.listdir(corpus_dir)
        assert new_corpus_count == original_corpus_count
        assert resp['name'] == u'Corpus of even-numbered sentences'
        assert resp['description'] == u'No ordering, no duplicates.'
        assert corpus_dir_contents != [] # Already a previously written corpus file there
        assert response.content_type == 'application/json'
        assert resp['content'] == u''
        assert len(corpus.forms) == sentence_count
        assert resp['form_search']['id'] == form_search_id

        # Write the corpus to file
        sleep(1)
        params = json.dumps({'format': 'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        resp2 = json.loads(response.body) # Response is a JSON repr. of the corpus
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        old_corpus_tbk_mod_time = corpus_tbk_mod_time
        corpus_tbk_mod_time = h.get_file_modification_time(corpus_tbk_path) 
        corpus_tbk_gzipped_path = '%s.gz' % corpus_tbk_path
        corpus_tbk_file_length = h.get_file_length(corpus_tbk_path) # no. of lines
        corpus_tbk_t2c_path = os.path.join(corpus_dir, 'corpus_%d.tbk.t2c' % corpus_id)
        corpus_file_id = resp2['files'][0]['id']
        assert old_corpus_tbk_mod_time < corpus_tbk_mod_time
        assert len(resp2['files']) == 1
        assert resp['id'] == resp2['id']
        assert resp['name'] == resp2['name']
        assert resp2['datetime_modified'] > resp['datetime_modified']
        assert os.path.exists(corpus_tbk_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        assert get_file_size(corpus_tbk_path) > get_file_size(corpus_tbk_gzipped_path)
        assert sentence_count == corpus_tbk_file_length

        # Retrieve the corpus file directly from the filesystem.
        corpus_file_object = open(corpus_tbk_path, 'rb')
        corpus_file_content = corpus_file_object.read()

        # Attempt to retrieve the gzipped corpus file via request as a restricted
        # user and expect to fail.  This is because the one restricted sentential 
        # form in the db is in the corpus.
        response = self.app.get(url('/corpora/%d/servefile/%d' % (
            corpus_id, corpus_file_id)), params, status=403,
            headers=self.json_headers, extra_environ=self.extra_environ_contrib)
        resp = json.loads(response.body)
        assert resp == h.unauthorized_msg

        # Retrieve the gzipped corpus file via request.
        response = self.app.get(url('/corpora/%d/servefile/%d' % (
            corpus_id, corpus_file_id)), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        unzipped_corpus_file_content = decompress_gzip_string(response.body)
        assert unzipped_corpus_file_content == corpus_file_content

        # Write the corpus to file again without any changes and expect a vacuous recreation
        sleep(1)
        params = json.dumps({'format': 'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id), params,
            headers=self.json_headers, extra_environ=self.extra_environ_admin)
        old_resp2 = resp2
        resp2 = json.loads(response.body) # Response is a JSON repr. of the corpus
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        old_corpus_tbk_mod_time = corpus_tbk_mod_time
        corpus_tbk_mod_time = h.get_file_modification_time(corpus_tbk_path) 
        assert old_corpus_tbk_mod_time < corpus_tbk_mod_time
        assert len(resp2['files']) == 1
        assert resp2['datetime_modified'] > old_resp2['datetime_modified']
        assert os.path.exists(corpus_tbk_path)

        # TGrep2-search the corpus-as-treebank
        # {'order_by': {'order_by_model': '', 'order_by_attribute': '', 'order_by_direction': ''}}
        # {'paginator': {'page': 0, 'items_per_page': 0}}
        
        tgrep2pattern = u'S < NP-SBJ'
        query = {'paginator': {'page': 1, 'items_per_page': 10}, 'tgrep2pattern': tgrep2pattern}
        json_query = json.dumps(query)
        if not h.command_line_program_installed('tgrep2'):
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_admin, status=400)
            resp = json.loads(response.body)
            assert resp["error"] ==  "TGrep2 is not installed."
        else:
            # TGrep2-search the corpus-as-treebank
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            for f in resp['items']:
                assert '(S ' in f['syntax'] and '(NP-SBJ ' in f['syntax']

            # A slightly more complex TGrep2 search
            tgrep2pattern = u'S < NP-SBJ << DT'
            query['tgrep2pattern'] = tgrep2pattern
            json_query = json.dumps(query)
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            for f in resp['items']:
                assert ('(S ' in f['syntax'] and '(NP-SBJ ' in f['syntax'] and 
                    '(DT ' in f['syntax'])

            # Another TGrep2 search
            tgrep2pattern = u'NP-SBJ < DT . VP'
            query['tgrep2pattern'] = tgrep2pattern
            json_query = json.dumps(query)
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            match_count = resp['paginator']['count']
            for f in resp['items']:
                assert ('(NP-SBJ ' in f['syntax'] and '(DT ' in f['syntax'] and 
                    '(VP ' in f['syntax'])

            # Failed tgrep2 search with invalid corpus id.
            response = self.app.request(url(controller='corpora', action='tgrep2', id=123456789),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_admin, status=404)
            resp = json.loads(response.body)
            assert resp['error'] == u'There is no corpus with id 123456789'

            # Restricted user will not get all of the results.
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_view)
            resp = json.loads(response.body)
            restricted_match_count = resp['paginator']['count']
            assert isinstance(restricted_match_count, int) and restricted_match_count < match_count

            # Failed TGrep2 search: bad JSON in request body
            json_query = json_query[:-1]
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=json_query, headers=self.json_headers,
                    environ=self.extra_environ_admin, status=400)
            resp = json.loads(response.body)
            assert resp ==  h.JSONDecodeErrorResponse

            # Failed TGrep2 search: malformed params
            tgrep2pattern = json.dumps({'TGrep2pattern': u'NP-SBJ < DT . VP'})
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=tgrep2pattern, headers=self.json_headers,
                    environ=self.extra_environ_admin, status=400)
            resp = json.loads(response.body)
            assert resp['errors']['tgrep2pattern'] == \
                    "A tgrep2pattern attribute must be supplied and must have a unicode/string value"

            # Empty string TGrep2 pattern results in no forms being returned.
            tgrep2pattern = json.dumps({'tgrep2pattern': u''})
            response = self.app.request(url(controller='corpora', action='tgrep2', id=corpus_id),
                    method='SEARCH', body=tgrep2pattern, headers=self.json_headers,
                    environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            assert resp == []
Example #6
0
    def tgrep2(self, id):
        """Search the corpus-as-treebank using Tgrep2.

        :URL: ``SEARCH/POST /corpora/id/tgrep2``.
        :Request body: JSON object with obligatory 'tgrep2pattern' attribute and
            optional 'paginator' and 'order_by' attributes.
        :param str id: the ``id`` value of the corpus.
        :returns: an array of forms as JSON objects

        """
        if not h.command_line_program_installed('tgrep2'):
            response.status_int = 400
            return {'error': 'TGrep2 is not installed.'}
        corpus = Session.query(Corpus).get(id)
        if corpus:
            try:
                treebank_corpus_file_object = filter(lambda cf: cf.format == u'treebank',
                        corpus.files)[0]
                corpus_dir_path = get_corpus_dir_path(corpus)
                tgrep2_corpus_file_path = os.path.join(corpus_dir_path,
                        '%s.t2c' % treebank_corpus_file_object.filename)
            except Exception:
                response.status_int = 400
                return {'error': 'Corpus %d has not been written to file as a treebank.'}
            if not os.path.exists(tgrep2_corpus_file_path):
                response.status_int = 400
                return {'error': 'Corpus %d has not been written to file as a treebank.'}
            #if not authorized_to_access_corpus_file(session['user'], treebank_corpus_file_object):
            #    response.status_int = 403
            #    return h.unauthorized_msg
            try:
                request_params = json.loads(unicode(request.body, request.charset))
                try:
                    tgrep2pattern = request_params['tgrep2pattern']
                    assert isinstance(tgrep2pattern, basestring)
                except Exception:
                    response.status_int = 400
                    return {'errors': {'tgrep2pattern':
                        'A tgrep2pattern attribute must be supplied and must have a unicode/string value'}}
                tmp_path = os.path.join(corpus_dir_path, '%s%s.txt' % (session['user'].username, h.generate_salt()))
                with open(os.devnull, "w") as fnull:
                    with open(tmp_path, 'w') as stdout:
                        # The -wu option causes TGrep2 to print only the root symbol of each matching tree
                        process = Popen(['tgrep2', '-c', tgrep2_corpus_file_path, '-wu', tgrep2pattern],
                            stdout=stdout, stderr=fnull)
                        process.communicate()
                match_ids = filter(None, map(get_form_ids_from_tgrep2_output_line, open(tmp_path, 'r')))
                os.remove(tmp_path)
                if match_ids:
                    query = h.eagerload_form(Session.query(Form)).filter(Form.id.in_(match_ids))
                    query = h.filter_restricted_models('Form', query)
                    query = h.add_order_by(query, request_params.get('order_by'), self.query_builder)
                    result = h.add_pagination(query, request_params.get('paginator'))
                elif request_params.get('paginator'):
                    paginator = request_params['paginator']
                    paginator['count'] = 0
                    result = {'paginator': paginator, 'items': []}
                else:
                    result = []
                return result
            except h.JSONDecodeError:
                response.status_int = 400
                return h.JSONDecodeErrorResponse
            except Invalid, e:
                response.status_int = 400
                return {'errors': e.unpack_errors()}
            except Exception, e:
                response.status_int = 400
                return {'error': 'Unable to perform TGrep2 search: %s.' % e}
Example #7
0
    def tgrep2(self, id):
        """Search the corpus-as-treebank using Tgrep2.

        :URL: ``SEARCH/POST /corpora/id/tgrep2``.
        :Request body: JSON object with obligatory 'tgrep2pattern' attribute and
            optional 'paginator' and 'order_by' attributes.
        :param str id: the ``id`` value of the corpus.
        :returns: an array of forms as JSON objects

        """
        if not h.command_line_program_installed('tgrep2'):
            response.status_int = 400
            return {'error': 'TGrep2 is not installed.'}
        corpus = Session.query(Corpus).get(id)
        if corpus:
            try:
                treebank_corpus_file_object = filter(
                    lambda cf: cf.format == u'treebank', corpus.files)[0]
                corpus_dir_path = get_corpus_dir_path(corpus)
                tgrep2_corpus_file_path = os.path.join(
                    corpus_dir_path,
                    '%s.t2c' % treebank_corpus_file_object.filename)
            except Exception:
                response.status_int = 400
                return {
                    'error':
                    'Corpus %d has not been written to file as a treebank.'
                }
            if not os.path.exists(tgrep2_corpus_file_path):
                response.status_int = 400
                return {
                    'error':
                    'Corpus %d has not been written to file as a treebank.'
                }
            #if not authorized_to_access_corpus_file(session['user'], treebank_corpus_file_object):
            #    response.status_int = 403
            #    return h.unauthorized_msg
            try:
                request_params = json.loads(
                    unicode(request.body, request.charset))
                try:
                    tgrep2pattern = request_params['tgrep2pattern']
                    assert isinstance(tgrep2pattern, basestring)
                except Exception:
                    response.status_int = 400
                    return {
                        'errors': {
                            'tgrep2pattern':
                            'A tgrep2pattern attribute must be supplied and must have a unicode/string value'
                        }
                    }
                tmp_path = os.path.join(
                    corpus_dir_path,
                    '%s%s.txt' % (session['user'].username, h.generate_salt()))
                with open(os.devnull, "w") as fnull:
                    with open(tmp_path, 'w') as stdout:
                        # The -wu option causes TGrep2 to print only the root symbol of each matching tree
                        process = Popen([
                            'tgrep2', '-c', tgrep2_corpus_file_path, '-wu',
                            tgrep2pattern
                        ],
                                        stdout=stdout,
                                        stderr=fnull)
                        process.communicate()
                match_ids = filter(
                    None,
                    map(get_form_ids_from_tgrep2_output_line,
                        open(tmp_path, 'r')))
                os.remove(tmp_path)
                if match_ids:
                    query = h.eagerload_form(Session.query(Form)).filter(
                        Form.id.in_(match_ids))
                    query = h.filter_restricted_models('Form', query)
                    query = h.add_order_by(query,
                                           request_params.get('order_by'),
                                           self.query_builder)
                    result = h.add_pagination(query,
                                              request_params.get('paginator'))
                elif request_params.get('paginator'):
                    paginator = request_params['paginator']
                    paginator['count'] = 0
                    result = {'paginator': paginator, 'items': []}
                else:
                    result = []
                return result
            except h.JSONDecodeError:
                response.status_int = 400
                return h.JSONDecodeErrorResponse
            except Invalid, e:
                response.status_int = 400
                return {'errors': e.unpack_errors()}
            except Exception, e:
                response.status_int = 400
                return {'error': 'Unable to perform TGrep2 search: %s.' % e}
Example #8
0
    def test_writetofile_content_specified(self):
        """Tests file writing/retrieval of a corpus whose forms are specified in the ``content`` attribute.

        """

        tgrep2_installed = h.command_line_program_installed('tgrep2')

        # Get ids of all sentences.
        sentences = Session.query(model.Form).\
            filter(model.Form.syntactic_category.\
                has(model.SyntacticCategory.name==u'S')).all()
        len_sentences = len(sentences)
        sentences = u','.join(map(str, map(lambda f: f.id, sentences)))

        # Get ids of all sentences with more than 5 words.
        long_sentences = Session.query(model.Form).\
            filter(and_(
                model.Form.syntactic_category.has(model.SyntacticCategory.name==u'S'),
                model.Form.transcription.op('regexp')(u'^([^ ]+ ){5}[^ ]+'))).all()
        len_long_sentences = len(long_sentences)
        long_sentences = u','.join(
            map(str, map(lambda f: f.id, long_sentences)))

        content = u','.join(
            [sentences, long_sentences, long_sentences, long_sentences])
        anticipated_length = len_sentences + (3 * len_long_sentences)
        name = u'Corpus of sentences with 6+ word sentences repeated'
        description = u'Ordered by content field; duplicates of words with more than 6 words.'

        # Generate some valid corpus creation input parameters.
        params = self.corpus_create_params.copy()
        params.update({
            'name': name,
            'description': description,
            'content': content
        })
        params = json.dumps(params)

        # Create the corpus
        original_corpus_count = Session.query(Corpus).count()
        response = self.app.post(url('corpora'), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        corpus_id = resp['id']
        new_corpus_count = Session.query(Corpus).count()
        corpus = Session.query(Corpus).get(corpus_id)
        corpus_dir = os.path.join(self.corpora_path, 'corpus_%d' % corpus_id)
        corpus_dir_contents = os.listdir(corpus_dir)
        assert new_corpus_count == original_corpus_count + 1
        assert resp['name'] == name
        assert resp['description'] == description
        assert corpus_dir_contents == []
        assert response.content_type == 'application/json'
        assert resp['content'] == content
        # The ``forms`` attribute is a collection, no repeats, that's why the following is true:
        assert len(corpus.forms) == len_sentences

        # Write the corpus to file as a treebank
        sleep(1)
        params = json.dumps({u'format': u'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        resp2 = json.loads(response.body)
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        corpus_tbk_gzipped_path = '%s.gz' % corpus_tbk_path
        corpus_tbk_gzipped_size = get_file_size(corpus_tbk_gzipped_path)
        corpus_tbk_file_length = h.get_file_length(corpus_tbk_path)
        corpus_tbk_t2c_path = os.path.join(corpus_dir,
                                           'corpus_%d.tbk.t2c' % corpus_id)
        corpus_file_id = resp2['files'][0]['id']
        assert resp['id'] == resp2['id']
        assert resp['name'] == resp2['name']
        assert resp2['datetime_modified'] > resp['datetime_modified']
        assert os.path.exists(corpus_tbk_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        assert get_file_size(corpus_tbk_path) > corpus_tbk_gzipped_size
        assert anticipated_length == corpus_tbk_file_length

        # Retrieve the corpus file directly from the filesystem.
        corpus_file_object = open(corpus_tbk_path, 'rb')
        corpus_file_content = corpus_file_object.read()

        # Attempt to retrieve the gzipped corpus file via request as a restricted
        # user and expect to fail.
        response = self.app.get(url('/corpora/%d/servefile/%d' %
                                    (corpus_id, corpus_file_id)),
                                status=403,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_contrib)
        resp = json.loads(response.body)
        assert resp == h.unauthorized_msg

        # Retrieve the gzipped corpus file via request.
        response = self.app.get(url('/corpora/%d/servefile/%d' %
                                    (corpus_id, corpus_file_id)),
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        assert len(response.body) < len(corpus_file_content)
        unzipped_corpus_file_content = decompress_gzip_string(response.body)
        assert unzipped_corpus_file_content == corpus_file_content

        # Write the corpus to file as a list of transcriptions, one per line.
        sleep(1)
        params = json.dumps({u'format': u'transcriptions only'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        old_resp2 = resp2
        resp2 = json.loads(response.body)
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_TO_path = os.path.join(
            corpus_dir, 'corpus_%d_transcriptions.txt' % corpus_id)
        corpus_TO_gzipped_path = '%s.gz' % corpus_TO_path
        corpus_TO_gzipped_size = get_file_size(corpus_TO_gzipped_path)
        corpus_TO_file_length = h.get_file_length(corpus_TO_path)
        if tgrep2_installed:
            # Five files should be present: tbk, tbk.gz, tbk.t2c, txt and txt.gz
            assert len(corpus_dir_contents) == 5
        else:
            # Four files should be present: tbk, tbk.gz, txt and txt.gz
            assert len(corpus_dir_contents) == 4
        assert resp2['datetime_modified'] > old_resp2['datetime_modified']
        assert os.path.exists(corpus_TO_path)
        assert os.path.exists(corpus_TO_gzipped_path)
        assert get_file_size(corpus_TO_path) > corpus_TO_gzipped_size
        assert anticipated_length == corpus_TO_file_length

        # Finally delete the corpus and expect it, its file data and corpus file
        # objects to have been deleted.
        assert os.path.exists(corpus_TO_path)
        assert os.path.exists(corpus_TO_gzipped_path)
        assert os.path.exists(corpus_tbk_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        corpus_file_ids = [cf['id'] for cf in resp2['files']]
        self.app.delete(url('corpus', id=corpus_id),
                        headers=self.json_headers,
                        extra_environ=self.extra_environ_admin)
        assert Session.query(model.Corpus).get(corpus_id) == None
        for corpus_file_id in corpus_file_ids:
            assert Session.query(model.CorpusFile).get(corpus_file_id) == None
        assert not os.path.exists(corpus_TO_path)
        assert not os.path.exists(corpus_TO_gzipped_path)
        assert not os.path.exists(corpus_tbk_path)
        assert not os.path.exists(corpus_tbk_t2c_path)
        assert not os.path.exists(corpus_tbk_gzipped_path)
Example #9
0
    def test_aaa_initialize(self):
        """Initialize the database using pseudo-data generated from random lorem ipsum sentences.

        These are located in ``onlinelinguisticdatabase/tests/data/corpora``.
        The data contain morphologically analyzed sentences, their component
        morphemes, and syntactic categories.  The sentences have phrase
        structure trees in bracket notation.

        The test will try to load the lorem ipsum dataset from a MySQL/SQLite
        dump file in ``onlinelinguisticdatabase/tests/data/corpora``.  If the
        dump file corresponding to ``loremipsum_path`` does not exist, it will
        import the lorem ipsum data directly from the text files and create
        the dump file so that future tests can run more speedily.  The
        ``loremipsum100_path``, ``loremipsum1000_path``, ``loremipsum10000_path``
        and ``loremipsum30000_path`` files are available and contain 100, 1000
        and 10,000 sentences, respectively.

        Setting the ``via_request`` variable to ``True`` will cause all of the
        forms to be created via request, i.e., via
        ``self.app.post(url('forms))...``.  This is much slower but may be
        desirable since values for the morphological analysis attributes
        will be generated.

        .. note::

            In order to run ``mysqldump`` with the MySQL user listed in
            ``test.ini``, that user must have permission to lock and update
            tables (alter and file privileges may also be required ...)::

                mysql -u root -p<root_password>
                grant lock tables, update on old_test.* to 'old'@'localhost';

        .. warning::

            Loading the .txt or .sql files with the ``via_request`` option set to
            ``True`` will take a very long time.  This might be an argument for
            separating the interface and logic components of the controllers so
            that a "core" HTTP-less OLD application could be exposed.  This
            would facilitate the creation of models with system-generated data
            and validation but without the HTTP overhead...

        """

        ########################################################################
        # Configure lorem ipsum data set import
        ########################################################################

        # Set ``loremipsum_path`` this to ``self.loremipsum100_path``,
        # ``self.loremipsum1000_path`` or ``self.loremipsum10000_path``.
        # WARNING: the larger ones will take a long time.
        # Use the 10,000-sentence lorem ipsum dataset to ensure that
        # very large corpora are handled correctly.
        loremipsum_path = self.loremipsum100_path

        # Set ``via_request`` to ``True`` to create all forms via HTTP requests.
        via_request = True

        self._add_SEARCH_to_web_test_valid_methods()

        # Add an application settings so that morpheme references will work out right.
        application_settings = h.generate_default_application_settings()
        Session.add(application_settings)
        Session.commit()

        def create_model(line, categories, via_request=False):
            """Create a model (form or syncat) using the string in ``line``."""
            model = 'Form'
            elements = unicode(line).split('\t')
            non_empty_elements = filter(None, elements)
            try:
                ol, mb, mg, ml, sc, sx = non_empty_elements
            except Exception:
                try:
                    ol, mb, mg, ml, sc = non_empty_elements
                    sx = u''
                except Exception:
                    try:
                        model = 'SyntacticCategory'
                        n, t = non_empty_elements
                    except Exception:
                        return categories
            if via_request:
                if model == 'SyntacticCategory':
                    params = self.syntactic_category_create_params.copy()
                    params.update({'name': n, 'type': t})
                    params = json.dumps(params)
                    response = self.app.post(url('syntacticcategories'),
                                             params, self.json_headers,
                                             self.extra_environ_admin)
                    cat_id = json.loads(response.body)['id']
                    categories[n] = cat_id
                else:
                    params = self.form_create_params.copy()
                    params.update({
                        'transcription':
                        ol,
                        'morpheme_break':
                        mb,
                        'morpheme_gloss':
                        mg,
                        'translations': [{
                            'transcription': ml,
                            'grammaticality': u''
                        }],
                        'syntax':
                        sx,
                        'syntactic_category':
                        categories.get(sc, u'')
                    })
                    params = json.dumps(params)
                    self.app.post(url('forms'), params, self.json_headers,
                                  self.extra_environ_admin)
            else:
                if model == 'SyntacticCategory':
                    syntactic_category = model.SyntacticCategory()
                    syntactic_category.name = n
                    syntactic_category.type = t
                    Session.add(syntactic_category)
                    categories[n] = syntactic_category.id
                else:
                    form = model.Form()
                    form.transcription = ol
                    form.morpheme_break = mb
                    form.morpheme_gloss = mg
                    translation = model.Translation()
                    translation.transcription = ml
                    form.translations.append(translation)
                    form.syntax = sx
                    form.syntacticcategory_id = categories.get(sc, None)
                    Session.add(form)
            return categories

        def add_loremipsum_to_db(loremipsum_path, via_request=False):
            """Add the contents of the file at ``loremipsum_path`` to the database."""
            categories = {}
            with open(loremipsum_path, 'r') as f:
                i = 0
                for l in f:
                    if i % 100 == 0:
                        if not via_request: Session.commit()
                        log.debug('%d lines processed' % i)
                    i = i + 1
                    categories = create_model(l.replace('\n', ''), categories,
                                              via_request)
                Session.commit()

        loremipsum_path_no_ext = os.path.splitext(loremipsum_path)[0]
        sqlalchemy_URL = self.config['sqlalchemy.url']
        sqlalchemy_URL_list = sqlalchemy_URL.split(':')
        olddump_script_path = os.path.join(self.test_scripts_path,
                                           'olddump.sh')
        oldload_script_path = os.path.join(self.test_scripts_path,
                                           'oldload.sh')
        RDBMS = sqlalchemy_URL_list[0]

        if RDBMS == 'mysql':
            mysql_dump_path = '%s_mysql.sql' % loremipsum_path_no_ext
            username = sqlalchemy_URL_list[1][2:]
            password = sqlalchemy_URL_list[2].split('@')[0]
            dbname = sqlalchemy_URL_list[3].split('/')[1]
            if os.path.exists(mysql_dump_path):
                log.debug(
                    'The lorem ipsum MySQL dump file exists.  Loading it...')
                # Clear the current DB completely
                h.clear_all_models(retain=[])
                # Load the dump file to the DB
                shell_script = '#!/bin/sh\nmysql -u %s -p%s %s < %s' % (
                    username, password, dbname, mysql_dump_path)
                with open(oldload_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(oldload_script_path, 0744)
                # Load the DB
                with open(os.devnull, 'w') as f:
                    call([oldload_script_path], stdout=f, stderr=f)
                # Destroy the load script
                os.remove(oldload_script_path)
                log.debug('Loaded.')
            else:
                log.debug(
                    'Have to import the lorem ipsum dataset from the text file and create the MySQL dump file.'
                )
                # Populate the database from the loremipusm text file and dump it
                add_loremipsum_to_db(loremipsum_path, via_request=via_request)
                # Write the DB dump shell script
                # Note: the --single-transaction option seems to be required (on Mac MySQL 5.6 using InnoDB tables ...)
                # see http://forums.mysql.com/read.php?10,108835,112951#msg-112951
                shell_script = '#!/bin/sh\nmysqldump -u %s -p%s --single-transaction --no-create-info --result-file=%s %s' % (
                    username, password, mysql_dump_path, dbname)
                with open(olddump_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(olddump_script_path, 0744)
                # Dump the DB
                with open(os.devnull, 'w') as f:
                    call([olddump_script_path], stdout=f, stderr=f)
                # Destroy the dump script
                os.remove(olddump_script_path)
                log.debug('Imported and dumped.')
        elif RDBMS == 'sqlite' and h.command_line_program_installed('sqlite3'):
            sqlite_dump_path = '%s_sqlite.sql' % loremipsum_path_no_ext
            sqlite_db = sqlalchemy_URL.split('/')[-1]
            dbpath = os.path.join(self.here, sqlite_db)
            if os.path.exists(sqlite_dump_path):
                log.debug(
                    'The lorem ipsum SQLite dump file exists.  Loading it...')
                # Clear the current DB completely
                h.clear_all_models(retain=[])
                # Load the dump file to the DB
                shell_script = '#!/bin/sh\nsqlite3 %s < %s' % (
                    dbpath, sqlite_dump_path)
                with open(oldload_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(oldload_script_path, 0744)
                # Load the DB
                with open(os.devnull, 'w') as f:
                    call([oldload_script_path], stdout=f, stderr=f)
                # Destroy the load script
                os.remove(oldload_script_path)
                log.debug('Loaded.')
            else:
                log.debug(
                    'Have to import the lorem ipsum dataset from the text file and create the SQLite dump file.'
                )
                # Populate the database from the loremipusm text file and dump it
                add_loremipsum_to_db(loremipsum_path, via_request=via_request)
                # Write the DB dump shell script
                shell_script = '#!/bin/sh\nsqlite3 %s ".dump" | grep -v "^CREATE" > %s' % (
                    dbpath, sqlite_dump_path)
                with open(olddump_script_path, 'w') as f:
                    f.write(shell_script)
                os.chmod(olddump_script_path, 0744)
                # Dump the DB
                with open(os.devnull, 'w') as f:
                    call([olddump_script_path], stdout=f, stderr=f)
                # Destroy the dump script
                os.remove(olddump_script_path)
                log.debug('Imported and dumped.')
        forms = h.get_forms()
        log.debug(
            'Lorem ipsum data loaded.  There are now %d forms in the db.' %
            len(forms))

        # Restrict one sentential form in the db.
        restricted_tag = h.generate_restricted_tag()
        Session.add(restricted_tag)
        Session.commit()
        a_form = Session.query(model.Form).\
            filter(model.Form.syntactic_category.\
                has(model.SyntacticCategory.name==u'S')).first()
        a_form_id = a_form.id
        a_form.tags.append(restricted_tag)
        Session.commit()
        restricted_form = Session.query(model.Form).\
            filter(model.Form.tags.any(model.Tag.name==u'restricted')).first()
        assert a_form_id == restricted_form.id
Example #10
0
    def test_writetofile_all_sentences(self):
        """Tests file writing/retrieval of a corpus containing all sentences.

        That is, that ``PUT /corpora/id/writetofile`` and
        ``GET /corpora/id/servefile`` both work with a corpus defined by a form
        search model that returns all sentences.

        """

        restricted_form_id = Session.query(model.Form).filter(
            model.Form.tags.any(model.Tag.name == u'restricted')).first().id
        tgrep2_installed = h.command_line_program_installed('tgrep2')

        # Create a form search model that retrieves all sentences
        query = {'filter': ['Form', 'syntactic_category', 'name', '=', 'S']}
        params = json.dumps({
            'name': u'Get all sentences',
            'description': u'Query to return all sentences in the database.',
            'search': query
        })
        response = self.app.post(url('formsearches'), params,
                                 self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        form_search_id = resp['id']

        # Perform the search to get the resulting forms.
        params = json.dumps({
            'query': query,
            'paginator': {
                'page': 1,
                'items_per_page': 1
            }
        })
        response = self.app.post(url('/forms/search'), params,
                                 self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        sentence_count = resp['paginator']['count']

        # Generate some valid corpus creation input parameters.
        params = self.corpus_create_params.copy()
        params.update({
            'name': u'Corpus of sentences',
            'description': u'No ordering, no duplicates.',
            'form_search': form_search_id
        })
        params = json.dumps(params)

        # Create the corpus
        #assert os.listdir(self.corpora_path) == []
        original_corpus_count = Session.query(Corpus).count()
        response = self.app.post(url('corpora'), params, self.json_headers,
                                 self.extra_environ_admin)
        resp = json.loads(response.body)
        corpus_id = resp['id']
        new_corpus_count = Session.query(Corpus).count()
        corpus = Session.query(Corpus).get(corpus_id)
        corpus_dir = os.path.join(self.corpora_path, 'corpus_%d' % corpus_id)
        corpus_dir_contents = os.listdir(corpus_dir)
        assert new_corpus_count == original_corpus_count + 1
        assert resp['name'] == u'Corpus of sentences'
        assert resp['description'] == u'No ordering, no duplicates.'
        assert corpus_dir_contents == []
        assert response.content_type == 'application/json'
        assert resp['content'] == u''
        assert len(corpus.forms) == sentence_count
        assert resp['form_search']['id'] == form_search_id

        # Try to TGrep2-search the corpus without first writing it to file
        # and expect to fail.
        tgrep2pattern = json.dumps({'tgrep2pattern': u'S < NP-SBJ'})
        if h.command_line_program_installed('tgrep2'):
            # Failed tgrep2 search with invalid corpus id.
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=tgrep2pattern,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin,
                                        status=400)
            tgrep2resp = json.loads(response.body)
            assert tgrep2resp[
                'error'] == 'Corpus %d has not been written to file as a treebank.'

        # Write the corpus to file
        sleep(1)
        params = json.dumps({'format': 'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        resp2 = json.loads(response.body)
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        corpus_tbk_mod_time = h.get_file_modification_time(corpus_tbk_path)
        corpus_tbk_gzipped_path = '%s.gz' % corpus_tbk_path
        corpus_tbk_file_length = h.get_file_length(corpus_tbk_path)
        corpus_tbk_t2c_path = os.path.join(corpus_dir,
                                           'corpus_%d.tbk.t2c' % corpus_id)
        corpus_file_id = resp2['files'][0]['id']
        assert resp['id'] == resp2['id']
        assert resp['name'] == resp2['name']
        assert resp2['datetime_modified'] > resp['datetime_modified']
        assert os.path.exists(corpus_tbk_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        assert get_file_size(corpus_tbk_path) > get_file_size(
            corpus_tbk_gzipped_path)
        assert sentence_count == corpus_tbk_file_length

        # Retrieve the corpus file directly from the filesystem.
        corpus_file_object = open(corpus_tbk_path, 'rb')
        corpus_file_content = corpus_file_object.read()

        # Attempt to retrieve the gzipped corpus file via request as a restricted
        # user and expect to fail.  This is because there is one restricted
        # sentential form in the db, cf. the ``initialize`` "test".
        response = self.app.get(url('/corpora/%d/servefile/%d' %
                                    (corpus_id, corpus_file_id)),
                                params,
                                status=403,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_contrib)
        resp = json.loads(response.body)
        assert resp == h.unauthorized_msg

        # Retrieve the gzipped corpus file via request.
        response = self.app.get(url('/corpora/%d/servefile/%d' %
                                    (corpus_id, corpus_file_id)),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        unzipped_corpus_file_content = decompress_gzip_string(response.body)
        assert unzipped_corpus_file_content == corpus_file_content
        assert response.content_type == u'application/x-gzip'

        # Now update the corpus by changing the form search, re-write-to-file
        # and make sure everything works.

        # Create a form search model that retrieves all sentences with even-numbered
        # ids and the restricted form.
        query = {
            'filter': [
                'and',
                [['Form', 'syntactic_category', 'name', '=', 'S'],
                 [
                     'or',
                     [['Form', 'id', '=', restricted_form_id],
                      ['Form', 'id', 'regex', '[02468]$']]
                 ]]
            ]
        }
        params = json.dumps({
            'name': u'Get even-numbered or restricted sentences',
            'description':
            u'Query to return all sentences in the database that have even-numbered ids or are restricted.',
            'search': query
        })
        response = self.app.post(url('formsearches'), params,
                                 self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        form_search_id = resp['id']

        # Perform the search to get the resulting forms.
        params = json.dumps({
            'query': query,
            'paginator': {
                'page': 1,
                'items_per_page': 1
            }
        })
        response = self.app.post(url('/forms/search'), params,
                                 self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        sentence_count = resp['paginator']['count']

        # Update the above-created corpus.
        params = self.corpus_create_params.copy()
        params.update({
            'name': u'Corpus of even-numbered sentences',
            'description': u'No ordering, no duplicates.',
            'form_search': form_search_id
        })
        params = json.dumps(params)
        original_corpus_count = Session.query(Corpus).count()
        response = self.app.put(url('corpus', id=corpus_id), params,
                                self.json_headers, self.extra_environ_admin)
        resp = json.loads(response.body)
        new_corpus_count = Session.query(Corpus).count()
        corpus = Session.query(Corpus).get(corpus_id)
        corpus_dir = os.path.join(self.corpora_path, 'corpus_%d' % corpus_id)
        corpus_dir_contents = os.listdir(corpus_dir)
        assert new_corpus_count == original_corpus_count
        assert resp['name'] == u'Corpus of even-numbered sentences'
        assert resp['description'] == u'No ordering, no duplicates.'
        assert corpus_dir_contents != [
        ]  # Already a previously written corpus file there
        assert response.content_type == 'application/json'
        assert resp['content'] == u''
        assert len(corpus.forms) == sentence_count
        assert resp['form_search']['id'] == form_search_id

        # Write the corpus to file
        sleep(1)
        params = json.dumps({'format': 'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        resp2 = json.loads(
            response.body)  # Response is a JSON repr. of the corpus
        corpus_dir_contents = os.listdir(corpus_dir)
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        old_corpus_tbk_mod_time = corpus_tbk_mod_time
        corpus_tbk_mod_time = h.get_file_modification_time(corpus_tbk_path)
        corpus_tbk_gzipped_path = '%s.gz' % corpus_tbk_path
        corpus_tbk_file_length = h.get_file_length(
            corpus_tbk_path)  # no. of lines
        corpus_tbk_t2c_path = os.path.join(corpus_dir,
                                           'corpus_%d.tbk.t2c' % corpus_id)
        corpus_file_id = resp2['files'][0]['id']
        assert old_corpus_tbk_mod_time < corpus_tbk_mod_time
        assert len(resp2['files']) == 1
        assert resp['id'] == resp2['id']
        assert resp['name'] == resp2['name']
        assert resp2['datetime_modified'] > resp['datetime_modified']
        assert os.path.exists(corpus_tbk_path)
        assert os.path.exists(corpus_tbk_gzipped_path)
        if tgrep2_installed:
            assert os.path.exists(corpus_tbk_t2c_path)
        else:
            assert not os.path.exists(corpus_tbk_t2c_path)
        assert get_file_size(corpus_tbk_path) > get_file_size(
            corpus_tbk_gzipped_path)
        assert sentence_count == corpus_tbk_file_length

        # Retrieve the corpus file directly from the filesystem.
        corpus_file_object = open(corpus_tbk_path, 'rb')
        corpus_file_content = corpus_file_object.read()

        # Attempt to retrieve the gzipped corpus file via request as a restricted
        # user and expect to fail.  This is because the one restricted sentential
        # form in the db is in the corpus.
        response = self.app.get(url('/corpora/%d/servefile/%d' %
                                    (corpus_id, corpus_file_id)),
                                params,
                                status=403,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_contrib)
        resp = json.loads(response.body)
        assert resp == h.unauthorized_msg

        # Retrieve the gzipped corpus file via request.
        response = self.app.get(url('/corpora/%d/servefile/%d' %
                                    (corpus_id, corpus_file_id)),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        unzipped_corpus_file_content = decompress_gzip_string(response.body)
        assert unzipped_corpus_file_content == corpus_file_content

        # Write the corpus to file again without any changes and expect a vacuous recreation
        sleep(1)
        params = json.dumps({'format': 'treebank'})
        response = self.app.put(url('/corpora/%d/writetofile' % corpus_id),
                                params,
                                headers=self.json_headers,
                                extra_environ=self.extra_environ_admin)
        old_resp2 = resp2
        resp2 = json.loads(
            response.body)  # Response is a JSON repr. of the corpus
        corpus_tbk_path = os.path.join(corpus_dir, 'corpus_%d.tbk' % corpus_id)
        old_corpus_tbk_mod_time = corpus_tbk_mod_time
        corpus_tbk_mod_time = h.get_file_modification_time(corpus_tbk_path)
        assert old_corpus_tbk_mod_time < corpus_tbk_mod_time
        assert len(resp2['files']) == 1
        assert resp2['datetime_modified'] > old_resp2['datetime_modified']
        assert os.path.exists(corpus_tbk_path)

        # TGrep2-search the corpus-as-treebank
        # {'order_by': {'order_by_model': '', 'order_by_attribute': '', 'order_by_direction': ''}}
        # {'paginator': {'page': 0, 'items_per_page': 0}}

        tgrep2pattern = u'S < NP-SBJ'
        query = {
            'paginator': {
                'page': 1,
                'items_per_page': 10
            },
            'tgrep2pattern': tgrep2pattern
        }
        json_query = json.dumps(query)
        if not h.command_line_program_installed('tgrep2'):
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin,
                                        status=400)
            resp = json.loads(response.body)
            assert resp["error"] == "TGrep2 is not installed."
        else:
            # TGrep2-search the corpus-as-treebank
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            for f in resp['items']:
                assert '(S ' in f['syntax'] and '(NP-SBJ ' in f['syntax']

            # A slightly more complex TGrep2 search
            tgrep2pattern = u'S < NP-SBJ << DT'
            query['tgrep2pattern'] = tgrep2pattern
            json_query = json.dumps(query)
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            for f in resp['items']:
                assert ('(S ' in f['syntax'] and '(NP-SBJ ' in f['syntax']
                        and '(DT ' in f['syntax'])

            # Another TGrep2 search
            tgrep2pattern = u'NP-SBJ < DT . VP'
            query['tgrep2pattern'] = tgrep2pattern
            json_query = json.dumps(query)
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            match_count = resp['paginator']['count']
            for f in resp['items']:
                assert ('(NP-SBJ ' in f['syntax'] and '(DT ' in f['syntax']
                        and '(VP ' in f['syntax'])

            # Failed tgrep2 search with invalid corpus id.
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=123456789),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin,
                                        status=404)
            resp = json.loads(response.body)
            assert resp['error'] == u'There is no corpus with id 123456789'

            # Restricted user will not get all of the results.
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_view)
            resp = json.loads(response.body)
            restricted_match_count = resp['paginator']['count']
            assert isinstance(restricted_match_count,
                              int) and restricted_match_count < match_count

            # Failed TGrep2 search: bad JSON in request body
            json_query = json_query[:-1]
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=json_query,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin,
                                        status=400)
            resp = json.loads(response.body)
            assert resp == h.JSONDecodeErrorResponse

            # Failed TGrep2 search: malformed params
            tgrep2pattern = json.dumps({'TGrep2pattern': u'NP-SBJ < DT . VP'})
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=tgrep2pattern,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin,
                                        status=400)
            resp = json.loads(response.body)
            assert resp['errors']['tgrep2pattern'] == \
                    "A tgrep2pattern attribute must be supplied and must have a unicode/string value"

            # Empty string TGrep2 pattern results in no forms being returned.
            tgrep2pattern = json.dumps({'tgrep2pattern': u''})
            response = self.app.request(url(controller='corpora',
                                            action='tgrep2',
                                            id=corpus_id),
                                        method='SEARCH',
                                        body=tgrep2pattern,
                                        headers=self.json_headers,
                                        environ=self.extra_environ_admin)
            resp = json.loads(response.body)
            assert resp == []