Example #1
0
def test_truncate(space_path, space, tmpdir, dispatcher):
    path = str(tmpdir.join('truncated.h5'))

    dispatcher.dispatch(
        'space truncate '
        '-s {space_path} '
        '-o {output} '
        '--nvaa '
        '--tagset bnc '
        '--size 40'
        ''.format(
            space_path=space_path,
            output=path,
        ).split()
    )

    truncated = read_space_from_file(path)

    assert space.column_labels.loc['be', 'VERB']['id'] == 3
    assert space.column_labels.loc['not', 'ADV']['id'] == 11
    assert space.column_labels.loc['do', 'VERB']['id'] == 16
    assert space.column_labels.loc['right', 'ADV']['id'] == 19
    assert space.column_labels.loc['first', 'ADJ']['id'] == 28
    assert space.column_labels.loc['have', 'VERB']['id'] == 61

    assert truncated.column_labels.loc['be', 'VERB']['id'] == 0
    assert truncated.column_labels.loc['not', 'ADV']['id'] == 3
    assert truncated.column_labels.loc['do', 'VERB']['id'] == 6
    assert truncated.column_labels.loc['right', 'ADV']['id'] == 8
    assert truncated.column_labels.loc['first', 'ADJ']['id'] == 14
    assert truncated.column_labels.loc['have', 'VERB']['id'] == 38
Example #2
0
def test_truncate(space_path, space, tmpdir, dispatcher):
    path = str(tmpdir.join('truncated.h5'))

    dispatcher.dispatch('space truncate '
                        '-s {space_path} '
                        '-o {output} '
                        '--nvaa '
                        '--tagset bnc '
                        '--size 40'
                        ''.format(
                            space_path=space_path,
                            output=path,
                        ).split())

    truncated = read_space_from_file(path)

    assert space.column_labels.loc['be', 'VERB']['id'] == 3
    assert space.column_labels.loc['not', 'ADV']['id'] == 11
    assert space.column_labels.loc['do', 'VERB']['id'] == 16
    assert space.column_labels.loc['right', 'ADV']['id'] == 19
    assert space.column_labels.loc['first', 'ADJ']['id'] == 28
    assert space.column_labels.loc['have', 'VERB']['id'] == 61

    assert truncated.column_labels.loc['be', 'VERB']['id'] == 0
    assert truncated.column_labels.loc['not', 'ADV']['id'] == 3
    assert truncated.column_labels.loc['do', 'VERB']['id'] == 6
    assert truncated.column_labels.loc['right', 'ADV']['id'] == 8
    assert truncated.column_labels.loc['first', 'ADJ']['id'] == 14
    assert truncated.column_labels.loc['have', 'VERB']['id'] == 38
Example #3
0
def verb_space_builder(channel):
    import pickle
    from scipy import sparse

    from fowler.corpora.execnet import logger, initialize_channel
    from fowler.corpora.models import read_space_from_file

    _, data = initialize_channel(channel)
    space = read_space_from_file(data['space_file'])

    result = {}
    for item in channel:

        if item == ('message', 'terminate'):
            if result:
                channel.send(('result', pickle.dumps(result)))
            break

        type_, data = item
        if type_ == 'task':
            # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data):

            # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data)
            (verb_stem, verb_tag), group = pickle.loads(data)

            logger.debug(
                'Processing verb %s_%s with %s argument pairs.',
                verb_stem,
                verb_tag,
                len(group),
                )

            for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values:

                try:
                    subject_vector = space[subj_stem, subj_tag]
                    object_vector = space[obj_stem, obj_tag]
                except KeyError:
                    # logger.exception('Could not retrieve an argument vector.')
                    continue

                if not subject_vector.size:
                    logger.warning('Subject %s %s is empty!', subj_stem, subj_tag)
                    continue

                if not object_vector.size:
                    logger.warning('Object %s %s is empty!', obj_stem, obj_tag)
                    continue

                subject_object_tensor = sparse.kron(subject_vector, object_vector)
                t = subject_object_tensor * count

                if verb_stem not in result:
                    result[verb_stem, verb_tag] = t
                else:
                    result[verb_stem, verb_tag] += t

        channel.send(('message', 'send_next'))
Example #4
0
def test_line_normalize(space_path, tmpdir, dispatcher):
    path = str(tmpdir.join('line-normalized.h5'))

    dispatcher.dispatch('space line-normalize '
                        '-s {space_path} '
                        '-o {output} '
                        ''.format(
                            space_path=space_path,
                            output=path,
                        ).split())

    normalized_space = read_space_from_file(path)

    violations = normalized_space.matrix > 1
    assert len(np.argwhere(violations)) == 0
Example #5
0
def test_line_normalize(space_path, tmpdir, dispatcher):
    path = str(tmpdir.join('line-normalized.h5'))

    dispatcher.dispatch(
        'space line-normalize '
        '-s {space_path} '
        '-o {output} '
        ''.format(
            space_path=space_path,
            output=path,
        ).split()
    )

    normalized_space = read_space_from_file(path)

    violations = normalized_space.matrix > 1
    assert len(np.argwhere(violations)) == 0
Example #6
0
def ittf(
    space,
    output,
    raw_space=('', '', 'Space with feature co-occurrence counts.'),
    times=('', ('n', 'logn'), 'Multiply the resulted values by n or logn.'),
):
    raw_space = read_space_from_file(raw_space)

    feature_cardinality = np.array(
        [v.nnz for v in raw_space.get_target_rows(*space.column_labels.index)]
    )

    n = space.matrix.todense()

    ittf = np.log(feature_cardinality) - np.log(n + 1)

    if times == 'n':
        matrix = np.multiply(n, ittf)
    elif times == 'logn':
        matrix = np.multiply(np.log(n + 1), ittf)

    Space(matrix, space.row_labels, space.column_labels).write(output)
Example #7
0
def ittf(
        space,
        output,
        raw_space=('', '', 'Space with feature co-occurrence counts.'),
        times=('', ('n', 'logn'),
               'Multiply the resulted values by n or logn.'),
):
    raw_space = read_space_from_file(raw_space)

    feature_cardinality = np.array(
        [v.nnz for v in raw_space.get_target_rows(*space.column_labels.index)])

    n = space.matrix.todense()

    ittf = np.log(feature_cardinality) - np.log(n + 1)

    if times == 'n':
        matrix = np.multiply(n, ittf)
    elif times == 'logn':
        matrix = np.multiply(np.log(n + 1), ittf)

    Space(matrix, space.row_labels, space.column_labels).write(output)
Example #8
0
 def space(self):
     try:
         return read_space_from_file(self.kwargs['space'])
     except AttributeError:
         raise ValueError('Could not read the space file.')
Example #9
0
 def verb_space(self):
     if self.kwargs['verb_space']:
         return read_space_from_file(self.kwargs['verb_space'])
Example #10
0
def space(space_path):
    return read_space_from_file(space_path)
Example #11
0
 def space(self):
     # TODO: this is depricated, SpaceMixin should be used, and
     # global__matrix should be renamed to global__space.
     return read_space_from_file(self.matrix)
Example #12
0
 def space(self):
     try:
         return read_space_from_file(self.kwargs['space'])
     except AttributeError:
         raise ValueError('Could not read the space file.')
Example #13
0
 def verb_space(self):
     if self.kwargs['verb_space']:
         return read_space_from_file(self.kwargs['verb_space'])
Example #14
0
 def space(self):
     return read_space_from_file(self.kwargs["space"])
Example #15
0
 def space(self):
     return read_space_from_file(self.kwargs['space'])
Example #16
0
def conditional_space(conditional_space_path):
    return read_space_from_file(conditional_space_path)
Example #17
0
 def space(self):
     # TODO: this is depricated, SpaceMixin should be used, and
     # global__matrix should be renamed to global__space.
     return read_space_from_file(self.matrix)
Example #18
0
def verb_space_builder(channel):
    import pickle

    from scipy import sparse

    from fowler.corpora.execnet import logger, initialize_channel
    from fowler.corpora.models import read_space_from_file

    _, data = initialize_channel(channel)
    space = read_space_from_file(data['space_file'])

    result = {}
    for item in channel:

        if item == ('message', 'terminate'):
            if result:
                channel.send(('result', pickle.dumps(result)))
            break

        type_, data = item
        if type_ == 'task':
            # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data):

            # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data)
            (verb_stem, verb_tag), group = pickle.loads(data)

            logger.debug(
                'Processing verb %s_%s with %s argument pairs.',
                verb_stem,
                verb_tag,
                len(group),
                )

            for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values:
                # XXX consider only the triples for which `count > 1000`.

                try:
                    subject_vector = space[subj_stem, subj_tag]
                    object_vector = space[obj_stem, obj_tag]
                except KeyError:
                    # Don't log the expensions as there are many of them!
                    continue

                if not subject_vector.size:
                    # logger.warning('Subject %s %s is empty!', subj_stem, subj_tag)
                    continue

                if not object_vector.size:
                    # logger.warning('Object %s %s is empty!', obj_stem, obj_tag)
                    continue

                subject_object_tensor = sparse.kron(subject_vector, object_vector)

                # XXX multiply by the count?
                t = subject_object_tensor

                if (verb_stem, verb_tag) not in result:
                    result[verb_stem, verb_tag] = t
                else:
                    result[verb_stem, verb_tag] += t

        channel.send(('message', 'send_next'))
Example #19
0
def space(space_path):
    return read_space_from_file(space_path)
Example #20
0
 def space(self):
     return read_space_from_file(self.kwargs['space'])
Example #21
0
def conditional_space(conditional_space_path):
    return read_space_from_file(conditional_space_path)