def test_truncate(space_path, space, tmpdir, dispatcher): path = str(tmpdir.join('truncated.h5')) dispatcher.dispatch( 'space truncate ' '-s {space_path} ' '-o {output} ' '--nvaa ' '--tagset bnc ' '--size 40' ''.format( space_path=space_path, output=path, ).split() ) truncated = read_space_from_file(path) assert space.column_labels.loc['be', 'VERB']['id'] == 3 assert space.column_labels.loc['not', 'ADV']['id'] == 11 assert space.column_labels.loc['do', 'VERB']['id'] == 16 assert space.column_labels.loc['right', 'ADV']['id'] == 19 assert space.column_labels.loc['first', 'ADJ']['id'] == 28 assert space.column_labels.loc['have', 'VERB']['id'] == 61 assert truncated.column_labels.loc['be', 'VERB']['id'] == 0 assert truncated.column_labels.loc['not', 'ADV']['id'] == 3 assert truncated.column_labels.loc['do', 'VERB']['id'] == 6 assert truncated.column_labels.loc['right', 'ADV']['id'] == 8 assert truncated.column_labels.loc['first', 'ADJ']['id'] == 14 assert truncated.column_labels.loc['have', 'VERB']['id'] == 38
def test_truncate(space_path, space, tmpdir, dispatcher): path = str(tmpdir.join('truncated.h5')) dispatcher.dispatch('space truncate ' '-s {space_path} ' '-o {output} ' '--nvaa ' '--tagset bnc ' '--size 40' ''.format( space_path=space_path, output=path, ).split()) truncated = read_space_from_file(path) assert space.column_labels.loc['be', 'VERB']['id'] == 3 assert space.column_labels.loc['not', 'ADV']['id'] == 11 assert space.column_labels.loc['do', 'VERB']['id'] == 16 assert space.column_labels.loc['right', 'ADV']['id'] == 19 assert space.column_labels.loc['first', 'ADJ']['id'] == 28 assert space.column_labels.loc['have', 'VERB']['id'] == 61 assert truncated.column_labels.loc['be', 'VERB']['id'] == 0 assert truncated.column_labels.loc['not', 'ADV']['id'] == 3 assert truncated.column_labels.loc['do', 'VERB']['id'] == 6 assert truncated.column_labels.loc['right', 'ADV']['id'] == 8 assert truncated.column_labels.loc['first', 'ADJ']['id'] == 14 assert truncated.column_labels.loc['have', 'VERB']['id'] == 38
def verb_space_builder(channel): import pickle from scipy import sparse from fowler.corpora.execnet import logger, initialize_channel from fowler.corpora.models import read_space_from_file _, data = initialize_channel(channel) space = read_space_from_file(data['space_file']) result = {} for item in channel: if item == ('message', 'terminate'): if result: channel.send(('result', pickle.dumps(result))) break type_, data = item if type_ == 'task': # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data): # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data) (verb_stem, verb_tag), group = pickle.loads(data) logger.debug( 'Processing verb %s_%s with %s argument pairs.', verb_stem, verb_tag, len(group), ) for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values: try: subject_vector = space[subj_stem, subj_tag] object_vector = space[obj_stem, obj_tag] except KeyError: # logger.exception('Could not retrieve an argument vector.') continue if not subject_vector.size: logger.warning('Subject %s %s is empty!', subj_stem, subj_tag) continue if not object_vector.size: logger.warning('Object %s %s is empty!', obj_stem, obj_tag) continue subject_object_tensor = sparse.kron(subject_vector, object_vector) t = subject_object_tensor * count if verb_stem not in result: result[verb_stem, verb_tag] = t else: result[verb_stem, verb_tag] += t channel.send(('message', 'send_next'))
def test_line_normalize(space_path, tmpdir, dispatcher): path = str(tmpdir.join('line-normalized.h5')) dispatcher.dispatch('space line-normalize ' '-s {space_path} ' '-o {output} ' ''.format( space_path=space_path, output=path, ).split()) normalized_space = read_space_from_file(path) violations = normalized_space.matrix > 1 assert len(np.argwhere(violations)) == 0
def test_line_normalize(space_path, tmpdir, dispatcher): path = str(tmpdir.join('line-normalized.h5')) dispatcher.dispatch( 'space line-normalize ' '-s {space_path} ' '-o {output} ' ''.format( space_path=space_path, output=path, ).split() ) normalized_space = read_space_from_file(path) violations = normalized_space.matrix > 1 assert len(np.argwhere(violations)) == 0
def ittf( space, output, raw_space=('', '', 'Space with feature co-occurrence counts.'), times=('', ('n', 'logn'), 'Multiply the resulted values by n or logn.'), ): raw_space = read_space_from_file(raw_space) feature_cardinality = np.array( [v.nnz for v in raw_space.get_target_rows(*space.column_labels.index)] ) n = space.matrix.todense() ittf = np.log(feature_cardinality) - np.log(n + 1) if times == 'n': matrix = np.multiply(n, ittf) elif times == 'logn': matrix = np.multiply(np.log(n + 1), ittf) Space(matrix, space.row_labels, space.column_labels).write(output)
def ittf( space, output, raw_space=('', '', 'Space with feature co-occurrence counts.'), times=('', ('n', 'logn'), 'Multiply the resulted values by n or logn.'), ): raw_space = read_space_from_file(raw_space) feature_cardinality = np.array( [v.nnz for v in raw_space.get_target_rows(*space.column_labels.index)]) n = space.matrix.todense() ittf = np.log(feature_cardinality) - np.log(n + 1) if times == 'n': matrix = np.multiply(n, ittf) elif times == 'logn': matrix = np.multiply(np.log(n + 1), ittf) Space(matrix, space.row_labels, space.column_labels).write(output)
def space(self): try: return read_space_from_file(self.kwargs['space']) except AttributeError: raise ValueError('Could not read the space file.')
def verb_space(self): if self.kwargs['verb_space']: return read_space_from_file(self.kwargs['verb_space'])
def space(space_path): return read_space_from_file(space_path)
def space(self): # TODO: this is depricated, SpaceMixin should be used, and # global__matrix should be renamed to global__space. return read_space_from_file(self.matrix)
def space(self): return read_space_from_file(self.kwargs["space"])
def space(self): return read_space_from_file(self.kwargs['space'])
def conditional_space(conditional_space_path): return read_space_from_file(conditional_space_path)
def verb_space_builder(channel): import pickle from scipy import sparse from fowler.corpora.execnet import logger, initialize_channel from fowler.corpora.models import read_space_from_file _, data = initialize_channel(channel) space = read_space_from_file(data['space_file']) result = {} for item in channel: if item == ('message', 'terminate'): if result: channel.send(('result', pickle.dumps(result))) break type_, data = item if type_ == 'task': # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data): # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data) (verb_stem, verb_tag), group = pickle.loads(data) logger.debug( 'Processing verb %s_%s with %s argument pairs.', verb_stem, verb_tag, len(group), ) for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values: # XXX consider only the triples for which `count > 1000`. try: subject_vector = space[subj_stem, subj_tag] object_vector = space[obj_stem, obj_tag] except KeyError: # Don't log the expensions as there are many of them! continue if not subject_vector.size: # logger.warning('Subject %s %s is empty!', subj_stem, subj_tag) continue if not object_vector.size: # logger.warning('Object %s %s is empty!', obj_stem, obj_tag) continue subject_object_tensor = sparse.kron(subject_vector, object_vector) # XXX multiply by the count? t = subject_object_tensor if (verb_stem, verb_tag) not in result: result[verb_stem, verb_tag] = t else: result[verb_stem, verb_tag] += t channel.send(('message', 'send_next'))