def sum_folder(channel): import pickle import logging from more_itertools import peekable import pandas as pd from fowler.corpora.execnet import initialize_channel _, data = initialize_channel(channel) logger = logging.getLogger('execnet.fum_folder') kwargs = data.get('kwargs', {}) instance = data['instance'] folder_name = data['folder_name'] folder = getattr(instance, folder_name) result = None for item in channel: if item == ('message', 'terminate'): if result is not None: logger.debug('Sending the final result, size: %s', len(result)) channel.send(('result', pickle.dumps(result))) break type_, data = item if type_ == 'task': intermediate_results = peekable(enumerate(folder(data, **kwargs))) if intermediate_results: if result is None: _, result = next(intermediate_results) # TODO: It would be nice to catch any exceptioin here, # (especially, the one that happens inside of the folder() call # and report it to the master. # Same applies to the next() call above. for i, r in intermediate_results: logger.debug('Iteration: %s, result size: %s', i, len(result)) result = pd.concat( [result, r], copy=False, ).groupby(level=result.index.names).sum() if (i % 10) == 9: result.sort(ascending=False, inplace=True) half = len(result) // 2 logger.debug('Sending a result. Result size: %s', half) channel.send( ('result', pickle.dumps(result.tail(half)))) result = result.head(-half) channel.send(('message', 'send_next'))
def verb_space_builder(channel): import pickle from scipy import sparse from fowler.corpora.execnet import logger, initialize_channel from fowler.corpora.models import read_space_from_file _, data = initialize_channel(channel) space = read_space_from_file(data['space_file']) result = {} for item in channel: if item == ('message', 'terminate'): if result: channel.send(('result', pickle.dumps(result))) break type_, data = item if type_ == 'task': # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data): # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data) (verb_stem, verb_tag), group = pickle.loads(data) logger.debug( 'Processing verb %s_%s with %s argument pairs.', verb_stem, verb_tag, len(group), ) for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values: try: subject_vector = space[subj_stem, subj_tag] object_vector = space[obj_stem, obj_tag] except KeyError: # logger.exception('Could not retrieve an argument vector.') continue if not subject_vector.size: logger.warning('Subject %s %s is empty!', subj_stem, subj_tag) continue if not object_vector.size: logger.warning('Object %s %s is empty!', obj_stem, obj_tag) continue subject_object_tensor = sparse.kron(subject_vector, object_vector) t = subject_object_tensor * count if verb_stem not in result: result[verb_stem, verb_tag] = t else: result[verb_stem, verb_tag] += t channel.send(('message', 'send_next'))
def sum_folder(channel): import pickle import logging from more_itertools import peekable import pandas as pd from fowler.corpora.execnet import initialize_channel _, data = initialize_channel(channel) logger = logging.getLogger('execnet.fum_folder') kwargs = data.get('kwargs', {}) instance = data['instance'] folder_name = data['folder_name'] folder = getattr(instance, folder_name) result = None for item in channel: if item == ('message', 'terminate'): if result is not None: logger.debug('Sending the final result, size: %s', len(result)) channel.send(('result', pickle.dumps(result))) break type_, data = item if type_ == 'task': intermediate_results = peekable(enumerate(folder(data, **kwargs))) if intermediate_results: if result is None: _, result = next(intermediate_results) # TODO: It would be nice to catch any exceptioin here, # (especially, the one that happens inside of the folder() call # and report it to the master. # Same applies to the next() call above. for i, r in intermediate_results: logger.debug('Iteration: %s, result size: %s', i, len(result)) result = pd.concat( [result, r], copy=False, ).groupby(level=result.index.names).sum() if (i % 10) == 9: result.sort(ascending=False, inplace=True) half = len(result) // 2 logger.debug('Sending a result. Result size: %s', half) channel.send(('result', pickle.dumps(result.tail(half)))) result = result.head(-half) channel.send(('message', 'send_next'))
def verb_space_builder(channel): import pickle from scipy import sparse from fowler.corpora.execnet import logger, initialize_channel from fowler.corpora.models import read_space_from_file _, data = initialize_channel(channel) space = read_space_from_file(data['space_file']) result = {} for item in channel: if item == ('message', 'terminate'): if result: channel.send(('result', pickle.dumps(result))) break type_, data = item if type_ == 'task': # for (subj_stem, subj_tag, obj_stem, obj_tag), group in pickle.loads(data): # (subj_stem, subj_tag, obj_stem, obj_tag), group = pickle.loads(data) (verb_stem, verb_tag), group = pickle.loads(data) logger.debug( 'Processing verb %s_%s with %s argument pairs.', verb_stem, verb_tag, len(group), ) for subj_stem, subj_tag, obj_stem, obj_tag, count in group[['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag', 'count']].values: # XXX consider only the triples for which `count > 1000`. try: subject_vector = space[subj_stem, subj_tag] object_vector = space[obj_stem, obj_tag] except KeyError: # Don't log the expensions as there are many of them! continue if not subject_vector.size: # logger.warning('Subject %s %s is empty!', subj_stem, subj_tag) continue if not object_vector.size: # logger.warning('Object %s %s is empty!', obj_stem, obj_tag) continue subject_object_tensor = sparse.kron(subject_vector, object_vector) # XXX multiply by the count? t = subject_object_tensor if (verb_stem, verb_tag) not in result: result[verb_stem, verb_tag] = t else: result[verb_stem, verb_tag] += t channel.send(('message', 'send_next'))