def cooccurrence( pool=None, context=('c', 'context.csv', 'The file with context words.'), targets=('t', 'targets.csv', 'The file with target words.'), input_dir=( 'i', local('./downloads/google_ngrams/5_cooccurrence'), 'The path to the directory with the co-occurence.', ), output=('o', 'matrix.h5', 'The output matrix file.'), ): """Build the co-occurrence matrix.""" file_names = input_dir.listdir(sort=True) pieces = pool.map(load_cooccurrence, ((f, targets, context) for f in file_names)) # Get rid of empty frames pieces = list(filter(len, pieces)) while len(pieces) > 1: logger.info('Pairs left %s', len(pieces)) if divmod(len(pieces), 2)[1]: odd = [pieces.pop()] else: odd = [] pieces = list(pool.map(group_sum, get_pairs(pieces))) + odd matrix, = pieces write_space(output, context, targets, matrix)
def cooccurrence( corpus, execnet_hub, targets, context, paths_progress_iter, output=('o', 'space.h5', 'The output space file.'), ): """Build the co-occurrence matrix.""" if targets.index.nlevels > 1: targets.sortlevel(inplace=True) if context.index.nlevels > 1: context.sortlevel(inplace=True) def init(channel): channel.send( ( 'data', pickle.dumps( { 'kwargs': { 'targets': targets, 'context': context, }, 'instance': corpus, 'folder_name': 'cooccurrence', }, ) ) ) results = execnet_hub.run( remote_func=sum_folder, iterable=paths_progress_iter, init_func=init, ) results = ([r] for r in results if r is not None) result = next(results)[0] for i, chunk in enumerate(chunked(results, 100)): logger.info('Received result chunk #%s.', i) chunked_result = [c[0] for c in chunk] with Timer() as timed: result = pd.concat( chunked_result + [result], copy=False, ).groupby(level=result.index.names).sum() logger.info( 'Computed the result by merging a chunk of received results and the result in %.2f seconds.', timed.elapsed, ) result = result.to_frame('count') result.reset_index(inplace=True) write_space(output, context, targets, result)
def write(self, file_name): """Write the vector space to a file.""" coo = coo_matrix(self.matrix) matrix = pd.DataFrame({ 'count': coo.data, 'id_target': coo.row, 'id_context': coo.col, }).set_index(['id_target', 'id_context']) write_space(file_name, self.column_labels, self.row_labels, matrix)
def write(self, file_name): """Write the vector space to a file.""" coo = coo_matrix(self.matrix) matrix = pd.DataFrame( { 'count': coo.data, 'id_target': coo.row, 'id_context': coo.col, } ).set_index(['id_target', 'id_context']) write_space(file_name, self.column_labels, self.row_labels, matrix)
def cooccurrence( bnc, pool, targets, context, window_size=('', 5, 'Window size.'), chunk_size=('', 7, 'Length of the chunk at the reduce stage.'), stem=('', False, 'Use word stems instead of word strings.'), output=('o', 'matrix.h5', 'The output matrix file.'), ): """Build the co-occurrence matrix.""" records = Counter() all_fileids = Bar( 'Reading BNC', max=len(bnc.fileids()), suffix='%(index)d/%(max)d, elapsed: %(elapsed_td)s', ).iter(bnc.fileids()) for fileids_chunk in chunked(all_fileids, 100): counters = pool.imap_unordered( bnc_cooccurrence, ((bnc.root, fileids, window_size, stem, targets, context) for fileids in fileids_chunk), ) records += sum_counters(counters, pool=pool, chunk_size=chunk_size) logger.debug('There are %d co-occurrence records so far.', len(records)) matrix = pd.DataFrame( ([t, c, n] for (t, c), n in records.items()), columns=('id_target', 'id_context', 'count'), ) matrix.set_index(['id_target', 'id_context'], inplace=True) write_space(output, context, targets, matrix)