Esempio n. 1
0
def cooccurrence(
    pool=None,
    context=('c', 'context.csv', 'The file with context words.'),
    targets=('t', 'targets.csv', 'The file with target words.'),
    input_dir=(
        'i',
        local('./downloads/google_ngrams/5_cooccurrence'),
        'The path to the directory with the co-occurence.',
    ),
    output=('o', 'matrix.h5', 'The output matrix file.'),
):
    """Build the co-occurrence matrix."""
    file_names = input_dir.listdir(sort=True)
    pieces = pool.map(load_cooccurrence, ((f, targets, context) for f in file_names))

    # Get rid of empty frames
    pieces = list(filter(len, pieces))

    while len(pieces) > 1:
        logger.info('Pairs left %s', len(pieces))

        if divmod(len(pieces), 2)[1]:
            odd = [pieces.pop()]
        else:
            odd = []

        pieces = list(pool.map(group_sum, get_pairs(pieces))) + odd

    matrix, = pieces

    write_space(output, context, targets, matrix)
Esempio n. 2
0
def cooccurrence(
        pool=None,
        context=('c', 'context.csv', 'The file with context words.'),
        targets=('t', 'targets.csv', 'The file with target words.'),
        input_dir=(
            'i',
            local('./downloads/google_ngrams/5_cooccurrence'),
            'The path to the directory with the co-occurence.',
        ),
        output=('o', 'matrix.h5', 'The output matrix file.'),
):
    """Build the co-occurrence matrix."""
    file_names = input_dir.listdir(sort=True)
    pieces = pool.map(load_cooccurrence,
                      ((f, targets, context) for f in file_names))

    # Get rid of empty frames
    pieces = list(filter(len, pieces))

    while len(pieces) > 1:
        logger.info('Pairs left %s', len(pieces))

        if divmod(len(pieces), 2)[1]:
            odd = [pieces.pop()]
        else:
            odd = []

        pieces = list(pool.map(group_sum, get_pairs(pieces))) + odd

    matrix, = pieces

    write_space(output, context, targets, matrix)
Esempio n. 3
0
def cooccurrence(
    corpus,
    execnet_hub,
    targets,
    context,
    paths_progress_iter,
    output=('o', 'space.h5', 'The output space file.'),
):
    """Build the co-occurrence matrix."""

    if targets.index.nlevels > 1:
        targets.sortlevel(inplace=True)
    if context.index.nlevels > 1:
        context.sortlevel(inplace=True)

    def init(channel):
        channel.send(
            (
                'data',
                pickle.dumps(
                    {
                        'kwargs': {
                            'targets': targets,
                            'context': context,
                        },
                        'instance': corpus,
                        'folder_name': 'cooccurrence',
                    },
                )
            )
        )

    results = execnet_hub.run(
        remote_func=sum_folder,
        iterable=paths_progress_iter,
        init_func=init,
    )

    results = ([r] for r in results if r is not None)
    result = next(results)[0]

    for i, chunk in enumerate(chunked(results, 100)):
        logger.info('Received result chunk #%s.', i)
        chunked_result = [c[0] for c in chunk]

        with Timer() as timed:
            result = pd.concat(
                chunked_result + [result],
                copy=False,
            ).groupby(level=result.index.names).sum()

        logger.info(
            'Computed the result by merging a chunk of received results and the result in %.2f seconds.',
            timed.elapsed,
        )

    result = result.to_frame('count')
    result.reset_index(inplace=True)

    write_space(output, context, targets, result)
Esempio n. 4
0
def cooccurrence(
    corpus,
    execnet_hub,
    targets,
    context,
    paths_progress_iter,
    output=('o', 'space.h5', 'The output space file.'),
):
    """Build the co-occurrence matrix."""

    if targets.index.nlevels > 1:
        targets.sortlevel(inplace=True)
    if context.index.nlevels > 1:
        context.sortlevel(inplace=True)

    def init(channel):
        channel.send(
            (
                'data',
                pickle.dumps(
                    {
                        'kwargs': {
                            'targets': targets,
                            'context': context,
                        },
                        'instance': corpus,
                        'folder_name': 'cooccurrence',
                    },
                )
            )
        )

    results = execnet_hub.run(
        remote_func=sum_folder,
        iterable=paths_progress_iter,
        init_func=init,
    )

    results = ([r] for r in results if r is not None)
    result = next(results)[0]

    for i, chunk in enumerate(chunked(results, 100)):
        logger.info('Received result chunk #%s.', i)
        chunked_result = [c[0] for c in chunk]

        with Timer() as timed:
            result = pd.concat(
                chunked_result + [result],
                copy=False,
            ).groupby(level=result.index.names).sum()

        logger.info(
            'Computed the result by merging a chunk of received results and the result in %.2f seconds.',
            timed.elapsed,
        )

    result = result.to_frame('count')
    result.reset_index(inplace=True)

    write_space(output, context, targets, result)
Esempio n. 5
0
    def write(self, file_name):
        """Write the vector space to a file."""

        coo = coo_matrix(self.matrix)

        matrix = pd.DataFrame({
            'count': coo.data,
            'id_target': coo.row,
            'id_context': coo.col,
        }).set_index(['id_target', 'id_context'])

        write_space(file_name, self.column_labels, self.row_labels, matrix)
Esempio n. 6
0
    def write(self, file_name):
        """Write the vector space to a file."""

        coo = coo_matrix(self.matrix)

        matrix = pd.DataFrame(
            {
                'count': coo.data,
                'id_target': coo.row,
                'id_context': coo.col,
            }
        ).set_index(['id_target', 'id_context'])

        write_space(file_name, self.column_labels, self.row_labels, matrix)
Esempio n. 7
0
def cooccurrence(
        bnc,
        pool,
        targets,
        context,
        window_size=('', 5, 'Window size.'),
        chunk_size=('', 7, 'Length of the chunk at the reduce stage.'),
        stem=('', False, 'Use word stems instead of word strings.'),
        output=('o', 'matrix.h5', 'The output matrix file.'),
):
    """Build the co-occurrence matrix."""
    records = Counter()

    all_fileids = Bar(
        'Reading BNC',
        max=len(bnc.fileids()),
        suffix='%(index)d/%(max)d, elapsed: %(elapsed_td)s',
    ).iter(bnc.fileids())

    for fileids_chunk in chunked(all_fileids, 100):

        counters = pool.imap_unordered(
            bnc_cooccurrence,
            ((bnc.root, fileids, window_size, stem, targets, context)
             for fileids in fileids_chunk),
        )

        records += sum_counters(counters, pool=pool, chunk_size=chunk_size)

        logger.debug('There are %d co-occurrence records so far.',
                     len(records))

    matrix = pd.DataFrame(
        ([t, c, n] for (t, c), n in records.items()),
        columns=('id_target', 'id_context', 'count'),
    )
    matrix.set_index(['id_target', 'id_context'], inplace=True)

    write_space(output, context, targets, matrix)