コード例 #1
0
def find_mutation_minpairs_all_words(corpus_context, num_cores = -1,
                    stop_check = None, call_back = None):
    function = partial(find_mutation_minpairs, corpus_context)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0,len(corpus_context))
        cur = 0
    if num_cores == -1:

        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            cur += 1
            call_back(cur)
            res = function(w)

            setattr(w.original, corpus_context.attribute.name, res[0])
    else:
        iterable = ((w,) for w in corpus_context)


        neighbors = score_mp(iterable, function, num_cores, call_back, stop_check, chunk_size= 1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(corpus_context.corpus.find(corpus_context.corpus.key(n[0])), corpus_context.attribute.name, n[1][0])
コード例 #2
0
def find_mutation_minpairs_all_words(corpus_context,
                                     num_cores=-1,
                                     stop_check=None,
                                     call_back=None):
    function = partial(find_mutation_minpairs, corpus_context)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0, len(corpus_context))
        cur = 0
    if num_cores == -1:

        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            cur += 1
            call_back(cur)
            res = function(w)

            setattr(w.original, corpus_context.attribute.name, res[0])
    else:
        iterable = ((w, ) for w in corpus_context)

        neighbors = score_mp(iterable,
                             function,
                             num_cores,
                             call_back,
                             stop_check,
                             chunk_size=1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(
                corpus_context.corpus.find(corpus_context.corpus.key(n[0])),
                corpus_context.attribute.name, n[1][0])
コード例 #3
0
def neighborhood_density_all_words(corpus_context,
                                   algorithm='edit_distance',
                                   max_distance=1,
                                   num_cores=-1,
                                   stop_check=None,
                                   call_back=None):
    """Calculate the neighborhood density of all words in the corpus and
    adds them as attributes of the words.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    algorithm : str
        The algorithm used to determine distance
    max_distance : float, optional
        Maximum edit distance from the queried word to consider a word a neighbor.
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function
    """
    function = partial(neighborhood_density,
                       corpus_context,
                       algorithm=algorithm,
                       max_distance=max_distance)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0, len(corpus_context))
        cur = 0
    if num_cores == -1:

        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            cur += 1
            call_back(cur)
            res = function(w)

            setattr(w.original, corpus_context.attribute.name, res[0])
    else:
        iterable = ((w, ) for w in corpus_context)

        neighbors = score_mp(iterable,
                             function,
                             num_cores,
                             call_back,
                             stop_check,
                             chunk_size=1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(
                corpus_context.corpus.find(corpus_context.corpus.key(n[0])),
                corpus_context.attribute.name, n[1][0])
コード例 #4
0
def neighborhood_density_all_words(corpus_context,
            algorithm = 'edit_distance', max_distance = 1,
            num_cores = -1,
            stop_check = None, call_back = None):
    """Calculate the neighborhood density of all words in the corpus and
    adds them as attributes of the words.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    algorithm : str
        The algorithm used to determine distance
    max_distance : float, optional
        Maximum edit distance from the queried word to consider a word a neighbor.
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function
    """
    function = partial(neighborhood_density, corpus_context,
                        algorithm = algorithm,
                        max_distance = max_distance)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0,len(corpus_context))
        cur = 0
    if num_cores == -1:

        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            cur += 1
            call_back(cur)
            res = function(w)

            setattr(w.original, corpus_context.attribute.name, res[0])
    else:
        iterable = ((w,) for w in corpus_context)


        neighbors = score_mp(iterable, function, num_cores, call_back, stop_check, chunk_size = 1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(corpus_context.corpus.find(corpus_context.corpus.key(n[0])),
                    corpus_context.attribute.name, n[1][0])
コード例 #5
0
def find_mutation_minpairs_all_words(corpus_context, tierdict, tier_type = None, num_cores = -1, collapse_homophones = False,
                    stop_check = None, call_back = None):

    function = partial(find_mutation_minpairs, corpus_context, tier_type=tier_type, collapse_homophones = collapse_homophones)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0,len(corpus_context))
        cur = 0

    results = dict()
    last_value_removed = None
    last_key_removed = None
    if num_cores == -1 or num_cores == 1:
        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            if last_value_removed:
                tierdict[last_key_removed].append(last_value_removed)
            w_sequence = getattr(w, corpus_context.sequence_type)
            last_key_removed = str(w_sequence)
            for i, item in enumerate(tierdict[last_key_removed]):
                if str(item) == str(w):
                    last_value_removed = tierdict[last_key_removed].pop(i)
                    break
            res = find_mutation_minpairs(corpus_context, w,
                                         tier_type=tier_type, collapse_homophones = collapse_homophones)
            results[str(w)] = res[1]
            setattr(w.original, corpus_context.attribute.name, res[0])

        # for w in corpus_context:
        #     if stop_check is not None and stop_check():
        #         return
        #     cur += 1
        #     call_back(cur)
        #     res = function(w)
        #     results[str(w)] = res[1]#[str(r) for r in res[1]]
        #     setattr(w.original, corpus_context.attribute.name, res[0])
    else:
        iterable = ((w,) for w in corpus_context)
        neighbors = score_mp(iterable, function, num_cores, call_back, stop_check, chunk_size= 1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(corpus_context.corpus.find(corpus_context.corpus.key(n[0])), corpus_context.attribute.name, n[1][0])

    return results
コード例 #6
0
def neighborhood_density_all_words(corpus_context, tierdict, tier_type = None, sequence_type = None,
            algorithm = 'edit_distance', max_distance = 1, output_format = 'spelling',
            num_cores = -1, settable_attr = None, collapse_homophones = False,
            stop_check = None, call_back = None):
    """Calculate the neighborhood density of all words in the corpus and
    adds them as attributes of the words.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    algorithm : str
        The algorithm used to determine distance
    max_distance : float, optional
        Maximum edit distance from the queried word to consider a word a neighbor.
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function
    settable_attr: string
        Name of attribute that neighbourhood density results will be assigned to
    """
    function = partial(neighborhood_density, corpus_context,
                        tierdict = tierdict,
                        tier_type = tier_type,
                        sequence_type = sequence_type,
                        algorithm = algorithm,
                        max_distance = max_distance,
                        collapse_homophones = collapse_homophones)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0,len(corpus_context))
        cur = 0

    results = dict()
    last_value_removed = None
    last_key_removed = None
    if num_cores == -1 or num_cores == 1:

        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            if last_value_removed:
                tierdict[last_key_removed].append(last_value_removed)
            w_sequence = getattr(w, corpus_context.sequence_type)
            last_key_removed = str(w_sequence)
            for i, item in enumerate(tierdict[last_key_removed]):
                if str(item) == str(w):
                    last_value_removed = tierdict[last_key_removed].pop(i)
                    break
            res = neighborhood_density(corpus_context, w, tierdict,
                        tier_type = tier_type,
                        sequence_type = sequence_type,
                        algorithm = algorithm,
                        max_distance = max_distance,
                        collapse_homophones = collapse_homophones)
            results[str(w)] = [getattr(r, output_format) for r in res[1]]
            setattr(w.original, settable_attr.name, res[0])


        # for w in corpus_context:
        #     if stop_check is not None and stop_check():
        #         return
        #     cur += 1
        #     call_back(cur)
        #     res = function(w)
        #     results[str(w)] = [getattr(r, output_format) for r in res[1]]
        #     setattr(w.original, settable_attr.name, res[0]-1)
        #     #the -1 is to account for the fact that words are counted as their own neighbour, and this is incorrect
        #     #subtracting 1 here is easier than fixing the neighbourhood density algorithm
    else:
        iterable = ((w,) for w in corpus_context)
        neighbors = score_mp(iterable, function, num_cores, call_back, stop_check, chunk_size = 1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(corpus_context.corpus.find(corpus_context.corpus.key(n[0])),
                    #corpus_context.attribute.name, n[1][0])
                    settable_attr.name, n[1][0])

    return results
コード例 #7
0
def neighborhood_density_all_words(corpus_context,
                                   tierdict,
                                   tier_type=None,
                                   sequence_type=None,
                                   algorithm='edit_distance',
                                   max_distance=1,
                                   output_format='spelling',
                                   num_cores=-1,
                                   settable_attr=None,
                                   collapse_homophones=False,
                                   stop_check=None,
                                   call_back=None):
    """Calculate the neighborhood density of all words in the corpus and
    adds them as attributes of the words.

    Parameters
    ----------
    corpus_context : CorpusContext
        Context manager for a corpus
    algorithm : str
        The algorithm used to determine distance
    max_distance : float, optional
        Maximum edit distance from the queried word to consider a word a neighbor.
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function
    settable_attr: string
        Name of attribute that neighbourhood density results will be assigned to
    """
    function = partial(neighborhood_density,
                       corpus_context,
                       tierdict=tierdict,
                       tier_type=tier_type,
                       sequence_type=sequence_type,
                       algorithm=algorithm,
                       max_distance=max_distance,
                       collapse_homophones=collapse_homophones)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0, len(corpus_context))
        cur = 0

    results = dict()
    last_value_removed = None
    last_key_removed = None
    if num_cores == -1 or num_cores == 1:

        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            if last_value_removed:
                tierdict[last_key_removed].append(last_value_removed)
            w_sequence = getattr(w, corpus_context.sequence_type)
            last_key_removed = str(w_sequence)
            for i, item in enumerate(tierdict[last_key_removed]):
                if str(item) == str(w):
                    last_value_removed = tierdict[last_key_removed].pop(i)
                    break
            res = neighborhood_density(corpus_context,
                                       w,
                                       tierdict,
                                       tier_type=tier_type,
                                       sequence_type=sequence_type,
                                       algorithm=algorithm,
                                       max_distance=max_distance,
                                       collapse_homophones=collapse_homophones)
            results[str(w)] = [getattr(r, output_format) for r in res[1]]
            setattr(w.original, settable_attr.name, res[0])

        # for w in corpus_context:
        #     if stop_check is not None and stop_check():
        #         return
        #     cur += 1
        #     call_back(cur)
        #     res = function(w)
        #     results[str(w)] = [getattr(r, output_format) for r in res[1]]
        #     setattr(w.original, settable_attr.name, res[0]-1)
        #     #the -1 is to account for the fact that words are counted as their own neighbour, and this is incorrect
        #     #subtracting 1 here is easier than fixing the neighbourhood density algorithm
    else:
        iterable = ((w, ) for w in corpus_context)
        neighbors = score_mp(iterable,
                             function,
                             num_cores,
                             call_back,
                             stop_check,
                             chunk_size=1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(
                corpus_context.corpus.find(corpus_context.corpus.key(n[0])),
                #corpus_context.attribute.name, n[1][0])
                settable_attr.name,
                n[1][0])

    return results
コード例 #8
0
def find_mutation_minpairs_all_words(corpus_context,
                                     tierdict,
                                     tier_type=None,
                                     num_cores=-1,
                                     collapse_homophones=False,
                                     stop_check=None,
                                     call_back=None):

    function = partial(find_mutation_minpairs,
                       corpus_context,
                       tier_type=tier_type,
                       collapse_homophones=collapse_homophones)
    if call_back is not None:
        call_back('Calculating neighborhood densities...')
        call_back(0, len(corpus_context))
        cur = 0

    results = dict()
    last_value_removed = None
    last_key_removed = None
    if num_cores == -1 or num_cores == 1:
        for w in corpus_context:
            if stop_check is not None and stop_check():
                return
            if last_value_removed:
                tierdict[last_key_removed].append(last_value_removed)
            w_sequence = getattr(w, corpus_context.sequence_type)
            last_key_removed = str(w_sequence)
            for i, item in enumerate(tierdict[last_key_removed]):
                if str(item) == str(w):
                    last_value_removed = tierdict[last_key_removed].pop(i)
                    break
            res = find_mutation_minpairs(
                corpus_context,
                w,
                tier_type=tier_type,
                collapse_homophones=collapse_homophones)
            results[str(w)] = res[1]
            setattr(w.original, corpus_context.attribute.name, res[0])

        # for w in corpus_context:
        #     if stop_check is not None and stop_check():
        #         return
        #     cur += 1
        #     call_back(cur)
        #     res = function(w)
        #     results[str(w)] = res[1]#[str(r) for r in res[1]]
        #     setattr(w.original, corpus_context.attribute.name, res[0])
    else:
        iterable = ((w, ) for w in corpus_context)
        neighbors = score_mp(iterable,
                             function,
                             num_cores,
                             call_back,
                             stop_check,
                             chunk_size=1)
        for n in neighbors:
            #Have to look up the key, then look up the object due to how
            #multiprocessing pickles objects
            setattr(
                corpus_context.corpus.find(corpus_context.corpus.key(n[0])),
                corpus_context.attribute.name, n[1][0])

    return results