Beispiel #1
0
def __z_test_word_list__(word_list_i, word_list_j, corpus_list, high, low):
    # type: (dict, dict) -> dict
    """
    this takes two word lists and do z test on all the words in those two word list
    and this will return the result in a dict map the word to the corresponding z-score

    Args:
        word_list_i: the first word list, a dictionary map word to word counts
        word_list_j: the second word list, a dictionary map word to word counts

    Returns:
        a dictionary map the word to z-score
    """
    total_count_i = sum(word_list_i.values())
    total_count_j = sum(word_list_j.values())
    total_list = merge_list([word_list_j, word_list_i])
    word_z_score_dict = {}
    for word in total_list:
        if low < corpus_list[word] < high:  # taking care fo the word filter
            try:
                p_i = word_list_i[word] / total_count_i
            except KeyError:
                p_i = 0
            try:
                p_j = word_list_j[word] / total_count_j
            except KeyError:
                p_j = 0
            z_score = __z_test__(p_i, p_j, total_count_i, total_count_j)
            word_z_score_dict.update({word.decode('utf-8'): z_score})
    return word_z_score_dict
Beispiel #2
0
def test_all_to_para(word_lists, option='CustomP', low=0.0, high=None):
    """
    this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*),
    and then pack that into the return

    :param word_lists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param high: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a array, represent a chunk and it is sorted via z_score
                each element array is a tuple: (word, corresponding z_score)
    """

    # init
    corpus_list = merge_list(word_lists)
    all_results = []  # the value to return
    total_word_count = sum(corpus_list.values())
    num_word = len(corpus_list)

    # handle option (word filter)
    high, low = __word_filter__(option, low, high, num_word, total_word_count,
                                corpus_list)

    # calculation
    for word_list in word_lists:
        word_z_score_dict = __z_test_word_list__(word_list_i=word_list,
                                                 word_list_j=corpus_list,
                                                 corpus_list=corpus_list,
                                                 high=high,
                                                 low=low)

        sorted_list = sorted(word_z_score_dict.items(),
                             key=itemgetter(1),
                             reverse=True)
        all_results.append(sorted_list)

    return all_results
Beispiel #3
0
def testall(WordLists, option="CustomP", Low=0.0, High=None):
    """
    this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*),
    and then pack that into the return

    :param WordLists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param Low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a array, represent a chunk and it is sorted via z_score
                each element array is a tuple: (word, corresponding z_score)
    """

    # init
    MergeList = merge_list(WordLists)
    AllResults = []  # the value to return
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList)  # handle option (word filter)

    # calculation
    for wordlist in WordLists:
        ResultList = {}
        ListWordCount = sum(wordlist.values())

        for word in wordlist.keys():
            if Low < MergeList[word] < High:
                z_score = ztest(
                    wordlist[word] / ListWordCount, MergeList[word] / TotalWordCount, ListWordCount, TotalWordCount
                )
                ResultList.update({word: z_score})

        ResultList = sorted(ResultList.items(), key=itemgetter(1), reverse=True)
        AllResults.append(ResultList)

    return AllResults
Beispiel #4
0
def __z_test_word_list__(word_list_i, word_list_j, corpus_list, high, low):
    # type: (dict, dict) -> dict
    """
    this takes two word lists and do z test on all the words in those two word list
    and this will return the result in a dict map the word to the corresponding z-score

    Args:
        word_list_i: the first word list, a dictionary map word to word counts
        word_list_j: the second word list, a dictionary map word to word counts

    Returns:
        a dictionary map the word to z-score
    """
    total_count_i = sum(word_list_i.values())
    total_count_j = sum(word_list_j.values())
    total_list = merge_list([word_list_j, word_list_i])
    word_z_score_dict = {}
    for word in total_list:
        if low < corpus_list[word] < high:  # taking care fo the word filter
            try:
                p_i = word_list_i[word] / total_count_i
            except KeyError:
                p_i = 0
            try:
                p_j = word_list_j[word] / total_count_j
            except KeyError:
                p_j = 0
            # keep 4 digits after the decimal point of z_score
            z_score = truncate(
                __z_test__(p_i, p_j, total_count_i, total_count_j), 4)
            # get rid of the insignificant results, insignificant means those with absolute values smaller than 1.96
            if abs(z_score) >= 1.96:
                word_z_score_dict.update({word.decode('utf-8'): z_score})
    return word_z_score_dict
Beispiel #5
0
def __z_test_word_list__(word_list_i, word_list_j, corpus_list, high, low):
    # type: (dict, dict) -> dict
    """
    this takes two word lists and do z test on all the words in those two word list
    and this will return the result in a dict map the word to the corresponding z-score

    Args:
        word_list_i: the first word list, a dictionary map word to word counts
        word_list_j: the second word list, a dictionary map word to word counts

    Returns:
        a dictionary map the word to z-score
    """
    total_count_i = sum(word_list_i.values())
    total_count_j = sum(word_list_j.values())
    total_list = merge_list([word_list_j, word_list_i])
    word_z_score_dict = {}
    for word in total_list:
        if low < corpus_list[word] < high:  # taking care fo the word filter
            try:
                p_i = word_list_i[word] / total_count_i
            except KeyError:
                p_i = 0
            try:
                p_j = word_list_j[word] / total_count_j
            except KeyError:
                p_j = 0
            # keep 4 digits after the decimal point of z_score
            z_score = truncate(__z_test__(p_i, p_j, total_count_i, total_count_j),4)
            # get rid of the insignificant results, insignificant means those with absolute values smaller than 1.96
            if abs(z_score) >= 1.96:
                word_z_score_dict.update({word.decode('utf-8'): z_score})
    return word_z_score_dict
Beispiel #6
0
def test_all_to_para(word_lists, option='CustomP', low=0.0, high=None):
    """

    All paragraphs are really references to documents. The UI has been updated to "documents" but all the variables
    below still use paragraphs.

    this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*),
    and then pack that into the return

    :param word_lists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param high: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a array, represent a chunk and it is sorted via z_score
                each element array is a tuple: (word, corresponding z_score)
    """

    # init
    corpus_list = merge_list(word_lists)
    all_results = []  # the value to return
    total_word_count = sum(corpus_list.values())
    num_word = len(corpus_list)

    # handle option (word filter)
    high, low = __word_filter__(option, low, high, num_word, total_word_count, corpus_list)

    # calculation
    for word_list in word_lists:
        word_z_score_dict = __z_test_word_list__(word_list_i=word_list, word_list_j=corpus_list,
                                                 corpus_list=corpus_list, high=high, low=low)
        sorted_list = sorted(word_z_score_dict.items(), key=lambda item: abs(item[1]), reverse=True)
        all_results.append(sorted_list)

    return all_results
Beispiel #7
0
def KWtest(Matrixs, Words, WordLists, option="CustomP", Low=0.0, High=1.0):
    # begin handle options
    MergeList = merge_list(WordLists)
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList)
    # end handle options

    Len = max(len(matrix) for matrix in Matrixs)
    # the length of all the sample set (all the sample set with less that this will turn into a masked array)

    word_pvalue_dict = {}  # the result list

    for i in range(1, len(Matrixs[0][0])):  # focusing on a specific word
        word = Words[i - 1]
        if Low < MergeList[word] < High:
            samples = []
            for k in range(len(Matrixs)):  # focusing on a group
                sample = []
                for j in range(len(Matrixs[k])):  # focusing on all the segment of that group
                    # add the sample into the sample list
                    sample.append(Matrixs[k][j][i])

                # combine all the samples of each sample list
                # turn the short ones masked so that all the sample set has the same length
                samples.append(
                    ma.masked_array(
                        sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample))
                    )
                )

            # do the KW test
            try:
                pvalue = kruskalwallis(samples)[1]
            except ValueError as error:
                if error.args[0] == "All numbers are identical in kruskal":  # get the argument of the error
                    pvalue = "Invalid"
                else:
                    raise ValueError(error)

            # put the result in the dict
            word_pvalue_dict.update({word: pvalue})
    return sorted(word_pvalue_dict.items(), key=itemgetter(1))
Beispiel #8
0
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0):
    """
    give the kruskal wallis test result on the topword
    :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement.
    :param Words: all the words (Matrixs and words are parallel)
    :param WordLists: a list of dictionary that has the word map to its word count.
                        each dictionary represent the information inside a segment
    :param option: some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)
    :param Low: this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:
          a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value
    """
    # begin handle options
    MergeList = merge_list(WordLists)
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount,
                           MergeList)
    # end handle options

    Len = max(len(matrix) for matrix in Matrixs)
    # the length of all the sample set (all the sample set with less that this will turn into a masked array)

    word_pvalue_dict = {}  # the result list

    for i in range(1, len(Matrixs[0][0])):  # focusing on a specific word
        word = Words[i - 1]
        try:
            MergeList[word]
        except KeyError:
            continue
        if Low < MergeList[word] < High:
            samples = []
            for k in range(len(Matrixs)):  # focusing on a group
                sample = []
                for j in range(len(Matrixs[k])
                               ):  # focusing on all the segment of that group
                    # add the sample into the sample list
                    sample.append(Matrixs[k][j][i])

                # combine all the samples of each sample list
                # turn the short ones masked so that all the sample set has the same length
                samples.append(
                    ma.masked_array(sample + [0] * (Len - len(sample)),
                                    mask=[0] * len(sample) + [1] *
                                    (Len - len(sample))))

            # do the KW test
            try:
                pvalue = kruskalwallis(samples)[1]
            except ValueError as error:
                if error.args[
                        0] == 'All numbers are identical in kruskal':  # get the argument of the error
                    pvalue = 'Invalid'
                else:
                    raise ValueError(error)

            # put the result in the dict
            word_pvalue_dict.update({word.decode('utf-8'): pvalue})
    return sorted(word_pvalue_dict.items(), key=itemgetter(1))
Beispiel #9
0
def testgroup(GroupWordLists, option='CustomP', Low=0.0, High=1.0):
    """
    this method takes ChunkWordlist and and then analyze each single word(compare to all the other group),
    and then pack that into the return

    :param GroupWordLists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param Low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a dictionary map a tuple to a list
                    tuple consist of 3 element (group number 1, list number, group number 2)
                        means compare the words in list number of group number 1 to all the word in group number 2
                    the list contain tuples, sorted by p value:
                        tuple means (word, p value)
                        this is word usage of word in group (group number 1), list (list number),
                        compare to the word usage of the same word in group (group number 2)
    """

    # init
    GroupLists = []
    GroupWordCounts = []
    GroupNumWords = []
    for Chunk in GroupWordLists:
        GroupLists.append(merge_list(Chunk))
        GroupWordCounts.append(sum(GroupLists[-1].values()))
        GroupNumWords.append(len(GroupLists[-1]))
    TotalList = merge_list(GroupLists)
    TotalWordCount = sum(GroupWordCounts)
    TotalNumWords = len(TotalList)
    AllResults = {}  # the value to return

    High, Low = wordfilter(option, Low, High, TotalNumWords, TotalWordCount,
                           TotalList)

    # calculation
    for i in range(len(GroupWordLists)):  # individual chunk
        for j in range(len(GroupWordLists)):  # group compare
            if i != j:  # each chunk in wordlist i, compare to each chunk in
                wordlistnumber = 0  # the label of the word list in GroupWordList[i]
                for wordlist in GroupWordLists[
                        i]:  # focusing on a specific word on list i.
                    iTotalWordCount = sum(wordlist.values())
                    for word in wordlist.keys():

                        # handle option
                        if Low < TotalList[word] < High:
                            iWordCount = wordlist[word]
                            iWordProp = iWordCount / iTotalWordCount
                            try:
                                jWordCount = GroupLists[j][word]
                            except KeyError:
                                jWordCount = 0
                            jTotalWordCount = GroupWordCounts[j]
                            jWordProp = jWordCount / jTotalWordCount

                            z_score = ztest(iWordProp, jWordProp,
                                            iTotalWordCount, jTotalWordCount)
                            try:
                                AllResults[(i, wordlistnumber, j)].append(
                                    (word.decode('utf-8'), z_score))
                            except:
                                AllResults.update({
                                    (i, wordlistnumber, j):
                                    [(word.decode('utf-8'), z_score)]
                                })
                    wordlistnumber += 1
    # sort the output
    for tuple in AllResults.keys():
        list = AllResults[tuple]
        list = sorted(list, key=lambda tup: tup[1], reverse=True)
        AllResults.update({tuple: list})
    return AllResults
Beispiel #10
0
def testall(WordLists, option='CustomP', Low=0.0, High=None):
    """
    this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*),
    and then pack that into the return

    :param WordLists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param Low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a array, represent a chunk and it is sorted via z_score
                each element array is a tuple: (word, corresponding z_score)
    """

    # init
    MergeList = merge_list(WordLists)
    AllResults = []  # the value to return
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount,
                           MergeList)  # handle option (word filter)

    # calculation
    for wordlist in WordLists:
        ResultList = {}
        ListWordCount = sum(wordlist.values())

        for word in wordlist.keys():
            if Low < MergeList[word] < High:
                z_score = ztest(wordlist[word] / ListWordCount,
                                MergeList[word] / TotalWordCount,
                                ListWordCount, TotalWordCount)
                ResultList.update({word.decode('utf-8'): z_score})

        ResultList = sorted(ResultList.items(),
                            key=itemgetter(1),
                            reverse=True)
        AllResults.append(ResultList)

    return AllResults
Beispiel #11
0
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0):
    """
    give the kruskal wallis test result on the topword
    :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement.
    :param Words: all the words (Matrixs and words are parallel)
    :param WordLists: a list of dictionary that has the word map to its word count.
                        each dictionary represent the information inside a segment
    :param option: some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)
    :param Low: this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:
          a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value
    """
    # begin handle options
    MergeList = merge_list(WordLists)
    TotalWordCount = sum(MergeList.values())
    NumWord = len(MergeList)

    High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList)
    # end handle options

    Len = max(len(matrix) for matrix in Matrixs)
    # the length of all the sample set (all the sample set with less that this will turn into a masked array)

    word_pvalue_dict = {}  # the result list

    for i in range(1, len(Matrixs[0][0])):  # focusing on a specific word
        word = Words[i - 1]
        try:
            MergeList[word]
        except KeyError:
            continue
        if Low < MergeList[word] < High:
            samples = []
            for k in range(len(Matrixs)):  # focusing on a group
                sample = []
                for j in range(len(Matrixs[k])):  # focusing on all the segment of that group
                    # add the sample into the sample list
                    sample.append(Matrixs[k][j][i])

                # combine all the samples of each sample list
                # turn the short ones masked so that all the sample set has the same length
                samples.append(ma.masked_array(sample + [0] * (Len - len(sample)),
                                               mask=[0] * len(sample) + [1] * (Len - len(sample))))

            # do the KW test
            try:
                pvalue = kruskalwallis(samples)[1]
            except ValueError as error:
                if error.args[0] == 'All numbers are identical in kruskal':  # get the argument of the error
                    pvalue = 'Invalid'
                else:
                    raise ValueError(error)

            # put the result in the dict
            word_pvalue_dict.update({word.decode('utf-8'): pvalue})
    return sorted(word_pvalue_dict.items(), key=itemgetter(1))
Beispiel #12
0
def testgroup(GroupWordLists, option='CustomP', Low=0.0, High=1.0):
    """
    this method takes ChunkWordlist and and then analyze each single word(compare to all the other group),
    and then pack that into the return

    :param GroupWordLists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param Low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param High: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a dictionary map a tuple to a list
                    tuple consist of 3 element (group number 1, list number, group number 2)
                        means compare the words in list number of group number 1 to all the word in group number 2
                    the list contain tuples, sorted by p value:
                        tuple means (word, p value)
                        this is word usage of word in group (group number 1), list (list number),
                        compare to the word usage of the same word in group (group number 2)
    """

    # init
    GroupLists = []
    GroupWordCounts = []
    GroupNumWords = []
    for Chunk in GroupWordLists:
        GroupLists.append(merge_list(Chunk))
        GroupWordCounts.append(sum(GroupLists[-1].values()))
        GroupNumWords.append(len(GroupLists[-1]))
    TotalList = merge_list(GroupLists)
    TotalWordCount = sum(GroupWordCounts)
    TotalNumWords = len(TotalList)
    AllResults = {}  # the value to return

    High, Low = wordfilter(option, Low, High, TotalNumWords, TotalWordCount, TotalList)

    # calculation
    for i in range(len(GroupWordLists)):  # individual chunk
        for j in range(len(GroupWordLists)):  # group compare
            if i != j:  # each chunk in wordlist i, compare to each chunk in
                wordlistnumber = 0  # the label of the word list in GroupWordList[i]
                for wordlist in GroupWordLists[i]:  # focusing on a specific word on list i.
                    iTotalWordCount = sum(wordlist.values())
                    for word in wordlist.keys():

                        # handle option
                        if Low < TotalList[word] < High:
                            iWordCount = wordlist[word]
                            iWordProp = iWordCount / iTotalWordCount
                            try:
                                jWordCount = GroupLists[j][word]
                            except KeyError:
                                jWordCount = 0
                            jTotalWordCount = GroupWordCounts[j]
                            jWordProp = jWordCount / jTotalWordCount

                            z_score = ztest(iWordProp, jWordProp, iTotalWordCount, jTotalWordCount)
                            try:
                                AllResults[(i, wordlistnumber, j)].append((word.decode('utf-8'), z_score))
                            except:
                                AllResults.update({(i, wordlistnumber, j): [(word.decode('utf-8'), z_score)]})
                    wordlistnumber += 1
    # sort the output
    for tuple in AllResults.keys():
        list = AllResults[tuple]
        list = sorted(list, key=lambda tup: tup[1], reverse=True)
        AllResults.update({tuple: list})
    return AllResults
Beispiel #13
0
def test_group_to_group(group_para_lists,
                        option='CustomP',
                        low=0.0,
                        high=None):
    """
    this function will give you the result of all the group compare with each other groups

    Args:
        group_para_lists: a list, each element of the list is list, each lists represent a group.
                            each element in the group list is a dictionary, map a word to a word count
                            each dictionary represent a paragraph(chunk) in the corresponding group
        option: some default option to set for High And Low(see the document for High and Low)
                1. using standard deviation to find outlier
                    TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                (word frequency > average + 2 * Standard_Deviation)
                    MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                    LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                (average - 2 * Standard_Deviation > word frequency)

                2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                    TopIQR: only analyze the Top outlier of word, determined by IQR
                                (word frequency > median + 1.5 * Standard)
                    MidIQR: only analyze the non-outlier of word, determined by IQR
                                (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                    LowIQR: only analyze the Left outlier of word, determined by IQR
                                (median - 1.5 * Standard > word frequency)
        low:  this method will only analyze the word with higher frequency than this value
                (this value is the lower bound of the word being analyzed)
                (this parameter will be overwritten if the option is not 'Custom')
        high: this method will only analyze the word with lower frequency than this value
                (this value is the upper bound of the word being analyzed)
                (this parameter will be overwritten if the option is not 'Custom')

    Returns:
        a dictionary of a tuple mapped to a list:
        tuple: the tuple consist of two elements:
                the two index is two groups to compare.
        list: a list of tuples represent the comparison result of the two index that :
                first element in the tuple is a string, representing a word
                second element is a float representing the corresponding z-score you get when you compare the word
                in the two different paragraph (the index is represented in the in the first tuple we talked about)

    """
    # init
    group_word_lists = [
    ]  # group list is the word list of each group (word to word count within the whole group)
    group_word_count = []  # the total word count of each group
    for chunk in group_para_lists:
        group_word_lists.append(merge_list(chunk))
        group_word_count.append(sum(group_word_lists[-1].values()))
    # the word list of the corpus (all the word maps to the sum of all the word count)
    corpus_list = merge_list(group_word_lists)
    total_word_count = sum(
        group_word_count
    )  # the total number of word count in words in the corpus
    # the number of unique words
    # example: 'the a ha the' has 3 unique words: 'the', 'a', and 'ha'
    total_num_words = len(corpus_list)
    num_group = len(group_word_lists)  # number of group
    all_results = {}

    high, low = __word_filter__(option, low, high, total_num_words,
                                total_word_count, corpus_list)

    # comparison map, in here is a list of tuple.
    # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0)
    # i_index has to be smaller than j_index to avoid repetition
    comp_map = itertools.product(range(num_group), range(num_group))
    comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map
                if i_index < j_index]

    for group_comp_index, group_base_index in comp_map:
        group_comp_list = group_word_lists[group_comp_index]
        group_base_list = group_word_lists[group_base_index]
        word_z_score_dict = __z_test_word_list__(word_list_i=group_comp_list,
                                                 word_list_j=group_base_list,
                                                 corpus_list=corpus_list,
                                                 high=high,
                                                 low=low)
        # sort the dictionary
        sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(),
                                               key=operator.itemgetter(1),
                                               reverse=True)
        # pack the sorted result in sorted list
        all_results.update({
            (group_comp_index, group_base_index):
            sorted_word_zscore_tuple_list
        })

    return all_results
Beispiel #14
0
def test_para_to_group(group_para_lists, option='CustomP', low=0.0, high=1.0):
    """
    this method analyze each single word(compare to all the other group),
    and then pack that into the return

    :param group_para_lists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param high: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a dictionary map a tuple to a list
                    tuple consist of 3 element (group number 1, list number, group number 2)
                        means compare the words in list number of group number 1 to all the word in group number 2
                    the list contain tuples, sorted by p value:
                        tuple means (word, p value)
                        this is word usage of word in group (group number 1), list (list number),
                        compare to the word usage of the same word in group (group number 2)
    """

    # init
    group_lists = [
    ]  # group list is the word list of each group (word to word count within the whole group)
    group_word_count = []  # the total word count of each group
    group_num_words = []  # a list of number of unique words in each group
    for chunk in group_para_lists:
        group_lists.append(merge_list(chunk))
        group_word_count.append(sum(group_lists[-1].values()))
        group_num_words.append(len(group_lists[-1]))
    corpus_list = merge_list(group_lists)
    total_word_count = sum(group_word_count)
    total_num_words = len(corpus_list)
    num_group = len(group_lists)  # number of groups
    all_results = {}  # the value to return

    high, low = __word_filter__(option, low, high, total_num_words,
                                total_word_count, corpus_list)

    # calculation

    # comparison map, in here is a list of tuple.
    # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0)
    # two group index cannot be equal
    comp_map = itertools.product(range(num_group), range(num_group))
    comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map
                if i_index != j_index]

    # compare each paragraph in group_comp to group_base (group comp means group for comparison)
    for group_comp_index, group_base_index in comp_map:

        # gives all the paragraphs in the group in a array
        group_comp_paras = group_para_lists[group_comp_index]
        # the word list of base group
        group_base_list = group_lists[group_base_index]

        # enumerate through all the paragraphs in group_comp_paras
        for para_index, paras in enumerate(group_comp_paras):
            word_z_score_dict = __z_test_word_list__(
                word_list_i=paras,
                word_list_j=group_base_list,
                corpus_list=corpus_list,
                high=high,
                low=low)
            # sort the dictionary
            sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True)
            # pack the sorted result in sorted list
            all_results.update({
                (group_comp_index, para_index, group_base_index):
                sorted_word_zscore_tuple_list
            })

    return all_results
Beispiel #15
0
def test_group_to_group(group_para_lists, option='CustomP', low=0.0, high=None):
    """

    All paragraphs are really references to documents. The UI has been updated to "documents" but all the variables
    below still use paragraphs.

    this function will give you the result of all the group compare with each other groups

    Args:
        group_para_lists: a list, each element of the list is list, each lists represent a group.
                            each element in the group list is a dictionary, map a word to a word count
                            each dictionary represent a paragraph(chunk) in the corresponding group
        option: some default option to set for High And Low(see the document for High and Low)
                1. using standard deviation to find outlier
                    TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                (word frequency > average + 2 * Standard_Deviation)
                    MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                    LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                (average - 2 * Standard_Deviation > word frequency)

                2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                    TopIQR: only analyze the Top outlier of word, determined by IQR
                                (word frequency > median + 1.5 * Standard)
                    MidIQR: only analyze the non-outlier of word, determined by IQR
                                (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                    LowIQR: only analyze the Left outlier of word, determined by IQR
                                (median - 1.5 * Standard > word frequency)
        low:  this method will only analyze the word with higher frequency than this value
                (this value is the lower bound of the word being analyzed)
                (this parameter will be overwritten if the option is not 'Custom')
        high: this method will only analyze the word with lower frequency than this value
                (this value is the upper bound of the word being analyzed)
                (this parameter will be overwritten if the option is not 'Custom')

    Returns:
        a dictionary of a tuple mapped to a list:
        tuple: the tuple consist of two elements:
                the two index is two groups to compare.
        list: a list of tuples represent the comparison result of the two index that :
                first element in the tuple is a string, representing a word
                second element is a float representing the corresponding z-score you get when you compare the word
                in the two different paragraph (the index is represented in the in the first tuple we talked about)

    """
    # init
    group_word_lists = []  # group list is the word list of each group (word to word count within the whole group)
    group_word_count = []  # the total word count of each group
    for chunk in group_para_lists:
        group_word_lists.append(merge_list(chunk))
        group_word_count.append(sum(group_word_lists[-1].values()))
    # the word list of the corpus (all the word maps to the sum of all the word count)
    corpus_list = merge_list(group_word_lists)
    total_word_count = sum(group_word_count)  # the total number of word count in words in the corpus
    # the number of unique words
    # example: 'the a ha the' has 3 unique words: 'the', 'a', and 'ha'
    total_num_words = len(corpus_list)
    num_group = len(group_word_lists)  # number of group
    all_results = {}

    high, low = __word_filter__(option, low, high, total_num_words, total_word_count, corpus_list)

    # comparison map, in here is a list of tuple.
    # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0)
    # i_index has to be smaller than j_index to avoid repetition
    comp_map = itertools.product(range(num_group), range(num_group))
    comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map if i_index < j_index]

    for group_comp_index, group_base_index in comp_map:
        group_comp_list = group_word_lists[group_comp_index]
        group_base_list = group_word_lists[group_base_index]
        word_z_score_dict = __z_test_word_list__(word_list_i=group_comp_list, word_list_j=group_base_list,
                                                 corpus_list=corpus_list, high=high, low=low)
        # sort the dictionary
        sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(), key=lambda item: abs(item[1]), reverse=True)
        # pack the sorted result in sorted list
        all_results.update({(group_comp_index, group_base_index): sorted_word_zscore_tuple_list})

    return all_results
Beispiel #16
0
def test_para_to_group(group_para_lists, option='CustomP', low=0.0, high=1.0):
    """

    All paragraphs are really references to documents. The UI has been updated to "documents" but all the variables
    below still use paragraphs.

    this method analyze each single word(compare to all the other group),
    and then pack that into the return

    :param group_para_lists:   Array
                        each element of array represent a chunk, and it is a dictionary type
                        each element in the dictionary maps word inside that chunk to its frequency

    :param option:  some default option to set for High And Low(see the document for High and Low)
                    1. using standard deviation to find outlier
                        TopStdE: only analyze the Right outlier of word, determined by standard deviation
                                    (word frequency > average + 2 * Standard_Deviation)
                        MidStdE: only analyze the Non-Outlier of word, determined by standard deviation
                                    (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation)
                        LowStdE: only analyze the Left Outlier of word, determined by standard deviation
                                    (average - 2 * Standard_Deviation > word frequency)

                    2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED*
                        TopIQR: only analyze the Top outlier of word, determined by IQR
                                    (word frequency > median + 1.5 * Standard)
                        MidIQR: only analyze the non-outlier of word, determined by IQR
                                    (median + 1.5 * Standard > word frequency > median - 1.5 * Standard)
                        LowIQR: only analyze the Left outlier of word, determined by IQR
                                    (median - 1.5 * Standard > word frequency)

    :param low:  this method will only analyze the word with higher frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')
    :param high: this method will only analyze the word with lower frequency than this value
                    (this parameter will be overwritten if the option is not 'Custom')

    :return:    contain a array
                each element of array is a dictionary map a tuple to a list
                    tuple consist of 3 element (group number 1, list number, group number 2)
                        means compare the words in list number of group number 1 to all the word in group number 2
                    the list contain tuples, sorted by p value:
                        tuple means (word, p value)
                        this is word usage of word in group (group number 1), list (list number),
                        compare to the word usage of the same word in group (group number 2)
    """

    # init
    group_lists = []  # group list is the word list of each group (word to word count within the whole group)
    group_word_count = []  # the total word count of each group
    group_num_words = []  # a list of number of unique words in each group
    for chunk in group_para_lists:
        group_lists.append(merge_list(chunk))
        group_word_count.append(sum(group_lists[-1].values()))
        group_num_words.append(len(group_lists[-1]))
    corpus_list = merge_list(group_lists)
    total_word_count = sum(group_word_count)
    total_num_words = len(corpus_list)
    num_group = len(group_lists)  # number of groups
    all_results = {}  # the value to return

    high, low = __word_filter__(option, low, high, total_num_words, total_word_count, corpus_list)

    # calculation

    # comparison map, in here is a list of tuple.
    # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0)
    # two group index cannot be equal
    comp_map = itertools.product(range(num_group), range(num_group))
    comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map if i_index != j_index]

    # compare each paragraph in group_comp to group_base (group comp means group for comparison)
    for group_comp_index, group_base_index in comp_map:

        # gives all the paragraphs in the group in a array
        group_comp_paras = group_para_lists[group_comp_index]
        # the word list of base group
        group_base_list = group_lists[group_base_index]

        # enumerate through all the paragraphs in group_comp_paras
        for para_index, paras in enumerate(group_comp_paras):
            word_z_score_dict = __z_test_word_list__(word_list_i=paras, word_list_j=group_base_list,
                                                     corpus_list=corpus_list, high=high, low=low)
            # sort the dictionary
            sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(), key=lambda item: abs(item[1]), reverse=True)
            # pack the sorted result in sorted list
            all_results.update({(group_comp_index, para_index, group_base_index): sorted_word_zscore_tuple_list})

    return all_results