def __z_test_word_list__(word_list_i, word_list_j, corpus_list, high, low): # type: (dict, dict) -> dict """ this takes two word lists and do z test on all the words in those two word list and this will return the result in a dict map the word to the corresponding z-score Args: word_list_i: the first word list, a dictionary map word to word counts word_list_j: the second word list, a dictionary map word to word counts Returns: a dictionary map the word to z-score """ total_count_i = sum(word_list_i.values()) total_count_j = sum(word_list_j.values()) total_list = merge_list([word_list_j, word_list_i]) word_z_score_dict = {} for word in total_list: if low < corpus_list[word] < high: # taking care fo the word filter try: p_i = word_list_i[word] / total_count_i except KeyError: p_i = 0 try: p_j = word_list_j[word] / total_count_j except KeyError: p_j = 0 z_score = __z_test__(p_i, p_j, total_count_i, total_count_j) word_z_score_dict.update({word.decode('utf-8'): z_score}) return word_z_score_dict
def test_all_to_para(word_lists, option='CustomP', low=0.0, high=None): """ this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*), and then pack that into the return :param word_lists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param high: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a array, represent a chunk and it is sorted via z_score each element array is a tuple: (word, corresponding z_score) """ # init corpus_list = merge_list(word_lists) all_results = [] # the value to return total_word_count = sum(corpus_list.values()) num_word = len(corpus_list) # handle option (word filter) high, low = __word_filter__(option, low, high, num_word, total_word_count, corpus_list) # calculation for word_list in word_lists: word_z_score_dict = __z_test_word_list__(word_list_i=word_list, word_list_j=corpus_list, corpus_list=corpus_list, high=high, low=low) sorted_list = sorted(word_z_score_dict.items(), key=itemgetter(1), reverse=True) all_results.append(sorted_list) return all_results
def testall(WordLists, option="CustomP", Low=0.0, High=None): """ this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*), and then pack that into the return :param WordLists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a array, represent a chunk and it is sorted via z_score each element array is a tuple: (word, corresponding z_score) """ # init MergeList = merge_list(WordLists) AllResults = [] # the value to return TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # handle option (word filter) # calculation for wordlist in WordLists: ResultList = {} ListWordCount = sum(wordlist.values()) for word in wordlist.keys(): if Low < MergeList[word] < High: z_score = ztest( wordlist[word] / ListWordCount, MergeList[word] / TotalWordCount, ListWordCount, TotalWordCount ) ResultList.update({word: z_score}) ResultList = sorted(ResultList.items(), key=itemgetter(1), reverse=True) AllResults.append(ResultList) return AllResults
def __z_test_word_list__(word_list_i, word_list_j, corpus_list, high, low): # type: (dict, dict) -> dict """ this takes two word lists and do z test on all the words in those two word list and this will return the result in a dict map the word to the corresponding z-score Args: word_list_i: the first word list, a dictionary map word to word counts word_list_j: the second word list, a dictionary map word to word counts Returns: a dictionary map the word to z-score """ total_count_i = sum(word_list_i.values()) total_count_j = sum(word_list_j.values()) total_list = merge_list([word_list_j, word_list_i]) word_z_score_dict = {} for word in total_list: if low < corpus_list[word] < high: # taking care fo the word filter try: p_i = word_list_i[word] / total_count_i except KeyError: p_i = 0 try: p_j = word_list_j[word] / total_count_j except KeyError: p_j = 0 # keep 4 digits after the decimal point of z_score z_score = truncate( __z_test__(p_i, p_j, total_count_i, total_count_j), 4) # get rid of the insignificant results, insignificant means those with absolute values smaller than 1.96 if abs(z_score) >= 1.96: word_z_score_dict.update({word.decode('utf-8'): z_score}) return word_z_score_dict
def __z_test_word_list__(word_list_i, word_list_j, corpus_list, high, low): # type: (dict, dict) -> dict """ this takes two word lists and do z test on all the words in those two word list and this will return the result in a dict map the word to the corresponding z-score Args: word_list_i: the first word list, a dictionary map word to word counts word_list_j: the second word list, a dictionary map word to word counts Returns: a dictionary map the word to z-score """ total_count_i = sum(word_list_i.values()) total_count_j = sum(word_list_j.values()) total_list = merge_list([word_list_j, word_list_i]) word_z_score_dict = {} for word in total_list: if low < corpus_list[word] < high: # taking care fo the word filter try: p_i = word_list_i[word] / total_count_i except KeyError: p_i = 0 try: p_j = word_list_j[word] / total_count_j except KeyError: p_j = 0 # keep 4 digits after the decimal point of z_score z_score = truncate(__z_test__(p_i, p_j, total_count_i, total_count_j),4) # get rid of the insignificant results, insignificant means those with absolute values smaller than 1.96 if abs(z_score) >= 1.96: word_z_score_dict.update({word.decode('utf-8'): z_score}) return word_z_score_dict
def test_all_to_para(word_lists, option='CustomP', low=0.0, high=None): """ All paragraphs are really references to documents. The UI has been updated to "documents" but all the variables below still use paragraphs. this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*), and then pack that into the return :param word_lists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param high: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a array, represent a chunk and it is sorted via z_score each element array is a tuple: (word, corresponding z_score) """ # init corpus_list = merge_list(word_lists) all_results = [] # the value to return total_word_count = sum(corpus_list.values()) num_word = len(corpus_list) # handle option (word filter) high, low = __word_filter__(option, low, high, num_word, total_word_count, corpus_list) # calculation for word_list in word_lists: word_z_score_dict = __z_test_word_list__(word_list_i=word_list, word_list_j=corpus_list, corpus_list=corpus_list, high=high, low=low) sorted_list = sorted(word_z_score_dict.items(), key=lambda item: abs(item[1]), reverse=True) all_results.append(sorted_list) return all_results
def KWtest(Matrixs, Words, WordLists, option="CustomP", Low=0.0, High=1.0): # begin handle options MergeList = merge_list(WordLists) TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # end handle options Len = max(len(matrix) for matrix in Matrixs) # the length of all the sample set (all the sample set with less that this will turn into a masked array) word_pvalue_dict = {} # the result list for i in range(1, len(Matrixs[0][0])): # focusing on a specific word word = Words[i - 1] if Low < MergeList[word] < High: samples = [] for k in range(len(Matrixs)): # focusing on a group sample = [] for j in range(len(Matrixs[k])): # focusing on all the segment of that group # add the sample into the sample list sample.append(Matrixs[k][j][i]) # combine all the samples of each sample list # turn the short ones masked so that all the sample set has the same length samples.append( ma.masked_array( sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample)) ) ) # do the KW test try: pvalue = kruskalwallis(samples)[1] except ValueError as error: if error.args[0] == "All numbers are identical in kruskal": # get the argument of the error pvalue = "Invalid" else: raise ValueError(error) # put the result in the dict word_pvalue_dict.update({word: pvalue}) return sorted(word_pvalue_dict.items(), key=itemgetter(1))
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0): """ give the kruskal wallis test result on the topword :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement. :param Words: all the words (Matrixs and words are parallel) :param WordLists: a list of dictionary that has the word map to its word count. each dictionary represent the information inside a segment :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value """ # begin handle options MergeList = merge_list(WordLists) TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # end handle options Len = max(len(matrix) for matrix in Matrixs) # the length of all the sample set (all the sample set with less that this will turn into a masked array) word_pvalue_dict = {} # the result list for i in range(1, len(Matrixs[0][0])): # focusing on a specific word word = Words[i - 1] try: MergeList[word] except KeyError: continue if Low < MergeList[word] < High: samples = [] for k in range(len(Matrixs)): # focusing on a group sample = [] for j in range(len(Matrixs[k]) ): # focusing on all the segment of that group # add the sample into the sample list sample.append(Matrixs[k][j][i]) # combine all the samples of each sample list # turn the short ones masked so that all the sample set has the same length samples.append( ma.masked_array(sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample)))) # do the KW test try: pvalue = kruskalwallis(samples)[1] except ValueError as error: if error.args[ 0] == 'All numbers are identical in kruskal': # get the argument of the error pvalue = 'Invalid' else: raise ValueError(error) # put the result in the dict word_pvalue_dict.update({word.decode('utf-8'): pvalue}) return sorted(word_pvalue_dict.items(), key=itemgetter(1))
def testgroup(GroupWordLists, option='CustomP', Low=0.0, High=1.0): """ this method takes ChunkWordlist and and then analyze each single word(compare to all the other group), and then pack that into the return :param GroupWordLists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a dictionary map a tuple to a list tuple consist of 3 element (group number 1, list number, group number 2) means compare the words in list number of group number 1 to all the word in group number 2 the list contain tuples, sorted by p value: tuple means (word, p value) this is word usage of word in group (group number 1), list (list number), compare to the word usage of the same word in group (group number 2) """ # init GroupLists = [] GroupWordCounts = [] GroupNumWords = [] for Chunk in GroupWordLists: GroupLists.append(merge_list(Chunk)) GroupWordCounts.append(sum(GroupLists[-1].values())) GroupNumWords.append(len(GroupLists[-1])) TotalList = merge_list(GroupLists) TotalWordCount = sum(GroupWordCounts) TotalNumWords = len(TotalList) AllResults = {} # the value to return High, Low = wordfilter(option, Low, High, TotalNumWords, TotalWordCount, TotalList) # calculation for i in range(len(GroupWordLists)): # individual chunk for j in range(len(GroupWordLists)): # group compare if i != j: # each chunk in wordlist i, compare to each chunk in wordlistnumber = 0 # the label of the word list in GroupWordList[i] for wordlist in GroupWordLists[ i]: # focusing on a specific word on list i. iTotalWordCount = sum(wordlist.values()) for word in wordlist.keys(): # handle option if Low < TotalList[word] < High: iWordCount = wordlist[word] iWordProp = iWordCount / iTotalWordCount try: jWordCount = GroupLists[j][word] except KeyError: jWordCount = 0 jTotalWordCount = GroupWordCounts[j] jWordProp = jWordCount / jTotalWordCount z_score = ztest(iWordProp, jWordProp, iTotalWordCount, jTotalWordCount) try: AllResults[(i, wordlistnumber, j)].append( (word.decode('utf-8'), z_score)) except: AllResults.update({ (i, wordlistnumber, j): [(word.decode('utf-8'), z_score)] }) wordlistnumber += 1 # sort the output for tuple in AllResults.keys(): list = AllResults[tuple] list = sorted(list, key=lambda tup: tup[1], reverse=True) AllResults.update({tuple: list}) return AllResults
def testall(WordLists, option='CustomP', Low=0.0, High=None): """ this method takes Wordlist and and then analyze each single word(*compare to the total passage(all the chunks)*), and then pack that into the return :param WordLists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a array, represent a chunk and it is sorted via z_score each element array is a tuple: (word, corresponding z_score) """ # init MergeList = merge_list(WordLists) AllResults = [] # the value to return TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # handle option (word filter) # calculation for wordlist in WordLists: ResultList = {} ListWordCount = sum(wordlist.values()) for word in wordlist.keys(): if Low < MergeList[word] < High: z_score = ztest(wordlist[word] / ListWordCount, MergeList[word] / TotalWordCount, ListWordCount, TotalWordCount) ResultList.update({word.decode('utf-8'): z_score}) ResultList = sorted(ResultList.items(), key=itemgetter(1), reverse=True) AllResults.append(ResultList) return AllResults
def KWtest(Matrixs, Words, WordLists, option='CustomP', Low=0.0, High=1.0): """ give the kruskal wallis test result on the topword :param Matrixs: every element is a group Matrix that contain the word counts, each represent a segement. :param Words: all the words (Matrixs and words are parallel) :param WordLists: a list of dictionary that has the word map to its word count. each dictionary represent the information inside a segment :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: a sorted dict (list of tuples) that the first element of the word and the second element is it corresponding p value """ # begin handle options MergeList = merge_list(WordLists) TotalWordCount = sum(MergeList.values()) NumWord = len(MergeList) High, Low = wordfilter(option, Low, High, NumWord, TotalWordCount, MergeList) # end handle options Len = max(len(matrix) for matrix in Matrixs) # the length of all the sample set (all the sample set with less that this will turn into a masked array) word_pvalue_dict = {} # the result list for i in range(1, len(Matrixs[0][0])): # focusing on a specific word word = Words[i - 1] try: MergeList[word] except KeyError: continue if Low < MergeList[word] < High: samples = [] for k in range(len(Matrixs)): # focusing on a group sample = [] for j in range(len(Matrixs[k])): # focusing on all the segment of that group # add the sample into the sample list sample.append(Matrixs[k][j][i]) # combine all the samples of each sample list # turn the short ones masked so that all the sample set has the same length samples.append(ma.masked_array(sample + [0] * (Len - len(sample)), mask=[0] * len(sample) + [1] * (Len - len(sample)))) # do the KW test try: pvalue = kruskalwallis(samples)[1] except ValueError as error: if error.args[0] == 'All numbers are identical in kruskal': # get the argument of the error pvalue = 'Invalid' else: raise ValueError(error) # put the result in the dict word_pvalue_dict.update({word.decode('utf-8'): pvalue}) return sorted(word_pvalue_dict.items(), key=itemgetter(1))
def testgroup(GroupWordLists, option='CustomP', Low=0.0, High=1.0): """ this method takes ChunkWordlist and and then analyze each single word(compare to all the other group), and then pack that into the return :param GroupWordLists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param Low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param High: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a dictionary map a tuple to a list tuple consist of 3 element (group number 1, list number, group number 2) means compare the words in list number of group number 1 to all the word in group number 2 the list contain tuples, sorted by p value: tuple means (word, p value) this is word usage of word in group (group number 1), list (list number), compare to the word usage of the same word in group (group number 2) """ # init GroupLists = [] GroupWordCounts = [] GroupNumWords = [] for Chunk in GroupWordLists: GroupLists.append(merge_list(Chunk)) GroupWordCounts.append(sum(GroupLists[-1].values())) GroupNumWords.append(len(GroupLists[-1])) TotalList = merge_list(GroupLists) TotalWordCount = sum(GroupWordCounts) TotalNumWords = len(TotalList) AllResults = {} # the value to return High, Low = wordfilter(option, Low, High, TotalNumWords, TotalWordCount, TotalList) # calculation for i in range(len(GroupWordLists)): # individual chunk for j in range(len(GroupWordLists)): # group compare if i != j: # each chunk in wordlist i, compare to each chunk in wordlistnumber = 0 # the label of the word list in GroupWordList[i] for wordlist in GroupWordLists[i]: # focusing on a specific word on list i. iTotalWordCount = sum(wordlist.values()) for word in wordlist.keys(): # handle option if Low < TotalList[word] < High: iWordCount = wordlist[word] iWordProp = iWordCount / iTotalWordCount try: jWordCount = GroupLists[j][word] except KeyError: jWordCount = 0 jTotalWordCount = GroupWordCounts[j] jWordProp = jWordCount / jTotalWordCount z_score = ztest(iWordProp, jWordProp, iTotalWordCount, jTotalWordCount) try: AllResults[(i, wordlistnumber, j)].append((word.decode('utf-8'), z_score)) except: AllResults.update({(i, wordlistnumber, j): [(word.decode('utf-8'), z_score)]}) wordlistnumber += 1 # sort the output for tuple in AllResults.keys(): list = AllResults[tuple] list = sorted(list, key=lambda tup: tup[1], reverse=True) AllResults.update({tuple: list}) return AllResults
def test_group_to_group(group_para_lists, option='CustomP', low=0.0, high=None): """ this function will give you the result of all the group compare with each other groups Args: group_para_lists: a list, each element of the list is list, each lists represent a group. each element in the group list is a dictionary, map a word to a word count each dictionary represent a paragraph(chunk) in the corresponding group option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) low: this method will only analyze the word with higher frequency than this value (this value is the lower bound of the word being analyzed) (this parameter will be overwritten if the option is not 'Custom') high: this method will only analyze the word with lower frequency than this value (this value is the upper bound of the word being analyzed) (this parameter will be overwritten if the option is not 'Custom') Returns: a dictionary of a tuple mapped to a list: tuple: the tuple consist of two elements: the two index is two groups to compare. list: a list of tuples represent the comparison result of the two index that : first element in the tuple is a string, representing a word second element is a float representing the corresponding z-score you get when you compare the word in the two different paragraph (the index is represented in the in the first tuple we talked about) """ # init group_word_lists = [ ] # group list is the word list of each group (word to word count within the whole group) group_word_count = [] # the total word count of each group for chunk in group_para_lists: group_word_lists.append(merge_list(chunk)) group_word_count.append(sum(group_word_lists[-1].values())) # the word list of the corpus (all the word maps to the sum of all the word count) corpus_list = merge_list(group_word_lists) total_word_count = sum( group_word_count ) # the total number of word count in words in the corpus # the number of unique words # example: 'the a ha the' has 3 unique words: 'the', 'a', and 'ha' total_num_words = len(corpus_list) num_group = len(group_word_lists) # number of group all_results = {} high, low = __word_filter__(option, low, high, total_num_words, total_word_count, corpus_list) # comparison map, in here is a list of tuple. # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0) # i_index has to be smaller than j_index to avoid repetition comp_map = itertools.product(range(num_group), range(num_group)) comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map if i_index < j_index] for group_comp_index, group_base_index in comp_map: group_comp_list = group_word_lists[group_comp_index] group_base_list = group_word_lists[group_base_index] word_z_score_dict = __z_test_word_list__(word_list_i=group_comp_list, word_list_j=group_base_list, corpus_list=corpus_list, high=high, low=low) # sort the dictionary sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(), key=operator.itemgetter(1), reverse=True) # pack the sorted result in sorted list all_results.update({ (group_comp_index, group_base_index): sorted_word_zscore_tuple_list }) return all_results
def test_para_to_group(group_para_lists, option='CustomP', low=0.0, high=1.0): """ this method analyze each single word(compare to all the other group), and then pack that into the return :param group_para_lists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param high: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a dictionary map a tuple to a list tuple consist of 3 element (group number 1, list number, group number 2) means compare the words in list number of group number 1 to all the word in group number 2 the list contain tuples, sorted by p value: tuple means (word, p value) this is word usage of word in group (group number 1), list (list number), compare to the word usage of the same word in group (group number 2) """ # init group_lists = [ ] # group list is the word list of each group (word to word count within the whole group) group_word_count = [] # the total word count of each group group_num_words = [] # a list of number of unique words in each group for chunk in group_para_lists: group_lists.append(merge_list(chunk)) group_word_count.append(sum(group_lists[-1].values())) group_num_words.append(len(group_lists[-1])) corpus_list = merge_list(group_lists) total_word_count = sum(group_word_count) total_num_words = len(corpus_list) num_group = len(group_lists) # number of groups all_results = {} # the value to return high, low = __word_filter__(option, low, high, total_num_words, total_word_count, corpus_list) # calculation # comparison map, in here is a list of tuple. # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0) # two group index cannot be equal comp_map = itertools.product(range(num_group), range(num_group)) comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map if i_index != j_index] # compare each paragraph in group_comp to group_base (group comp means group for comparison) for group_comp_index, group_base_index in comp_map: # gives all the paragraphs in the group in a array group_comp_paras = group_para_lists[group_comp_index] # the word list of base group group_base_list = group_lists[group_base_index] # enumerate through all the paragraphs in group_comp_paras for para_index, paras in enumerate(group_comp_paras): word_z_score_dict = __z_test_word_list__( word_list_i=paras, word_list_j=group_base_list, corpus_list=corpus_list, high=high, low=low) # sort the dictionary sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(), key=operator.itemgetter(1), reverse=True) # pack the sorted result in sorted list all_results.update({ (group_comp_index, para_index, group_base_index): sorted_word_zscore_tuple_list }) return all_results
def test_group_to_group(group_para_lists, option='CustomP', low=0.0, high=None): """ All paragraphs are really references to documents. The UI has been updated to "documents" but all the variables below still use paragraphs. this function will give you the result of all the group compare with each other groups Args: group_para_lists: a list, each element of the list is list, each lists represent a group. each element in the group list is a dictionary, map a word to a word count each dictionary represent a paragraph(chunk) in the corresponding group option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) low: this method will only analyze the word with higher frequency than this value (this value is the lower bound of the word being analyzed) (this parameter will be overwritten if the option is not 'Custom') high: this method will only analyze the word with lower frequency than this value (this value is the upper bound of the word being analyzed) (this parameter will be overwritten if the option is not 'Custom') Returns: a dictionary of a tuple mapped to a list: tuple: the tuple consist of two elements: the two index is two groups to compare. list: a list of tuples represent the comparison result of the two index that : first element in the tuple is a string, representing a word second element is a float representing the corresponding z-score you get when you compare the word in the two different paragraph (the index is represented in the in the first tuple we talked about) """ # init group_word_lists = [] # group list is the word list of each group (word to word count within the whole group) group_word_count = [] # the total word count of each group for chunk in group_para_lists: group_word_lists.append(merge_list(chunk)) group_word_count.append(sum(group_word_lists[-1].values())) # the word list of the corpus (all the word maps to the sum of all the word count) corpus_list = merge_list(group_word_lists) total_word_count = sum(group_word_count) # the total number of word count in words in the corpus # the number of unique words # example: 'the a ha the' has 3 unique words: 'the', 'a', and 'ha' total_num_words = len(corpus_list) num_group = len(group_word_lists) # number of group all_results = {} high, low = __word_filter__(option, low, high, total_num_words, total_word_count, corpus_list) # comparison map, in here is a list of tuple. # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0) # i_index has to be smaller than j_index to avoid repetition comp_map = itertools.product(range(num_group), range(num_group)) comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map if i_index < j_index] for group_comp_index, group_base_index in comp_map: group_comp_list = group_word_lists[group_comp_index] group_base_list = group_word_lists[group_base_index] word_z_score_dict = __z_test_word_list__(word_list_i=group_comp_list, word_list_j=group_base_list, corpus_list=corpus_list, high=high, low=low) # sort the dictionary sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(), key=lambda item: abs(item[1]), reverse=True) # pack the sorted result in sorted list all_results.update({(group_comp_index, group_base_index): sorted_word_zscore_tuple_list}) return all_results
def test_para_to_group(group_para_lists, option='CustomP', low=0.0, high=1.0): """ All paragraphs are really references to documents. The UI has been updated to "documents" but all the variables below still use paragraphs. this method analyze each single word(compare to all the other group), and then pack that into the return :param group_para_lists: Array each element of array represent a chunk, and it is a dictionary type each element in the dictionary maps word inside that chunk to its frequency :param option: some default option to set for High And Low(see the document for High and Low) 1. using standard deviation to find outlier TopStdE: only analyze the Right outlier of word, determined by standard deviation (word frequency > average + 2 * Standard_Deviation) MidStdE: only analyze the Non-Outlier of word, determined by standard deviation (average + 2 * Standard_Deviation > word frequency > average - 2 * Standard_Deviation) LowStdE: only analyze the Left Outlier of word, determined by standard deviation (average - 2 * Standard_Deviation > word frequency) 2. using IQR to find outlier *THIS METHOD DO NOT WORK WELL, BECAUSE THE DATA USUALLY ARE HIGHLY SKEWED* TopIQR: only analyze the Top outlier of word, determined by IQR (word frequency > median + 1.5 * Standard) MidIQR: only analyze the non-outlier of word, determined by IQR (median + 1.5 * Standard > word frequency > median - 1.5 * Standard) LowIQR: only analyze the Left outlier of word, determined by IQR (median - 1.5 * Standard > word frequency) :param low: this method will only analyze the word with higher frequency than this value (this parameter will be overwritten if the option is not 'Custom') :param high: this method will only analyze the word with lower frequency than this value (this parameter will be overwritten if the option is not 'Custom') :return: contain a array each element of array is a dictionary map a tuple to a list tuple consist of 3 element (group number 1, list number, group number 2) means compare the words in list number of group number 1 to all the word in group number 2 the list contain tuples, sorted by p value: tuple means (word, p value) this is word usage of word in group (group number 1), list (list number), compare to the word usage of the same word in group (group number 2) """ # init group_lists = [] # group list is the word list of each group (word to word count within the whole group) group_word_count = [] # the total word count of each group group_num_words = [] # a list of number of unique words in each group for chunk in group_para_lists: group_lists.append(merge_list(chunk)) group_word_count.append(sum(group_lists[-1].values())) group_num_words.append(len(group_lists[-1])) corpus_list = merge_list(group_lists) total_word_count = sum(group_word_count) total_num_words = len(corpus_list) num_group = len(group_lists) # number of groups all_results = {} # the value to return high, low = __word_filter__(option, low, high, total_num_words, total_word_count, corpus_list) # calculation # comparison map, in here is a list of tuple. # there are two element in the tuple, each one is a index of groups (for example the first group will have index 0) # two group index cannot be equal comp_map = itertools.product(range(num_group), range(num_group)) comp_map = [(i_index, j_index) for (i_index, j_index) in comp_map if i_index != j_index] # compare each paragraph in group_comp to group_base (group comp means group for comparison) for group_comp_index, group_base_index in comp_map: # gives all the paragraphs in the group in a array group_comp_paras = group_para_lists[group_comp_index] # the word list of base group group_base_list = group_lists[group_base_index] # enumerate through all the paragraphs in group_comp_paras for para_index, paras in enumerate(group_comp_paras): word_z_score_dict = __z_test_word_list__(word_list_i=paras, word_list_j=group_base_list, corpus_list=corpus_list, high=high, low=low) # sort the dictionary sorted_word_zscore_tuple_list = sorted(word_z_score_dict.items(), key=lambda item: abs(item[1]), reverse=True) # pack the sorted result in sorted list all_results.update({(group_comp_index, para_index, group_base_index): sorted_word_zscore_tuple_list}) return all_results