def compAssocForPairsInStemClass(stemClasses): vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 total = len(stemClasses) for stem, classList in stemClasses.items(): #only consider stem classes with multiple members if( len(classList) < 2 ): continue pairs = getPairs(classList) print('stem:', stem) print('\tstem class:', classList, '\n') for i in range(len(pairs)): ''' Compute a co-occurrence or association metric for each pair. This measures how strong the association is between the words. ''' dice = getAssociationForPair(vocabDict, pairs[i]) print('\tpair:', pairs[i]) print('\tdice:', dice) print('\t', counter, 'of', total, '\n') print() counter += 1
def getStemsClassesSizeKPlus(k=2): stemClasses = getDictFromFile('wiki-small-vocab-stem-classes.json') chosenStemClasses = {} for stem, classList in stemClasses.items(): if (len(classList) >= k): chosenStemClasses[stem] = classList diff = len(stemClasses) - len(chosenStemClasses) print('old:', len(stemClasses)) print('new:', len(chosenStemClasses)) print('getStemsClassesSizeKPlus() - diff:', diff, '\n') return chosenStemClasses
def getTopKFromDict(k): print('\n'*2) print('top', k) if( k<1 ): return outlinksDict = getDictFromFile('./outlinksDict.json') result = sorted(outlinksDict.items(), key=lambda x: x[1], reverse=True) for i in range(len(result)): print(i+1, result[i]) if( i == k-1 ): break
def getKAlphabeticalWords(k=1000): index = getDictFromFile('wiki-small-vocab.json') sortedKeys = list(index.keys()) sortedKeys.sort() counter = 0 for i in range(len(sortedKeys)): if (isWord(sortedKeys[i]) == True): counter += 1 print(sortedKeys[i]) if (counter == 1000): break
def optimizeStemClass(oldStemClass, windowSize, threshold): ''' Algorithm from: Search Engines Information Retrieval in Practice (page 191-192) ''' vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 total = len(oldStemClass) for stem, classList in oldStemClass.items(): pairs = getPairs(classList) ''' Construct a graph where the vertices represent words and the edges are between words whose co-occurrence metric is above a threshold T. ''' G = nx.Graph() G.add_nodes_from(classList) for i in range(len(pairs)): ''' Compute a co-occurrence or association metric for each pair. This measures how strong the association is between the words. ''' dice = getAssociationForPair(vocabDict, pairs[i], windowSize) if (dice >= threshold): G.add_edge(pairs[i][0], pairs[i][1]) if (counter % 10 == 0): print(counter, 'of', total, 'dice:', dice, '\n') if (len(G.edges()) != 0): print('Graph:') print('nodes:', G.nodes()) print('edges:', G.edges()) conComp = list(nx.connected_component_subgraphs(G)) print() print('New stem class for stem:', stem, ':') for subgraph in conComp: subgraph = subgraph.nodes() if (len(subgraph) > 1): print('\t', subgraph) counter += 1
def getStemclasses(): stemClasses = {} vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 for voc, vocDict in vocabDict.items(): stem = PorterStemmer.useStemer(voc) stemClasses.setdefault(stem, []) stemClasses[stem].append(voc) if (counter % 10000 == 0): print('\t', counter, voc) counter += 1 dumpJsonToFile('wiki-small-vocab-stem-classes.json', stemClasses, False)
def genCSVFile(): outfile = open('sorted-1-2-gram.org.csv', 'w') outfile.write('Rank,Term,Freq,C\n') gramsDict = getDictFromFile('1-2-gram.json') sortedDict = sorted(gramsDict.items(), key=lambda x: x[1], reverse=True) total = 770552 rank = 1 for tup in sortedDict: term, freq = tup c = (freq / total) * rank outfile.write( str(rank) + ', ' + term + ', ' + str(freq) + ', ' + str(round(c, 5)) + '\n') rank += 1 outfile.close()
def optimizeStemClass(oldStemClass, windowSize, threshold): vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 total = len(oldStemClass) for stem, classList in oldStemClass.items(): pairs = getPairs(classList) G = nx.Graph() G.add_nodes_from(classList) for i in range(len(pairs)): dice = getAssociationForPair(vocabDict, pairs[i], windowSize) if (dice >= threshold): G.add_edge(pairs[i][0], pairs[i][1]) if (counter % 10 == 0): print(counter, 'of', total, 'dice:', dice, '\n') if (len(G.edges()) != 0): print('Graph:') print('nodes:', G.nodes()) print('edges:', G.edges()) conComp = list(nx.connected_component_subgraphs(G)) print() print('New stem class for stem:', stem, ':') for subgraph in conComp: subgraph = subgraph.nodes() if (len(subgraph) > 1): print('\t', subgraph) counter += 1
def compAssocForPairsInStemClassThreshold(stemClasses, threshold): vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 total = len(stemClasses) for stem, classList in stemClasses.items(): if (len(classList) < 2): continue G = nx.Graph() G.add_nodes_from(classList) pairs = getPairs(classList) stemDice = 0 for i in range(len(pairs)): dice = getAssociationForPair(vocabDict, pairs[i]) if (dice >= threshold): G.add_edge(pairs[i][0], pairs[i][1]) stemDice = dice if (len(G.edges()) != 0): conComp = list(nx.connected_component_subgraphs(G)) print('stem:', stem) print('\tdice:', stemDice) print('\told stem class:', classList, '\n') print('\tNew stem class for stem:') for subgraph in conComp: subgraph = subgraph.nodes() if (len(subgraph) > 1): print('\t', subgraph) print() counter += 1
def compAssocForPairsInStemClass(stemClasses): vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 total = len(stemClasses) for stem, classList in stemClasses.items(): if (len(classList) < 2): continue pairs = getPairs(classList) print('stem:', stem) print('\tstem class:', classList, '\n') for i in range(len(pairs)): dice = getAssociationForPair(vocabDict, pairs[i]) print('\tpair:', pairs[i]) print('\tdice:', dice) print('\t', counter, 'of', total, '\n') print() counter += 1
def getAssocMeasuresDocs(a, N, k=10): vocabDict = getDictFromFile('wiki-small-vocab.json') a = a.lower() if (a not in vocabDict): print('term:', a, 'not in vocab') return aFileSet = set(vocabDict[a]['f']) vocabDict[a]['MIM'] = -1 vocabDict[a]['EMIM'] = -1 vocabDict[a]['CHI-SQUARE'] = -1 vocabDict[a]['DICE'] = -1 Na = len(aFileSet) for b, bDict in vocabDict.items(): if (b == a): continue bFileSet = set(bDict['f']) Nb = len(bFileSet) intersect = aFileSet & bFileSet MIM = -1 EMIM = -1 dice = -1 chiSquare = -1 Nab = len(intersect) NaTimesNb = Na * Nb if (Nab != 0): MIM = Nab / (Na * Nb) dice = Nab / (Na + Nb) EMIM = Nab * math.log(N * MIM, 10) if (NaTimesNb != 0): numer = Nab - (NaTimesNb / N) chiSquare = (numer * numer) / NaTimesNb bDict['MIM'] = MIM bDict['EMIM'] = EMIM bDict['CHI-SQUARE'] = chiSquare bDict['DICE'] = dice for sortCriteria in ['MIM', 'EMIM', 'CHI-SQUARE', 'DICE']: print() sort = sorted(vocabDict.items(), key=lambda x: x[1][sortCriteria], reverse=True) sort = sort[:k] print(a, 'vs') counter = 1 for termDict in sort: term, termDict = termDict print('\t', counter, 'term:', term, sortCriteria + ':', termDict[sortCriteria]) counter += 1
def getAssocMeasuresWindow(a, N, filename, k=10): prev = datetime.now() vocabDict = getDictFromFile(filename) a = a.lower() if (a not in vocabDict): print('term:', a, 'not in vocab') return transformDocToWindowOpt(vocabDict, a) totalVocab = len(vocabDict) pos = 0 vocabDict[a]['MIM'] = -1 vocabDict[a]['EMIM'] = -1 vocabDict[a]['CHI-SQUARE'] = -1 vocabDict[a]['DICE'] = -1 for b, bDict in vocabDict.items(): pos += 1 if (b == a): continue count = countTerms(vocabDict[a]['f']['windows'], a, b) Na = count['left'] Nab = count['both'] transformDocToWindowOpt(vocabDict, b) count = countTerms(vocabDict[b]['f']['windows'], b, a) Nb = count['left'] MIM = -1 EMIM = -1 dice = -1 chiSquare = -1 if (pos % 100 == 0): print(pos, 'of', totalVocab) print('\tNa:', Na, a) print('\tNb:', Nb, b) print('\tNab:', Nab) delta = datetime.now() - prev print('\ttotal seconds:', delta.seconds) NaTimesNb = Na * Nb if (Nab != 0): MIM = Nab / (Na * Nb) dice = Nab / (Na + Nb) EMIM = Nab * math.log(N * MIM, 10) if (NaTimesNb != 0): numer = Nab - (NaTimesNb / N) chiSquare = (numer * numer) / NaTimesNb bDict['MIM'] = MIM bDict['EMIM'] = EMIM bDict['CHI-SQUARE'] = chiSquare bDict['DICE'] = dice for sortCriteria in ['MIM', 'EMIM', 'CHI-SQUARE', 'DICE']: print() sort = sorted(vocabDict.items(), key=lambda x: x[1][sortCriteria], reverse=True) sort = sort[:k] print(a, 'vs') for termDict in sort: term, termDict = termDict print('\tterm:', term, sortCriteria + ':', termDict[sortCriteria])