Exemple #1
0
def compAssocForPairsInStemClass(stemClasses):
	
	vocabDict = getDictFromFile('wiki-small-vocab.json')
	counter = 0
	total = len(stemClasses)

	for stem, classList in stemClasses.items():
		
		#only consider stem classes with multiple members
		if( len(classList) < 2 ):
			continue

		pairs = getPairs(classList)
		print('stem:', stem)
		print('\tstem class:', classList, '\n')
		for i in range(len(pairs)):
			
			'''
				Compute a co-occurrence or association metric for each pair. This measures
				how strong the association is between the words.
			'''
			dice = getAssociationForPair(vocabDict, pairs[i])

			print('\tpair:', pairs[i])
			print('\tdice:', dice)
			print('\t', counter, 'of', total, '\n')

		print()
		counter += 1
Exemple #2
0
def getStemsClassesSizeKPlus(k=2):

    stemClasses = getDictFromFile('wiki-small-vocab-stem-classes.json')
    chosenStemClasses = {}

    for stem, classList in stemClasses.items():
        if (len(classList) >= k):
            chosenStemClasses[stem] = classList

    diff = len(stemClasses) - len(chosenStemClasses)

    print('old:', len(stemClasses))
    print('new:', len(chosenStemClasses))
    print('getStemsClassesSizeKPlus() - diff:', diff, '\n')
    return chosenStemClasses
Exemple #3
0
def getTopKFromDict(k):

	print('\n'*2)
	print('top', k)
	if( k<1 ):
		return

	outlinksDict = getDictFromFile('./outlinksDict.json')
	result = sorted(outlinksDict.items(), key=lambda x: x[1], reverse=True)

	for i in range(len(result)):
		print(i+1, result[i])

		if( i == k-1 ):
			break
Exemple #4
0
def getKAlphabeticalWords(k=1000):

    index = getDictFromFile('wiki-small-vocab.json')
    sortedKeys = list(index.keys())
    sortedKeys.sort()

    counter = 0
    for i in range(len(sortedKeys)):

        if (isWord(sortedKeys[i]) == True):
            counter += 1

            print(sortedKeys[i])

        if (counter == 1000):
            break
Exemple #5
0
def optimizeStemClass(oldStemClass, windowSize, threshold):
    '''
		Algorithm from: Search Engines Information Retrieval in Practice (page 191-192)
	'''
    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0
    total = len(oldStemClass)
    for stem, classList in oldStemClass.items():

        pairs = getPairs(classList)
        '''
			Construct a graph where the vertices represent words and the edges are between
			words whose co-occurrence metric is above a threshold T.
		'''
        G = nx.Graph()
        G.add_nodes_from(classList)

        for i in range(len(pairs)):
            '''
				Compute a co-occurrence or association metric for each pair. This measures
				how strong the association is between the words.
			'''
            dice = getAssociationForPair(vocabDict, pairs[i], windowSize)

            if (dice >= threshold):
                G.add_edge(pairs[i][0], pairs[i][1])

        if (counter % 10 == 0):
            print(counter, 'of', total, 'dice:', dice, '\n')

        if (len(G.edges()) != 0):

            print('Graph:')
            print('nodes:', G.nodes())
            print('edges:', G.edges())

            conComp = list(nx.connected_component_subgraphs(G))

            print()
            print('New stem class for stem:', stem, ':')
            for subgraph in conComp:

                subgraph = subgraph.nodes()
                if (len(subgraph) > 1):
                    print('\t', subgraph)

        counter += 1
Exemple #6
0
def getStemclasses():

    stemClasses = {}
    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0

    for voc, vocDict in vocabDict.items():

        stem = PorterStemmer.useStemer(voc)
        stemClasses.setdefault(stem, [])
        stemClasses[stem].append(voc)

        if (counter % 10000 == 0):
            print('\t', counter, voc)

        counter += 1

    dumpJsonToFile('wiki-small-vocab-stem-classes.json', stemClasses, False)
def genCSVFile():

    outfile = open('sorted-1-2-gram.org.csv', 'w')
    outfile.write('Rank,Term,Freq,C\n')

    gramsDict = getDictFromFile('1-2-gram.json')
    sortedDict = sorted(gramsDict.items(), key=lambda x: x[1], reverse=True)

    total = 770552
    rank = 1
    for tup in sortedDict:
        term, freq = tup
        c = (freq / total) * rank
        outfile.write(
            str(rank) + ', ' + term + ', ' + str(freq) + ', ' +
            str(round(c, 5)) + '\n')
        rank += 1

    outfile.close()
def optimizeStemClass(oldStemClass, windowSize, threshold):

    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0
    total = len(oldStemClass)
    for stem, classList in oldStemClass.items():

        pairs = getPairs(classList)

        G = nx.Graph()
        G.add_nodes_from(classList)

        for i in range(len(pairs)):

            dice = getAssociationForPair(vocabDict, pairs[i], windowSize)

            if (dice >= threshold):
                G.add_edge(pairs[i][0], pairs[i][1])

        if (counter % 10 == 0):
            print(counter, 'of', total, 'dice:', dice, '\n')

        if (len(G.edges()) != 0):

            print('Graph:')
            print('nodes:', G.nodes())
            print('edges:', G.edges())

            conComp = list(nx.connected_component_subgraphs(G))

            print()
            print('New stem class for stem:', stem, ':')
            for subgraph in conComp:

                subgraph = subgraph.nodes()
                if (len(subgraph) > 1):
                    print('\t', subgraph)

        counter += 1
Exemple #9
0
def compAssocForPairsInStemClassThreshold(stemClasses, threshold):

    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0
    total = len(stemClasses)

    for stem, classList in stemClasses.items():

        if (len(classList) < 2):
            continue

        G = nx.Graph()
        G.add_nodes_from(classList)
        pairs = getPairs(classList)
        stemDice = 0
        for i in range(len(pairs)):

            dice = getAssociationForPair(vocabDict, pairs[i])

            if (dice >= threshold):
                G.add_edge(pairs[i][0], pairs[i][1])
                stemDice = dice

        if (len(G.edges()) != 0):
            conComp = list(nx.connected_component_subgraphs(G))

            print('stem:', stem)
            print('\tdice:', stemDice)
            print('\told stem class:', classList, '\n')
            print('\tNew stem class for stem:')

            for subgraph in conComp:
                subgraph = subgraph.nodes()
                if (len(subgraph) > 1):
                    print('\t', subgraph)

        print()
        counter += 1
Exemple #10
0
def compAssocForPairsInStemClass(stemClasses):

    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0
    total = len(stemClasses)

    for stem, classList in stemClasses.items():

        if (len(classList) < 2):
            continue

        pairs = getPairs(classList)
        print('stem:', stem)
        print('\tstem class:', classList, '\n')
        for i in range(len(pairs)):

            dice = getAssociationForPair(vocabDict, pairs[i])

            print('\tpair:', pairs[i])
            print('\tdice:', dice)
            print('\t', counter, 'of', total, '\n')

        print()
        counter += 1
def getAssocMeasuresDocs(a, N, k=10):

    vocabDict = getDictFromFile('wiki-small-vocab.json')
    a = a.lower()

    if (a not in vocabDict):
        print('term:', a, 'not in vocab')
        return

    aFileSet = set(vocabDict[a]['f'])
    vocabDict[a]['MIM'] = -1
    vocabDict[a]['EMIM'] = -1
    vocabDict[a]['CHI-SQUARE'] = -1
    vocabDict[a]['DICE'] = -1

    Na = len(aFileSet)

    for b, bDict in vocabDict.items():

        if (b == a):
            continue

        bFileSet = set(bDict['f'])
        Nb = len(bFileSet)
        intersect = aFileSet & bFileSet

        MIM = -1
        EMIM = -1
        dice = -1
        chiSquare = -1

        Nab = len(intersect)
        NaTimesNb = Na * Nb

        if (Nab != 0):

            MIM = Nab / (Na * Nb)
            dice = Nab / (Na + Nb)
            EMIM = Nab * math.log(N * MIM, 10)

        if (NaTimesNb != 0):
            numer = Nab - (NaTimesNb / N)
            chiSquare = (numer * numer) / NaTimesNb

        bDict['MIM'] = MIM
        bDict['EMIM'] = EMIM
        bDict['CHI-SQUARE'] = chiSquare
        bDict['DICE'] = dice

    for sortCriteria in ['MIM', 'EMIM', 'CHI-SQUARE', 'DICE']:

        print()

        sort = sorted(vocabDict.items(),
                      key=lambda x: x[1][sortCriteria],
                      reverse=True)
        sort = sort[:k]

        print(a, 'vs')
        counter = 1
        for termDict in sort:
            term, termDict = termDict
            print('\t', counter, 'term:', term, sortCriteria + ':',
                  termDict[sortCriteria])
            counter += 1
def getAssocMeasuresWindow(a, N, filename, k=10):

    prev = datetime.now()

    vocabDict = getDictFromFile(filename)
    a = a.lower()

    if (a not in vocabDict):
        print('term:', a, 'not in vocab')
        return

    transformDocToWindowOpt(vocabDict, a)
    totalVocab = len(vocabDict)
    pos = 0

    vocabDict[a]['MIM'] = -1
    vocabDict[a]['EMIM'] = -1
    vocabDict[a]['CHI-SQUARE'] = -1
    vocabDict[a]['DICE'] = -1

    for b, bDict in vocabDict.items():

        pos += 1

        if (b == a):
            continue

        count = countTerms(vocabDict[a]['f']['windows'], a, b)
        Na = count['left']
        Nab = count['both']

        transformDocToWindowOpt(vocabDict, b)
        count = countTerms(vocabDict[b]['f']['windows'], b, a)
        Nb = count['left']

        MIM = -1
        EMIM = -1
        dice = -1
        chiSquare = -1

        if (pos % 100 == 0):
            print(pos, 'of', totalVocab)
            print('\tNa:', Na, a)
            print('\tNb:', Nb, b)
            print('\tNab:', Nab)
            delta = datetime.now() - prev
            print('\ttotal seconds:', delta.seconds)

        NaTimesNb = Na * Nb

        if (Nab != 0):
            MIM = Nab / (Na * Nb)
            dice = Nab / (Na + Nb)
            EMIM = Nab * math.log(N * MIM, 10)

        if (NaTimesNb != 0):
            numer = Nab - (NaTimesNb / N)
            chiSquare = (numer * numer) / NaTimesNb

        bDict['MIM'] = MIM
        bDict['EMIM'] = EMIM
        bDict['CHI-SQUARE'] = chiSquare
        bDict['DICE'] = dice

    for sortCriteria in ['MIM', 'EMIM', 'CHI-SQUARE', 'DICE']:

        print()

        sort = sorted(vocabDict.items(),
                      key=lambda x: x[1][sortCriteria],
                      reverse=True)
        sort = sort[:k]

        print(a, 'vs')
        for termDict in sort:
            term, termDict = termDict
            print('\tterm:', term, sortCriteria + ':', termDict[sortCriteria])