Ejemplo n.º 1
0
def quantize(abstractions):
    print 'calculating random trigrams...'
    allGrams = dict([])
    c, i, u, l = 1, 20, 0, len(abstractions)
    for abstraction in abstractions:
        u = feedbackMessage(c, i, u, l, 'generating n-grams:')
        allGrams = trigrammatize(abstraction, allGrams)
        c += 1
    ranks = []
    cc, i, u, l = 1, 20, 0, len(allGrams.keys())
    x = 0
    levels = dict([])
    for gram1 in allGrams.keys():
        u = feedbackMessage(cc, i, u, l, 'calculating n-gram frequencies:')
        for gram2 in allGrams[gram1].keys():
            for gram3 in allGrams[gram1][gram2].keys():
                score = allGrams[gram1][gram2][gram3]
                if score in levels.keys():
                    levels[score] += 1
                else:
                    levels[score] = 1
                gram = gram1 + '_' + gram2 + '_' + gram3
                ranks.append((score, gram))
                del allGrams[gram1][gram2][gram3]
                x += 1
        cc += 1
        #	print 'done. calculating tipping point...'
    #	tippingPoint = findConfluenceDynamic(levels)
    #	print 'done.\nfiltering data.\nextracting best feature sets...'
    ranks = [rank for rank in ranks if rank[0] >= tippingPoint]
    ranks.sort()
    ranks.reverse()
    print 'done.\nclustering...'
    return ranks
Ejemplo n.º 2
0
def allWords(lines):
    words = []
    c, i, u, l = 1, 20, 0, len(lines)
    for text in lines:
        u = feedbackMessage(c, i, u, l, 'loading file:')
        words += tokenize(text)
        c += 1
    return words
Ejemplo n.º 3
0
def clusterize(freqDist, originalAbstractions, originals, \
               docsByLine, lines, returnIndividualSenteces):
    trigrams = [pair[1] for pair in freqDist]
    clusters = startClustering(trigrams)
    abstractions = [set(abstraction) for abstraction in originalAbstractions]
    territory = [originals[z] for z in range(0, len(abstractions))]
    indexes = range(0, len(abstractions))
    print len(abstractions), 'initial vectors,', len(trigrams), \
        'initial feature sets'
    c, i, u, l = 1, 20, 0, len(trigrams)
    #   abstractions = list of sentences without noise
    while abstractions and trigrams:
        c += 1
        u = feedbackMessage(c, i, u, l, 'clustering: ' + \
                                        str(len(abstractions)) + ' remaining sentence vectors, ' + \
                                        str(len(trigrams)) + ' remaining feature sets -')
        trigram = trigrams[0]
        features = set(trigram.split('_'))
        unclassifiedAbstractions = []
        unclassifiedOriginals = []
        unclassifiedIndexes = []
        for j in range(0, len(abstractions)):
            abstraction = abstractions[j]
            #	print features,'\t',abstraction,'\t',features.intersection(abstraction)
            if features.intersection(abstraction) == features:
                where = clusters[trigram]
                if returnIndividualSenteces == True:
                    where.documents.append(territory[j])
                    where.indexes.append(indexes[j])
                else:
                    original = lines[docsByLine[indexes[j]]]
                    original = original[original.index(', ') + 2:original.index(', ') + 102]
                    where.documents.append(original)
                    where.indexes.append(docsByLine[indexes[j]])
                where.vectors.append(abstraction)
                where.space += list(abstraction)
                #		else:
                #			unclassifiedAbstractions.append(abstraction)
                #			unclassifiedOriginals.append(territory[j])
                #			unclassifiedIndexes.append(indexes[j])
                #	abstractions = unclassifiedAbstractions
                #	territory = unclassifiedOriginals
                #	indexes = unclassifiedIndexes
        trigrams = trigrams[1:]
    if returnIndividualSenteces == False:
        for key in clusters.keys():
            clusters[key].documents = sorted(list(set(clusters[key].documents)))
            clusters[key].indexes = sorted(list(set(clusters[key].indexes)))
    return clusters
Ejemplo n.º 4
0
def advancedRead(path, relevant):
    rd = open(path, 'r')
    lines = rd.readlines()
    rd.close()
    newLines = []
    entities = dict([])
    c, i, u, l = 1, 20, 0, len(lines)
    for line in lines:
        u = feedbackMessage(c, i, u, l, 'segmenting file:')
        #	recognition = entityRecognition(line.strip(), entities, noise)
        #	newLine = recognition[0]
        #	entities = recognition[1]
        newLines.append(line)
        #	newLines.append(newLine)
        c += 1
    return [newLines, entities]