Exemple #1
0
def draw(o, v, r, k, i, filename):
    o, v, r, k, i = rescale_points(o, v, r, k, i)

    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, WIDTH, HEIGHT)
    ctx = cairo.Context(surface)
    ctx.scale(WIDTH/1.0, HEIGHT/1.0) # Normalizing the canvas

    # Draw shape boundary
    ctx.set_source_rgb(0.2, 0.2, 0.6)
    ctx.set_line_width(0.005)
    draw_consecutive_lines(ctx, o)
    ctx.stroke()

    # Draw internal delaunay triangulation
    ctx.set_source_rgba(0.3, 0.2, 0.5, 0.6)
    ctx.set_line_width(0.002)
    edges = set([])
    for t in i:
        for edge in xuniqueCombinations(t, 2):
            edges.add(tuple(edge))
    draw_edges(ctx, edges)
    ctx.stroke()

    # Draw voronoi skeleton
    ctx.set_source_rgba(0.5, 0.2, 0.2, 0.8)
    #draw_edges(ctx, v, r)
    draw_edges(ctx, v, k)

    surface.write_to_png(filename)
Exemple #2
0
def fNaiveBayesTraining():
    
    numFold = 10
    
    wordFeatureDivideNumStart = 10 # default = 3
    wordFeatureDivideNumStop = 5 # default =10 not include
    wordFeatureDivideNumStep = -2 # default =10
    
#    listMyType = ['stp-', 'wnl-', 'ptr-']
    #typeTextPreprocess = ''
    #typeTextPreprocess = 'stp-'
    typeTextPreprocess = 'wnl-'
    #typeTextPreprocess = 'ptr-'
    
    
    myRe = '((^Title: |^Abstract: )(.*))'
    p = re.compile(myRe)
    
    #filesInput = ['pure-doc-dx.txt', 'pure-doc-tx.txt']
    #filesInput = ['intervention.txt', 'patient.txt', 'outcome.txt']
    
    #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc)
    
    
    
    #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #                  ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #                  ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    #                  ]
    #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #filesInput = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    
    
    dirMain = ''
    dirInput = 'Output2_TrainingSet/'
    #dirInput = 'Output2Train/'
    
    
    for wordFeatureDivideNum in range(wordFeatureDivideNumStart,wordFeatureDivideNumStop, wordFeatureDivideNumStep):
    #    print 'wordFeatureDivideNum: ', wordFeatureDivideNum
        
        dirOutput = 'Output3_Divide'+str(wordFeatureDivideNum)+'/'
        
        #for typeTextPreprocess in listMyType:
        dirCwd = os.getcwd()+'/'
        if os.path.isdir(dirCwd+dirOutput):
            try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
                shutil.rmtree(dirCwd+dirOutput)
            except:
                raise
        os.mkdir(dirCwd+dirOutput)
        
        
        for idxCrossValidation in range(0,numFold):
            print idxCrossValidation
            
        #    listFilesInput = ['stp-'+str(idxCrossValidation)+'-Per-int-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-out-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-pat-Test-.txt']
#            listFilesInput = [typeTextPreprocess+str(idxCrossValidation)+'-int-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Test-.csv']
            listFilesInput = [typeTextPreprocess+str(idxCrossValidation)+'-int-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Train-.csv']
        
        #exit()
            
        #    listFilesInput = ['stp-0-Per-int-Test-.txt', 'stp-0-Per-out-Test-.txt', 'stp-0-Per-pat-Test-.txt']
            print "Unique Combinations of 2 letters from :",listFilesInput
            for fileOne in listFilesInput:
        #        outputFileNameDiff = fileOne[10:13]
                outputFileNameDiff = fileOne[6:9]
                
                print 'outputFileNameDiff: ', outputFileNameDiff
            
                listMyWords = []
                listDoc = []
        
        #         example from CrossValidationStep2
        #         outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per'
        
        #         filePioTxt= dirCwd+dirInput+typeTextPreprocess+fileOne
                filePioTxt= dirCwd+dirInput+fileOne
                with open(filePioTxt) as fTxtOrg:
                    listDocOrg = fTxtOrg.readlines()
                print 'len(listDocOrg): ', len(listDocOrg)
        
                for rowOfListDocOrg in listDocOrg:
            #                print 'rowOfListDocOrg: ', rowOfListDocOrg
            #            myResult = p.search(rowOfListDocOrg)
            #            if myResult <> None:
            #                myData = re.sub('^Title: |^Abstract: ','',myResult.group())
                    listMyWords.extend(rowOfListDocOrg.split())
            #                listDoc.append((myData.split(),fileOne[9:11]))
                    print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split())
                    listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff))
            
            #exit()
            
            listFilesInputCombinations = []
            for uc in xpermutations.xuniqueCombinations(listFilesInput,2):
                listFilesInputCombinations.append(uc)
            #    print ' '.join(uc)
            print 'listFilesInputCombinations: ', listFilesInputCombinations
            #exit()
            
            for filesInput in listFilesInputCombinations:    
                listDoc = []
                listMyWords = []
                
                outputPercentageFilenameBase = 'Per'
                
                for fileOne in filesInput:
                    outputFileNameDiff = fileOne[6:9]
                    print 'outputFileNameDiff: ', outputFileNameDiff
                    
            #        outputFileNameDiff = fileOne[0:3]
            #        print 'outputFileNameDiff: ', outputFileNameDiff
                #    exit()
                    outputPercentageFilenameBase = outputPercentageFilenameBase + '-'+ outputFileNameDiff
        #            filePioTxt= dirMain+dirInput+typeTextPreprocess+fileOne
                    filePioTxt= dirMain+dirInput+fileOne
                    with open(filePioTxt) as fTxtOrg:
                        listDocOrg = fTxtOrg.readlines()
                    print 'len(listDocOrg): ', len(listDocOrg)
                #    exit()
                #    with open(dirMain+'output'+typeTextPreprocess+fileOne[8:11]+'.csv', 'wb') as outf:
                #    with open(dirMain+typeTextPreprocess+'output-'+outputFileNameDiff+'.csv', 'wb') as outf:
            
            
                    
            
                    for rowOfListDocOrg in listDocOrg:
        #                print 'rowOfListDocOrg: ', rowOfListDocOrg
            #            myResult = p.search(rowOfListDocOrg)
            #            if myResult <> None:
            #                myData = re.sub('^Title: |^Abstract: ','',myResult.group())
                        listMyWords.extend(rowOfListDocOrg.split())
            #                listDoc.append((myData.split(),fileOne[9:11]))
                        print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split())
                        listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff))
                print 'type(listDoc): ', type(listDoc)
                print 'listDoc[0]: ', listDoc[0]
                print 'listDoc[1]: ', listDoc[1]
                
                random.shuffle(listDoc)
                #print len(listDoc), myData.split()
                print 'len(listMyWords): ', len(listMyWords)
            #    exit()
                
                all_words = nltk.FreqDist(listMyWords)
                print 'len(all_words): ', len(all_words)
                #print 'type(all_words): ', type(all_words), len(all_words)
        #        word_features = all_words.keys()[:len(all_words)/10]
                word_features = all_words.keys()[:len(all_words)/wordFeatureDivideNum]
                print 'word_features: ', len(word_features), type(word_features), word_features
                #exit()
                
                
                
            #    favorDiagnostic = ['intervention', 'risk', 'therapy', 'disease', 'participants', 'effects', 'subjects', 'patient', 'response', 'outcomes', 'events','outcome', 'findings', 'performance', 'statistically', 'evaluation', 'population']
            #    print '\ndocument_features(favorDiagnostic): ', document_features(favorDiagnostic)
                
                
                featuresets = [(document_features(d, word_features), c) for (d,c) in listDoc]
#                print '\nfeaturesets: ', len(featuresets), featuresets
                # featuresets(1/3):  360 [({'bolus': False, 'magnetic': False, 'colonoscopy': False ... }, 'int')
                # featuresets(1/2):  360 [({'bolus': False, 'ali': False, 'caused': False, 'magnetic': False ... }, 'int')
#                exit()
                sizeTest = len(listDoc)/numFold
                print '\nlen(listDoc): ', len(listDoc), '\nsizeTraining:', len(listDoc)-sizeTest,'\nsizeTesting: ', sizeTest
                
                train_set, test_set = featuresets[sizeTest:], featuresets[:sizeTest]
                classifier = nltk.NaiveBayesClassifier.train(train_set)
                print 'nltk.classify.accuracy(classifier, test_set): ', nltk.classify.accuracy(classifier, test_set), '\n'
                
                
                cpdist = classifier._feature_probdist
                print 'classifier.most_informative_features(10):', classifier.most_informative_features(10)
                
        #        print dirMain+dirOutput+str(idxCrossValidation)+outputPercentageFilenameBase+'.csv'
        #        exit()
                with open(dirMain+dirOutput+typeTextPreprocess+str(idxCrossValidation)+'-'+outputPercentageFilenameBase+'.csv', 'wb') as outf:
                    outcsv = csv.writer(outf)
                    for fname, fval in classifier.most_informative_features(len(word_features)):
                        def labelprob(l):
                            return cpdist[l,fname].prob(fval)
                        
                        labels = sorted([l for l in classifier._labels if 
                                fval in cpdist[l,fname].samples()], 
                            key=labelprob)
                        if len(labels) == 1:
                            continue
                        l0 = labels[0]
                        l1 = labels[-1]
                        if cpdist[l0,fname].prob(fval) == 0:
                            ratio = 'INF'
                        else:
                            ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval))
                    
                        if cpdist[l0,fname].prob(fval) == 0:
                            ratio1 = 'INF'
                        else:
                    #        ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval))
                            ratio1 = '%8.2f' % (cpdist[l1,fname].prob(fval) / (cpdist[l1,fname].prob(fval)+cpdist[l0,fname].prob(fval)))
                    
                        if cpdist[l0,fname].prob(fval) == 0:
                            ratio2 = 'INF'
                        else:
                            ratio2 = '%8.2f' % ( cpdist[l0,fname].prob(fval) / (cpdist[l1,fname].prob(fval) + cpdist[l0,fname].prob(fval)))
                    
                    #    print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio)
                        print '%24s = %-14r %6s : %-6s = %s : 1.0 : %s : %s' % (fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2)
                #        outf.write(fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2)
                #        outf.write(fname, fval)
                        outcsv.writerow((fname, fval, l1[:6], l0[:6], ratio, '1', ratio1, ratio2))
        
        
        
        
    exit()
    
    
    exit()
    #0.81
    classifier.show_most_informative_features(n=10)
    def show_most_informative_features22(self, n=10):
        # Determine the most relevant features, and display them.
        
        cpdist = self._feature_probdist
        print 'Most Informative Features'
        
        for fname, fval in self.most_informative_features(n):
            def labelprob(l):
                return cpdist[l,fname].prob(fval)
            
            labels = sorted([l for l in self._labels if 
                    fval in cpdist[l,fname].samples()], 
                key=labelprob)
            if len(labels) == 1:
                continue
            l0 = labels[0]
            l1 = labels[-1]
            if cpdist[l0,fname].prob(fval) == 0:
                ratio = 'INF'
            else:
                ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / 
                    cpdist[l0,fname].prob(fval))
            print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio)
    
    show_most_informative_features22(n=10)
    exit()
    qq = classifier.most_informative_features(10)
    print qq
    classifier.probdist(test_set)
Exemple #3
0
def fNaiveBayesTraining(numFold=10):
    wordFeatureDivideNumStart10times = 4 # default = 3
    wordFeatureDivideNumStop10times = 5 # default =10 not include
    wordFeatureDivideNumStep10times = 1 # default =10
    
#    listMyType = ['stp-', 'wnl-', 'ptr-']
    #typeTextPreprocess = ''
    #typeTextPreprocess = 'stp-'
    typeTextPreprocess = 'wnl-'
    #typeTextPreprocess = 'ptr-'
    
    
    myRe = '((^Title: |^Abstract: )(.*))'
    p = re.compile(myRe)
    
    #listFilesInputPair = ['pure-doc-dx.txt', 'pure-doc-tx.txt']
    #listFilesInputPair = ['intervention.txt', 'patient.txt', 'outcome.txt']
    
    #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc)
    
    
    
    #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #                  ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #                  ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    #                  ]
    #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #listFilesInputPair = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    
    
    dirMain = ''
    dirInputTrainingSet = 'Output2_TrainingSet/'
    dirInputTestingSet = 'Output2_TestingSet/'
    dirOutput_accuracy = 'Output3_accuracy/'
    #dirInputTrainingSet = 'Output2Train/'

    dirCwd = os.getcwd()+'/'
    
#    dirOutput_accuracy
    if os.path.isdir(dirCwd+dirOutput_accuracy):
        try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
            shutil.rmtree(dirCwd+dirOutput_accuracy)
        except:
            raise
    os.mkdir(dirCwd+dirOutput_accuracy)

    with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy:
        myAccruacyData = 'ratioWordFeature,' + 'listFilesInputPair,' + 'idxCrossValidation,' + 'accuracy\n'
        outfAccuracy.write(myAccruacyData)

        print 'myAccruacyData: ', myAccruacyData
    

    
    
    featuresetsTrain = []
#    wordFeatureRatio10times = 0.1
    for wordFeatureRatio10times in range(wordFeatureDivideNumStart10times,wordFeatureDivideNumStop10times, wordFeatureDivideNumStep10times):
#        print 'wordFeatureRatio10times: ', wordFeatureRatio10times
        ratioWordFeature = wordFeatureRatio10times/10.0
        print 'ratioWordFeature: ', ratioWordFeature
#        continue
        
        dirOutput = 'Output3_Divide'+str(ratioWordFeature)+'/'
        
        #for typeTextPreprocess in listMyType:
        if os.path.isdir(dirCwd+dirOutput):
            try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
                shutil.rmtree(dirCwd+dirOutput)
            except:
                raise
        os.mkdir(dirCwd+dirOutput)
        
        
        for idxCrossValidation in range(0,numFold):
#            print idxCrossValidation
            
#            listFilesInputFilenameStem = [typeTextPreprocess+str(idxCrossValidation)+'-int-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Train-.csv']
#            listFilesInputFilenameStem = [typeTextPreprocess+str(idxCrossValidation)+'-int', typeTextPreprocess+str(idxCrossValidation)+'-out', typeTextPreprocess+str(idxCrossValidation)+'-pat']
#            listFilesInputFilenameStem = [str(idxCrossValidation)+'-int', str(idxCrossValidation)+'-out', str(idxCrossValidation)+'-pat']
            listFilesInputFilenameStem = ['int', 'out', 'pat']
        #    listFilesInputTrain = ['stp-'+str(idxCrossValidation)+'-Per-int-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-out-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-pat-Test-.txt']
#            listFilesInputTest = [typeTextPreprocess+str(idxCrossValidation)+'-int-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Test-.csv']
#            listFilesInputTrain = [typeTextPreprocess+str(idxCrossValidation)+'-int-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Train-.csv']
        
        #exit()
            
        #    listFilesInputTrain = ['stp-0-Per-int-Test-.txt', 'stp-0-Per-out-Test-.txt', 'stp-0-Per-pat-Test-.txt']
#            print "Unique Combinations of 2 letters from :",listFilesInputTrain
            
            listFilesInputCombinations = []
#            for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2):
            for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2):
                listFilesInputCombinations.append(uc)
#                print type(uc), ' '.join(uc)
#            exit()
            
            for listFilesInputPair in listFilesInputCombinations:    
                listDocTrain = []
                listDocTest = []
                
                listMyWordsTrain = []
                listMyWordsTest = []

                outputPercentageFilenameMiddle = 'Per'
                
                for fileOne in listFilesInputPair:
                    outputFileNameDiff = fileOne[0:3]
                    print 'fileOne: ', fileOne, 'outputFileNameDiff: ', outputFileNameDiff
#                    exit()
                    
            #        outputFileNameDiff = fileOne[0:3]
            #        print 'outputFileNameDiff: ', outputFileNameDiff
                #    exit()
                    outputPercentageFilenameMiddle = outputPercentageFilenameMiddle + '-'+ outputFileNameDiff
        #            fileOneTrain= dirMain+dirInputTrainingSet+typeTextPreprocess+fileOne
#                    fileOneTrain= dirMain+dirInputTrainingSet+fileOne+'-Train-.csv'

                    fileOneTrain= dirMain+dirInputTrainingSet+typeTextPreprocess+str(idxCrossValidation)+'-'+fileOne+'-Train-.csv'
                    print 'fileOneTrain: ', fileOneTrain
#                    exit()
                    with open(fileOneTrain) as fTxtOrgTrain:
                        listDocOrgTrain = fTxtOrgTrain.readlines()
#                    print 'len(listDocOrgTrain): ', len(listDocOrgTrain), listDocOrgTrain
                    
                    fileOneTest= dirMain+dirInputTestingSet+typeTextPreprocess+str(idxCrossValidation)+'-'+fileOne+'-Test-.csv'
                    print 'fileOneTest: ', fileOneTest
                    with open(fileOneTest) as fTxtOrgTest:
                        listDocOrgTest = fTxtOrgTest.readlines()
#                    print 'len(listDocOrgText): ', len(listDocOrgTrain), listDocOrgTrain
                    
                #    exit()
                #    with open(dirMain+'output'+typeTextPreprocess+fileOne[8:11]+'.csv', 'wb') as outf:
                #    with open(dirMain+typeTextPreprocess+'output-'+outputFileNameDiff+'.csv', 'wb') as outf:
            
            
                    
            
                    for rowOfListDocOrgTrain in listDocOrgTrain:
        #                print 'rowOfListDocOrgTrain: ', rowOfListDocOrgTrain
            #            myResult = p.search(rowOfListDocOrgTrain)
            #            if myResult <> None:
            #                myData = re.sub('^Title: |^Abstract: ','',myResult.group())
                        listMyWordsTrain.extend(rowOfListDocOrgTrain.split())
            #                listDocTrain.append((myData.split(),fileOne[9:11]))
#                        print '(rowOfListDocOrgTrain.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrgTrain.split())
                        listDocTrain.append((rowOfListDocOrgTrain.split(), outputFileNameDiff))
#               (for fileOne in listFilesInputPair:) END

                    for rowOfListDocOrgTest in listDocOrgTest:
                        listMyWordsTest.extend(rowOfListDocOrgTest.split())
                        listDocTest.append((rowOfListDocOrgTest.split(), outputFileNameDiff))
                    


#                print 'type(listDocTrain): ', type(listDocTrain)
#                print 'listDocTrain[0]: ', listDocTrain[0]
#                print 'listDocTrain[1]: ', listDocTrain[1]
#                random.shuffle(listDocTrain)
#                print 'len(listMyWordsTrain): ', len(listMyWordsTrain)
            #    exit()
                
                allWordsTrain = nltk.FreqDist(listMyWordsTrain)
#                allWordsTest = nltk.FreqDist(listMyWordsTest)
#                print 'len(allWordsTrain): ', len(allWordsTrain)
                #print 'type(allWordsTrain): ', type(allWordsTrain), len(allWordsTrain)
        #        word_features_Train = allWordsTrain.keys()[:len(allWordsTrain)/10]
                word_features_Train = allWordsTrain.keys()[:int(len(allWordsTrain)*ratioWordFeature)]
#                word_features_Test = allWordsTrain.keys()[:len(allWordsTest)]
#                print 'word_features_Train: ', len(word_features_Train), type(word_features_Train), word_features_Train
                #exit()
                
                
                
            #    favorDiagnostic = ['intervention', 'risk', 'therapy', 'disease', 'participants', 'effects', 'subjects', 'patient', 'response', 'outcomes', 'events','outcome', 'findings', 'performance', 'statistically', 'evaluation', 'population']
            #    print '\ndocument_features(favorDiagnostic): ', document_features(favorDiagnostic)
                
                
                featuresetsTrain = [(document_features(d, word_features_Train), c) for (d,c) in listDocTrain]
                featuresetsText = [(document_features(d, word_features_Train), c) for (d,c) in listDocTest]
                print 'sys.getsizeof(featuresetsTrain): ', sys.getsizeof(featuresetsTrain), 'ratioWordFeature: ', ratioWordFeature
                print 'sys.getsizeof(featuresetsText): ', sys.getsizeof(featuresetsText), 'ratioWordFeature: ', ratioWordFeature
#                print '\nfeaturesets: ', len(featuresetsTrain), featuresetsTrain
#                continue
                # featuresetsTrain(1/3):  360 [({'bolus': False, 'magnetic': False, 'colonoscopy': False ... }, 'int')
                # featuresetsTrain(1/2):  360 [({'bolus': False, 'ali': False, 'caused': False, 'magnetic': False ... }, 'int')
#                exit()
#                sizeTest = len(listDocTrain)/numFold
#                print '\nlen(listDocTrain): ', len(listDocTrain), '\nsizeTraining:', len(listDocTrain)-sizeTest,'\nsizeTesting: ', sizeTest
                
#                train_set, test_set = featuresetsTrain[sizeTest:], featuresetsTrain[:sizeTest]
#                classifier = nltk.NaiveBayesClassifier.train(train_set)
                classifier = nltk.NaiveBayesClassifier.train(featuresetsTrain)

#                with open(dirMain+dirOutputMergeFile+typeTextPreprocess+'-'+str(idxCrossValidation)+'-Train-'+'.csv', 'a') as outfFullTrain:
                with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy:
#                    myAccruacyData = 'ratioWordFeature,' + str(ratioWordFeature) +','+ '-'.join(listFilesInputPair) + ',idxCrossValidation,' + str(idxCrossValidation)+',accuracy,' + str(nltk.classify.accuracy(classifier, featuresetsText)) +'\n'
                    myAccruacyData = str(ratioWordFeature) +','+ '-'.join(listFilesInputPair) +','+ str(idxCrossValidation) +','+ str(nltk.classify.accuracy(classifier, featuresetsText)) +'\n'
                    print 'myAccruacyData: ', myAccruacyData
                    outfAccuracy.write(myAccruacyData)
#                    exit()
#                    outfAccuracy.write(myAccruacyData)
#                    outfAccuracy.write()
                    print myAccruacyData
                
                
                cpdist = classifier._feature_probdist
#                print 'classifier.most_informative_features(10):', classifier.most_informative_features(10)
                
        #        print dirMain+dirOutput+str(idxCrossValidation)+outputPercentageFilenameMiddle+'.csv'
        #        exit()
                with open(dirMain+dirOutput+typeTextPreprocess+str(idxCrossValidation)+'-'+outputPercentageFilenameMiddle+'.csv', 'wb') as outf:
                    outcsv = csv.writer(outf)
                    for fname, fval in classifier.most_informative_features(len(word_features_Train)):
                        def labelprob(l):
                            return cpdist[l,fname].prob(fval)
                        
                        labels = sorted([l for l in classifier._labels if 
                                fval in cpdist[l,fname].samples()], 
                            key=labelprob)
                        if len(labels) == 1:
                            continue
                        l0 = labels[0]
                        l1 = labels[-1]
                        if cpdist[l0,fname].prob(fval) == 0:
                            ratio = 'INF'
                        else:
                            ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval))
                    
                        if cpdist[l0,fname].prob(fval) == 0:
                            ratio1 = 'INF'
                        else:
                    #        ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval))
                            ratio1 = '%8.2f' % (cpdist[l1,fname].prob(fval) / (cpdist[l1,fname].prob(fval)+cpdist[l0,fname].prob(fval)))
                    
                        if cpdist[l0,fname].prob(fval) == 0:
                            ratio2 = 'INF'
                        else:
                            ratio2 = '%8.2f' % ( cpdist[l0,fname].prob(fval) / (cpdist[l1,fname].prob(fval) + cpdist[l0,fname].prob(fval)))
                    
                    #    print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio)
#                        print '%24s = %-14r %6s : %-6s = %s : 1.0 : %s : %s' % (fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2)
                #        outf.write(fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2)
                #        outf.write(fname, fval)
                        outcsv.writerow((fname, fval, l1[:6], l0[:6], ratio, '1', ratio1, ratio2))
#                print 'listFilesInputCombinations: ', listFilesInputCombinations

        print 'len(listDocOrgTrain): ', len(listDocOrgTrain)
        
        
        
    exit()
    
    
    exit()
    #0.81
    classifier.show_most_informative_features(n=10)
    def show_most_informative_features22(self, n=10):
        # Determine the most relevant features, and display them.
        
        cpdist = self._feature_probdist
        print 'Most Informative Features'
        
        for fname, fval in self.most_informative_features(n):
            def labelprob(l):
                return cpdist[l,fname].prob(fval)
            
            labels = sorted([l for l in self._labels if 
                    fval in cpdist[l,fname].samples()], 
                key=labelprob)
            if len(labels) == 1:
                continue
            l0 = labels[0]
            l1 = labels[-1]
            if cpdist[l0,fname].prob(fval) == 0:
                ratio = 'INF'
            else:
                ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / 
                    cpdist[l0,fname].prob(fval))
            print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio)
    
    show_most_informative_features22(n=10)
    exit()
    qq = classifier.most_informative_features(10)
    print qq
    classifier.probdist(test_set)
def fNaiveBayesTraining(numFold=10):
    global global_list_Word_features
    global global_ratioWordFeature
    global global_dirOutput
    global global_listFilesInputPair
    
    
    
    myRe = '((^Title: |^Abstract: )(.*))'
    p = re.compile(myRe)

    listFilesInputFilenameStemTmp = os.listdir(dirMain + dirInput)
    for itemOflistFilesInputFilenameStemTmp in listFilesInputFilenameStemTmp:
        listFilesInputFilenameStem.append(itemOflistFilesInputFilenameStemTmp[0:-4])



    
    #global_listFilesInputPair = ['pure-doc-dx.txt', 'pure-doc-tx.txt']
    #global_listFilesInputPair = ['intervention.txt', 'patient.txt', 'outcome.txt']
    
    #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc)
    
    
    
    #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #                  ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #                  ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    #                  ]
    #global_listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #global_listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #global_listFilesInputPair = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    
    
    
#    dirOutput_accuracy
    logging.info('dirMain+dirOutput_accuracy = ' + dirMain+dirOutput_accuracy)
    if os.path.isdir(dirMain+dirOutput_accuracy):
        try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
            shutil.rmtree(dirMain+dirOutput_accuracy)
        except:
            raise
    os.mkdir(dirMain+dirOutput_accuracy)    

    with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy:
        myAccruacyData = 'global_ratioWordFeature,' + 'global_listFilesInputPair,' + 'idxCrossValidation,' + 'accuracy\n'
        outfAccuracy.write(myAccruacyData)

#        print 'myAccruacyData: ', myAccruacyData
        logging.debug('myAccruacyData: ' + myAccruacyData)
        
        
    with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-pmid.csv', 'wb') as outfPmid:
        outfPmid.write('First line:\n')

    
    with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-PreRecFmea.csv', 'a') as outfPreRecFmea:
#        myPreRecFmeaData = str(posPrecision) +','+ str(posRecall) +','+ str(posRmeasure) +','+ str(negPrecision) +','+ str(negRecall) +','+ str(negFmeasure) +'\n'
#        myPreRecFmeaData = 'global_ratioWordFeature,' + 'global_listFilesInputPair,' + 'idxCrossValidation,' + 'posPrecision,'+ 'posRecall,'+ 'posRmeasure,'+ 'negPrecision,'+ 'negRecall,'+ 'negFmeasure\n'
        myPreRecFmeaData = 'global_ratioWordFeature,' + 'global_listFilesInputPair,' + 'idxCrossValidation,' + 'testType,'+ 'testValue\n'
        outfPreRecFmea.write(myPreRecFmeaData)


#    myCsv = csv.writer(open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-PreRecFmea1.csv', 'a'), dialect='excel')
    myCsv = csv.writer(open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-PreRecFmea1.csv', 'wb'), dialect='excel')
#    myListTmp = [global_ratioWordFeature, global_listFilesInputPair[0][0], idxCrossValidation , 'posPrecision', posPrecision]
    myListTmp = ['global_ratioWordFeature1', 'global_listFilesInputPair', 'idxCrossValidation', 'testType', 'testValue']
    myCsv.writerow(myListTmp)

    
#    wordFeatureRatio10times = 0.1
    for wordFeatureRatio10times in range(wordFeatureRatioStart10times,wordFeatureRatioStop10times, wordFeatureRatioStep10times):
#        print 'wordFeatureRatio10times: ', wordFeatureRatio10times
        global_ratioWordFeature = wordFeatureRatio10times/100.0
#        print 'global_ratioWordFeature: ', global_ratioWordFeature
        logging.info('global_ratioWordFeature: ' + str(global_ratioWordFeature))
        
#        continue
        
        global_dirOutput = 'Output3_Divide'+str(global_ratioWordFeature)+'/'
        
        #for typeTextPreprocess in listMyType:
        if os.path.isdir(dirMain+global_dirOutput):
            try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
                shutil.rmtree(dirMain+global_dirOutput)
            except:
                raise
        os.mkdir(dirMain+global_dirOutput)

# ================================================================================================================
        for idxCrossValidation in range(0,numFold):
          
            listFilesInputCombinations = []
        #            for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2):
#            for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2):
#                listFilesInputCombinations.append(uc)
        #                print type(uc), ' '.join(uc)
        #            exit()


#            flagComplements = False
#            flagComplements = True
            if flagComplements:
                for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 1):
#                    listFilesInputCombinations.append([uc, list(set(listFilesInputFilenameStem).difference(uc))])
                    listRemoveUc = list(listFilesInputFilenameStem)
                    listRemoveUc.remove(uc[0])
#                    print listRemoveUc
                    
                    logging.debug([uc, listRemoveUc])
                    listFilesInputCombinations.append([uc, listRemoveUc])
                    

#                    listFilesInputCombinations.append([uc, list(listFilesInputFilenameStem).remove('intervention')])
#                    print 'uc: ', type(uc), ' '.join(uc)                    
                    
#                    print 'uc if flagComplements: ', type(uc), ' '.join(uc), ' ','set(listFilesInputFilenameStem).difference(uc): ', set(listFilesInputFilenameStem).difference(uc)
#                    logging.debug(['uc = ', uc, 'set(listFilesInputFilenameStem).difference(uc): ', set(listFilesInputFilenameStem).difference(uc)])
#                print 'listFilesInputCombinations: ', type(listFilesInputCombinations), listFilesInputCombinations
#                logging.debug(['idxCrossValidation = ', str(idxCrossValidation), 'listFilesInputCombinations', listFilesInputCombinations])
            else:
            #            for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2):
                for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2):
#                    listFilesInputCombinations.append(uc)
                    listFilesInputCombinations.append([[uc[0]], [uc[1]]])
#                    print 'uc if flagComplements else: ', type(uc), uc
                    logging.debug(['uc if flagComplements else: ', type(uc), uc])
                print '\nlistFilesInputCombinations: ', type(listFilesInputCombinations), listFilesInputCombinations
#                exit()
#                for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2):
#            exit()            
#            print "listFilesInputCombinations = ", listFilesInputCombinations
#            listFilesInputCombinations = []

            
            
            
            for global_listFilesInputPair in listFilesInputCombinations:    
#                logging.info(['global_listFilesInputPair = ', global_listFilesInputPair])
                p = Process(target=fSubprocess, args=(idxCrossValidation,))
                p.start()
                #dicCorpus = parent_conn.recv()
                #print parent_conn.recv()   # prints "[42, None, 'hello']"
                p.join()
        with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Train-'+'.txt', 'wb') as outf2:
            for oneRowOfListTrainWithDiff in listTrainWithDiff:
#                listAllDocWords.extend(oneRowOfListTrainWithDiff[0])
                outf2.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n')

        with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Test-'+'.txt', 'wb') as outf3:
            for oneRowOflistValidationWithDiff in listValidationWithDiff:
                print 'oneRowOflistValidationWithDiff: ', oneRowOflistValidationWithDiff
                outf3.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n')

        idxCrossValidation = idxCrossValidation + 1 

exit()

listFilesInputCombinations = []
for uc in xpermutations.xuniqueCombinations(listFilesInput,2):
    listFilesInputCombinations.append(uc)
#    print ' '.join(uc)
print 'listFilesInputCombinations: ', listFilesInputCombinations
#exit()

for filesnamePair in listFilesInputCombinations:    
    listDoc2filesWithDiff = []
    listWordsOf2files = []
    
#    outputPercentageFilenameBase = typeTextPreprocess+'Per'
#    outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per'

    
    for oneFilename in filesnamePair:
        outputFileNameDiff = oneFilename[0:3]
def fNaiveBayesTraining(numFold=10):
    global global_list_Word_features
    global ratioWordFeature
    global dirOutput
    global listFilesInputPair
    
    
    
    myRe = '((^Title: |^Abstract: )(.*))'
    p = re.compile(myRe)
    
    #listFilesInputPair = ['pure-doc-dx.txt', 'pure-doc-tx.txt']
    #listFilesInputPair = ['intervention.txt', 'patient.txt', 'outcome.txt']
    
    #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc)
    
    
    
    #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #                  ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #                  ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    #                  ]
    #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt']
    #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt']
    #listFilesInputPair = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt']
    
    
    
#    dirOutput_accuracy
    if os.path.isdir(dirCwd+dirOutput_accuracy):
        try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
            shutil.rmtree(dirCwd+dirOutput_accuracy)
        except:
            raise
    os.mkdir(dirCwd+dirOutput_accuracy)

    with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy:
        myAccruacyData = 'ratioWordFeature,' + 'listFilesInputPair,' + 'idxCrossValidation,' + 'accuracy\n'
        outfAccuracy.write(myAccruacyData)

        print 'myAccruacyData: ', myAccruacyData
    

    
#    wordFeatureRatio10times = 0.1
    for wordFeatureRatio10times in range(wordFeatureRatioStart10times,wordFeatureRatioStop10times, wordFeatureRatioStep10times):
#        print 'wordFeatureRatio10times: ', wordFeatureRatio10times
        ratioWordFeature = wordFeatureRatio10times/100.0
        print 'ratioWordFeature: ', ratioWordFeature
#        continue
        
        dirOutput = 'Output3_Divide'+str(ratioWordFeature)+'/'
        
        #for typeTextPreprocess in listMyType:
        if os.path.isdir(dirCwd+dirOutput):
            try:
        #            shutil.rmtree(LDASubDataDir, ignore_errors, onerror)
                shutil.rmtree(dirCwd+dirOutput)
            except:
                raise
        os.mkdir(dirCwd+dirOutput)

# ================================================================================================================
        for idxCrossValidation in range(0,numFold):
          
            listFilesInputCombinations = []
        #            for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2):
            for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2):
                listFilesInputCombinations.append(uc)
        #                print type(uc), ' '.join(uc)
        #            exit()
            
            for listFilesInputPair in listFilesInputCombinations:    
            
                p = Process(target=fSubprocess, args=(idxCrossValidation,))
                p.start()
                #dicCorpus = parent_conn.recv()
                #print parent_conn.recv()   # prints "[42, None, 'hello']"
                p.join()
Exemple #7
0
def main():
    """
    Compute row entropy (H_MON) from exemplar matrices as described in:
    
        Dominik Schlechtweg, Stefanie Eckmann, Enrico Santus, Sabine Schulte im Walde and Daniel Hole. 
            2017. German in Flux: Detecting Metaphoric Change via Word Entropy. In Proceedings of 
            CoNLL 2017. Vancouver, Canada.
    """

    # Get the arguments
    args = docopt("""Compute row entropy (H_MON) from exemplar matrices.

    Usage:
        H_MON.py <testset_file> <exemplar_matrix> <time_boundary_1-time_boundary_2> <window_size> <exemplar_number> <vector_number> <output_dir>
        
    Arguments:
        <testset_file> = a file containing term-pairs corresponding to developments with their
                           starting points and the type of the development
        <exemplar_matrix> = the pkl file of the source exemplar matrix
        <time_boundary_1-time_boundary_2> = the time boundaries to slice up the corpus
        <window_size> = size of sliding time interval
        <exemplar_number> = number of exemplars to construct one target vector
        <vector_number> = number of target vectors to average over
        <output_dir> = where to save the results
        
    """)

    matrice1_pkl = args['<exemplar_matrix>']
    testset_file = args['<testset_file>']
    output_dir = args['<output_dir>']
    time_boundaries_str = args['<time_boundary_1-time_boundary_2>']
    global_lower_bound, global_upper_bound = int(
        time_boundaries_str.split("-")[0]), int(
            time_boundaries_str.split("-")[1])
    window_size = int(args['<window_size>'])
    N = int(args['<exemplar_number>'])
    V = int(args['<vector_number>'])
    join_sign = ':'
    non_values = [-999.0, -888.0]

    # Generate cut-points
    cut_points = [i for i in range(global_lower_bound, global_upper_bound)]

    # Load the term-pairs
    targets, test_set = load_test_pairs(testset_file)

    # Process matrice

    matrice1_name = os.path.splitext(basename(matrice1_pkl))[0]
    matrice1_folder = os.path.dirname(matrice1_pkl) + "/"

    # Receive a .pkl file
    exemplar_space, _, vocab_map1, vocab_size1, column_map1, id2column_map1 = get_space(
        matrice1_folder, matrice1_name, False, False, False)

    for cut_point in cut_points:

        print cut_point

        target_values = {}

        current_lower_bound, current_upper_bound = cut_point - window_size, cut_point + window_size

        for target in targets:

            print target

            values = []

            exem_list = [(exem.split(join_sign)[0], exem.split(join_sign)[1],
                          exem.split(join_sign)[2], exem.split(join_sign)[3])
                         for exem in vocab_map1 if exem.split(join_sign)[0] +
                         join_sign + exem.split(join_sign)[1] == target]
            exem_dict = dict([((lemma, pos, int(date), int(identifier)),
                               int(date))
                              for (lemma, pos, date, identifier) in exem_list])

            # Get contexts in window
            window = [(lemma, pos, date, identifier)
                      for (lemma, pos, date, identifier) in exem_dict
                      if current_lower_bound <= date <= current_upper_bound]
            print 'Window size is: %d' % (len(window))
            random.shuffle(window)

            # Get combinations of exemplars of size N
            exem_combos = xuniqueCombinations(window, N)

            for i, combo in enumerate(exem_combos):

                if i >= V:
                    break

                print 'Calculating combination %d of %d...' % (i, V)

                # Initialize sparse matrix
                sparse_mat_dict = defaultdict(lambda: defaultdict(lambda: 0))

                cols = []
                for (lemma, pos, date, identifier) in combo:

                    exem_tar = join_sign.join(
                        (lemma, pos, str(date), str(identifier)))

                    row = exemplar_space.get_row(exem_tar)

                    data = row.get_mat().data
                    indices = row.get_mat().indices

                    for i, key in enumerate(data):

                        cxt = id2column_map1[indices[i]]
                        cols.append(cxt)
                        sparse_mat_dict[target][
                            cxt] = sparse_mat_dict[target][cxt] + key

                # Bring to sparse matrix format
                rows = set([target])
                cols = set(cols)
                sparse_mat_tup = [(key, context, sparse_mat_dict[key][context])
                                  for key in sparse_mat_dict
                                  for context in sparse_mat_dict[key]]

                [id2row], [row2id] = extract_indexing_structs_mod(rows, [0])
                [id2column], [column2id
                              ] = extract_indexing_structs_mod(cols, [0])
                sparse_mat = read_sparse_space_data_mod(
                    sparse_mat_tup, row2id, column2id)

                # Create a space from co-occurrence counts in sparse format
                sparse_space = Space(sparse_mat, id2row, id2column, row2id,
                                     column2id)
                sparse_space.__class__ = Space_extension

                # Get all row entropies
                r_entropies_dict1, r_entr_ranked1 = get_r_entropies(
                    row2id, sparse_space, None, output_dir, True)

                values.append(r_entropies_dict1[target])

            # Computing mean of values
            print 'Computing mean of values...'
            print 'Averaging over %d values' % (len(values))
            try:
                target_values[target] = mean(values)
            except StatisticsError:
                target_values[target] = non_values[1]

        # Make output
        unscored_output = []
        for (x, y, label, relation) in test_set:
            unscored_output.append((x, y, label, relation, target_values[x]))

        # Save results
        save_results(
            unscored_output,
            output_dir + 'row_entropy_exem_' + str(current_lower_bound) + '-' +
            str(current_upper_bound) + '_' + str(N) + 's' + '.txt')
Exemple #8
0
def get_voronoi_and_salience(poff_filename):
    points = poff_reader(poff_filename)

    # Convex hull
    convex_hull = convexHull(points)
    convex_hull_idx = map(lambda p: points.index(p), convex_hull)
    convex_hull_edges = get_edges(convex_hull_idx)

    # Get boundaries edges
    points_idx = range(0, len(points))
    boundary_edges = get_edges(points_idx)

    # Add one dimension
    o_points = map(lambda x: x + (0,), points)

    # Add a virtual infinite vertex
    #inf_vertex = (0, 0, float('inf'))
    inf_vertex = (0, 0, 9999999)
    points = o_points + [inf_vertex]
    inf_idx = points.index(inf_vertex)

    # Get triangles (in 3D)
    t_obj = Triangulation(points)
    tetraherons = t_obj.get_elements_indices()

    triangles = filter(lambda x: inf_idx not in x,
                       set([tuple(sorted(i)) for t in tetraherons
                            for i in xuniqueCombinations(t, 3)]))

    # Calculate neighbours table
    neighbours = get_neighbours(triangles)

    # Classify all infinite Delaunay faces as EXTERIOR and push them
    # to a queue
    side_table = {}
    q = Queue.Queue()
    for face in triangles:
        edges = set([tuple(sorted(e)) for e in xuniqueCombinations(face, 2)])
        if edges & (convex_hull_edges - boundary_edges):
            side_table[face] = EXTERIOR
            q.put(face)
            continue
        if edges & (boundary_edges & convex_hull_edges):
            side_table[face] = INTERIOR
            q.put(face)
            continue
    # Collect all EXTERIOR triangles
    o_triangles_idx = filter(lambda x: side_table[x] is EXTERIOR, side_table.keys())
    o_triangles = idx_to_real(o_triangles_idx, o_points)

    # Mark every triangle as EXTERIOR or INTERIOR
    while not q.empty():
        face = q.get()
        for nface in neighbours[face]:
            if nface not in side_table:
                nedge = tuple(sorted(list(set(nface) & set(face))))
                assert len(nedge) == 2
                if nedge in boundary_edges:
                    side_table[nface] = not side_table[face]
                else:
                    side_table[nface] = side_table[face]
                q.put(nface)

    # Collect all INTERIOR triangles
    i_triangles_idx = filter(lambda x: side_table[x] is INTERIOR, triangles)
    i_triangles = idx_to_real(i_triangles_idx, o_points)

    # Filter neighbours dictionary to only INTERIOR triangles
    i_neighbours = {}
    for k, d in neighbours.items():
        if side_table[k] is INTERIOR:
            i_neighbours[k] = filter(lambda x: side_table[x] is INTERIOR, d)

    # Calculate areas and circumenteres of triangles
    areas = {}
    circumcenters = {}
    for t_idx, t in zip(i_triangles_idx, i_triangles):
        areas[t_idx] = abs((t[2][0] - t[0][0]) * (t[1][1] - t[0][1]) - \
                           (t[1][0] - t[0][0]) * (t[2][1] - t[0][1])) / 2
        circumcenters[t_idx] = get_circumcenter(t)

    # find triangles that have two edge on the boundary
    boundary_triangles_idx = set([])
    for t in i_triangles_idx:
        edges = set([tuple(sorted(e)) for e in xuniqueCombinations(t, 2)])
        if len(edges & boundary_edges) == 2:
            boundary_triangles_idx.add(t)

    # traverse all triangles, and calculate the salience value
    voronoi_edges = []
    r = {}
    k = {}
    sum_area = sum(areas.values())
    visited_t = set([])
    for start_t in boundary_triangles_idx:
        q = Queue.Queue()
        q.put(start_t)
        while not q.empty():
            t = q.get()
            visited_t.add(t)
            for new_t in i_neighbours[t]:
                if new_t in visited_t:
                    continue
                nedge = tuple(sorted(list(set(new_t) & set(t))))
                assert len(nedge) == 2
                area_1 = get_any_part_area(nedge, i_neighbours, areas)
                area_2 = sum_area - area_1
                v_edge = tuple(sorted([circumcenters[t], circumcenters[new_t]]))
                if v_edge[0] == v_edge[1]:
                    q.put(new_t)
                    continue
                if v_edge not in voronoi_edges:
                    voronoi_edges.append(v_edge)
                r[v_edge] = min(area_1, area_2) / max(area_1, area_2)
                k[v_edge] = min(area_1, area_2) / length(v_edge)
                q.put(new_t)
        break

    # Do something with r, k and voronoi_edges
    return o_points, voronoi_edges, r, k, i_triangles