def draw(o, v, r, k, i, filename): o, v, r, k, i = rescale_points(o, v, r, k, i) surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, WIDTH, HEIGHT) ctx = cairo.Context(surface) ctx.scale(WIDTH/1.0, HEIGHT/1.0) # Normalizing the canvas # Draw shape boundary ctx.set_source_rgb(0.2, 0.2, 0.6) ctx.set_line_width(0.005) draw_consecutive_lines(ctx, o) ctx.stroke() # Draw internal delaunay triangulation ctx.set_source_rgba(0.3, 0.2, 0.5, 0.6) ctx.set_line_width(0.002) edges = set([]) for t in i: for edge in xuniqueCombinations(t, 2): edges.add(tuple(edge)) draw_edges(ctx, edges) ctx.stroke() # Draw voronoi skeleton ctx.set_source_rgba(0.5, 0.2, 0.2, 0.8) #draw_edges(ctx, v, r) draw_edges(ctx, v, k) surface.write_to_png(filename)
def fNaiveBayesTraining(): numFold = 10 wordFeatureDivideNumStart = 10 # default = 3 wordFeatureDivideNumStop = 5 # default =10 not include wordFeatureDivideNumStep = -2 # default =10 # listMyType = ['stp-', 'wnl-', 'ptr-'] #typeTextPreprocess = '' #typeTextPreprocess = 'stp-' typeTextPreprocess = 'wnl-' #typeTextPreprocess = 'ptr-' myRe = '((^Title: |^Abstract: )(.*))' p = re.compile(myRe) #filesInput = ['pure-doc-dx.txt', 'pure-doc-tx.txt'] #filesInput = ['intervention.txt', 'patient.txt', 'outcome.txt'] #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc) #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] # ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] # ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # ] #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] #filesInput = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] #filesInput = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] dirMain = '' dirInput = 'Output2_TrainingSet/' #dirInput = 'Output2Train/' for wordFeatureDivideNum in range(wordFeatureDivideNumStart,wordFeatureDivideNumStop, wordFeatureDivideNumStep): # print 'wordFeatureDivideNum: ', wordFeatureDivideNum dirOutput = 'Output3_Divide'+str(wordFeatureDivideNum)+'/' #for typeTextPreprocess in listMyType: dirCwd = os.getcwd()+'/' if os.path.isdir(dirCwd+dirOutput): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirCwd+dirOutput) except: raise os.mkdir(dirCwd+dirOutput) for idxCrossValidation in range(0,numFold): print idxCrossValidation # listFilesInput = ['stp-'+str(idxCrossValidation)+'-Per-int-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-out-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-pat-Test-.txt'] # listFilesInput = [typeTextPreprocess+str(idxCrossValidation)+'-int-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Test-.csv'] listFilesInput = [typeTextPreprocess+str(idxCrossValidation)+'-int-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Train-.csv'] #exit() # listFilesInput = ['stp-0-Per-int-Test-.txt', 'stp-0-Per-out-Test-.txt', 'stp-0-Per-pat-Test-.txt'] print "Unique Combinations of 2 letters from :",listFilesInput for fileOne in listFilesInput: # outputFileNameDiff = fileOne[10:13] outputFileNameDiff = fileOne[6:9] print 'outputFileNameDiff: ', outputFileNameDiff listMyWords = [] listDoc = [] # example from CrossValidationStep2 # outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per' # filePioTxt= dirCwd+dirInput+typeTextPreprocess+fileOne filePioTxt= dirCwd+dirInput+fileOne with open(filePioTxt) as fTxtOrg: listDocOrg = fTxtOrg.readlines() print 'len(listDocOrg): ', len(listDocOrg) for rowOfListDocOrg in listDocOrg: # print 'rowOfListDocOrg: ', rowOfListDocOrg # myResult = p.search(rowOfListDocOrg) # if myResult <> None: # myData = re.sub('^Title: |^Abstract: ','',myResult.group()) listMyWords.extend(rowOfListDocOrg.split()) # listDoc.append((myData.split(),fileOne[9:11])) print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split()) listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff)) #exit() listFilesInputCombinations = [] for uc in xpermutations.xuniqueCombinations(listFilesInput,2): listFilesInputCombinations.append(uc) # print ' '.join(uc) print 'listFilesInputCombinations: ', listFilesInputCombinations #exit() for filesInput in listFilesInputCombinations: listDoc = [] listMyWords = [] outputPercentageFilenameBase = 'Per' for fileOne in filesInput: outputFileNameDiff = fileOne[6:9] print 'outputFileNameDiff: ', outputFileNameDiff # outputFileNameDiff = fileOne[0:3] # print 'outputFileNameDiff: ', outputFileNameDiff # exit() outputPercentageFilenameBase = outputPercentageFilenameBase + '-'+ outputFileNameDiff # filePioTxt= dirMain+dirInput+typeTextPreprocess+fileOne filePioTxt= dirMain+dirInput+fileOne with open(filePioTxt) as fTxtOrg: listDocOrg = fTxtOrg.readlines() print 'len(listDocOrg): ', len(listDocOrg) # exit() # with open(dirMain+'output'+typeTextPreprocess+fileOne[8:11]+'.csv', 'wb') as outf: # with open(dirMain+typeTextPreprocess+'output-'+outputFileNameDiff+'.csv', 'wb') as outf: for rowOfListDocOrg in listDocOrg: # print 'rowOfListDocOrg: ', rowOfListDocOrg # myResult = p.search(rowOfListDocOrg) # if myResult <> None: # myData = re.sub('^Title: |^Abstract: ','',myResult.group()) listMyWords.extend(rowOfListDocOrg.split()) # listDoc.append((myData.split(),fileOne[9:11])) print '(rowOfListDocOrg.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrg.split()) listDoc.append((rowOfListDocOrg.split(),outputFileNameDiff)) print 'type(listDoc): ', type(listDoc) print 'listDoc[0]: ', listDoc[0] print 'listDoc[1]: ', listDoc[1] random.shuffle(listDoc) #print len(listDoc), myData.split() print 'len(listMyWords): ', len(listMyWords) # exit() all_words = nltk.FreqDist(listMyWords) print 'len(all_words): ', len(all_words) #print 'type(all_words): ', type(all_words), len(all_words) # word_features = all_words.keys()[:len(all_words)/10] word_features = all_words.keys()[:len(all_words)/wordFeatureDivideNum] print 'word_features: ', len(word_features), type(word_features), word_features #exit() # favorDiagnostic = ['intervention', 'risk', 'therapy', 'disease', 'participants', 'effects', 'subjects', 'patient', 'response', 'outcomes', 'events','outcome', 'findings', 'performance', 'statistically', 'evaluation', 'population'] # print '\ndocument_features(favorDiagnostic): ', document_features(favorDiagnostic) featuresets = [(document_features(d, word_features), c) for (d,c) in listDoc] # print '\nfeaturesets: ', len(featuresets), featuresets # featuresets(1/3): 360 [({'bolus': False, 'magnetic': False, 'colonoscopy': False ... }, 'int') # featuresets(1/2): 360 [({'bolus': False, 'ali': False, 'caused': False, 'magnetic': False ... }, 'int') # exit() sizeTest = len(listDoc)/numFold print '\nlen(listDoc): ', len(listDoc), '\nsizeTraining:', len(listDoc)-sizeTest,'\nsizeTesting: ', sizeTest train_set, test_set = featuresets[sizeTest:], featuresets[:sizeTest] classifier = nltk.NaiveBayesClassifier.train(train_set) print 'nltk.classify.accuracy(classifier, test_set): ', nltk.classify.accuracy(classifier, test_set), '\n' cpdist = classifier._feature_probdist print 'classifier.most_informative_features(10):', classifier.most_informative_features(10) # print dirMain+dirOutput+str(idxCrossValidation)+outputPercentageFilenameBase+'.csv' # exit() with open(dirMain+dirOutput+typeTextPreprocess+str(idxCrossValidation)+'-'+outputPercentageFilenameBase+'.csv', 'wb') as outf: outcsv = csv.writer(outf) for fname, fval in classifier.most_informative_features(len(word_features)): def labelprob(l): return cpdist[l,fname].prob(fval) labels = sorted([l for l in classifier._labels if fval in cpdist[l,fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0,fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval)) if cpdist[l0,fname].prob(fval) == 0: ratio1 = 'INF' else: # ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval)) ratio1 = '%8.2f' % (cpdist[l1,fname].prob(fval) / (cpdist[l1,fname].prob(fval)+cpdist[l0,fname].prob(fval))) if cpdist[l0,fname].prob(fval) == 0: ratio2 = 'INF' else: ratio2 = '%8.2f' % ( cpdist[l0,fname].prob(fval) / (cpdist[l1,fname].prob(fval) + cpdist[l0,fname].prob(fval))) # print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio) print '%24s = %-14r %6s : %-6s = %s : 1.0 : %s : %s' % (fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2) # outf.write(fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2) # outf.write(fname, fval) outcsv.writerow((fname, fval, l1[:6], l0[:6], ratio, '1', ratio1, ratio2)) exit() exit() #0.81 classifier.show_most_informative_features(n=10) def show_most_informative_features22(self, n=10): # Determine the most relevant features, and display them. cpdist = self._feature_probdist print 'Most Informative Features' for fname, fval in self.most_informative_features(n): def labelprob(l): return cpdist[l,fname].prob(fval) labels = sorted([l for l in self._labels if fval in cpdist[l,fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0,fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval)) print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio) show_most_informative_features22(n=10) exit() qq = classifier.most_informative_features(10) print qq classifier.probdist(test_set)
def fNaiveBayesTraining(numFold=10): wordFeatureDivideNumStart10times = 4 # default = 3 wordFeatureDivideNumStop10times = 5 # default =10 not include wordFeatureDivideNumStep10times = 1 # default =10 # listMyType = ['stp-', 'wnl-', 'ptr-'] #typeTextPreprocess = '' #typeTextPreprocess = 'stp-' typeTextPreprocess = 'wnl-' #typeTextPreprocess = 'ptr-' myRe = '((^Title: |^Abstract: )(.*))' p = re.compile(myRe) #listFilesInputPair = ['pure-doc-dx.txt', 'pure-doc-tx.txt'] #listFilesInputPair = ['intervention.txt', 'patient.txt', 'outcome.txt'] #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc) #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] # ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] # ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # ] #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] #listFilesInputPair = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] dirMain = '' dirInputTrainingSet = 'Output2_TrainingSet/' dirInputTestingSet = 'Output2_TestingSet/' dirOutput_accuracy = 'Output3_accuracy/' #dirInputTrainingSet = 'Output2Train/' dirCwd = os.getcwd()+'/' # dirOutput_accuracy if os.path.isdir(dirCwd+dirOutput_accuracy): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirCwd+dirOutput_accuracy) except: raise os.mkdir(dirCwd+dirOutput_accuracy) with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy: myAccruacyData = 'ratioWordFeature,' + 'listFilesInputPair,' + 'idxCrossValidation,' + 'accuracy\n' outfAccuracy.write(myAccruacyData) print 'myAccruacyData: ', myAccruacyData featuresetsTrain = [] # wordFeatureRatio10times = 0.1 for wordFeatureRatio10times in range(wordFeatureDivideNumStart10times,wordFeatureDivideNumStop10times, wordFeatureDivideNumStep10times): # print 'wordFeatureRatio10times: ', wordFeatureRatio10times ratioWordFeature = wordFeatureRatio10times/10.0 print 'ratioWordFeature: ', ratioWordFeature # continue dirOutput = 'Output3_Divide'+str(ratioWordFeature)+'/' #for typeTextPreprocess in listMyType: if os.path.isdir(dirCwd+dirOutput): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirCwd+dirOutput) except: raise os.mkdir(dirCwd+dirOutput) for idxCrossValidation in range(0,numFold): # print idxCrossValidation # listFilesInputFilenameStem = [typeTextPreprocess+str(idxCrossValidation)+'-int-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Train-.csv'] # listFilesInputFilenameStem = [typeTextPreprocess+str(idxCrossValidation)+'-int', typeTextPreprocess+str(idxCrossValidation)+'-out', typeTextPreprocess+str(idxCrossValidation)+'-pat'] # listFilesInputFilenameStem = [str(idxCrossValidation)+'-int', str(idxCrossValidation)+'-out', str(idxCrossValidation)+'-pat'] listFilesInputFilenameStem = ['int', 'out', 'pat'] # listFilesInputTrain = ['stp-'+str(idxCrossValidation)+'-Per-int-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-out-Test-.txt', 'stp-'+str(idxCrossValidation)+'-Per-pat-Test-.txt'] # listFilesInputTest = [typeTextPreprocess+str(idxCrossValidation)+'-int-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Test-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Test-.csv'] # listFilesInputTrain = [typeTextPreprocess+str(idxCrossValidation)+'-int-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-out-Train-.csv', typeTextPreprocess+str(idxCrossValidation)+'-pat-Train-.csv'] #exit() # listFilesInputTrain = ['stp-0-Per-int-Test-.txt', 'stp-0-Per-out-Test-.txt', 'stp-0-Per-pat-Test-.txt'] # print "Unique Combinations of 2 letters from :",listFilesInputTrain listFilesInputCombinations = [] # for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2): for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2): listFilesInputCombinations.append(uc) # print type(uc), ' '.join(uc) # exit() for listFilesInputPair in listFilesInputCombinations: listDocTrain = [] listDocTest = [] listMyWordsTrain = [] listMyWordsTest = [] outputPercentageFilenameMiddle = 'Per' for fileOne in listFilesInputPair: outputFileNameDiff = fileOne[0:3] print 'fileOne: ', fileOne, 'outputFileNameDiff: ', outputFileNameDiff # exit() # outputFileNameDiff = fileOne[0:3] # print 'outputFileNameDiff: ', outputFileNameDiff # exit() outputPercentageFilenameMiddle = outputPercentageFilenameMiddle + '-'+ outputFileNameDiff # fileOneTrain= dirMain+dirInputTrainingSet+typeTextPreprocess+fileOne # fileOneTrain= dirMain+dirInputTrainingSet+fileOne+'-Train-.csv' fileOneTrain= dirMain+dirInputTrainingSet+typeTextPreprocess+str(idxCrossValidation)+'-'+fileOne+'-Train-.csv' print 'fileOneTrain: ', fileOneTrain # exit() with open(fileOneTrain) as fTxtOrgTrain: listDocOrgTrain = fTxtOrgTrain.readlines() # print 'len(listDocOrgTrain): ', len(listDocOrgTrain), listDocOrgTrain fileOneTest= dirMain+dirInputTestingSet+typeTextPreprocess+str(idxCrossValidation)+'-'+fileOne+'-Test-.csv' print 'fileOneTest: ', fileOneTest with open(fileOneTest) as fTxtOrgTest: listDocOrgTest = fTxtOrgTest.readlines() # print 'len(listDocOrgText): ', len(listDocOrgTrain), listDocOrgTrain # exit() # with open(dirMain+'output'+typeTextPreprocess+fileOne[8:11]+'.csv', 'wb') as outf: # with open(dirMain+typeTextPreprocess+'output-'+outputFileNameDiff+'.csv', 'wb') as outf: for rowOfListDocOrgTrain in listDocOrgTrain: # print 'rowOfListDocOrgTrain: ', rowOfListDocOrgTrain # myResult = p.search(rowOfListDocOrgTrain) # if myResult <> None: # myData = re.sub('^Title: |^Abstract: ','',myResult.group()) listMyWordsTrain.extend(rowOfListDocOrgTrain.split()) # listDocTrain.append((myData.split(),fileOne[9:11])) # print '(rowOfListDocOrgTrain.split(),outputFileNameDiff): ', (outputFileNameDiff, rowOfListDocOrgTrain.split()) listDocTrain.append((rowOfListDocOrgTrain.split(), outputFileNameDiff)) # (for fileOne in listFilesInputPair:) END for rowOfListDocOrgTest in listDocOrgTest: listMyWordsTest.extend(rowOfListDocOrgTest.split()) listDocTest.append((rowOfListDocOrgTest.split(), outputFileNameDiff)) # print 'type(listDocTrain): ', type(listDocTrain) # print 'listDocTrain[0]: ', listDocTrain[0] # print 'listDocTrain[1]: ', listDocTrain[1] # random.shuffle(listDocTrain) # print 'len(listMyWordsTrain): ', len(listMyWordsTrain) # exit() allWordsTrain = nltk.FreqDist(listMyWordsTrain) # allWordsTest = nltk.FreqDist(listMyWordsTest) # print 'len(allWordsTrain): ', len(allWordsTrain) #print 'type(allWordsTrain): ', type(allWordsTrain), len(allWordsTrain) # word_features_Train = allWordsTrain.keys()[:len(allWordsTrain)/10] word_features_Train = allWordsTrain.keys()[:int(len(allWordsTrain)*ratioWordFeature)] # word_features_Test = allWordsTrain.keys()[:len(allWordsTest)] # print 'word_features_Train: ', len(word_features_Train), type(word_features_Train), word_features_Train #exit() # favorDiagnostic = ['intervention', 'risk', 'therapy', 'disease', 'participants', 'effects', 'subjects', 'patient', 'response', 'outcomes', 'events','outcome', 'findings', 'performance', 'statistically', 'evaluation', 'population'] # print '\ndocument_features(favorDiagnostic): ', document_features(favorDiagnostic) featuresetsTrain = [(document_features(d, word_features_Train), c) for (d,c) in listDocTrain] featuresetsText = [(document_features(d, word_features_Train), c) for (d,c) in listDocTest] print 'sys.getsizeof(featuresetsTrain): ', sys.getsizeof(featuresetsTrain), 'ratioWordFeature: ', ratioWordFeature print 'sys.getsizeof(featuresetsText): ', sys.getsizeof(featuresetsText), 'ratioWordFeature: ', ratioWordFeature # print '\nfeaturesets: ', len(featuresetsTrain), featuresetsTrain # continue # featuresetsTrain(1/3): 360 [({'bolus': False, 'magnetic': False, 'colonoscopy': False ... }, 'int') # featuresetsTrain(1/2): 360 [({'bolus': False, 'ali': False, 'caused': False, 'magnetic': False ... }, 'int') # exit() # sizeTest = len(listDocTrain)/numFold # print '\nlen(listDocTrain): ', len(listDocTrain), '\nsizeTraining:', len(listDocTrain)-sizeTest,'\nsizeTesting: ', sizeTest # train_set, test_set = featuresetsTrain[sizeTest:], featuresetsTrain[:sizeTest] # classifier = nltk.NaiveBayesClassifier.train(train_set) classifier = nltk.NaiveBayesClassifier.train(featuresetsTrain) # with open(dirMain+dirOutputMergeFile+typeTextPreprocess+'-'+str(idxCrossValidation)+'-Train-'+'.csv', 'a') as outfFullTrain: with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy: # myAccruacyData = 'ratioWordFeature,' + str(ratioWordFeature) +','+ '-'.join(listFilesInputPair) + ',idxCrossValidation,' + str(idxCrossValidation)+',accuracy,' + str(nltk.classify.accuracy(classifier, featuresetsText)) +'\n' myAccruacyData = str(ratioWordFeature) +','+ '-'.join(listFilesInputPair) +','+ str(idxCrossValidation) +','+ str(nltk.classify.accuracy(classifier, featuresetsText)) +'\n' print 'myAccruacyData: ', myAccruacyData outfAccuracy.write(myAccruacyData) # exit() # outfAccuracy.write(myAccruacyData) # outfAccuracy.write() print myAccruacyData cpdist = classifier._feature_probdist # print 'classifier.most_informative_features(10):', classifier.most_informative_features(10) # print dirMain+dirOutput+str(idxCrossValidation)+outputPercentageFilenameMiddle+'.csv' # exit() with open(dirMain+dirOutput+typeTextPreprocess+str(idxCrossValidation)+'-'+outputPercentageFilenameMiddle+'.csv', 'wb') as outf: outcsv = csv.writer(outf) for fname, fval in classifier.most_informative_features(len(word_features_Train)): def labelprob(l): return cpdist[l,fname].prob(fval) labels = sorted([l for l in classifier._labels if fval in cpdist[l,fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0,fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval)) if cpdist[l0,fname].prob(fval) == 0: ratio1 = 'INF' else: # ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval)) ratio1 = '%8.2f' % (cpdist[l1,fname].prob(fval) / (cpdist[l1,fname].prob(fval)+cpdist[l0,fname].prob(fval))) if cpdist[l0,fname].prob(fval) == 0: ratio2 = 'INF' else: ratio2 = '%8.2f' % ( cpdist[l0,fname].prob(fval) / (cpdist[l1,fname].prob(fval) + cpdist[l0,fname].prob(fval))) # print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio) # print '%24s = %-14r %6s : %-6s = %s : 1.0 : %s : %s' % (fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2) # outf.write(fname, fval, l1[:6], l0[:6], ratio, ratio1, ratio2) # outf.write(fname, fval) outcsv.writerow((fname, fval, l1[:6], l0[:6], ratio, '1', ratio1, ratio2)) # print 'listFilesInputCombinations: ', listFilesInputCombinations print 'len(listDocOrgTrain): ', len(listDocOrgTrain) exit() exit() #0.81 classifier.show_most_informative_features(n=10) def show_most_informative_features22(self, n=10): # Determine the most relevant features, and display them. cpdist = self._feature_probdist print 'Most Informative Features' for fname, fval in self.most_informative_features(n): def labelprob(l): return cpdist[l,fname].prob(fval) labels = sorted([l for l in self._labels if fval in cpdist[l,fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0,fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1,fname].prob(fval) / cpdist[l0,fname].prob(fval)) print '%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, l1[:6], l0[:6], ratio) show_most_informative_features22(n=10) exit() qq = classifier.most_informative_features(10) print qq classifier.probdist(test_set)
def fNaiveBayesTraining(numFold=10): global global_list_Word_features global global_ratioWordFeature global global_dirOutput global global_listFilesInputPair myRe = '((^Title: |^Abstract: )(.*))' p = re.compile(myRe) listFilesInputFilenameStemTmp = os.listdir(dirMain + dirInput) for itemOflistFilesInputFilenameStemTmp in listFilesInputFilenameStemTmp: listFilesInputFilenameStem.append(itemOflistFilesInputFilenameStemTmp[0:-4]) #global_listFilesInputPair = ['pure-doc-dx.txt', 'pure-doc-tx.txt'] #global_listFilesInputPair = ['intervention.txt', 'patient.txt', 'outcome.txt'] #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc) #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] # ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] # ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # ] #global_listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] #global_listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] #global_listFilesInputPair = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # dirOutput_accuracy logging.info('dirMain+dirOutput_accuracy = ' + dirMain+dirOutput_accuracy) if os.path.isdir(dirMain+dirOutput_accuracy): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirMain+dirOutput_accuracy) except: raise os.mkdir(dirMain+dirOutput_accuracy) with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy: myAccruacyData = 'global_ratioWordFeature,' + 'global_listFilesInputPair,' + 'idxCrossValidation,' + 'accuracy\n' outfAccuracy.write(myAccruacyData) # print 'myAccruacyData: ', myAccruacyData logging.debug('myAccruacyData: ' + myAccruacyData) with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-pmid.csv', 'wb') as outfPmid: outfPmid.write('First line:\n') with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-PreRecFmea.csv', 'a') as outfPreRecFmea: # myPreRecFmeaData = str(posPrecision) +','+ str(posRecall) +','+ str(posRmeasure) +','+ str(negPrecision) +','+ str(negRecall) +','+ str(negFmeasure) +'\n' # myPreRecFmeaData = 'global_ratioWordFeature,' + 'global_listFilesInputPair,' + 'idxCrossValidation,' + 'posPrecision,'+ 'posRecall,'+ 'posRmeasure,'+ 'negPrecision,'+ 'negRecall,'+ 'negFmeasure\n' myPreRecFmeaData = 'global_ratioWordFeature,' + 'global_listFilesInputPair,' + 'idxCrossValidation,' + 'testType,'+ 'testValue\n' outfPreRecFmea.write(myPreRecFmeaData) # myCsv = csv.writer(open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-PreRecFmea1.csv', 'a'), dialect='excel') myCsv = csv.writer(open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-PreRecFmea1.csv', 'wb'), dialect='excel') # myListTmp = [global_ratioWordFeature, global_listFilesInputPair[0][0], idxCrossValidation , 'posPrecision', posPrecision] myListTmp = ['global_ratioWordFeature1', 'global_listFilesInputPair', 'idxCrossValidation', 'testType', 'testValue'] myCsv.writerow(myListTmp) # wordFeatureRatio10times = 0.1 for wordFeatureRatio10times in range(wordFeatureRatioStart10times,wordFeatureRatioStop10times, wordFeatureRatioStep10times): # print 'wordFeatureRatio10times: ', wordFeatureRatio10times global_ratioWordFeature = wordFeatureRatio10times/100.0 # print 'global_ratioWordFeature: ', global_ratioWordFeature logging.info('global_ratioWordFeature: ' + str(global_ratioWordFeature)) # continue global_dirOutput = 'Output3_Divide'+str(global_ratioWordFeature)+'/' #for typeTextPreprocess in listMyType: if os.path.isdir(dirMain+global_dirOutput): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirMain+global_dirOutput) except: raise os.mkdir(dirMain+global_dirOutput) # ================================================================================================================ for idxCrossValidation in range(0,numFold): listFilesInputCombinations = [] # for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2): # for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2): # listFilesInputCombinations.append(uc) # print type(uc), ' '.join(uc) # exit() # flagComplements = False # flagComplements = True if flagComplements: for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 1): # listFilesInputCombinations.append([uc, list(set(listFilesInputFilenameStem).difference(uc))]) listRemoveUc = list(listFilesInputFilenameStem) listRemoveUc.remove(uc[0]) # print listRemoveUc logging.debug([uc, listRemoveUc]) listFilesInputCombinations.append([uc, listRemoveUc]) # listFilesInputCombinations.append([uc, list(listFilesInputFilenameStem).remove('intervention')]) # print 'uc: ', type(uc), ' '.join(uc) # print 'uc if flagComplements: ', type(uc), ' '.join(uc), ' ','set(listFilesInputFilenameStem).difference(uc): ', set(listFilesInputFilenameStem).difference(uc) # logging.debug(['uc = ', uc, 'set(listFilesInputFilenameStem).difference(uc): ', set(listFilesInputFilenameStem).difference(uc)]) # print 'listFilesInputCombinations: ', type(listFilesInputCombinations), listFilesInputCombinations # logging.debug(['idxCrossValidation = ', str(idxCrossValidation), 'listFilesInputCombinations', listFilesInputCombinations]) else: # for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2): for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2): # listFilesInputCombinations.append(uc) listFilesInputCombinations.append([[uc[0]], [uc[1]]]) # print 'uc if flagComplements else: ', type(uc), uc logging.debug(['uc if flagComplements else: ', type(uc), uc]) print '\nlistFilesInputCombinations: ', type(listFilesInputCombinations), listFilesInputCombinations # exit() # for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2): # exit() # print "listFilesInputCombinations = ", listFilesInputCombinations # listFilesInputCombinations = [] for global_listFilesInputPair in listFilesInputCombinations: # logging.info(['global_listFilesInputPair = ', global_listFilesInputPair]) p = Process(target=fSubprocess, args=(idxCrossValidation,)) p.start() #dicCorpus = parent_conn.recv() #print parent_conn.recv() # prints "[42, None, 'hello']" p.join()
with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Train-'+'.txt', 'wb') as outf2: for oneRowOfListTrainWithDiff in listTrainWithDiff: # listAllDocWords.extend(oneRowOfListTrainWithDiff[0]) outf2.write(' '.join(oneRowOfListTrainWithDiff[0])+'\n') with open(dirMain+dirOutput+outputPercentageFilenameBase+'-'+outputFileNameDiff+'-Test-'+'.txt', 'wb') as outf3: for oneRowOflistValidationWithDiff in listValidationWithDiff: print 'oneRowOflistValidationWithDiff: ', oneRowOflistValidationWithDiff outf3.write(' '.join(oneRowOflistValidationWithDiff[0])+'\n') idxCrossValidation = idxCrossValidation + 1 exit() listFilesInputCombinations = [] for uc in xpermutations.xuniqueCombinations(listFilesInput,2): listFilesInputCombinations.append(uc) # print ' '.join(uc) print 'listFilesInputCombinations: ', listFilesInputCombinations #exit() for filesnamePair in listFilesInputCombinations: listDoc2filesWithDiff = [] listWordsOf2files = [] # outputPercentageFilenameBase = typeTextPreprocess+'Per' # outputPercentageFilenameBase = typeTextPreprocess+str(idxCrossValidation)+'-Per' for oneFilename in filesnamePair: outputFileNameDiff = oneFilename[0:3]
def fNaiveBayesTraining(numFold=10): global global_list_Word_features global ratioWordFeature global dirOutput global listFilesInputPair myRe = '((^Title: |^Abstract: )(.*))' p = re.compile(myRe) #listFilesInputPair = ['pure-doc-dx.txt', 'pure-doc-tx.txt'] #listFilesInputPair = ['intervention.txt', 'patient.txt', 'outcome.txt'] #for uc in xuniqueCombinations(['l','o','v','e'],2): print ''.join(uc) #listFilesInputCombinations = [ [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] # ,[typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] # ,[typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # ] #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'patient.txt'] #listFilesInputPair = [typeTextPreprocess+'intervention.txt', typeTextPreprocess+'outcome.txt'] #listFilesInputPair = [typeTextPreprocess+'patient.txt', typeTextPreprocess+'outcome.txt'] # dirOutput_accuracy if os.path.isdir(dirCwd+dirOutput_accuracy): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirCwd+dirOutput_accuracy) except: raise os.mkdir(dirCwd+dirOutput_accuracy) with open(dirMain+dirOutput_accuracy+typeTextPreprocess+'-accuracy.csv', 'a') as outfAccuracy: myAccruacyData = 'ratioWordFeature,' + 'listFilesInputPair,' + 'idxCrossValidation,' + 'accuracy\n' outfAccuracy.write(myAccruacyData) print 'myAccruacyData: ', myAccruacyData # wordFeatureRatio10times = 0.1 for wordFeatureRatio10times in range(wordFeatureRatioStart10times,wordFeatureRatioStop10times, wordFeatureRatioStep10times): # print 'wordFeatureRatio10times: ', wordFeatureRatio10times ratioWordFeature = wordFeatureRatio10times/100.0 print 'ratioWordFeature: ', ratioWordFeature # continue dirOutput = 'Output3_Divide'+str(ratioWordFeature)+'/' #for typeTextPreprocess in listMyType: if os.path.isdir(dirCwd+dirOutput): try: # shutil.rmtree(LDASubDataDir, ignore_errors, onerror) shutil.rmtree(dirCwd+dirOutput) except: raise os.mkdir(dirCwd+dirOutput) # ================================================================================================================ for idxCrossValidation in range(0,numFold): listFilesInputCombinations = [] # for uc in xpermutations.xuniqueCombinations(listFilesInputTrain, 2): for uc in xpermutations.xuniqueCombinations(listFilesInputFilenameStem, 2): listFilesInputCombinations.append(uc) # print type(uc), ' '.join(uc) # exit() for listFilesInputPair in listFilesInputCombinations: p = Process(target=fSubprocess, args=(idxCrossValidation,)) p.start() #dicCorpus = parent_conn.recv() #print parent_conn.recv() # prints "[42, None, 'hello']" p.join()
def main(): """ Compute row entropy (H_MON) from exemplar matrices as described in: Dominik Schlechtweg, Stefanie Eckmann, Enrico Santus, Sabine Schulte im Walde and Daniel Hole. 2017. German in Flux: Detecting Metaphoric Change via Word Entropy. In Proceedings of CoNLL 2017. Vancouver, Canada. """ # Get the arguments args = docopt("""Compute row entropy (H_MON) from exemplar matrices. Usage: H_MON.py <testset_file> <exemplar_matrix> <time_boundary_1-time_boundary_2> <window_size> <exemplar_number> <vector_number> <output_dir> Arguments: <testset_file> = a file containing term-pairs corresponding to developments with their starting points and the type of the development <exemplar_matrix> = the pkl file of the source exemplar matrix <time_boundary_1-time_boundary_2> = the time boundaries to slice up the corpus <window_size> = size of sliding time interval <exemplar_number> = number of exemplars to construct one target vector <vector_number> = number of target vectors to average over <output_dir> = where to save the results """) matrice1_pkl = args['<exemplar_matrix>'] testset_file = args['<testset_file>'] output_dir = args['<output_dir>'] time_boundaries_str = args['<time_boundary_1-time_boundary_2>'] global_lower_bound, global_upper_bound = int( time_boundaries_str.split("-")[0]), int( time_boundaries_str.split("-")[1]) window_size = int(args['<window_size>']) N = int(args['<exemplar_number>']) V = int(args['<vector_number>']) join_sign = ':' non_values = [-999.0, -888.0] # Generate cut-points cut_points = [i for i in range(global_lower_bound, global_upper_bound)] # Load the term-pairs targets, test_set = load_test_pairs(testset_file) # Process matrice matrice1_name = os.path.splitext(basename(matrice1_pkl))[0] matrice1_folder = os.path.dirname(matrice1_pkl) + "/" # Receive a .pkl file exemplar_space, _, vocab_map1, vocab_size1, column_map1, id2column_map1 = get_space( matrice1_folder, matrice1_name, False, False, False) for cut_point in cut_points: print cut_point target_values = {} current_lower_bound, current_upper_bound = cut_point - window_size, cut_point + window_size for target in targets: print target values = [] exem_list = [(exem.split(join_sign)[0], exem.split(join_sign)[1], exem.split(join_sign)[2], exem.split(join_sign)[3]) for exem in vocab_map1 if exem.split(join_sign)[0] + join_sign + exem.split(join_sign)[1] == target] exem_dict = dict([((lemma, pos, int(date), int(identifier)), int(date)) for (lemma, pos, date, identifier) in exem_list]) # Get contexts in window window = [(lemma, pos, date, identifier) for (lemma, pos, date, identifier) in exem_dict if current_lower_bound <= date <= current_upper_bound] print 'Window size is: %d' % (len(window)) random.shuffle(window) # Get combinations of exemplars of size N exem_combos = xuniqueCombinations(window, N) for i, combo in enumerate(exem_combos): if i >= V: break print 'Calculating combination %d of %d...' % (i, V) # Initialize sparse matrix sparse_mat_dict = defaultdict(lambda: defaultdict(lambda: 0)) cols = [] for (lemma, pos, date, identifier) in combo: exem_tar = join_sign.join( (lemma, pos, str(date), str(identifier))) row = exemplar_space.get_row(exem_tar) data = row.get_mat().data indices = row.get_mat().indices for i, key in enumerate(data): cxt = id2column_map1[indices[i]] cols.append(cxt) sparse_mat_dict[target][ cxt] = sparse_mat_dict[target][cxt] + key # Bring to sparse matrix format rows = set([target]) cols = set(cols) sparse_mat_tup = [(key, context, sparse_mat_dict[key][context]) for key in sparse_mat_dict for context in sparse_mat_dict[key]] [id2row], [row2id] = extract_indexing_structs_mod(rows, [0]) [id2column], [column2id ] = extract_indexing_structs_mod(cols, [0]) sparse_mat = read_sparse_space_data_mod( sparse_mat_tup, row2id, column2id) # Create a space from co-occurrence counts in sparse format sparse_space = Space(sparse_mat, id2row, id2column, row2id, column2id) sparse_space.__class__ = Space_extension # Get all row entropies r_entropies_dict1, r_entr_ranked1 = get_r_entropies( row2id, sparse_space, None, output_dir, True) values.append(r_entropies_dict1[target]) # Computing mean of values print 'Computing mean of values...' print 'Averaging over %d values' % (len(values)) try: target_values[target] = mean(values) except StatisticsError: target_values[target] = non_values[1] # Make output unscored_output = [] for (x, y, label, relation) in test_set: unscored_output.append((x, y, label, relation, target_values[x])) # Save results save_results( unscored_output, output_dir + 'row_entropy_exem_' + str(current_lower_bound) + '-' + str(current_upper_bound) + '_' + str(N) + 's' + '.txt')
def get_voronoi_and_salience(poff_filename): points = poff_reader(poff_filename) # Convex hull convex_hull = convexHull(points) convex_hull_idx = map(lambda p: points.index(p), convex_hull) convex_hull_edges = get_edges(convex_hull_idx) # Get boundaries edges points_idx = range(0, len(points)) boundary_edges = get_edges(points_idx) # Add one dimension o_points = map(lambda x: x + (0,), points) # Add a virtual infinite vertex #inf_vertex = (0, 0, float('inf')) inf_vertex = (0, 0, 9999999) points = o_points + [inf_vertex] inf_idx = points.index(inf_vertex) # Get triangles (in 3D) t_obj = Triangulation(points) tetraherons = t_obj.get_elements_indices() triangles = filter(lambda x: inf_idx not in x, set([tuple(sorted(i)) for t in tetraherons for i in xuniqueCombinations(t, 3)])) # Calculate neighbours table neighbours = get_neighbours(triangles) # Classify all infinite Delaunay faces as EXTERIOR and push them # to a queue side_table = {} q = Queue.Queue() for face in triangles: edges = set([tuple(sorted(e)) for e in xuniqueCombinations(face, 2)]) if edges & (convex_hull_edges - boundary_edges): side_table[face] = EXTERIOR q.put(face) continue if edges & (boundary_edges & convex_hull_edges): side_table[face] = INTERIOR q.put(face) continue # Collect all EXTERIOR triangles o_triangles_idx = filter(lambda x: side_table[x] is EXTERIOR, side_table.keys()) o_triangles = idx_to_real(o_triangles_idx, o_points) # Mark every triangle as EXTERIOR or INTERIOR while not q.empty(): face = q.get() for nface in neighbours[face]: if nface not in side_table: nedge = tuple(sorted(list(set(nface) & set(face)))) assert len(nedge) == 2 if nedge in boundary_edges: side_table[nface] = not side_table[face] else: side_table[nface] = side_table[face] q.put(nface) # Collect all INTERIOR triangles i_triangles_idx = filter(lambda x: side_table[x] is INTERIOR, triangles) i_triangles = idx_to_real(i_triangles_idx, o_points) # Filter neighbours dictionary to only INTERIOR triangles i_neighbours = {} for k, d in neighbours.items(): if side_table[k] is INTERIOR: i_neighbours[k] = filter(lambda x: side_table[x] is INTERIOR, d) # Calculate areas and circumenteres of triangles areas = {} circumcenters = {} for t_idx, t in zip(i_triangles_idx, i_triangles): areas[t_idx] = abs((t[2][0] - t[0][0]) * (t[1][1] - t[0][1]) - \ (t[1][0] - t[0][0]) * (t[2][1] - t[0][1])) / 2 circumcenters[t_idx] = get_circumcenter(t) # find triangles that have two edge on the boundary boundary_triangles_idx = set([]) for t in i_triangles_idx: edges = set([tuple(sorted(e)) for e in xuniqueCombinations(t, 2)]) if len(edges & boundary_edges) == 2: boundary_triangles_idx.add(t) # traverse all triangles, and calculate the salience value voronoi_edges = [] r = {} k = {} sum_area = sum(areas.values()) visited_t = set([]) for start_t in boundary_triangles_idx: q = Queue.Queue() q.put(start_t) while not q.empty(): t = q.get() visited_t.add(t) for new_t in i_neighbours[t]: if new_t in visited_t: continue nedge = tuple(sorted(list(set(new_t) & set(t)))) assert len(nedge) == 2 area_1 = get_any_part_area(nedge, i_neighbours, areas) area_2 = sum_area - area_1 v_edge = tuple(sorted([circumcenters[t], circumcenters[new_t]])) if v_edge[0] == v_edge[1]: q.put(new_t) continue if v_edge not in voronoi_edges: voronoi_edges.append(v_edge) r[v_edge] = min(area_1, area_2) / max(area_1, area_2) k[v_edge] = min(area_1, area_2) / length(v_edge) q.put(new_t) break # Do something with r, k and voronoi_edges return o_points, voronoi_edges, r, k, i_triangles