# -*- coding: utf-8 -*- import __init__ import argparse import cpLib.conceptDB as db import cpLib.conceptExtraction as cpe if __name__ == "__main__": parser = argparse.ArgumentParser(description='shuffle a wordPair file') parser.add_argument("vocFilePath", help='voc file') parser.add_argument("inputConceptPairPath", help='concept pair file') args = parser.parse_args() vocFilePath = args.vocFilePath inputConceptPairPath = args.inputConceptPairPath conceptPairStrList = [ l.split('\t') for l in open(args.inputConceptPairPath).read().splitlines() ] strict = args.compose d = db.DB('../data/voc/npy/wikiEn-skipgram.npy', False) conceptPairList = cpe.buildConceptPairList(d, conceptPairStrList, True) shuffledConceptPairList = cpe.shuffledConceptPairList(conceptPairList) for conceptPair in shuffledConceptPairList: print '\t'.join([str(s) for s in conceptPair])
def setUp(self): self.d = db.DB('../data/voc/npy/googleNews_mini.npy')
def printPredictedConceptClass(d, clf, cpStrList, strict): cpList = cpe.buildConceptList(d, cpStrList, strict) yPred = clf.predict(cpList) yProba = clf.predict_proba(cpList) for x, y, yp in zip(cpStrList, yPred, yProba): print '\t'.join([str(i) for i in [x, y, yp]]) if __name__ == "__main__": parser = argparse.ArgumentParser( description='Predict concept class according to a trained classifier') parser.add_argument("vocFilePath", help='voc file') parser.add_argument("trainedClfPath", help='trained classifier file') parser.add_argument("inputConceptPath", help='concept file') parser.add_argument("--compose", help='try to compose concept', action='store_false') args = parser.parse_args() vocFilePath = args.vocFilePath trainedClfPath = args.trainedClfPath inputConceptPath = args.inputConceptPath strict = args.compose cpStrList = open(inputConceptPath).read().splitlines() printPredictedConceptClass(db.DB(vocFilePath), dill.load(open(trainedClfPath)), cpStrList, strict)
import cpLib.conceptDB as db if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'Convert a word2vec vocabulary/vector file from text to numpy format') parser.add_argument('txtFile', help='voc and vectors file in text format') parser.add_argument('npFile', help='path to store the voc in numpy format') args = parser.parse_args() inputTxtFilePath = args.txtFile npFilePath = args.npFile dictFilePath = npFilePath + 'dict' if inputTxtFilePath.endswith('.txt') and npFilePath.endswith('.npy'): d = db.DB(inputTxtFilePath) vocIndexDict = {} with open(inputTxtFilePath, 'r') as inputTxtFile: print print 'vector dim: ' + inputTxtFile.readline() for i, line in enumerate(inputTxtFile): vocIndexDict[line.split()[0]] = i np.save(npFilePath, d.vect) with open(dictFilePath, 'w') as dictFile: json.dump(vocIndexDict, dictFile) else: print 'input file error'
return list(conceptStrSet) if __name__ == "__main__": parser = argparse.ArgumentParser( description='Find best pair match given a trained pair classifier') parser.add_argument("vocFilePath", help='voc file') parser.add_argument("trainedClfPath", help='trained classifier file') parser.add_argument("sourceConcept", help='concept source') parser.add_argument("targetClass", help='class to search the best match for') parser.add_argument("--domain", help='restrict target domain') args = parser.parse_args() d = db.DB(args.vocFilePath) clf = dill.load(open(args.trainedClfPath)) classIndex = clf.classes_.tolist().index(args.targetClass) conceptSource = d.get(args.sourceConcept) otherConceptStrList = [c for c in d.voc.keys() if c != conceptSource.word ] if args.domain is None else extractSubDomain( open(args.domain), conceptSource) conceptPairList = zip([conceptSource.word] * len(otherConceptStrList), [args.targetClass] * len(otherConceptStrList), otherConceptStrList) X = cpe.buildConceptPairList(d, conceptPairList, True) yProba = clf.predict_proba(X)
l.split('\t') for l in open(inputConceptPath[0]).read().splitlines() ], inputConceptPath[1] if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Predict concept pair class according to a trained classifier') parser.add_argument("vocFilePath", help='voc file') parser.add_argument("trainedClfPath", help='trained classifier file') parser.add_argument("inputConceptPairPathAndClassList", nargs='+', help='concept pair file list followed by class name') parser.add_argument("--compose", help='try to compose concept', action='store_false') args = parser.parse_args() vocFilePath = args.vocFilePath trainedClfPath = args.trainedClfPath inputConceptPairPathAndClassList = args.inputConceptPairPathAndClassList annotedConceptPairStrList = [ extractAnnotedConceptPairStr(f) for f in pairwise(args.inputConceptPairPathAndClassList) ] strict = args.compose detailConceptPairClfError(db.DB(vocFilePath), dill.load(open(trainedClfPath)), annotedConceptPairStrList, strict)
return pc.carthToPolar(vectorLine)[1:] if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'Convert a word2vec vocabulary/vector from carthesian to polar') parser.add_argument('inputFilePath', help='input database') parser.add_argument('outputFolderPath', help='folder path to store the output database') parser.add_argument("--angular", help='drop norm of vectors (angular)', action='store_true') args = parser.parse_args() d = db.DB(args.inputFilePath) newVectFile, transformName = [], '' if args.angular: newVectFile = np.apply_along_axis(lineToAngular, 1, d.vect) transformName = '_angular' else: newVectFile = np.apply_along_axis(lineToPolar, 1, d.vect) transformName = '_polar' vectInPath, dictInPath = path( args.inputFilePath), path(args.inputFilePath + 'dict') outParentPath = path(args.outputFolderPath) vectOutPath = path(outParentPath / vectInPath.namebase + transformName + vectInPath.ext) dictOutPath = path(outParentPath / dictInPath.namebase + transformName +
sku.detailClassificationError(clf, cpList, cpList, yTrue, True) def extractAnnotedConceptStr(inputConceptPath): return open(inputConceptPath[0]).read().splitlines(), inputConceptPath[1] if __name__ == "__main__": parser = argparse.ArgumentParser( description='detail concept classifier error') parser.add_argument("vocFilePath", help='voc file') parser.add_argument("trainedClfPath", help='trained classifier file') parser.add_argument("inputConceptPathandClassList", nargs='+', help='concept file list followed by class name') parser.add_argument("--compose", help='try to compose concept', action='store_false') args = parser.parse_args() vocFilePath = args.vocFilePath trainedClfPath = args.trainedClfPath annotedConceptStrList = [ extractAnnotedConceptStr(f) for f in pairwise(args.inputConceptPathandClassList) ] strict = args.compose detailConceptClfError(db.DB(vocFilePath), dill.load(open(trainedClfPath)), annotedConceptStrList, strict)
def test_buildFromNpyFile(self): d = db.DB('../data/voc/npy/googleNews_mini.npy') self.assertEquals(d.get('</s>').vect[0], d.vect[0][0])
def test_buildFromTxtFile(self): d = db.DB('../data/voc/txt/googleNews_mini.txt') self.assertEquals(d.get('</s>').vect[0], d.vect[0][0])