Beispiel #1
0
 def __init__(self,
              paramSet,
              scorerBuilder,
              ltopics,
              posClass,
              folder,
              verbose=True,
              cache=False):
     '''
     :param paramSet: list-like of paramsets passed to scorerBuilder
             each 'paramset' is a dict of parameters
     :param scorerBuilder: callable building scoring functions from a paramset
     :param ltopics: list of (topic, labels) - topic is (modelId, topicId),
             labels is a string or dict of 'label':{0 or 1}
     :param posClass: string or a list of strings defining which label is positive class
     :param verbose: if true, print results during experiment runtime
     :param cache: if True, force scorerBuilder to cache function results
     '''
     self.setOfParams = paramSet
     self.scorerBuilder = scorerBuilder.__name__
     self.__builder = scorerBuilder
     self.ltopics = ltopics
     self.posClass = posClass
     IdComposer.__init__(self, class_='TSE')
     self.verbose = verbose
     self.__log = createLogger(fullClassName(self), INFO)
     self.baseFolder = folder
     self.folder = path.join(folder, self.id)
     if not path.exists(self.folder): os.mkdir(self.folder)
     self.cache = cache
     if cache:
         self.cacheFolder = path.join(self.baseFolder, 'function_cache')
         if not path.exists(self.cacheFolder): os.mkdir(self.cacheFolder)
     self.__rSetUp = False
Beispiel #2
0
 def __init__(self,
              selector,
              mapper,
              score,
              mapperIsFactory=True,
              timer=False,
              useTopic=False):
     '''
     :param selector: returns list of documents for the topic
     :param mapper: a mapper function on (selected) topic documents,
             returning vectors or scalar,
             or alternatively a 'factory' - callable that accepts 'dictionary',
             'text2tokens' and 'corpus' parameters (attributes of
              a topic model to which processed topic belongs) and
             builds such a mapper.
             This is for creating a customized mapper for each model.
     :param mapperIsFactory: if True treat mapper as a factory as described above
     :param score: score function on the matrix/vector of transformed documents
     :param useTopic: if True, topic is also sent as parameter to mapper
     '''
     self.selector, self.score = selector, deduceId(score)
     self.mapper = resolveId(mapper)
     IdComposer.__init__(self)
     self.__factory = mapperIsFactory
     self.__mapper = resolve(mapper)
     self.__score = score
     self.__timer = timer
     self.__useTopic = useTopic
Beispiel #3
0
 def __init__(self,
              selector,
              mapper,
              metric,
              weightFilter=None,
              weighted=True,
              algorithm='clustering',
              center='mean',
              mapperIsFactory=True):
     '''
     :param selector: selector of topic items (nodes of the graph)
     :param mapper: mapper of topic items to vectors
     :param metric: weight metric on pairs of vectors, for forming weight matrix
     :param weightFilter: if not None - pair/list of two elements
                 weights in weight matrix outside this interval will be set to 0
     :param weighted: if True, graph algorithms will work with weighted graphs
     :param algorithm: graph-based measure the coherence is based on
                 'clustering' - clustering coefficient
                 'closeness' - closeness centrality
                 'communicability' - communicability centrality
     :param center: 'mean' or 'median' - to use when averaging a measure over nodes
     '''
     self.selector, self.mapper = selector, resolveId(mapper)
     self.metric, self.weightFilter = metric, weightFilter
     self.center = center
     self.algorithm = algorithm
     self.weighted = weighted
     IdComposer.__init__(self)
     self.__tes = TopicDocumentsScore(selector, mapper, None,
                                      mapperIsFactory)
Beispiel #4
0
 def __init__(self, fname):
     '''
     :param fname: file with vectors stored in txt format
     :return:
     '''
     self.file = self.__idFromFname(fname)
     IdComposer.__init__(self)
     self.__fname = fname
 def __init__(self, corpus=None, text2tokens=None, label=None):
     self._tok2ind = dict()
     self._ind2tok = dict()
     self._maxind = None
     corpus, text2tokens = resolveIds(corpus, text2tokens)
     if corpus is not None: self.corpus = corpus
     if text2tokens is not None: self.text2tokens = text2tokens
     if label is not None: self.label = label
     IdComposer.__init__(self)
 def __init__(self, corpus, txt2tok, dict, context=None, initSize=100):
     self.__cr = ContextResolver(context)
     corpus, txt2tok, dict = self.__cr.resolve(corpus, txt2tok, dict)
     self.corpus, self.txt2tok, self.dict = \
         resolveIds(corpus, txt2tok, dict)
     IdComposer.__init__(self, attributes=['corpus', 'txt2tok', 'dict'])
     self.__corpus = np.empty(initSize, dtype=np.object)
     self.__nd = 0  # number of documents
     self.__index = None
Beispiel #7
0
 def __init__(self, fname):
     '''
     :param fname: file with vectors stored in original google format,
                     with optional .bin extension
     :return:
     '''
     self.file = self.__idFromFname(fname)
     IdComposer.__init__(self)
     self.__fname = fname
Beispiel #8
0
 def __init__(self, threshold):
     '''
     :param threshold: integer (for top K selction),
         a number between 0 and 1 (as topic-weight threshold)
         or 'above-random' that sats topic-weight threshold to 1/num_topics
     '''
     self.threshold = threshold2str(threshold)
     IdComposer.__init__(self)
     self.__threshold = threshold
Beispiel #9
0
 def __init__(self, text2tokens, filter, transformer, lowercase=False):
     '''
     :param filter: callable, accepts a string token, returns True (filter out) or False (keep token)
     :param transformer: callable, accepts a string token, returns a string
     '''
     self.text2tokens = text2tokens
     self.filter, self.transformer = filter, transformer
     if lowercase: self.lowercase = True
     IdComposer.__init__(self)
     self.lowercase = lowercase
Beispiel #10
0
 def __init__(self, distance, center='mean', exp=1.0):
     self.distance = distance
     self.center = center
     if center == 'mean':
         self.exp = exp
         IdComposer.__init__(self)
     elif center == 'median':
         # raising to positive power does not influence median
         IdComposer.__init__(self)
     else:
         raise Exception('Invalid measure of center %s' % center)
 def __init__(self, corpus, models=None):
     '''
     :param corpus:
     :param models: list of model Ids, or None
     '''
     self.corpus = resolveIds(corpus)
     self.models = '_'.join(str(m) for m in models) if models else None
     IdComposer.__init__(self)
     self.__modelTopicOrder = {}
     self.__models = models
     self.__ctiCache = {}
 def __init__(self, dimReduce=None, covariance='diag',
              score = 'll', seed=12345):
     '''
     :param dimReduce: if True,
     :param covariance: full, diag, spherical
     :param score: ll (log likelihood), aic, bic
     :param seed: random seed for initialization
     '''
     self.seed = seed
     self.covariance = covariance
     self.dimReduce = dimReduce
     self.score = score
     IdComposer.__init__(self)
Beispiel #13
0
 def __init__(self, docLowerLimit, docUpperLimit, words2keep):
     '''
     :param docLowerLimit: words are removed if they don't occur
             in at least this many documents, if None filter is not applied
     :param docUpperLimit: words are removed if they occur in
             more than this many documents (integer or fraction),
             if None filter is not applied
     :param words2keep: after applying document filters, keep only this many words
     '''
     self.docLowerLimit = docLowerLimit
     self.docUpperLimit = docUpperLimit
     self.words2keep = words2keep
     IdComposer.__init__(self)
Beispiel #14
0
 def __init__(self, model, id=None, cacheTopics=True):
     '''
     :param model: gensim LdaModel or compatible
     :param label: additional data used for model identification
     :param cacheTopics: if True topic matrix of gensim model is copied
             and not calculated on every access
     '''
     IdComposer.__init__(self,
                         ['corpus', 'dictionary', 'text2tokens', 'options'])
     self.id = id
     if model is not None:
         if not isinstance(model, LdaModel_mod):
             raise TypeError('model must be of type ldamodel')
         self.__init_gensim_data(model)
     self.cacheTopics = cacheTopics
Beispiel #15
0
 def __init__(self,
              dimReduce=None,
              covariance='diag',
              center='mean',
              seed=12345):
     '''
     :param dimReduce: None or number of components to reduce to with PCA
     :param covariance: full, diag, spherical
     :param scoreMeasure: ll (log likelihood), aic, bic
     :param seed: random seed for initialization
     '''
     self.covariance = covariance
     self.dimReduce = dimReduce
     self.center = center
     IdComposer.__init__(self)
     self.seed = seed
Beispiel #16
0
 def __init__(self, measure, index, topWords, windowSize=0, wordTransform=None, standard=True):
     '''
     :param measure: string describing coherence measure
     :param index: pytopia id of Palmetto Lucene index
     :param topWords: number of top topic words used to calculate coherence
     '''
     if measure not in ['umass', 'uci', 'npmi', 'c_a', 'c_p', 'c_v']:
         raise Exception('unknown coherence measure: %s' % measure)
     self.measure = measure
     self.index = index
     self.topWords = topWords
     self.windowSize = windowSize
     self.wordTransform = wordTransform
     self.standard = standard
     IdComposer.__init__(self)
     self.coherence = None
Beispiel #17
0
 def __init__(self, distance, center='mean', exp=1.0):
     '''
     :param distance: distance function
     :param exp: each distance is raised to power exp > 0
     :param center: 'mean' or 'median'
     '''
     self.distance = distance
     self.center = center
     if center == 'mean':
         self.exp = exp
         IdComposer.__init__(self)
     elif center == 'median':
         # raising to positive power does not influence median
         IdComposer.__init__(self)
     else:
         raise Exception('Invalid measure of center %s' % center)
 def __init__(self, vectorizer, corpus=None, verbose=False):
     '''
     :param vectorizer: callable accepting text objects or text ids and
      returning vectors of the same type (ndarray or scipy sparse), shape and stored scalar type.
     :param corpus: if None and if vectorizer has a 'corpus' property it is used as corpus
     '''
     self.vectorizer = vectorizer
     if corpus is None:
         if hasattr(vectorizer, 'corpus'):
             self.corpus = resolveIds(vectorizer.corpus)
         else:
             raise Exception('corpus must be specified')
     else:
         self.corpus = resolveIds(corpus)
     IdComposer.__init__(self, attributes=['vectorizer'])
     self.verbose = verbose
Beispiel #19
0
 def __init__(self,
              type,
              vectors=None,
              threshold=None,
              topWords=10,
              corpus='us_politics',
              text2tokens='RsssuckerTxt2Tokens',
              dict='us_politics_dict',
              cache=None,
              **params):
     self.type, self.vectors, self.threshold = type, vectors, threshold
     self.__p = params
     attrs = ['type', 'vectors', 'threshold']
     for k, v in params.iteritems():
         setattr(self, k, v)
         attrs.append(k)
     self.topWords = topWords
     IdComposer.__init__(self, attributes=attrs, class_='Coherence')
     self.corpus, self.text2tokens, self.dictionary = corpus, text2tokens, dict
     self.cache = cache
 def __setstate__(self, state):
     self.lowercase, self.token2words, self.topFreq, self.topWord, idcState = state
     IdComposer.__setstate__(self, idcState)
 def __init__(self, corpus, text2tokens, dictionary):
     self.corpus, self.dictionary, self.text2tokens = \
         resolveIds(corpus, dictionary, text2tokens)
     IdComposer.__init__(self, ['corpus', 'dictionary', 'text2tokens'])
 def __init__(self, corpus, text2tokens, lowercase):
     self.corpus, self.text2tokens = resolveIds(corpus, text2tokens)
     self.lowercase = lowercase
     IdComposer.__init__(self)
 def __getstate__(self):
     return self.lowercase, self.token2words, self.topFreq, self.topWord,  \
            IdComposer.__getstate__(self)
 def __init__(self, text2tokens, word2vector, tokenMod=None, avg=None):
     # todo remove tokenMod, clients should use function composition insted
     # todo solve passing params by id
     self.text2tokens, self.word2vector = text2tokens, word2vector
     self.tokenMod = tokenMod; self.avg = avg
     IdComposer.__init__(self)
Beispiel #25
0
 def __getstate__(self):
     return IdComposer.__getstate__(self), self.__id2ind
Beispiel #26
0
 def __setstate__(self, state):
     IdComposer.__setstate__(self, state[0])
     self.__id2ind = state[1]
 def __getstate__(self):
     return self._word2doc, self._doc2word, self._numDocs, \
             IdComposer.__getstate__(self)
Beispiel #28
0
 def __init__(self, corpus, model, dictionary=None, txt2tokens=None):
     self.corpus, self.model = resolveIds(corpus, model)
     self.dictionary, self.text2tokens = resolveIds(dictionary, txt2tokens)
     IdComposer.__init__(
         self, attributes=['corpus', 'model', 'dictionary', 'text2tokens'])
 def __setstate__(self, state):
     self._word2doc, self._doc2word, self._numDocs, idcState = state
     IdComposer.__setstate__(self, idcState)
Beispiel #30
0
 def __init__(self, model, topicId, vector=None):
     self.model = resolveId(model)
     self.topicId = topicId
     IdComposer.__init__(self, ['model', 'topicId'])
     self._vector = vector