def __init__(self, selector, mapper, score, mapperIsFactory=True, timer=False, useTopic=False): ''' :param selector: returns list of documents for the topic :param mapper: a mapper function on (selected) topic documents, returning vectors or scalar, or alternatively a 'factory' - callable that accepts 'dictionary', 'text2tokens' and 'corpus' parameters (attributes of a topic model to which processed topic belongs) and builds such a mapper. This is for creating a customized mapper for each model. :param mapperIsFactory: if True treat mapper as a factory as described above :param score: score function on the matrix/vector of transformed documents :param useTopic: if True, topic is also sent as parameter to mapper ''' self.selector, self.score = selector, deduceId(score) self.mapper = resolveId(mapper) IdComposer.__init__(self) self.__factory = mapperIsFactory self.__mapper = resolve(mapper) self.__score = score self.__timer = timer self.__useTopic = useTopic
def __init__(self, paramSet, scorerBuilder, ltopics, posClass, folder, verbose=True, cache=False): ''' :param paramSet: list-like of paramsets passed to scorerBuilder each 'paramset' is a dict of parameters :param scorerBuilder: callable building scoring functions from a paramset :param ltopics: list of (topic, labels) - topic is (modelId, topicId), labels is a string or dict of 'label':{0 or 1} :param posClass: string or a list of strings defining which label is positive class :param verbose: if true, print results during experiment runtime :param cache: if True, force scorerBuilder to cache function results ''' self.setOfParams = paramSet self.scorerBuilder = scorerBuilder.__name__ self.__builder = scorerBuilder self.ltopics = ltopics self.posClass = posClass IdComposer.__init__(self, class_='TSE') self.verbose = verbose self.__log = createLogger(fullClassName(self), INFO) self.baseFolder = folder self.folder = path.join(folder, self.id) if not path.exists(self.folder): os.mkdir(self.folder) self.cache = cache if cache: self.cacheFolder = path.join(self.baseFolder, 'function_cache') if not path.exists(self.cacheFolder): os.mkdir(self.cacheFolder) self.__rSetUp = False
def __init__(self, selector, mapper, metric, weightFilter=None, weighted=True, algorithm='clustering', center='mean', mapperIsFactory=True): ''' :param selector: selector of topic items (nodes of the graph) :param mapper: mapper of topic items to vectors :param metric: weight metric on pairs of vectors, for forming weight matrix :param weightFilter: if not None - pair/list of two elements weights in weight matrix outside this interval will be set to 0 :param weighted: if True, graph algorithms will work with weighted graphs :param algorithm: graph-based measure the coherence is based on 'clustering' - clustering coefficient 'closeness' - closeness centrality 'communicability' - communicability centrality :param center: 'mean' or 'median' - to use when averaging a measure over nodes ''' self.selector, self.mapper = selector, resolveId(mapper) self.metric, self.weightFilter = metric, weightFilter self.center = center self.algorithm = algorithm self.weighted = weighted IdComposer.__init__(self) self.__tes = TopicDocumentsScore(selector, mapper, None, mapperIsFactory)
def __init__(self, fname): ''' :param fname: file with vectors stored in txt format :return: ''' self.file = self.__idFromFname(fname) IdComposer.__init__(self) self.__fname = fname
def __init__(self, fname): ''' :param fname: file with vectors stored in original google format, with optional .bin extension :return: ''' self.file = self.__idFromFname(fname) IdComposer.__init__(self) self.__fname = fname
def __init__(self, corpus, txt2tok, dict, context=None, initSize=100): self.__cr = ContextResolver(context) corpus, txt2tok, dict = self.__cr.resolve(corpus, txt2tok, dict) self.corpus, self.txt2tok, self.dict = \ resolveIds(corpus, txt2tok, dict) IdComposer.__init__(self, attributes=['corpus', 'txt2tok', 'dict']) self.__corpus = np.empty(initSize, dtype=np.object) self.__nd = 0 # number of documents self.__index = None
def __init__(self, threshold): ''' :param threshold: integer (for top K selction), a number between 0 and 1 (as topic-weight threshold) or 'above-random' that sats topic-weight threshold to 1/num_topics ''' self.threshold = threshold2str(threshold) IdComposer.__init__(self) self.__threshold = threshold
def __init__(self, corpus=None, text2tokens=None, label=None): self._tok2ind = dict() self._ind2tok = dict() self._maxind = None corpus, text2tokens = resolveIds(corpus, text2tokens) if corpus is not None: self.corpus = corpus if text2tokens is not None: self.text2tokens = text2tokens if label is not None: self.label = label IdComposer.__init__(self)
def __init__(self, text2tokens, filter, transformer, lowercase=False): ''' :param filter: callable, accepts a string token, returns True (filter out) or False (keep token) :param transformer: callable, accepts a string token, returns a string ''' self.text2tokens = text2tokens self.filter, self.transformer = filter, transformer if lowercase: self.lowercase = True IdComposer.__init__(self) self.lowercase = lowercase
def __init__(self, distance, center='mean', exp=1.0): self.distance = distance self.center = center if center == 'mean': self.exp = exp IdComposer.__init__(self) elif center == 'median': # raising to positive power does not influence median IdComposer.__init__(self) else: raise Exception('Invalid measure of center %s' % center)
def __init__(self, corpus, models=None): ''' :param corpus: :param models: list of model Ids, or None ''' self.corpus = resolveIds(corpus) self.models = '_'.join(str(m) for m in models) if models else None IdComposer.__init__(self) self.__modelTopicOrder = {} self.__models = models self.__ctiCache = {}
def __init__(self, dimReduce=None, covariance='diag', score = 'll', seed=12345): ''' :param dimReduce: if True, :param covariance: full, diag, spherical :param score: ll (log likelihood), aic, bic :param seed: random seed for initialization ''' self.seed = seed self.covariance = covariance self.dimReduce = dimReduce self.score = score IdComposer.__init__(self)
def __init__(self, docLowerLimit, docUpperLimit, words2keep): ''' :param docLowerLimit: words are removed if they don't occur in at least this many documents, if None filter is not applied :param docUpperLimit: words are removed if they occur in more than this many documents (integer or fraction), if None filter is not applied :param words2keep: after applying document filters, keep only this many words ''' self.docLowerLimit = docLowerLimit self.docUpperLimit = docUpperLimit self.words2keep = words2keep IdComposer.__init__(self)
def __init__(self, model, id=None, cacheTopics=True): ''' :param model: gensim LdaModel or compatible :param label: additional data used for model identification :param cacheTopics: if True topic matrix of gensim model is copied and not calculated on every access ''' IdComposer.__init__(self, ['corpus', 'dictionary', 'text2tokens', 'options']) self.id = id if model is not None: if not isinstance(model, LdaModel_mod): raise TypeError('model must be of type ldamodel') self.__init_gensim_data(model) self.cacheTopics = cacheTopics
def __init__(self, distance, center='mean', exp=1.0): ''' :param distance: distance function :param exp: each distance is raised to power exp > 0 :param center: 'mean' or 'median' ''' self.distance = distance self.center = center if center == 'mean': self.exp = exp IdComposer.__init__(self) elif center == 'median': # raising to positive power does not influence median IdComposer.__init__(self) else: raise Exception('Invalid measure of center %s' % center)
def __init__(self, vectorizer, corpus=None, verbose=False): ''' :param vectorizer: callable accepting text objects or text ids and returning vectors of the same type (ndarray or scipy sparse), shape and stored scalar type. :param corpus: if None and if vectorizer has a 'corpus' property it is used as corpus ''' self.vectorizer = vectorizer if corpus is None: if hasattr(vectorizer, 'corpus'): self.corpus = resolveIds(vectorizer.corpus) else: raise Exception('corpus must be specified') else: self.corpus = resolveIds(corpus) IdComposer.__init__(self, attributes=['vectorizer']) self.verbose = verbose
def __init__(self, measure, index, topWords, windowSize=0, wordTransform=None, standard=True): ''' :param measure: string describing coherence measure :param index: pytopia id of Palmetto Lucene index :param topWords: number of top topic words used to calculate coherence ''' if measure not in ['umass', 'uci', 'npmi', 'c_a', 'c_p', 'c_v']: raise Exception('unknown coherence measure: %s' % measure) self.measure = measure self.index = index self.topWords = topWords self.windowSize = windowSize self.wordTransform = wordTransform self.standard = standard IdComposer.__init__(self) self.coherence = None
def __init__(self, dimReduce=None, covariance='diag', center='mean', seed=12345): ''' :param dimReduce: None or number of components to reduce to with PCA :param covariance: full, diag, spherical :param scoreMeasure: ll (log likelihood), aic, bic :param seed: random seed for initialization ''' self.covariance = covariance self.dimReduce = dimReduce self.center = center IdComposer.__init__(self) self.seed = seed
def __init__(self, type, vectors=None, threshold=None, topWords=10, corpus='us_politics', text2tokens='RsssuckerTxt2Tokens', dict='us_politics_dict', cache=None, **params): self.type, self.vectors, self.threshold = type, vectors, threshold self.__p = params attrs = ['type', 'vectors', 'threshold'] for k, v in params.iteritems(): setattr(self, k, v) attrs.append(k) self.topWords = topWords IdComposer.__init__(self, attributes=attrs, class_='Coherence') self.corpus, self.text2tokens, self.dictionary = corpus, text2tokens, dict self.cache = cache
def __init__(self, corpus, text2tokens, lowercase): self.corpus, self.text2tokens = resolveIds(corpus, text2tokens) self.lowercase = lowercase IdComposer.__init__(self)
def __init__(self, corpus): ''' :param corpus: pytopia corpus or id ''' self.corpus = resolve(corpus).id IdComposer.__init__(self)
def __init__(self, text2tokens, word2vector, tokenMod=None, avg=None): # todo remove tokenMod, clients should use function composition insted # todo solve passing params by id self.text2tokens, self.word2vector = text2tokens, word2vector self.tokenMod = tokenMod; self.avg = avg IdComposer.__init__(self)
def __init__(self, corpus, text2tokens, dictionary): self.corpus, self.dictionary, self.text2tokens = \ resolveIds(corpus, dictionary, text2tokens) IdComposer.__init__(self, ['corpus', 'dictionary', 'text2tokens'])
def __init__(self, model, topicId, vector=None): self.model = resolveId(model) self.topicId = topicId IdComposer.__init__(self, ['model', 'topicId']) self._vector = vector
def __init__(self, corpus, model, dictionary=None, txt2tokens=None): self.corpus, self.model = resolveIds(corpus, model) self.dictionary, self.text2tokens = resolveIds(dictionary, txt2tokens) IdComposer.__init__( self, attributes=['corpus', 'model', 'dictionary', 'text2tokens'])
def __init__(self, text2tokens, dictionary): self.text2tokens, self.dictionary = resolveIds(text2tokens, dictionary) IdComposer.__init__(self)
def __init__(self, measure): ''' :param measure: measure of distance between two distributions ''' self.measure = measure IdComposer.__init__(self)