Example #1
0
    def from_conf(cls,
                  conf,
                  _max_expe=2e6,
                  private_keywords=[],
                  expdesign=None):
        gt = cls(private_keywords=private_keywords)
        _spec = conf.pop('_spec', None)
        if not _spec:
            if not expdesign:
                expdesign = ExpDesign
            conf['_name_expe'] = '_default_expe'
            conf['_expe_hash'] = hash_objects(
                dict((k, v) for k, v in conf.items()
                     if k not in private_keywords))
            gt._tensors.append(ExpTensor.from_expe(conf))
            gt._ds_.append(expdesign)
            return gt

        exp = []
        size_expe = len(_spec)
        consume_expe = 0
        while consume_expe < size_expe:
            o = _spec[consume_expe]
            if isinstance(o, tuple):
                # _type => expdesign
                name, o, _type = o

            if isinstance(o, ExpGroup):
                size_expe += len(o) - 1
                _spec = _spec[:consume_expe] + o + _spec[consume_expe + 1:]
            elif isinstance(o, list):
                exp.append(o)
                gt._ds_.append(_type)
                consume_expe += 1
            else:
                o['_name_expe'] = name
                o['_expe_hash'] = hash_objects(
                    dict((k, v) for k, v in o.items()
                         if k not in private_keywords))
                exp.append(o)
                gt._ds_.append(_type)
                consume_expe += 1

            if size_expe > _max_expe:
                lgg.warning(
                    'Number of experiences exceeds the hard limit of %d (please review ExpTensor).'
                    % _max_expe)

        gt._tensors.extend([ExpTensor.from_expe(conf, spec) for spec in exp])
        return gt
Example #2
0
    def fit(self):
        voca = Vocabulary(exclude_stopwords=True)
        writer = self.get_writer(reset=self.expe.reset, online=True)
        setattr(self, 'writer', writer)

        for _it, path in enumerate(self.doc_yielder(self.expe.path)):

            fullpath = path
            shortpath = '/' +  fullpath[len(os.path.expanduser(self.expe.path)):].rstrip('/').lstrip('/')

            is_known = False
            is_duplicated = False

            if self.getfirst(shortpath, 'shortpath'):
                # don't update document
                # could compute a diff here...
                is_known = True # assume already indexed
            else:
                text = extract_pdf(fullpath)
                text = voca.remove_stopwords(text)
                #bow = voca.doc2bow(text)
                if text in (None, ''):
                    # do nothing
                    continue

                doc = dict(shortpath=shortpath, fullpath=fullpath)
                doc['content'] = text
                doc['hash'] = hash_objects(text)

                first_m = self.getfirst(doc['hash'], 'hash')
                if first_m:
                    #if not 'content' in first_m:
                    #    writer.delete_by_term('hash', doc['hash'])
                    #    continue
                    # don't update document
                    self.log.warning("Duplicate file detected: %s renaming to %s" % (first_m['shortpath'], shortpath))
                    first_m['shortpath'] = shortpath
                    writer.update_document(**first_m)
                    is_duplicated = True
                else:
                    if self.expe.extract_structure:
                        # structured content
                        structured = self.extract_structured_kw(doc)
                        doc.update(structured)

            if not (is_known or is_duplicated):
                print("indexing `%s'" % (path))
                try:
                    writer.add_document(**doc)
                except Exception as e:
                    print('indexing doc %s failed!' % fullpath)

        return
Example #3
0
    def _make_hash(self, skip_check=False):
        _hash = []
        n_duplicate = 0
        for _id, _d in enumerate(self._lod):
            d = _d.copy()
            [ d.pop(k) for k in self._private_keywords if k in d and k != '_repeat']
            o = hash_objects(d)
            if o in _hash:
                n_duplicate += 1
            _hash.append(o)


        if n_duplicate > 0 and not skip_check:
            lgg.warning('Duplicate experience: %d' % (n_duplicate))
            ask_sure_exit('Continue [y/n]?')
        self._hash = _hash
Example #4
0
    def gen_data_matlab(self):
        from scipy.io import savemat
        from pymake.util.utils import hash_objects
        expe = self.expe

        expe['driver'] = 'gt'
        training_ratio = 100
        testset_ratio = 20
        validset_ratio = 10

        corpus_name = expe.corpus
        training_ratio = str(int(expe.get('training_ratio', training_ratio)))
        testset_ratio = str(int(expe.get('testset_ratio', testset_ratio)))
        validset_ratio = str(int(expe.get('validset_ratio', validset_ratio)))
        repeat = expe.get('_repeat', '')

        expe['training_ratio'] = training_ratio
        expe['testset_ratio'] = testset_ratio
        expe['validset_ratio'] = validset_ratio

        # Debug how validset is computed
        #expe['testset_ratio'] -= 0.1/1.1

        frontend = self.load_frontend()
        is_symmetric = frontend.is_symmetric()

        ### sparse matrix with test indexes at 1
        Ytest = frontend.data_test

        ### Adjacency matrix
        g = frontend.data
        g.clear_filters()
        y = frontend.adj()

        # set the weight
        for i, j, w in frontend.get_edges():
            y[i, j] = w
            if is_symmetric:
                y[j, i] = w

        # state
        seed = []
        for c in list(corpus_name):
            seed.append(str(ord(c)))
        seed.append(repeat)

        seed = ''.join([chr(int(i)) for i in list(''.join(seed))])
        seed = int((hash_objects(seed)), 32) % 2**32

        out = os.path.join(self.get_data_path(), 'mat') + '/'
        if repeat:
            out += repeat + '/'
        os.makedirs(out, exist_ok=True)

        fnout = out + corpus_name + '_' + '-'.join(
            [training_ratio, testset_ratio, validset_ratio]) + '.mat'
        print('saving: %s' % fnout)
        savemat(
            fnout, {
                'Y': y.astype(float),
                'Ytest': Ytest.astype(float),
                'is_symmetric': is_symmetric,
                'state': seed
            })