def from_conf(cls, conf, _max_expe=2e6, private_keywords=[], expdesign=None): gt = cls(private_keywords=private_keywords) _spec = conf.pop('_spec', None) if not _spec: if not expdesign: expdesign = ExpDesign conf['_name_expe'] = '_default_expe' conf['_expe_hash'] = hash_objects( dict((k, v) for k, v in conf.items() if k not in private_keywords)) gt._tensors.append(ExpTensor.from_expe(conf)) gt._ds_.append(expdesign) return gt exp = [] size_expe = len(_spec) consume_expe = 0 while consume_expe < size_expe: o = _spec[consume_expe] if isinstance(o, tuple): # _type => expdesign name, o, _type = o if isinstance(o, ExpGroup): size_expe += len(o) - 1 _spec = _spec[:consume_expe] + o + _spec[consume_expe + 1:] elif isinstance(o, list): exp.append(o) gt._ds_.append(_type) consume_expe += 1 else: o['_name_expe'] = name o['_expe_hash'] = hash_objects( dict((k, v) for k, v in o.items() if k not in private_keywords)) exp.append(o) gt._ds_.append(_type) consume_expe += 1 if size_expe > _max_expe: lgg.warning( 'Number of experiences exceeds the hard limit of %d (please review ExpTensor).' % _max_expe) gt._tensors.extend([ExpTensor.from_expe(conf, spec) for spec in exp]) return gt
def fit(self): voca = Vocabulary(exclude_stopwords=True) writer = self.get_writer(reset=self.expe.reset, online=True) setattr(self, 'writer', writer) for _it, path in enumerate(self.doc_yielder(self.expe.path)): fullpath = path shortpath = '/' + fullpath[len(os.path.expanduser(self.expe.path)):].rstrip('/').lstrip('/') is_known = False is_duplicated = False if self.getfirst(shortpath, 'shortpath'): # don't update document # could compute a diff here... is_known = True # assume already indexed else: text = extract_pdf(fullpath) text = voca.remove_stopwords(text) #bow = voca.doc2bow(text) if text in (None, ''): # do nothing continue doc = dict(shortpath=shortpath, fullpath=fullpath) doc['content'] = text doc['hash'] = hash_objects(text) first_m = self.getfirst(doc['hash'], 'hash') if first_m: #if not 'content' in first_m: # writer.delete_by_term('hash', doc['hash']) # continue # don't update document self.log.warning("Duplicate file detected: %s renaming to %s" % (first_m['shortpath'], shortpath)) first_m['shortpath'] = shortpath writer.update_document(**first_m) is_duplicated = True else: if self.expe.extract_structure: # structured content structured = self.extract_structured_kw(doc) doc.update(structured) if not (is_known or is_duplicated): print("indexing `%s'" % (path)) try: writer.add_document(**doc) except Exception as e: print('indexing doc %s failed!' % fullpath) return
def _make_hash(self, skip_check=False): _hash = [] n_duplicate = 0 for _id, _d in enumerate(self._lod): d = _d.copy() [ d.pop(k) for k in self._private_keywords if k in d and k != '_repeat'] o = hash_objects(d) if o in _hash: n_duplicate += 1 _hash.append(o) if n_duplicate > 0 and not skip_check: lgg.warning('Duplicate experience: %d' % (n_duplicate)) ask_sure_exit('Continue [y/n]?') self._hash = _hash
def gen_data_matlab(self): from scipy.io import savemat from pymake.util.utils import hash_objects expe = self.expe expe['driver'] = 'gt' training_ratio = 100 testset_ratio = 20 validset_ratio = 10 corpus_name = expe.corpus training_ratio = str(int(expe.get('training_ratio', training_ratio))) testset_ratio = str(int(expe.get('testset_ratio', testset_ratio))) validset_ratio = str(int(expe.get('validset_ratio', validset_ratio))) repeat = expe.get('_repeat', '') expe['training_ratio'] = training_ratio expe['testset_ratio'] = testset_ratio expe['validset_ratio'] = validset_ratio # Debug how validset is computed #expe['testset_ratio'] -= 0.1/1.1 frontend = self.load_frontend() is_symmetric = frontend.is_symmetric() ### sparse matrix with test indexes at 1 Ytest = frontend.data_test ### Adjacency matrix g = frontend.data g.clear_filters() y = frontend.adj() # set the weight for i, j, w in frontend.get_edges(): y[i, j] = w if is_symmetric: y[j, i] = w # state seed = [] for c in list(corpus_name): seed.append(str(ord(c))) seed.append(repeat) seed = ''.join([chr(int(i)) for i in list(''.join(seed))]) seed = int((hash_objects(seed)), 32) % 2**32 out = os.path.join(self.get_data_path(), 'mat') + '/' if repeat: out += repeat + '/' os.makedirs(out, exist_ok=True) fnout = out + corpus_name + '_' + '-'.join( [training_ratio, testset_ratio, validset_ratio]) + '.mat' print('saving: %s' % fnout) savemat( fnout, { 'Y': y.astype(float), 'Ytest': Ytest.astype(float), 'is_symmetric': is_symmetric, 'state': seed })