def get_atoms(cls): expe_designs = [] atoms = dict() modules = get_pymake_settings('_spec') modules = [modules] if type(modules) is str else modules for module in modules: s = cls(module, class_filter=cls.module) for surname, _module in s.packages.items(): name = _module.__name__ module = s._cls_browse[name] expd = getattr(import_module(module.module), name)() content = {} content['script_name'] = surname content['module_name'] = '.'.join( (_module.__module__, _module.__name__)) content['_module'] = module content['exp'] = expd._specs() atoms[name] = content return atoms
def fetch(self, *args): i = self.output_path.find('.pmk/') path = self.output_path local_path = path[i:] user = get_pymake_settings('ssh_user') machine = get_pymake_settings('ssh_machine') remote_loc = get_pymake_settings('ssh_remote') local_loc = os.path.dirname(path) + '/' ext = '.inf' _file = os.path.join(remote_loc, local_path + ext) cmd = ['scp', '%s@%s:%s' % (user, machine, _file), local_loc] os.makedirs(os.path.dirname(path), exist_ok=True) subprocess.call(cmd)
def load_stirling(style='npy'): stirling_path = get_pymake_settings('project_stirling') fn = os.path.join(stirling_path, 'stirling.npy') npy_exists = os.path.isfile(fn) if style == 'npy' and npy_exists: return np.load(fn) else: stirlg = lookup_stirling() return stirlg.load()
def get_packages(cls, **kwargs): module_name = get_pymake_settings('_spec') if not 'class_filter' in kwargs: kwargs['class_filter'] = cls.module if isinstance(module_name, list): packs = {} for m in module_name: packs.update(cls(m, **kwargs).packages) return packs else: return cls(module_name, **kwargs).packages
def get_conf_from_file(target, mp): """ Return dictionary of property for an expe file. @mp: map parameters format model_K_hyper_N @template_file order important to align the dictionnary. """ masterkeys = _MASTERKEYS.copy() template_file = masterkeys.keys() ##template_file = 'networks/generator/Graph13/debug11/immsb_10_auto_0_all.*' data_path = get_pymake_settings('project_data') # Relative path ignore if target.startswith(data_path): target.replace(data_path, '') path = target.lstrip('/').split('/') _prop = os.path.splitext(path.pop())[0] _prop = path + _prop.split('_') prop = {} cpt_hook_master = 0 cpt_hook_user = 0 # @Debug/Improve the nasty Hook here def update_pt(cur, master, user): return cur - master + user #prop = {k: _prop[i] for i, k in enumerate(template_file) if k in mp} for i, k in enumerate(template_file): if not k in mp: cpt_hook_master += 1 continue pt = update_pt(i, cpt_hook_master, cpt_hook_user) hook = tree_hook(k, _prop[pt]) if hook: cpt_hook_user += 1 pt = update_pt(i, cpt_hook_master, cpt_hook_user) prop[k] = _prop[pt] return prop
def get_atoms(cls): atoms = dict() modules = get_pymake_settings('_script') modules = [modules] if type(modules) is str else modules for module in modules: s = cls(module, class_filter=cls.module) ## get decorator for each class #class2met2dec = {} #for method, _class in classs.packages.items(): # append decoratpr information to filter @atpymake for surname, _module in s.packages.items(): name = _module.__name__ module = s._cls_browse[name] methods = list(module.methods.keys()) for m in methods.copy(): _m = getattr(s.packages[name.lower()], m) if not inspect.isfunction(_m) and m != '__call__': methods.remove(m) elif '__call__' == m: methods.remove('__call__') methods.append(name.lower()) elif m.startswith('_'): methods.remove(m) elif m in dir(cls.module): methods.remove(m) content = {} content['scriptname'] = name content['scriptsurname'] = surname content['module_file'] = module.file content['module'] = _module.__module__ content['_module'] = _module #content['module_name'] = '.'.join((module.name, module.module)) content['module_super'] = module.super content['methods'] = set(methods) atoms[name] = content return atoms
def get_atoms(cls, _type='short'): if _type == 'short': shrink_module_name = True elif _type == 'topos': shrink_module_name = False packages = get_pymake_settings('_model') atoms = OrderedDict() for pkg in packages: if len(pkg) > 8: prefix = pkg[:3] if '.' in pkg: prefix += ''.join(map(lambda x: x[0], pkg.split('.')[1:])) else: prefix = True atoms.update( ModelsLoader.get_packages( pkg, prefix=prefix, max_depth=3, shrink_module_name=shrink_module_name)) return atoms
def load_data(self, fn): ''' Load data in the data path folder defined in the pmk.cfg ''' path = get_pymake_settings('project_data') path = os.path.join(path, fn) f, ext = os.path.splitext(path) if ext in ('.csv', '.txt'): import pandas as pd func = pd.read_csv kwargs = {} elif ext in ('.npy', ): func = sparse.load kwargs = {} elif ext in ('.npz', ): func = sparse.load_npz kwargs = {} else: raise NotImplementedError('extension not known: %s' % ext) self.log.info('Loading data: %s(%s, **%s)' % (func.__name__, path, kwargs)) data = func(path, **kwargs) self.log.info('%s data shape: %s' % (fn, str(data.shape))) return data
class tfidf(IndexManager): ''' Index documents. * Whoosh based. * format supported : * pdf ''' _DATA_PATH = os.path.join(get_pymake_settings('project_data'), 'tfidf') _SCHEMA = {'document' : ws.fields.Schema(hash = ws.fields.ID(stored = True, unique=True), shortpath = ws.fields.ID(stored = True, unique=True), fullpath = ws.fields.ID(stored = True, unique=True), title = ws.fields.KEYWORD(stored = True), authors = ws.fields.KEYWORD(stored = True), # names of the authors '||' separated references = ws.fields.KEYWORD(stored = True), # names of the references '||' separated date = ws.fields.KEYWORD(stored = True), # date of publication (@todo: find it by cross reference !) content = ws.fields.TEXT), #source = '', # name of the journal/conf ertc #type = '', # journal/conf etc } def __init__(self, expe): self.expe = expe super().__init__(default_index='document') def doc_yielder(self, path): ''' find all pdf and yield doc2bow doc ''' path = os.path.expanduser(path) if os.path.isfile(path): self.expe.path = path.rpartition('/')[0] +'/' for p in [path]: yield p elif not os.path.exists(path): self.log.error('path error: %s' % path) exit() for root, dirnames, filenames in os.walk(path): for filename in filenames: if not filename.endswith(('.pdf','.PDF')): continue fullpath = os.path.join(root, filename) if match_pattern(fullpath, self.expe.get('exclude_path')): continue yield fullpath def doc2xml(self, hit): import shutil # 0. Init cermine usage. (one path/pdf at a time). filename = os.path.basename(hit['fullpath']) fullpath = hit['fullpath'] shortpath = hit['shortpath'] pwd = os.getenv('PWD') os.chdir(os.path.join(pwd, 'data/lib/cermine/')) cermine_tar_dir = 'pdf_temp/'+filename.rpartition('.')[0] + '/' if not os.path.exists(cermine_tar_dir): os.makedirs(cermine_tar_dir) shutil.copy(hit['fullpath'], cermine_tar_dir) # 1. run Cermine jar = 'cermine-impl-1.14-SNAPSHOT-jar-with-dependencies.jar' classes = 'pl.edu.icm.cermine.ContentExtractor' try: self.log.info('extracting content of: %s' % (shortpath)) output = subprocess.check_output(['java', '-cp', jar, classes, '-path', cermine_tar_dir]) except Exception as e: self.log.error('Cermine Error %s : ' % e) self.log.error('Please try install/upgrade Cermine for pdf data extraction.') os.remove(cermine_tar_dir + filename) # remove the copied pdf os.chdir(pwd) return {} # 2. get the xml information cermine_file = cermine_tar_dir+ filename.rpartition('.')[0] + '.cermxml' if not os.path.isfile(cermine_file): self.log.error('Cermine failed...') return {} xml_strings = open(cermine_file).read() os.remove(cermine_tar_dir + filename) # remove the copied pdf os.chdir(pwd) return xml_strings # Two assumptions : # * string is a pdf, # * is a structured is as as scientific paper (journal ?). def extract_structured_kw(self, hit): structured = {} xml_strings = self.doc2xml(hit) try: from bs4 import BeautifulSoup except ImportError: self.log.error('Please install BeautifulSoup4 to parse xml doc.') return {} try: soup = BeautifulSoup(xml_strings, 'lxml') except Exception as e: self.log.error('BeautifulSoup fail to parse a file: %s : ' % e) return {} #titles = soup.findAll(re.compile(".*title.*")) # Main title # max probable title from cermine front = soup.front front_titles = front.findAll(re.compile(".*title.*")) #print(front_titles) main_title = ' '.join([o.string or '' for o in front_titles]).strip() structured['title'] = main_title authors = soup.findAll(attrs={'contrib-type':'author'}) authors = [o.findAll('string-name') for o in authors] authors = sum(authors, []) authors = ' || '.join([o.string for o in authors]) structured['authors'] = authors # Institution, Journal, Year etc... pass # References references = [ ' '.join(str(r).split()) for r in soup.findAll('mixed-citation')] structured['references'] = ' || '.join(references) return structured def fit(self): voca = Vocabulary(exclude_stopwords=True) writer = self.get_writer(reset=self.expe.reset, online=True) setattr(self, 'writer', writer) for _it, path in enumerate(self.doc_yielder(self.expe.path)): fullpath = path shortpath = '/' + fullpath[len(os.path.expanduser(self.expe.path)):].rstrip('/').lstrip('/') is_known = False is_duplicated = False if self.getfirst(shortpath, 'shortpath'): # don't update document # could compute a diff here... is_known = True # assume already indexed else: text = extract_pdf(fullpath) text = voca.remove_stopwords(text) #bow = voca.doc2bow(text) if text in (None, ''): # do nothing continue doc = dict(shortpath=shortpath, fullpath=fullpath) doc['content'] = text doc['hash'] = hash_objects(text) first_m = self.getfirst(doc['hash'], 'hash') if first_m: #if not 'content' in first_m: # writer.delete_by_term('hash', doc['hash']) # continue # don't update document self.log.warning("Duplicate file detected: %s renaming to %s" % (first_m['shortpath'], shortpath)) first_m['shortpath'] = shortpath writer.update_document(**first_m) is_duplicated = True else: if self.expe.extract_structure: # structured content structured = self.extract_structured_kw(doc) doc.update(structured) if not (is_known or is_duplicated): print("indexing `%s'" % (path)) try: writer.add_document(**doc) except Exception as e: print('indexing doc %s failed!' % fullpath) return def close(self): if hasattr(self, 'writer'): try: self.writer.close() except Exception as e: print('Whoosh error: %s' %e)
def forest_tensor(target_files, map_parameters): """ It has to be ordered the same way than the file properties. Fuze directory to find available files then construct the tensor according the set space fomed by object found. @in target_files has to be orderedDict to align the the tensor access. """ # Expe analyser / Tabulyze It # res shape ([expe], [model], [measure] # ================================================================================= # Expe: [debug, corpus] -- from the dirname # Model: [name, K, hyper, h**o] -- from the expe filename # measure: # * 0: global precision, # * 1: local precision, # * 2: recall ### Output: rez.shape rez_map_l rez_map if not target_files: lgg.info('Target Files empty') return None #dim = get_conf_dim_from_files(target_files, map_parameters) # Rely on Expe... dim = dict((k, len(v)) if isinstance(v, (list, tuple)) else (k, len([v])) for k, v in map_parameters.items()) rez_map = map_parameters.keys() # order ! # Expert knowledge value new_dims = _New_Dims # Update Mapping [dim.update(d) for d in new_dims] [rez_map.append(n.keys()[0]) for n in new_dims] # Create the shape of the Ananisys/Resulst Tensor #rez_map = dict(zip(rez_map_l, range(len(rez_map_l)))) shape = [] for n in rez_map: shape.append(dim[n]) # Create the numpy array to store all experience values, whith various setings rez = np.zeros(shape) * np.nan not_finished = [] info_file = [] for _f in target_files: prop = get_conf_from_file(_f, map_parameters) pt = np.empty(rez.ndim) assert (len(pt) - len(new_dims) == len(prop)) for k, v in prop.items(): try: v = int(v) except: pass try: idx = map_parameters[k].index(v) except Exception as e: lgg.error(prop) lgg.error('key:value error -- %s, %s' % (k, v)) raise ValueError pt[rez_map.index(k)] = idx f = os.path.join(get_pymake_settings('project_data'), _f) d = load(f) if not d: not_finished.append('%s not finish...\n' % _f) continue try: pt = list(pt.astype(int)) for i, v in enumerate(_Key_measures): pt[-1] = i ### HOOK # v: is the measure name # json_v: the value of the measure if v == 'homo_model_e': try: json_v = d.get('homo_model_o') - d.get(v) except: pass elif v == 'f1': precision = d.get('Precision') try: recall = d.get('Recall') recall * 2 except: # future remove recall = d.get('Rappel') json_v = 2 * precision * recall / (precision + recall) else: if v == 'Recall': try: v * 2 except: v = 'Rappel' json_v = d.get(v) rez[zip(pt)] = json_v except IndexError as e: lgg.error(e) lgg.error( 'Index Error: Files are probably missing here to complete the results...\n' ) #info_file.append( '%s %s; \t K=%s\n' % (corpus_type, f, K) ) lgg.debug(''.join(not_finished)) #lgg.debug(''.join(info_file)) rez = np.ma.masked_array(rez, np.isnan(rez)) return rez
def __init__(self, default_index='model'): self._DATA_PATH = os.path.join(get_pymake_settings('PWD'), '.pmk') self._index_basename = 'ir_index' self._default_index = default_index self._ix = {} # Index store by key
def full_fig_path(self, fn): figs_path = get_pymake_settings('project_figs') path = os.path.join(figs_path, self.expe.get('_refdir', ''), self.specname(fn)) make_path(path) return path
def get_data_path(self): path = get_pymake_settings('project_data') path = os.path.join(path, '') return path