def from_feather_to_csv_all(): for seq_len in range(2, 13): print('seq_len = ' + str(seq_len) + '...') for nF in range(1, 9999): # 1,...,(n-1) fichtr = 'clicks_X_train_' + str(seq_len) + '-' + str( nF) + '.feather' if not os_path_isfile( s_input_path + 'ok_en_hdfs/' + fichtr.replace('.feather', '_para_spark.csv')): if not os_path_isfile( s_input_path + fichtr.replace('.feather', '_para_spark.csv')): if os_path_isfile(s_input_path + fichtr): fich = 'clicks_X_train_' + str(seq_len) + '-' + str( nF) + '.feather' fich = from_feather_to_csv(fich) fich = 'clicks_X_valid_' + str(seq_len) + '-' + str( nF) + '.feather' fich = from_feather_to_csv(fich) fich = 'clicks_X_test_' + str(seq_len) + '-' + str( nF) + '.feather' fich = from_feather_to_csv(fich) fich = 'clicks_y_train_' + str(seq_len) + '-' + str( nF) + '.feather' fich = from_feather_to_csv(fich) fich = 'clicks_y_valid_' + str(seq_len) + '-' + str( nF) + '.feather' fich = from_feather_to_csv(fich) fich = 'clicks_y_test_' + str(seq_len) + '-' + str( nF) + '.feather' fich = from_feather_to_csv(fich)
def process_single_target(target_dirname): # Load chandat chandat_fpath = os_path_join(target_dirname, CHANDAT_FNAME) try: chandat_obj = get_mat_obj_from_h5py(chandat_fpath) except OSError: chandat_obj = loadmat(chandat_fpath) # print('chandat_obj[\'chandat\'].shape =', chandat_obj['chandat'].shape) # Load old_stft_obj old_stft_fpath = os_path_join(target_dirname, OLD_STFT_FNAME) if os_path_isfile(old_stft_fpath): try: old_stft_obj = get_mat_obj_from_h5py(old_stft_fpath) except OSError: old_stft_obj = loadmat(old_stft_fpath) else: old_stft_obj = r2_dnn_stft(target_dirname, saving_to_disk=False) # print('old_stft_obj[\'old_stft_real\'].shape =', old_stft_obj['old_stft_real'].shape) # print('old_stft_obj[\'old_stft_imag\'].shape =', old_stft_obj['old_stft_imag'].shape) new_stft_object = r3_dnn_apply(target_dirname, old_stft_obj=old_stft_obj, saving_to_disk=False) del old_stft_obj # print('process_single_scan_battery: new_stft_object[\'new_stft_real\'].shape =', new_stft_object['new_stft_real'].shape) chandat_dnn_object = r4_dnn_istft(target_dirname, chandat_obj=chandat_obj, new_stft_object=new_stft_object, is_saving_chandat_dnn=False) del new_stft_object chandat_image_obj = r5_dnn_image(target_dirname, chandat_obj=chandat_obj, chandat_dnn_obj=chandat_dnn_object, is_saving_chandat_image=False) del chandat_obj, chandat_dnn_object r6_dnn_image_display(target_dirname, dnn_image_obj=chandat_image_obj, show_fig=False) # Remove target-level files and folders for file in TARGET_FILES_TO_REMOVE: file_path = os_path_join(target_dirname, file) if os_path_isfile(file_path): # print('{}: Trying to remove {}'.format(SCRIPT_FNAME, file_path)) try: os_remove(file_path) except: raise OSError( 'Error: unable to remove file {}'.format(file_path))
def expand_bot_path(filename): ''' ''' # try "core/" first_try = os_path_join(os_path_dirname(__file__), filename) if os_path_isfile(first_try): return first_try # try "core/.." second_try = os_path_join(os_path_dirname(__file__), '..', filename) if os_path_isfile(second_try): return second_try raise IOError('File "{0}" not found under "{1}" or "{2}"'.format( filename, first_try, second_try))
def preparar_RDD(seq_len = 0): from elephas.utils.rdd_utils import to_simple_rdd from os import rename as os_rename for nF in range(1, 99): # 1,...,(n-1) fichtr = 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv' if os_path_isfile(s_input_path + fichtr): print('Leyendo ficheros train+valid ' + str(nF) + ' - numAds ' + str(seq_len) + '...') X_train = read_csv(s_input_path + 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=np_float64, header = None).values y_train = read_csv(s_input_path + 'clicks_y_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=int, header = None).values X_valid = read_csv(s_input_path + 'clicks_X_valid_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=np_float64, header = None).values y_valid = read_csv(s_input_path + 'clicks_y_valid_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv', dtype=int, header = None).values print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape) X_train, y_train = mi_reshape(X_train, to_categorical(y_train), seq_len) X_valid, y_valid = mi_reshape(X_valid, to_categorical(y_valid), seq_len) X_train = np_concat((X_train, X_valid), axis=0) # Incluimos validset dentro del trainset en Spark y_train = np_concat((y_train, y_valid), axis=0) # Incluimos validset dentro del trainset en Spark print(X_train.shape, y_train.shape) print('Creando RDD (train+valid) ' + str(nF) + ' - numAds ' + str(seq_len) + '...') rdd_ini = to_simple_rdd(sc, X_train, y_train) # Convertimos ndarray [ i.e. array(...) ] en list [ i.e. [...] ]: rdd_lista = rdd_ini.map(lambda i: map(lambda s: s.tolist(), i)) # Y ahora guardamos como txt: rdd_lista.coalesce(numSparkWorkers, True).saveAsTextFile(s_spark_inputpath + 'clicks_train_seq' + str(seq_len) + '-' + str(nF) + '_rdd') # Forzamos a guardarlo en 4 trozos (al menos) print('Ok. Guardado en HDFS el RDD (train+valid) ' + str(nF) + ' - numAds ' + str(seq_len) + '.') os_rename(s_input_path + fichtr, s_input_path + 'ok_en_hdfs/' + 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '_para_spark.csv')
def process_single_scan_battery_keras(model_folder, source_scan_battery_dirname): # Make sure model_folder and source_scan_battery_dirname exist. if not os_path_isdir(model_folder): raise OSError('{}: model folder {} does not exist'.format( SCRIPT_FNAME, model_folder)) if not os_path_isdir(source_scan_battery_dirname): raise OSError( '{}: source scan battery folder {} does not exist'.format( SCRIPT_FNAME, source_scan_battery_dirname)) # model/scan_batteries folders. model_scan_batteries_dirname = os_path_join(model_folder, SCAN_BATTERIES_DIRNAME) model_scan_battery_dirname = os_path_join( model_scan_batteries_dirname, os_path_basename(source_scan_battery_dirname)) # Copy source scan_batteries folder into model scan_batteries folder # TODO: Could also just copy the entire scan_batteries folder (all 3 types) into model_folder # logging_info('{}: copying {} to {}'.format(SCRIPT_FNAME, source_scan_battery_dirname, model_scan_battery_dirname)) # copy_anything(source_scan_battery_dirname, model_scan_battery_dirname) # model_scan_battery_process_scripts_dirname = os_path_abspath(os_path_join(model_scan_battery_dirname, PROCESS_SCRIPTS_DIRNAME)) # Grab all targets with glob mode_scan_battery_target_prefix = os_path_join(model_scan_battery_dirname, TARGET_PREFIX + '*') target_dirnames = glob_glob(mode_scan_battery_target_prefix) if not target_dirnames: raise ValueError('{}: no targets found with prefix {}'.format( SCRIPT_FNAME, mode_scan_battery_target_prefix)) for target_dirname in target_dirnames: process_single_target(target_dirname) # for target_dirname in target_dirnames: # # print('{}: processing target directory {}'.format(SCRIPT_FNAME, target_dirname)) # process_single_target(target_dirname) # Remove scan battery-level folders for folder in SCAN_BATTERY_FOLDERS_TO_REMOVE: folder_path = os_path_join(model_scan_battery_dirname, folder) if os_path_isdir(folder_path): # print('{}: Trying to remove {}'.format(SCRIPT_FNAME, folder_path)) try: shutil.rmtree(folder_path) except: raise OSError( 'Error: unable to remove file {}'.format(folder_path)) # Remove scan battery-level files for file in SCAN_BATTERY_FILES_TO_REMOVE: file_path = os_path_join(model_scan_battery_dirname, file) if os_path_isfile(file_path): # print('{}: Trying to remove {}'.format(SCRIPT_FNAME, file_path)) try: os_remove(file_path) except: raise OSError( 'Error: unable to remove file {}'.format(file_path))
def release(self): if not os_path_isfile(self._path): raise Exception('Can\'t release unacquired Lock!') if current_thread() != self._owner: raise Exception('Can\'t release Lock, not the right Thread!') os_remove(self._path) self._owner = None
def increase_index_and_move(src_folder, dst_folder, file, extension, src_index, dst_index, max_index): # Helper function to format the full source and destination path path = lambda f, i: os_path_join(f, extension.format(file, i)) # If destination file's index is lesser than # the maximum number of backups allowed if src_index <= max_index: src = path(src_folder, src_index) dst = path(dst_folder, dst_index) # If the destination file exists if os_path_isfile(dst): # Call this function recursivly increase_index_and_move( src_folder=dst_folder, dst_folder=dst_folder, file=file, extension=extension, src_index=dst_index, dst_index=dst_index + 1, max_index=max_index, ) cleanup = "" # If destination file's index is equal or # greater than the maximum number of backups allowed else: src = path(src_folder, max_index - 1) dst = path(dst_folder, max_index) cleanup = path(src_folder, src_index) # Move source file to destination try: shutil_move(src, dst) return cleanup # If source does not found except FileNotFoundError: return ""
def _storeArticle(article): """ _safeArticle(Dict) -> Bool private help method to safe an aticle param article:Dict - """ # try: #make a path according to the article's topics path = re_sub('http://www.spiegel.de/','', article['link']).split('/') filename = path.pop(-1) storePath = os_path_join(BASE_PATH,os_path_join(*path)) #create directories if not os_path_exists(storePath): os_makedirs(storePath) #write article as json to the file with open(os_path_join(storePath, filename),'w') as o: json.dump(article, o) #write the article name to the log if os_path_isfile(BASE_PATH + 'article_log'): log = open(BASE_PATH + 'article_log','a') else: log = open(BASE_PATH + 'article_log','w') log.write(article['link'] + '\n') log.close() return True
def wrapper(*args, **kwargs): # TODO: find 'self' object - untested report = kwargs.get('self') if report is None: for i in args: if hasattr(i, 'dirLog'): report = i if report is None: report = globals().get('self') if report is None: raise LogError # Configure logger import logging bolNewLog = False try: if not os_path_isfile(report.dirLog): open(report.dirLog, 'w').close() bolNewLog = True strFormat = '%(asctime)s %(levelname)s: %(message)s' logging.basicConfig(filename=report.dirLog, filemode='a', format=strFormat, level=logging.DEBUG) except Exception as err: print(err) raise LogError if callable(fun): fun(args, kwargs) logging.shutdown() del logging gc_collect() return bolNewLog
def from_feather_to_csv(fich='clicks_X_valid_4-1.feather', s_input_path='kaggle/Outbrain/In/python/'): fich_dest = fich.replace('.feather', '_para_spark.csv') if not os_path_isfile(s_input_path + fich_dest): from feather import read_dataframe as fthr_read_dataframe from numpy import savetxt as np_savetxt X = fthr_read_dataframe(s_input_path + fich) np_savetxt(s_input_path + fich_dest, X, delimiter=',') print(fich_dest, X.values.shape, ' Ok.') return (fich_dest)
def test_main_option_output(self): tmpfile = os_path_join( self.tmpdir, 'test_main_option_output.' + str(randint(100000, 999999))) cmds = (['python3', '-m', 'passphrase', '--output', tmpfile], ['python3', '-m', 'passphrase', '-o', tmpfile]) for cmd in cmds: result = subprocess.run( cmd, stdout=subprocess.PIPE).stdout.decode('utf-8') self.assertTrue(os_path_isfile(tmpfile)) with open(tmpfile, mode='rt', encoding='utf-8') as tfile: self.assertEqual(result, tfile.read())
def from_feather_to_csv_all(): from os.path import isfile as os_path_isfile for seq_len in range(2,13): for nF in range(1, 9999): # 1,...,(n-1) fichtr = 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '.feather' if not os_path_isfile(s_input_path + fichtr): break # Ya no hay más fich = 'clicks_X_train_' + str(seq_len) + '-' + str(nF) + '.feather'; fich = from_feather_to_csv(fich) fich = 'clicks_X_valid_' + str(seq_len) + '-' + str(nF) + '.feather'; fich = from_feather_to_csv(fich) fich = 'clicks_X_test_' + str(seq_len) + '-' + str(nF) + '.feather'; fich = from_feather_to_csv(fich) fich = 'clicks_y_train_' + str(seq_len) + '-' + str(nF) + '.feather'; fich = from_feather_to_csv(fich) fich = 'clicks_y_valid_' + str(seq_len) + '-' + str(nF) + '.feather'; fich = from_feather_to_csv(fich) fich = 'clicks_y_test_' + str(seq_len) + '-' + str(nF) + '.feather'; fich = from_feather_to_csv(fich)
def get_filepaths_in_folder(folderpath: str, ignore: List[str], recursive: bool = False) -> List[str]: folderpath = os_path_normpath(folderpath) print('get_filepaths_in_folder:', folderpath) list_filepaths = [] for f in os_listdir(folderpath): if f not in ignore: f_path = myjoin(folderpath, f) if os_path_isfile(f_path): list_filepaths.append(myjoin(folderpath, f)) elif recursive: list_filepaths += get_filepaths_in_folder( f_path, ignore, recursive) return list_filepaths
def start(self, wait_on=True): """Create and launch LXC container with switchpp. Args: wait_on(bool): Indicates if wait for device status """ self.class_logger.info( "Starting LXC for switch with ip:%s port:%s..." % (self.ipaddr, self.port)) # Check if it is an altamodel. if os_path_isfile(os_path_join(self.build_path, "bin", "ons-fulcrum")): self.class_logger.info("AltaModel is found.") self.__class__.SWITCH_APP = {"FulcrumApp"} log_wrap_out, log_wrap_err = loggers.pipe_loggers( "switchpp%s" % (self.id, ), self.popen_logfile) # sudo env LD_LIBRARY_PATH=$PWD/lib ./bin/ons-lxc -n 1 -i br0 -a 10.0.5.101/24 -p 52 lxc_id = str(int(self.port) - 8080) command = [ "./ons-ctl", "start", "-n", lxc_id, "-i", self.vlab_iface, "-a", "%s/24" % self.ipaddr, "-p", str(self.ports_count) ] self.class_logger.debug("LXC start command: %s" % (" ".join(command))) process = Popen(command, stdout=log_wrap_out, stderr=log_wrap_err, close_fds=True, cwd=os_path_join(self.build_path, "bin")) process = Popen(['lxc-wait', '-n', lxc_id, '-s', 'RUNNING'], stdout=log_wrap_out, stderr=log_wrap_err, close_fds=True) process.wait() # let's wait until device is up and running: if wait_on: time.sleep(5) self.waiton(timeout=self.startup_time) # Set On(True) status self.status = True return self.xmlproxy
def __init__(self, folder, file, reset=False, lazy_update=False): self._file = os_path_join(folder, file) self._cache = cache = {} self._last = None if reset: return # If cache file already exists try: with open(self._file, 'rb') as file: for filepath, checksum in pickle_load(file).items(): # If file still exists if os_path_isfile(filepath): cache[filepath] = checksum if lazy_update: self._lcache = cache.copy() except (FileNotFoundError, EOFError): pass
def update_db(self): DB_update_folder = DB_DIR+"/Updates"; if(not os_path_exists(DB_update_folder)): return; # Get all files that match versioning file_versions = []; for file in os_listdir(DB_update_folder): filepath = os_path_join(DB_update_folder, file); # Try to get the file's name excluding extension (valid filename example: v0.0.0.sql) version_string = Version.version_string(os_path_splitext(os_path_basename(filepath))[0]); # Include only files with proper version names within update range if(os_path_isfile(filepath) and version_string and self.version_is_usable(Version(version_string))): file_versions.append({"path": filepath, "version": Version(version_string)}); file_versions.sort(key=lambda file_version : file_version["version"]); for file in file_versions: if(self.call_shell_command(["sudo", "mysql", "-u", "root", "<", file["path"]])): raise Exception(f"Failed to update DB with file {file['path']}");
def cvs_add(self, input_path: str) -> Tuple[str, str]: """Adds single file to staged changes Returns pair: result status AND filename Possible return statuses: 'does not exist' 'success' 'not a file' 'already added' """ if not os_path_exists(self._CVS_DIR_PATH): raise CodecException('add: init repo first') if not os_path_exists(input_path): return 'does not exist', input_path elif os_path_isfile(input_path): repo_changes: 'ChangesCodec.RepositoryChanges' if os_path_exists(self._STAGED_PATH): repo_changes = self._decode_changes(self._STAGED_PATH) else: repo_changes = self.RepositoryChanges() if input_path in repo_changes.addition: return 'already added', input_path repo_changes.addition.append(input_path) self._encode_changes(repo_changes, self._STAGED_PATH) return 'success', input_path else: return 'not a file', input_path
def acquire(self): while os_path_isfile(self._path): time_sleep(1) open(self._path,'w').close() self._owner = current_thread()
type=str, action='store', nargs='?', default='') parser.add_argument("--no_splash_screen", help="Does not display the splash screen", action='store_true') args = parser.parse_args() if args.filename: from pywinauto_recorder.recorder import overlay_add_mode_icon from pywinauto_recorder.recorder import overlay_add_progress_icon import traceback import codecs recorder = None if os_path_isfile(args.filename): with codecs.open(args.filename, "r", encoding='utf-8') as python_file: data = python_file.read() print("Replaying: " + args.filename) replay(data, args.filename) else: print("Error: file '" + args.filename + "' not found.") input("Press Enter to continue...") print("Exit") else: from pywinauto_recorder.player import * from pywinauto_recorder.recorder import * from win32api import GetSystemMetrics display_splash_screen()
def existfile(filename): if os_path_exists(filename) and os_path_isfile(filename) and os_access(filename, os_W_OK): return True; #endif return False;
,\'{author}\',{timestamp},\'{comment}\'', u'articletags':u'{tagid},{articleid}', u'tags':u'{tagid},{tagname}' } CREATE = 'create table {}({})' TABLE_QUERY = { 'articles':u'articleid integer primary key, link text, \ content text, site text, author text, timestamp integer, \ comment text', u'articletags':u'tagid integer,articleid integer', u'tags':u'tagid integer primary key, tagname text unique' } if not os_path_isfile(DBPATH): db = sqlite3.connect(DBPATH) c = db.cursor() for table, query in TABLE_QUERY.iteritems(): c.execute(CREATE.format(table,query)) db.commit() db.close() class LockFile: def __init__(self, path=DBPATH): self._path = os_join(path, '.lock') self._owner = None def acquire(self): while os_path_isfile(self._path): time_sleep(1)
def GET(self): URIs_especiales = { '_raices': 'self.indexar_json()', '_configuracion': 'self.indexar_configuracion()' } # 0 Si el valor es menor que 0 acciones_parametros_especiales = { '_limite': ' datos_almacenados[ 0: 0 if int(valor_parametro_especial) < 0 else int(valor_parametro_especial) ] ', '_desde': ' datos_almacenados[ 0 if int(valor_parametro_especial) - 1 < 0 else int(valor_parametro_especial) - 1 : ]', '_total': ' {"total objetos":len(datos_almacenados)}' } trozos_URI, parametros, parametros_especiales = self.trocear_URI( parametros=True) if len(trozos_URI) == 0: if self.CONFIGURACION['SERVIDOR_ESTATICO']: try: with open( self.CONFIGURACION['PAGINA_ESTATICA_DIRECTORIO'] + '/' + self.CONFIGURACION['PAGINA_ESTATICA_ARCHIVO'], 'r') as PAGINA_ESTATICA: self.devolver_estado( 200, PAGINA_ESTATICA.read(), nombre_archivo=self. CONFIGURACION['PAGINA_ESTATICA_ARCHIVO']) except Exception as e: self.indexar_json() else: self.indexar_json() elif trozos_URI[0] in URIs_especiales.keys(): if not self.CONFIGURACION['URI_ESPECIALES']: self.devolver_estado(403) return eval(URIs_especiales[trozos_URI[0]]) elif trozos_URI[0] == self.CONFIGURACION['PAGINA_ESTATICA_DIRECTORIO']: directorio = self.CONFIGURACION['PAGINA_ESTATICA_DIRECTORIO'] for x in range(1, len(trozos_URI)): directorio += "/" + trozos_URI[x] if os_path_isdir(directorio): if self.CONFIGURACION['INDEXAR_DIRECTORIOS']: codigo_estado, contenido, nombre_archivo = almacenamiento.leer_directorio( directorio, trozos_URI, self.CONFIGURACION['PAGINA_ESTATICA_ARCHIVO'], self.CONFIGURACION[ 'BUSCAR_PAGINA_ESTATICA_AL_INDEXAR_DIRECTORIO']) else: codigo_estado, contenido, nombre_archivo = 403, False, False self.devolver_estado(codigo_estado, contenido, nombre_archivo) elif os_path_isfile(directorio): codigo_estado, contenido, nombre_archivo = almacenamiento.leer_archivo( directorio, trozos_URI) self.devolver_estado(codigo_estado, contenido, nombre_archivo) else: self.devolver_estado(404) else: try: datos_almacenados = almacenamiento.leer_json( self.CONFIGURACION, trozos_URI) # Si existen parametros, por cada parametro y por cada dato recuperado se comprueba, solo funciona con objetos if len(parametros) > 0: if not isinstance(datos_almacenados, list): self.devolver_estado(400) return True for parametro in parametros: indice = 0 while indice < len(datos_almacenados): if not isinstance(datos_almacenados[indice], dict): self.devolver_estado(400) return True if not str(datos_almacenados[indice][ parametro.split("=")[0]]) == str( parametro.split("=")[1]): datos_almacenados.remove( datos_almacenados[indice]) else: indice += 1 # Si existen parametros especiales y estan permitidos, se recorren y ejecutan if len(parametros_especiales) > 0: if not self.CONFIGURACION['PARAMETROS_ESPECIALES']: self.devolver_estado( 403, 'PARAMETROS_ESPECIALES_DESACTIVADOS') return True if not isinstance(datos_almacenados, list): self.devolver_estado(400) return True for parametro in parametros_especiales: parametro_especial = parametro.split("=")[0] try: valor_parametro_especial = parametro.split("=")[1] except: valor_parametro_especial = None # Para casos como ?_total if parametro_especial in acciones_parametros_especiales.keys( ): datos_almacenados = eval( acciones_parametros_especiales[ parametro_especial]) self.devolver_estado(200, datos_almacenados, es_json=True) except Exception as e: self.captura_error(str(e), cod_error=404, msg_error=str(e))
def collect(infolder, line = comment_LINE, block = comment_BLOCK, tags = WORDS, marks = MARKS, include=INCLUDE, exclude=EXCLUDE, overwrite=False): # Process block comment marks blocks_open, blocks_close = comment_block_comments(block) # TODO: Make hidden files OS independent, probably using # https://docs.python.org/3.4/library/tempfile.html ? # FIXME: for some reason, if a comment-type ever existed in the TODO # file, but after a while its posts are all gone, the keyword # still remains there, according to the current TODO file, # which still have the "QUESTIONS" keyword, and comment # TODO: Add explicit-remove/browsing capabilities of the .*_cache files # (for example: if git reverted changes --> remove hash from cache file) # The best solution would be a complete CLI tool, to read and manage # and use the cutils command line tools # Compile regular expression patterns pattern1 = re_compile(_COMMENT.format(r'|'.join(map(comment_escape, line)), blocks_open, r'|'.join(map(comment_escape, tags)), r'|'.join(map(comment_escape, marks)), blocks_close), flags=re_IGNORECASE | re_DOTALL | re_MULTILINE | re_VERBOSE) pattern2 = re_compile(r'\n') # Get previously generated collection of all posts COLLECTED = os_path_join(infolder, '.ccom_todo') try: with open(COLLECTED, 'rb') as file: collected = pickle_load(file) except (FileNotFoundError, EOFError): collected = table_Table(row=OrderedDict) # Clear cache -- remove all non-existing files for filepath in collected.rows(): if not os_path_isfile(filepath): del collected[filepath] # Exception containers except_dirs = [] # relative path to dir from root except_files = [] # relative path to file from root except_names = [] # filename (with extension) anywhere except_exts = [] # extension anywhere # If 'exclude' is dictionary like object try: _empty = () # Exceptions relative to root for key, container in zip(('folders', 'files'), (except_dirs, except_files)): container.extend(os_path_join(infolder, p) for p in exclude.get(key, _empty)) # Exceptions anywhere for key, container in zip(('names', 'extensions'), (except_names, except_exts)): container.extend(exclude.get(key, _empty)) # If 'exclude' is an iterable object except AttributeError: except_names = exclude # Include containers permit_names = [] # filename (with extension) anywhere permit_exts = [] # extension anywhere # If 'include' is dictionary like object try: _empty = () # Includes anywhere for key, container in zip(('names', 'extensions'), (permit_names, permit_exts)): container.extend(include.get(key, _empty)) # If 'include' is an iterable object except AttributeError: permit_names = include # Scan through all files and folders with check_Checker(infolder, file='.ccom_cache') as checker: for root, dirs, filenames in os_walk(infolder): # If skip this folder and all subfolders if root in except_dirs: dirs.clear() continue # Check all files in folder for filename in filenames: filepath = os_path_join(root, filename)[2:] # If skip this exact file if filepath in except_files: continue name, extension = os_path_splitext(filename) # If file or extension is not banned and it is on the # white-list and it changed since last time checked and # this is not and overwrite-call if (filename not in except_names and extension not in except_exts and (extension in permit_exts or filename in permit_names) and checker.ischanged(filepath) and not overwrite): with open(filepath, encoding='utf-8') as file: _search(collected, pattern1, pattern2, file.read(), filepath, marks) # Save collection of all posts with open(COLLECTED, 'wb') as file: pickle_dump(collected, file, pickle_HIGHEST_PROTOCOL) # Open the todo file and write out the results with open('TODO', 'w', encoding='utf-8') as todo: # Make it compatible with cver.py todo.write('## INFO ##\n'*2) # Format TODO file as yaml for key in itertools_chain(tags, marks.values()): KEY = key.upper() try: types = collected[KEY].items() len_pos = todo.tell() # Offset for separator comment and # leading and trailing new lines todo.write(' '*82) todo.write('{}:\n'.format(KEY)) index = 1 for filename, posts in types: for i, (linenumber, content) in enumerate(posts, start=index): todo.write(_ITEM.format(msg='\n'.join(content), index=i, short=_SHORT, long=_SHORT*2, sep='- '*38, file=filename, line=linenumber)) index = i + 1 todo.write('\n') # Move back to tag separator comment todo.seek(len_pos) todo.write('\n#{:-^78}#\n'.format( ' {} POSTS IN {} FILES '.format(index - 1, len(types)))) # Move back to the end todo.seek(0, 2) except KeyError: continue print('CCOM: placed {!r}'.format(os_path_join(infolder, 'TODO')))
def parse_NYC_snd_datafile(self,fpath='',with_regex=True): from json import dumps as j_dumps from os.path import isfile as os_path_isfile f = open(src_fpath,'r') x = f.readlines() f.close( ) print '\n',SND_NON_S_PATH,'\n',SND_S_PATH,'\n' # Characters Allowed: [a-z0-9A-Z],[-&'/] # Borough: # 1 = MN # 2 = Bronx # 3 = Brooklyn # 4 = Queens # 5 = Staten Island ## GFT: # blank None of the below, e.g., either a name of a street that has no hyphenated house numbers # and no part of which is within # Edgewater Park, or a name of a tunnel, etc # A Addressable place name # B Name of bridge # C Business Improvement Districts # D Duplicate Address Pseudo-Street name (DAPS) # E Street is entirely within Edgewater Park # F Street is partially within Edgewater Park # G Non-Addressable Place name (NAP) of a complex # H All house numbers on this street are hyphenated # I Intersection Name # J Non-Physical Boundary Features # M Some house numbers on this street are hyphenated, some are not # N NAP of a 'stand-alone' geographic feature (not a complex # or a constituent entity of a complex) # O Shore Line # P Pseudo-street name (BEND, CITY LIMIT, DEAD END and their aliases) # R Rail line # S Front-truncated street name # T Tunnel # U Miscellaneous Structures # X NAP of a constituent entity of a complex Z Ramp ## ignore B,G,N,O,R,T ## watch H,M def get_snd_non_s(x): # non-type 'S' / size 34 a = { # 'rec_type':x[0:1], 'boro':x[1:2], # (see "boro" below) 'stname':x[2:34].strip(), # full street name 'primary_flag':x[34:35].strip(), # P(=primary) or V(=non-primary) 'principal_flag':x[35:36].strip(), # F or S 'boro':x[36:37].strip(), # 1,2,3,4,5 'sc5':x[36:42].strip(), 'lgc':x[42:44].strip(), # Local Group Code 'spv':x[44:47].strip(), # Spelling Variation # filler = x[47:49] 'numeric_ind':x[49:50].strip(), # Numeric Name Indicator 'GFT':x[50:51].strip(), # (see description above) # 'len_full_name':x[51:53].strip(), 'full_stname':x[53:85].strip(), 'min_SNL':x[85:87].strip(), 'stn20':x[87:107].strip(), 'ht_name_type_code':x[107:108].strip(), # blank or R(= roadbed), G(= generic), U(= undivided) # filler = x[109:200] } return a def get_snd_s(x): a = { # filler = x[0:1] # always '1' # 'rectype':x[0:1], 'boro':x[1:2].strip(), # only 1 or 2 (MN and BX) 'stname':x[2:34].strip(), # front truncated name # filler = x[34:49] # P or V 'numeric_ind':x[49:50].strip(), # blank or N 'GFT':x[50:51], # always 'S' # 'len_full_name':x[51:53].strip(), 'num_of_progens':x[53:54].strip(), # either 1 or 2 ? 'progen_word_1':x[54:55].strip(), # E or W 'progen_gft_1':x[55:56].strip(), 'progen_b10sc_1':x[56:67].strip(), 'sc5_1':x[56:62].strip(), # filler = x[67:70] 'progen_word_2':x[70:71].strip(), # E or W 'progen_gft_2':x[71:72].strip(), 'progen_b10sc_2':x[72:83].strip(), 'sc5_2':x[72:78].strip(), # filler = x[83:86] # filler = x[86:200] } return a non_s,s,xlen=[],[],len(x) for i in range(1,xlen): rec = re_sub(r'(\r\n)$',r'',x[i]) if rec[34]=='P' or rec[34]=='V': # non-'S' r = get_snd_non_s(rec) r['source_ln_num'] = i non_s.append(r) else: r = get_snd_s(rec) r['source_ln_num'] = i s.append(r) # print len(non_s),'rows of non-type S' # print len(s),'rows of S-type' # print xlen,'total rows in data' assert len(non_s)+len(s)==xlen-1 # TESTING: # print x[1] # for non s-type # non_s[0] # --for s-type # print x[11] # s[0] p = j_dumps(non_s) df_non_s=pd.read_json(p) ns_cols = sorted(get_snd_non_s('').keys()) df_non_s = df_non_s.ix[:,ns_cols] # - Remove Blank Columns remove_cols = [] # ---- PROVE THAT OK TO REMOVE 'min_SNL' b/c NO VALUES EXIST test_col='min_SNL' t=df_non_s[test_col].unique().tolist() assert True == (len(t)==1) == (t[0]=='') remove_cols.append(test_col) # ---- PROVE THAT OK TO REMOVE 'stn20' b/c NO VALUES EXIST test_col='stn20' t=df_non_s[test_col].unique().tolist() assert True == (len(t)==1) == (t[0]=='') remove_cols.append(test_col) # -- df_non_s = df_non_s.drop(remove_cols,axis=1) print '\n',len(df_non_s),'non-S-Type records\n' # print df_non_s.head() assert False==os_path_isfile(SND_NON_S_PATH) df_non_s.to_csv(SND_NON_S_PATH) assert True==os_path_isfile(SND_NON_S_PATH) p = j_dumps(s) df_s=pd.read_json(p) s_cols = sorted(get_snd_s('').keys()) df_s = df_s.ix[:,s_cols] l_funct = lambda s: 0 if len(str(s).strip())==0 else int(s) df_s['progen_b10sc_2'] = df_s.progen_b10sc_2.map(l_funct) df_s['sc5_2'] = df_s.sc5_2.map(l_funct) # - Remove Blank Columns remove_cols = [] # ---- PROVE THAT OK TO REMOVE 'min_SNL' b/c NO VALUES EXIST test_col='progen_gft_2' t=df_s[test_col].unique().tolist() assert True == (len(t)==1) == (t[0]=='') remove_cols.append(test_col) # -- df_s = df_s.drop(remove_cols,axis=1) print '\n',len(df_s),'S-Type records\n' # print df_s.head() assert False==os_path_isfile(SND_S_PATH) df_s.to_csv(SND_S_PATH) assert True==os_path_isfile(SND_S_PATH) return 'success'
def parse_NYC_snd_datafile(self, fpath='', with_regex=True): from json import dumps as j_dumps from os.path import isfile as os_path_isfile f = open(src_fpath, 'r') x = f.readlines() f.close() print '\n', SND_NON_S_PATH, '\n', SND_S_PATH, '\n' # Characters Allowed: [a-z0-9A-Z],[-&'/] # Borough: # 1 = MN # 2 = Bronx # 3 = Brooklyn # 4 = Queens # 5 = Staten Island ## GFT: # blank None of the below, e.g., either a name of a street that has no hyphenated house numbers # and no part of which is within # Edgewater Park, or a name of a tunnel, etc # A Addressable place name # B Name of bridge # C Business Improvement Districts # D Duplicate Address Pseudo-Street name (DAPS) # E Street is entirely within Edgewater Park # F Street is partially within Edgewater Park # G Non-Addressable Place name (NAP) of a complex # H All house numbers on this street are hyphenated # I Intersection Name # J Non-Physical Boundary Features # M Some house numbers on this street are hyphenated, some are not # N NAP of a 'stand-alone' geographic feature (not a complex # or a constituent entity of a complex) # O Shore Line # P Pseudo-street name (BEND, CITY LIMIT, DEAD END and their aliases) # R Rail line # S Front-truncated street name # T Tunnel # U Miscellaneous Structures # X NAP of a constituent entity of a complex Z Ramp ## ignore B,G,N,O,R,T ## watch H,M def get_snd_non_s(x): # non-type 'S' / size 34 a = { # 'rec_type':x[0:1], 'boro': x[1:2], # (see "boro" below) 'stname': x[2:34].strip(), # full street name 'primary_flag': x[34:35].strip(), # P(=primary) or V(=non-primary) 'principal_flag': x[35:36].strip(), # F or S 'boro': x[36:37].strip(), # 1,2,3,4,5 'sc5': x[36:42].strip(), 'lgc': x[42:44].strip(), # Local Group Code 'spv': x[44:47].strip(), # Spelling Variation # filler = x[47:49] 'numeric_ind': x[49:50].strip(), # Numeric Name Indicator 'GFT': x[50:51].strip(), # (see description above) # 'len_full_name':x[51:53].strip(), 'full_stname': x[53:85].strip(), 'min_SNL': x[85:87].strip(), 'stn20': x[87:107].strip(), 'ht_name_type_code': x[107:108].strip( ), # blank or R(= roadbed), G(= generic), U(= undivided) # filler = x[109:200] } return a def get_snd_s(x): a = { # filler = x[0:1] # always '1' # 'rectype':x[0:1], 'boro': x[1:2].strip(), # only 1 or 2 (MN and BX) 'stname': x[2:34].strip(), # front truncated name # filler = x[34:49] # P or V 'numeric_ind': x[49:50].strip(), # blank or N 'GFT': x[50:51], # always 'S' # 'len_full_name':x[51:53].strip(), 'num_of_progens': x[53:54].strip(), # either 1 or 2 ? 'progen_word_1': x[54:55].strip(), # E or W 'progen_gft_1': x[55:56].strip(), 'progen_b10sc_1': x[56:67].strip(), 'sc5_1': x[56:62].strip(), # filler = x[67:70] 'progen_word_2': x[70:71].strip(), # E or W 'progen_gft_2': x[71:72].strip(), 'progen_b10sc_2': x[72:83].strip(), 'sc5_2': x[72:78].strip(), # filler = x[83:86] # filler = x[86:200] } return a non_s, s, xlen = [], [], len(x) for i in range(1, xlen): rec = re_sub(r'(\r\n)$', r'', x[i]) if rec[34] == 'P' or rec[34] == 'V': # non-'S' r = get_snd_non_s(rec) r['source_ln_num'] = i non_s.append(r) else: r = get_snd_s(rec) r['source_ln_num'] = i s.append(r) # print len(non_s),'rows of non-type S' # print len(s),'rows of S-type' # print xlen,'total rows in data' assert len(non_s) + len(s) == xlen - 1 # TESTING: # print x[1] # for non s-type # non_s[0] # --for s-type # print x[11] # s[0] p = j_dumps(non_s) df_non_s = pd.read_json(p) ns_cols = sorted(get_snd_non_s('').keys()) df_non_s = df_non_s.ix[:, ns_cols] # - Remove Blank Columns remove_cols = [] # ---- PROVE THAT OK TO REMOVE 'min_SNL' b/c NO VALUES EXIST test_col = 'min_SNL' t = df_non_s[test_col].unique().tolist() assert True == (len(t) == 1) == (t[0] == '') remove_cols.append(test_col) # ---- PROVE THAT OK TO REMOVE 'stn20' b/c NO VALUES EXIST test_col = 'stn20' t = df_non_s[test_col].unique().tolist() assert True == (len(t) == 1) == (t[0] == '') remove_cols.append(test_col) # -- df_non_s = df_non_s.drop(remove_cols, axis=1) print '\n', len(df_non_s), 'non-S-Type records\n' # print df_non_s.head() assert False == os_path_isfile(SND_NON_S_PATH) df_non_s.to_csv(SND_NON_S_PATH) assert True == os_path_isfile(SND_NON_S_PATH) p = j_dumps(s) df_s = pd.read_json(p) s_cols = sorted(get_snd_s('').keys()) df_s = df_s.ix[:, s_cols] l_funct = lambda s: 0 if len(str(s).strip()) == 0 else int(s) df_s['progen_b10sc_2'] = df_s.progen_b10sc_2.map(l_funct) df_s['sc5_2'] = df_s.sc5_2.map(l_funct) # - Remove Blank Columns remove_cols = [] # ---- PROVE THAT OK TO REMOVE 'min_SNL' b/c NO VALUES EXIST test_col = 'progen_gft_2' t = df_s[test_col].unique().tolist() assert True == (len(t) == 1) == (t[0] == '') remove_cols.append(test_col) # -- df_s = df_s.drop(remove_cols, axis=1) print '\n', len(df_s), 'S-Type records\n' # print df_s.head() assert False == os_path_isfile(SND_S_PATH) df_s.to_csv(SND_S_PATH) assert True == os_path_isfile(SND_S_PATH) return 'success'