def get_repo_symbolic_name_from_dirname(git_dir: str) -> str: """Return the name of the repo in `git_dir`. E.g., "alphamatic/amp", "ParticleDev/commodity_research" """ dbg.dassert_exists(git_dir) cmd = "cd %s; (git remote -v | grep fetch)" % git_dir # TODO(gp): Make it more robust, by checking both fetch and push. # "origin [email protected]:alphamatic/amp (fetch)" _, output = si.system_to_string(cmd) data: List[str] = output.split() _LOG.debug("data=%s", data) dbg.dassert_eq(len(data), 3, "data='%s'", str(data)) # [email protected]:alphamatic/amp repo_name = data[1] m = re.match(r"^.*\.com:(.*)$", repo_name) dbg.dassert(m, "Can't parse '%s'", repo_name) repo_name = m.group(1) # type: ignore _LOG.debug("repo_name=%s", repo_name) # We expect something like "alphamatic/amp". m = re.match(r"^\S+/\S+$", repo_name) dbg.dassert(m, "repo_name='%s'", repo_name) # origin [email protected]:ParticleDev/ORG_Particle.git (fetch) suffix_to_remove = ".git" if repo_name.endswith(suffix_to_remove): repo_name = repo_name[:-len(suffix_to_remove)] return repo_name
def create_dir( dir_name: str, incremental: bool, abort_if_exists: bool = False, ask_to_delete: bool = False, ) -> None: """Create a directory `dir_name` if it doesn't exist. - param incremental: if False then the directory is deleted and re-created, otherwise it skips - param abort_if_exists: - param ask_to_delete: if it is not incremental and the dir exists, asks before deleting """ dbg.dassert_is_not(dir_name, None) dbg.dassert( os.path.normpath(dir_name) != ".", msg="Can't create the current dir" ) if abort_if_exists: dbg.dassert_not_exists(dir_name) # dir exists / dir does not exist # incremental no-op mkdir # not incremental rm+mkdir mkdir if os.path.isdir(dir_name): if incremental: # The dir exists and we want to keep it it exists (i.e., # incremental), so we are done. # os.chmod(dir_name, 0755) return if ask_to_delete: si.query_yes_no( "Do you really want to delete dir '%s'?" % dir_name, abort_on_no=True, ) # The dir exists and we want to create it from scratch (i.e., not # incremental), so we need to delete the dir. _LOG.debug("Deleting dir '%s'", dir_name) if os.path.islink(dir_name): delete_file(dir_name) else: shutil.rmtree(dir_name) _LOG.debug("Creating directory '%s'", dir_name) # Note that makedirs raises OSError if the target directory already exists. # A race condition can happen when another process creates our target # directory, while we have just found that it doesn't exist, so we need to # handle this situation gracefully. try: os.makedirs(dir_name) except OSError as e: # It can happen that we try to create the directory while somebody else # created it, so we neutralize the corresponding exception. if e.errno == 17: # OSError: [Errno 17] File exists. pass else: raise e
def find_file_in_git_tree(file_in: str, super_module: bool = True) -> str: """Find the path of a file `file_in` in the outermost git submodule (i.e., in the super-module).""" root_dir = get_client_root(super_module=super_module) cmd = "find %s -name '%s' | grep -v .git" % (root_dir, file_in) _, file_name = si.system_to_one_line(cmd) _LOG.debug("file_name=%s", file_name) dbg.dassert(file_name != "", "Can't find file '%s' in dir '%s'", file_in, root_dir) file_name: str = os.path.abspath(file_name) dbg.dassert_exists(file_name) return file_name
def validate_datetime(timestamp: DATETIME_TYPE) -> pd.Timestamp: """ Assert that timestamp is in UTC, convert to pd.Timestamp. :param timestamp: datetime object or pd.Timestamp :return: tz-aware pd.Timestamp """ dbg.dassert_type_in(timestamp, [pd.Timestamp, datetime.datetime]) pd_timestamp = pd.Timestamp(timestamp) dbg.dassert(pd_timestamp.tzinfo, "Timestamp should be tz-aware.") dbg.dassert_eq(pd_timestamp.tzinfo.zone, "UTC", "Timezone should be UTC.") return pd_timestamp
def type_to_string(type_as_str: str) -> str: """Return a short string representing the type of an object, e.g., "core.dataflow.Node" (instead of "class <'core.dataflow.Node'>")""" if isinstance(type_as_str, type): type_as_str = str(type_as_str) dbg.dassert_isinstance(type_as_str, str) # Remove the extra string from: # <class 'core.dataflow.Zscore'> prefix = "<class '" dbg.dassert(type_as_str.startswith(prefix), type_as_str) suffix = "'>" dbg.dassert(type_as_str.endswith(suffix), type_as_str) type_as_str = type_as_str[len(prefix):-len(suffix)] return type_as_str
def get_path_from_git_root(file_name: str, super_module: bool) -> str: """Get the git path from the root of the tree. :param super_module: like get_client_root() """ git_root = get_client_root(super_module) + "/" abs_path = os.path.abspath(file_name) dbg.dassert(abs_path.startswith(git_root)) end_idx = len(git_root) ret = abs_path[end_idx:] # cmd = "git ls-tree --full-name --name-only HEAD %s" % file_name # _, git_file_name = si.system_to_string(cmd) # dbg.dassert_ne(git_file_name, "") return ret
def change_filename_extension(filename: str, old_ext: str, new_ext: str) -> str: """Change extension of a filename (e.g. "data.csv" to "data.json"). :param filename: the old filename (including extension) :param old_ext: the extension of the old filename :param new_ext: the extension to replace the old extension :return: a filename with the new extension """ dbg.dassert( filename.endswith(old_ext), "Extension '%s' doesn't match file '%s'", old_ext, filename, ) # Remove the old extension. new_filename = filename.rstrip(old_ext) # Add the new extension. new_filename = new_filename + new_ext return new_filename
def to_datetime( dates: Union[pd.Series, pd.Index]) -> Union[pd.Series, pd.Index]: """ Convert string dates to datetime. This works like `pd.to_datetime`, but supports more date formats and shifts the dates to the end of period instead of the start. :param dates: series or index of dates to convert :return: datetime dates """ # TODO(Julia): Support ISO 8601 weeks. # This function doesn't deal with mixed formats. dbg.dassert_isinstance(dates, Iterable) dbg.dassert(not isinstance(dates, str)) # Try converting to datetime using `pd.to_datetime`. format_example_index = -1 date_example = dates.tolist()[format_example_index] format_fix = _handle_incorrect_conversions(date_example) if format_fix is not None: format_, date_modifiction_func = format_fix dates = dates.map(date_modifiction_func) date_example = dates.tolist()[format_example_index] else: format_ = None datetime_dates = pd.to_datetime(dates, format=format_, errors="coerce") # Shift to end of period if conversion has been successful. if not pd.isna(datetime_dates).all(): datetime_example = datetime_dates.tolist()[format_example_index] if (not pd.isna(datetime_example) and datetime_example.strftime("%Y-%m-%d") == date_example): return datetime_dates shift_func = _shift_to_period_end(date_example) if shift_func is not None: datetime_dates = datetime_dates.map(shift_func) return datetime_dates # If standard conversion fails, attempt our own conversion. format_determination_output = _determine_date_format(date_example) if format_determination_output is None: return datetime_dates format_, date_modification_func = format_determination_output dates = dates.map(date_modification_func) return pd.to_datetime(dates, format=format_)
def check_et_timezone(dt: DATETIME_TYPE) -> bool: # TODO(gp): Check if dateutils is better. import pytz tzinfo = dt.tzinfo dbg.dassert(tzinfo, "Timestamp should be tz-aware.") zone = tzinfo.zone # type: ignore ret = zone in ( pytz.timezone("US/Eastern").zone, pytz.timezone("America/New_York").zone, ) dbg.dassert( ret, "dt=%s (type=%s) tzinfo=%s (type=%s) tzinfo.zone=%s", dt, type(dt), tzinfo, type(tzinfo), zone, ) return True
def _system( cmd: str, abort_on_error: bool, suppress_error: Optional[Any], suppress_output: bool, blocking: bool, wrapper: Optional[Any], output_file: Optional[Any], tee: bool, dry_run: bool, log_level: Union[int, str], ) -> Tuple[int, str]: """Execute a shell command. :param cmd: string with command to execute :param abort_on_error: whether we should assert in case of error or not :param suppress_error: set of error codes to suppress :param suppress_output: whether to print the output or not - If "on_debug_level" then print the output if the log level is DEBUG :param blocking: blocking system call or not :param wrapper: another command to prepend the execution of cmd :param output_file: redirect stdout and stderr to this file :param tee: if True, tee stdout and stderr to output_file :param dry_run: just print the final command but not execute it :param log_level: print the command to execute at level "log_level". - If "echo" then print the command line to screen as print and not logging :return: return code (int), output of the command (str) """ orig_cmd = cmd[:] # Prepare the command line. cmd = "(%s)" % cmd dbg.dassert_imply(tee, output_file is not None) if output_file is not None: dir_name = os.path.dirname(output_file) if not os.path.exists(dir_name): _LOG.debug("Dir '%s' doesn't exist: creating", dir_name) dbg.dassert(bool(dir_name), "dir_name='%s'", dir_name) os.makedirs(dir_name) if tee: cmd += " 2>&1 | tee %s" % output_file else: cmd += " 2>&1 >%s" % output_file else: cmd += " 2>&1" if wrapper: cmd = wrapper + " && " + cmd # # TODO(gp): Add a check for the valid values. # TODO(gp): Make it "ECHO". if isinstance(log_level, str): dbg.dassert_eq(log_level, "echo") print("> %s" % orig_cmd) _LOG.debug("> %s", cmd) else: _LOG.log(log_level, "> %s", cmd) # dbg.dassert_in(suppress_output, ("ON_DEBUG_LEVEL", True, False)) if suppress_output == "ON_DEBUG_LEVEL": # print("eff_lev=%s" % eff_level) # print("lev=%s" % logging.DEBUG) _LOG.getEffectiveLevel() # Suppress the output if the verbosity level is higher than DEBUG, # otherwise print. suppress_output = _LOG.getEffectiveLevel() > logging.DEBUG # output = "" if dry_run: _LOG.warning("Not executing cmd\n%s\nas per user request", cmd) rc = 0 return rc, output # Execute the command. try: stdout = subprocess.PIPE stderr = subprocess.STDOUT p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=stdout, stderr=stderr) output = "" if blocking: # Blocking call: get the output. while True: line = p.stdout.readline().decode("utf-8") # type: ignore if not line: break if not suppress_output: print((line.rstrip("\n"))) output += line p.stdout.close() # type: ignore rc = p.wait() else: # Not blocking. # Wait until process terminates (without using p.wait()). max_cnt = 20 cnt = 0 while p.poll() is None: # Process hasn't exited yet, let's wait some time. time.sleep(0.1) cnt += 1 _LOG.debug("cnt=%s, rc=%s", cnt, p.returncode) if cnt > max_cnt: break if cnt > max_cnt: # Timeout: we assume it worked. rc = 0 else: rc = p.returncode if suppress_error is not None: dbg.dassert_isinstance(suppress_error, set) if rc in suppress_error: rc = 0 except OSError as e: rc = -1 _LOG.error("error=%s", str(e)) _LOG.debug("rc=%s", rc) if abort_on_error and rc != 0: msg = ("\n" + prnt.frame("cmd='%s' failed with rc='%s'" % (cmd, rc)) + "\nOutput of the failing command is:\n%s\n%s\n%s" % (prnt.line(">"), output, prnt.line("<"))) _LOG.error("%s", msg) raise RuntimeError("cmd='%s' failed with rc='%s'" % (cmd, rc)) # dbg.dassert_type_in(output, (str, )) return rc, output
def _get_form4_13_payload( self, form_type: str, cik: Optional[peconf.CIK_TYPE] = None, cusip: Optional[peconf.CUSIP_TYPE] = None, start_datetime: Optional[str] = None, end_datetime: Optional[str] = None, date_mode: Optional[str] = None, output_type: str = "dataframes", ) -> Dict[str, List[peconf.SERVER_RESPONSE_TYPE]]: """ Get payload data for forms 4 or 13 and a company. :param form_type: Form type. Allowed range of values: form4, form13. :param cik: Central Index Key as integer. Could be list of P1_CIK or just one identifier. :param cusip: Committee on Uniform Securities Identification Procedures number. Could be list or just one identifier. :param start_datetime: Get data where filing date is >= start_date. Date format is "YYYY-MM-DDTHH-MI-SS". None means the entire available date range. :param end_datetime: Get data where filing date is <= end_date. Date format is "YYYY-MM-DDTHH-MI-SS". None means the entire available date range. :param date_mode: Define whether dates are interpreted as publication dates or knowledge dates :param output_type: Output format: 'dict' or 'dataframes'. :return: Dict with a data tables. """ phdbg.dassert( not (cik is not None and cusip is not None), msg="You cannot pass CIK and CUSIP parameters " "at the same time.", ) phdbg.dassert( form_type in ("form13", "form4"), msg="The form_type parameter should be form13 or form4.", ) phdbg.dassert( output_type in ("dict", "dataframes"), msg="The output_type parameter should be a dict " "or dataframes.", ) params: Dict[str, Any] = {} params = self._set_optional_params( params, start_datetime=start_datetime, end_datetime=end_datetime, cik=cik, cusip=cusip, date_mode=date_mode, ) url = f'{self.base_url}{self._api_routes["PAYLOAD"]}/{form_type}' compound_data: peconf.SERVER_RESPONSE_TYPE = {} for data in self._payload_form_cik_cusip_generator( method="GET", url=url, headers=self.headers, params=params ): for key in data: if key in compound_data: compound_data[key] += data[key] else: compound_data[key] = data[key] return self._process_form_4_13_10_output( compound_data, output_type=output_type )