Example #1
0
def get_repo_symbolic_name_from_dirname(git_dir: str) -> str:
    """Return the name of the repo in `git_dir`.

    E.g., "alphamatic/amp", "ParticleDev/commodity_research"
    """
    dbg.dassert_exists(git_dir)
    cmd = "cd %s; (git remote -v | grep fetch)" % git_dir
    # TODO(gp): Make it more robust, by checking both fetch and push.
    # "origin  [email protected]:alphamatic/amp (fetch)"
    _, output = si.system_to_string(cmd)
    data: List[str] = output.split()
    _LOG.debug("data=%s", data)
    dbg.dassert_eq(len(data), 3, "data='%s'", str(data))
    # [email protected]:alphamatic/amp
    repo_name = data[1]
    m = re.match(r"^.*\.com:(.*)$", repo_name)
    dbg.dassert(m, "Can't parse '%s'", repo_name)
    repo_name = m.group(1)  # type: ignore
    _LOG.debug("repo_name=%s", repo_name)
    # We expect something like "alphamatic/amp".
    m = re.match(r"^\S+/\S+$", repo_name)
    dbg.dassert(m, "repo_name='%s'", repo_name)
    # origin  [email protected]:ParticleDev/ORG_Particle.git (fetch)
    suffix_to_remove = ".git"
    if repo_name.endswith(suffix_to_remove):
        repo_name = repo_name[:-len(suffix_to_remove)]
    return repo_name
Example #2
0
def create_dir(
    dir_name: str,
    incremental: bool,
    abort_if_exists: bool = False,
    ask_to_delete: bool = False,
) -> None:
    """Create a directory `dir_name` if it doesn't exist.

    - param incremental: if False then the directory is deleted and
        re-created, otherwise it skips
    - param abort_if_exists:
    - param ask_to_delete: if it is not incremental and the dir exists,
        asks before deleting
    """
    dbg.dassert_is_not(dir_name, None)
    dbg.dassert(
        os.path.normpath(dir_name) != ".", msg="Can't create the current dir"
    )
    if abort_if_exists:
        dbg.dassert_not_exists(dir_name)
    #                   dir exists / dir does not exist
    # incremental       no-op        mkdir
    # not incremental   rm+mkdir     mkdir
    if os.path.isdir(dir_name):
        if incremental:
            # The dir exists and we want to keep it it exists (i.e.,
            # incremental), so we are done.
            # os.chmod(dir_name, 0755)
            return
        if ask_to_delete:
            si.query_yes_no(
                "Do you really want to delete dir '%s'?" % dir_name,
                abort_on_no=True,
            )
        # The dir exists and we want to create it from scratch (i.e., not
        # incremental), so we need to delete the dir.
        _LOG.debug("Deleting dir '%s'", dir_name)
        if os.path.islink(dir_name):
            delete_file(dir_name)
        else:
            shutil.rmtree(dir_name)
    _LOG.debug("Creating directory '%s'", dir_name)
    # Note that makedirs raises OSError if the target directory already exists.
    # A race condition can happen when another process creates our target
    # directory, while we have just found that it doesn't exist, so we need to
    # handle this situation gracefully.
    try:
        os.makedirs(dir_name)
    except OSError as e:
        # It can happen that we try to create the directory while somebody else
        # created it, so we neutralize the corresponding exception.
        if e.errno == 17:
            # OSError: [Errno 17] File exists.
            pass
        else:
            raise e
Example #3
0
def find_file_in_git_tree(file_in: str, super_module: bool = True) -> str:
    """Find the path of a file `file_in` in the outermost git submodule (i.e.,
    in the super-module)."""
    root_dir = get_client_root(super_module=super_module)
    cmd = "find %s -name '%s' | grep -v .git" % (root_dir, file_in)
    _, file_name = si.system_to_one_line(cmd)
    _LOG.debug("file_name=%s", file_name)
    dbg.dassert(file_name != "", "Can't find file '%s' in dir '%s'", file_in,
                root_dir)
    file_name: str = os.path.abspath(file_name)
    dbg.dassert_exists(file_name)
    return file_name
Example #4
0
def validate_datetime(timestamp: DATETIME_TYPE) -> pd.Timestamp:
    """
    Assert that timestamp is in UTC, convert to pd.Timestamp.

    :param timestamp: datetime object or pd.Timestamp
    :return: tz-aware pd.Timestamp
    """
    dbg.dassert_type_in(timestamp, [pd.Timestamp, datetime.datetime])
    pd_timestamp = pd.Timestamp(timestamp)
    dbg.dassert(pd_timestamp.tzinfo, "Timestamp should be tz-aware.")
    dbg.dassert_eq(pd_timestamp.tzinfo.zone, "UTC", "Timezone should be UTC.")
    return pd_timestamp
Example #5
0
def type_to_string(type_as_str: str) -> str:
    """Return a short string representing the type of an object, e.g.,
    "core.dataflow.Node" (instead of "class <'core.dataflow.Node'>")"""
    if isinstance(type_as_str, type):
        type_as_str = str(type_as_str)
    dbg.dassert_isinstance(type_as_str, str)
    # Remove the extra string from:
    #   <class 'core.dataflow.Zscore'>
    prefix = "<class '"
    dbg.dassert(type_as_str.startswith(prefix), type_as_str)
    suffix = "'>"
    dbg.dassert(type_as_str.endswith(suffix), type_as_str)
    type_as_str = type_as_str[len(prefix):-len(suffix)]
    return type_as_str
Example #6
0
def get_path_from_git_root(file_name: str, super_module: bool) -> str:
    """Get the git path from the root of the tree.

    :param super_module: like get_client_root()
    """
    git_root = get_client_root(super_module) + "/"
    abs_path = os.path.abspath(file_name)
    dbg.dassert(abs_path.startswith(git_root))
    end_idx = len(git_root)
    ret = abs_path[end_idx:]
    # cmd = "git ls-tree --full-name --name-only HEAD %s" % file_name
    # _, git_file_name = si.system_to_string(cmd)
    # dbg.dassert_ne(git_file_name, "")
    return ret
Example #7
0
def change_filename_extension(filename: str, old_ext: str, new_ext: str) -> str:
    """Change extension of a filename (e.g. "data.csv" to "data.json").

    :param filename: the old filename (including extension)
    :param old_ext: the extension of the old filename
    :param new_ext: the extension to replace the old extension
    :return: a filename with the new extension
    """
    dbg.dassert(
        filename.endswith(old_ext),
        "Extension '%s' doesn't match file '%s'",
        old_ext,
        filename,
    )
    # Remove the old extension.
    new_filename = filename.rstrip(old_ext)
    # Add the new extension.
    new_filename = new_filename + new_ext
    return new_filename
Example #8
0
def to_datetime(
        dates: Union[pd.Series, pd.Index]) -> Union[pd.Series, pd.Index]:
    """
    Convert string dates to datetime.

    This works like `pd.to_datetime`, but supports more date formats and shifts
    the dates to the end of period instead of the start.

    :param dates: series or index of dates to convert
    :return: datetime dates
    """
    # TODO(Julia): Support ISO 8601 weeks.
    # This function doesn't deal with mixed formats.
    dbg.dassert_isinstance(dates, Iterable)
    dbg.dassert(not isinstance(dates, str))
    # Try converting to datetime using `pd.to_datetime`.
    format_example_index = -1
    date_example = dates.tolist()[format_example_index]
    format_fix = _handle_incorrect_conversions(date_example)
    if format_fix is not None:
        format_, date_modifiction_func = format_fix
        dates = dates.map(date_modifiction_func)
        date_example = dates.tolist()[format_example_index]
    else:
        format_ = None
    datetime_dates = pd.to_datetime(dates, format=format_, errors="coerce")
    # Shift to end of period if conversion has been successful.
    if not pd.isna(datetime_dates).all():
        datetime_example = datetime_dates.tolist()[format_example_index]
        if (not pd.isna(datetime_example)
                and datetime_example.strftime("%Y-%m-%d") == date_example):
            return datetime_dates
        shift_func = _shift_to_period_end(date_example)
        if shift_func is not None:
            datetime_dates = datetime_dates.map(shift_func)
        return datetime_dates
    # If standard conversion fails, attempt our own conversion.
    format_determination_output = _determine_date_format(date_example)
    if format_determination_output is None:
        return datetime_dates
    format_, date_modification_func = format_determination_output
    dates = dates.map(date_modification_func)
    return pd.to_datetime(dates, format=format_)
Example #9
0
def check_et_timezone(dt: DATETIME_TYPE) -> bool:
    # TODO(gp): Check if dateutils is better.
    import pytz

    tzinfo = dt.tzinfo
    dbg.dassert(tzinfo, "Timestamp should be tz-aware.")
    zone = tzinfo.zone  # type: ignore
    ret = zone in (
        pytz.timezone("US/Eastern").zone,
        pytz.timezone("America/New_York").zone,
    )
    dbg.dassert(
        ret,
        "dt=%s (type=%s) tzinfo=%s (type=%s) tzinfo.zone=%s",
        dt,
        type(dt),
        tzinfo,
        type(tzinfo),
        zone,
    )
    return True
def _system(
    cmd: str,
    abort_on_error: bool,
    suppress_error: Optional[Any],
    suppress_output: bool,
    blocking: bool,
    wrapper: Optional[Any],
    output_file: Optional[Any],
    tee: bool,
    dry_run: bool,
    log_level: Union[int, str],
) -> Tuple[int, str]:
    """Execute a shell command.

    :param cmd: string with command to execute
    :param abort_on_error: whether we should assert in case of error or not
    :param suppress_error: set of error codes to suppress
    :param suppress_output: whether to print the output or not
        - If "on_debug_level" then print the output if the log level is DEBUG
    :param blocking: blocking system call or not
    :param wrapper: another command to prepend the execution of cmd
    :param output_file: redirect stdout and stderr to this file
    :param tee: if True, tee stdout and stderr to output_file
    :param dry_run: just print the final command but not execute it
    :param log_level: print the command to execute at level "log_level".
        - If "echo" then print the command line to screen as print and not
          logging
    :return: return code (int), output of the command (str)
    """
    orig_cmd = cmd[:]
    # Prepare the command line.
    cmd = "(%s)" % cmd
    dbg.dassert_imply(tee, output_file is not None)
    if output_file is not None:
        dir_name = os.path.dirname(output_file)
        if not os.path.exists(dir_name):
            _LOG.debug("Dir '%s' doesn't exist: creating", dir_name)
            dbg.dassert(bool(dir_name), "dir_name='%s'", dir_name)
            os.makedirs(dir_name)
        if tee:
            cmd += " 2>&1 | tee %s" % output_file
        else:
            cmd += " 2>&1 >%s" % output_file
    else:
        cmd += " 2>&1"
    if wrapper:
        cmd = wrapper + " && " + cmd
    #
    # TODO(gp): Add a check for the valid values.
    # TODO(gp): Make it "ECHO".
    if isinstance(log_level, str):
        dbg.dassert_eq(log_level, "echo")
        print("> %s" % orig_cmd)
        _LOG.debug("> %s", cmd)
    else:
        _LOG.log(log_level, "> %s", cmd)
    #
    dbg.dassert_in(suppress_output, ("ON_DEBUG_LEVEL", True, False))
    if suppress_output == "ON_DEBUG_LEVEL":
        # print("eff_lev=%s" % eff_level)
        # print("lev=%s" % logging.DEBUG)
        _LOG.getEffectiveLevel()
        # Suppress the output if the verbosity level is higher than DEBUG,
        # otherwise print.
        suppress_output = _LOG.getEffectiveLevel() > logging.DEBUG
    #
    output = ""
    if dry_run:
        _LOG.warning("Not executing cmd\n%s\nas per user request", cmd)
        rc = 0
        return rc, output
    # Execute the command.
    try:
        stdout = subprocess.PIPE
        stderr = subprocess.STDOUT
        p = subprocess.Popen(cmd,
                             shell=True,
                             executable="/bin/bash",
                             stdout=stdout,
                             stderr=stderr)
        output = ""
        if blocking:
            # Blocking call: get the output.
            while True:
                line = p.stdout.readline().decode("utf-8")  # type: ignore
                if not line:
                    break
                if not suppress_output:
                    print((line.rstrip("\n")))
                output += line
            p.stdout.close()  # type: ignore
            rc = p.wait()
        else:
            # Not blocking.
            # Wait until process terminates (without using p.wait()).
            max_cnt = 20
            cnt = 0
            while p.poll() is None:
                # Process hasn't exited yet, let's wait some time.
                time.sleep(0.1)
                cnt += 1
                _LOG.debug("cnt=%s, rc=%s", cnt, p.returncode)
                if cnt > max_cnt:
                    break
            if cnt > max_cnt:
                # Timeout: we assume it worked.
                rc = 0
            else:
                rc = p.returncode
        if suppress_error is not None:
            dbg.dassert_isinstance(suppress_error, set)
            if rc in suppress_error:
                rc = 0
    except OSError as e:
        rc = -1
        _LOG.error("error=%s", str(e))
    _LOG.debug("rc=%s", rc)
    if abort_on_error and rc != 0:
        msg = ("\n" + prnt.frame("cmd='%s' failed with rc='%s'" % (cmd, rc)) +
               "\nOutput of the failing command is:\n%s\n%s\n%s" %
               (prnt.line(">"), output, prnt.line("<")))
        _LOG.error("%s", msg)
        raise RuntimeError("cmd='%s' failed with rc='%s'" % (cmd, rc))
    # dbg.dassert_type_in(output, (str, ))
    return rc, output
Example #11
0
    def _get_form4_13_payload(
        self,
        form_type: str,
        cik: Optional[peconf.CIK_TYPE] = None,
        cusip: Optional[peconf.CUSIP_TYPE] = None,
        start_datetime: Optional[str] = None,
        end_datetime: Optional[str] = None,
        date_mode: Optional[str] = None,
        output_type: str = "dataframes",
    ) -> Dict[str, List[peconf.SERVER_RESPONSE_TYPE]]:
        """
        Get payload data for forms 4 or 13 and a company.

        :param form_type: Form type. Allowed range of values: form4, form13.
        :param cik: Central Index Key as integer. Could be list of P1_CIK or
            just one identifier.
        :param cusip: Committee on Uniform Securities Identification Procedures
            number. Could be list or just one identifier.
        :param start_datetime: Get data where filing date is >= start_date. Date
            format is "YYYY-MM-DDTHH-MI-SS". None means the entire available date range.
        :param end_datetime: Get data where filing date is <= end_date. Date format
            is "YYYY-MM-DDTHH-MI-SS". None means the entire available date range.
        :param date_mode: Define whether dates are
            interpreted as publication dates or knowledge dates
        :param output_type: Output format: 'dict' or 'dataframes'.
        :return: Dict with a data tables.
        """
        phdbg.dassert(
            not (cik is not None and cusip is not None),
            msg="You cannot pass CIK and CUSIP parameters " "at the same time.",
        )
        phdbg.dassert(
            form_type in ("form13", "form4"),
            msg="The form_type parameter should be form13 or form4.",
        )
        phdbg.dassert(
            output_type in ("dict", "dataframes"),
            msg="The output_type parameter should be a dict " "or dataframes.",
        )
        params: Dict[str, Any] = {}
        params = self._set_optional_params(
            params,
            start_datetime=start_datetime,
            end_datetime=end_datetime,
            cik=cik,
            cusip=cusip,
            date_mode=date_mode,
        )
        url = f'{self.base_url}{self._api_routes["PAYLOAD"]}/{form_type}'
        compound_data: peconf.SERVER_RESPONSE_TYPE = {}
        for data in self._payload_form_cik_cusip_generator(
            method="GET", url=url, headers=self.headers, params=params
        ):
            for key in data:
                if key in compound_data:
                    compound_data[key] += data[key]
                else:
                    compound_data[key] = data[key]
        return self._process_form_4_13_10_output(
            compound_data, output_type=output_type
        )