Python find_duplicates_from_dict_ticdat Exemples, ticdat.utils.find_duplicates_from_dict_ticdat Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : jsontd.py Projet : nandi6uc/ticdat

    def find_duplicates(self, json_file_path, from_pandas=False):
        """
        Find the row counts for duplicated rows.

        :param json_file_path: A json file path. It should encode a dictionary
                               with table names as keys.

        :param from_pandas: boolean.  If truthy, then use pandas json readers. See
                            PanDatFactory json readers for more details.

        :return: A dictionary whose keys are table names for the primary-ed key tables.
                 Each value of the return dictionary is itself a dictionary.
                 The inner dictionary is keyed by the primary key values encountered in the table,
                 and the value is the count of records in the json entry with this primary key.
                 Row counts smaller than 2 are pruned off, as they aren't duplicates
        """
        _standard_verify(self.tic_dat_factory)
        if from_pandas:
            from ticdat import PanDatFactory
            pdf = PanDatFactory.create_from_full_schema(
                self.tic_dat_factory.schema(include_ancillary_info=True))
            _rtn = pdf.json.create_pan_dat(json_file_path)
            jdict = {
                t:
                [tuple(_) for _ in getattr(_rtn, t).itertuples(index=False)]
                for t in pdf.all_tables
            }
        else:
            jdict = self._create_jdict(json_file_path)
        rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict)
        return rtn or {}

Exemple #2

0

Afficher le fichier

def read_lingo_text(tdf,results_text):
    """
    Read Lingo .ldt strings
    :param tdf: A TicDatFactory defining the schema
    :param results_text: A list of strings defining Lingo tables
    :return: A TicDat object consistent with tdf
    """

    for i in results_text.values():
        verify(stringish(i), "text needs to be a string")

    def _get_as_type(val):
        try:
            return float(val)
        except ValueError:
            return val

    dict_with_lists = defaultdict(list)

    for tbn in results_text:
        rows = []
        text = results_text[tbn].strip().split("\n")
        for line in text:
            rows.append(list(map(lambda k: _get_as_type(k),line.strip().split())))
        dict_with_lists[tbn] = rows


    assert not find_duplicates_from_dict_ticdat(tdf, dict_with_lists), \
            "duplicates were found - if asserts are disabled, duplicate rows will overwrite"

    return tdf.TicDat(**{k.replace(tdf.lingo_prepend,"",1):v for k,v in dict_with_lists.items()})

Exemple #3

0

Afficher le fichier

Fichier : jsontd.py Projet : snelson3/opalytics-ticdat

 def find_duplicates(self, json_file_path):
     """
     Find the row counts for duplicated rows.
     :param json_file_path: A json file path. It should encode a dictionary
                            with table names as keys.
     :return: A dictionary whose keys are table names for the primary-ed key tables.
              Each value of the return dictionary is itself a dictionary.
              The inner dictionary is keyed by the primary key values encountered in the table,
              and the value is the count of records in the json entry with this primary key.
              Row counts smaller than 2 are pruned off, as they aren't duplicates
     """
     _standard_verify(self.tic_dat_factory)
     jdict = self._create_jdict(json_file_path)
     rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict)
     return rtn or {}

Exemple #4

0

Afficher le fichier

def read_opl_text(tdf,text, commaseperator = True):
    """
    Read an OPL .dat string
    :param tdf: A TicDatFactory defining the schema
    :param text: A string consistent with the OPL .dat format
    :return: A TicDat object consistent with tdf
    """
    verify(stringish(text), "text needs to be a string")
    # probably want to verify something about the ticdat factory, look at the wiki
    dict_with_lists = defaultdict(list)
    NONE, TABLE, ROW, ROWSTRING, ROWNUM, FIELD, STRING,  NUMBER = 1, 2, 3, 4, 5, 6, 7, 8
    mode = NONE
    field = ''
    table_name = ''
    row = []

    def to_number(st, pos):
        try:
            return float(st)
        except ValueError:
            verify(False,
                   "Badly formatted string - Field '%s' is not a valid number. Character position [%s]." % (st, pos))

    for i,c in enumerate(text):
        if mode not in [STRING, ROWSTRING] and (c.isspace() or c == '{' or c == ';'):
            if mode in [NUMBER, ROWNUM, FIELD] and not commaseperator:
                c = ','
            else:
                continue
        if mode in [STRING, ROWSTRING]:
            if c == '"':
                if text[i-1] == '\\':
                    field = field[:-1] + '"'
                else:
                    if mode is ROWSTRING:
                        row.append(field)
                        field = ''
                        verify(len(row) == len((dict_with_lists[table_name] or [row])[0]),
                               "Inconsistent row lengths found for table %s" % table_name)
                        dict_with_lists[table_name].append(row)
                        row = []
                        mode = TABLE
                    else:
                        mode = FIELD
            else:
                field += c
        elif c == '=':
            verify(mode is NONE, "Badly formatted string, unrecognized '='. Character position [%s]"%i)
            verify(len(table_name) > 0, "Badly formatted string, table name can't be blank. Character position [%s]"%i)
            verify(table_name not in dict_with_lists.keys(), "Can't have duplicate table name. [Character position [%s]"%i)
            dict_with_lists[table_name] = []
            mode = TABLE
        elif c == '<':
            verify(mode is TABLE, "Badly formatted string, unrecognized '<'. Character position [%s]"%i)
            mode = ROW

        elif c == ',':
            verify(mode in [ROW, FIELD, NUMBER, ROWNUM, TABLE], "Badly formatted string, unrecognized ','. \
                                                                    Character position [%s]"%i)
            if mode is TABLE:
                continue
            if mode is ROWNUM:
                field = to_number(field,i)
                row.append(field)
                field = ''
                verify(len(row) == len((dict_with_lists[table_name] or [row])[0]),
                       "Inconsistent row lengths found for table %s" % table_name)
                dict_with_lists[table_name].append(row)
                row = []
                mode = TABLE
            else:
                if mode is NUMBER:
                    field = to_number(field,i)
                row.append(field)
                field = ''
                mode = ROW

        elif c == '"':
            verify(mode in [ROW, TABLE], "Badly formatted string, unrecognized '\"'. Character position [%s]"%i)
            if mode is ROW:
                mode = STRING
            if mode is TABLE:
                mode = ROWSTRING

        elif c == '}':
            verify(mode in [TABLE, ROWNUM], "Badly formatted string, unrecognized '}'. Character position [%s]"%i)
            if mode is ROWNUM:
                field = to_number(field,i)
                row.append(field)
                field = ''
                verify(len(row) == len((dict_with_lists[table_name] or [row])[0]),
                       "Inconsistent row lengths found for table %s" % table_name)
                dict_with_lists[table_name].append(row)
            row = []
            table_name = ''
            mode = NONE

        elif c == '>':
            verify(mode in [ROW, FIELD, NUMBER], "Badly formatted string, unrecognized '>'. \
                                                                    Character position [%s]"%i)
            if mode is NUMBER:
                field = to_number(field,i)
                mode = FIELD
            if mode is FIELD:
                row.append(field)
                field = ''
            verify(len(row) == len((dict_with_lists[table_name] or [row])[0]),
                   "Inconsistent row lengths found for table %s"%table_name)
            dict_with_lists[table_name].append(row)
            row = []
            mode = TABLE
        else:
            verify(mode in [NONE, ROW, ROWNUM, FIELD, NUMBER], "Badly formatted string, \
                                                                    unrecognized '%s'. Character position [%s]"%(c,i))
            if mode is NONE:
                table_name += c
            elif mode is TABLE:
                mode = ROWNUM
                field += c
            else:
                mode = NUMBER
                field += c
    assert not find_duplicates_from_dict_ticdat(tdf, dict_with_lists), \
            "duplicates were found - if asserts are disabled, duplicate rows will overwrite"

    return tdf.TicDat(**{k.replace(tdf.opl_prepend,"",1):v for k,v in dict_with_lists.items()})