コード例 #1
0
def build_table(fn, headers):
    """Build a Table object from a .tsv.gz file.

    :param fn: the .tsv.gz file
    :type fn: str
    :param headers: headers in the file
    :type headers: list
    """
    logging.debug('building table for file %s' % fn)
    table_name = fn.replace(TSV_EXT, '').replace('.', '_')
    table_map = DB_TRANSFORM.get(table_name) or {}
    columns = []
    all_headers = set(headers)
    all_headers.update(table_map.keys())
    for header in all_headers:
        col_info = table_map.get(header) or {}
        col_type = col_info.get('type') or sqlalchemy.UnicodeText
        if 'length' in col_info and col_type is sqlalchemy.String:
            col_type = sqlalchemy.String(length=col_info['length'])
        col_args = {
            'name': header,
            'type_': col_type,
            'index': col_info.get('index', False)
        }
        col_obj = sqlalchemy.Column(**col_args)
        columns.append(col_obj)
    return sqlalchemy.Table(table_name, metadata, *columns)
コード例 #2
0
def generate_content(fd, headers, table):
    """Generate blocks of rows to be written to the database.

    :param fd: a file descriptor for the .tsv.gz file
    :type fd: :class:`_io.TextIOWrapper`
    :param headers: headers in the file
    :type headers: list
    :param table: the table that will populated
    :type table: :class:`sqlalchemy.Table`
    :returns: block of data to insert
    :rtype: list
    """
    data = []
    headers_len = len(headers)
    data_transf = {}
    table_name = table.name
    for column, conf in DB_TRANSFORM.get(table_name, {}).items():
        if 'transform' in conf:
            data_transf[column] = conf['transform']
    for line in fd:
        s_line = line.decode('utf-8').strip().split('\t')
        if len(s_line) != headers_len:
            continue
        info = dict(zip(headers, [x if x != r'\N' else None for x in s_line]))
        for key, tranf in data_transf.items():
            if key not in info:
                continue
            info[key] = tranf(info[key])
        if table_name == 'title_basics':
            info['t_soundex'] = title_soundex(info['primaryTitle'])
        elif table_name == 'title_akas':
            info['t_soundex'] = title_soundex(info['title'])
        elif table_name == 'name_basics':
            info['ns_soundex'], info['sn_soundex'], info[
                's_soundex'] = name_soundexes(info['primaryName'])
        data.append(info)
        if len(data) >= BLOCK_SIZE:
            yield data
            data = []
    if data:
        yield data
        data = []