def build_table(fn, headers): """Build a Table object from a .tsv.gz file. :param fn: the .tsv.gz file :type fn: str :param headers: headers in the file :type headers: list """ logging.debug('building table for file %s' % fn) table_name = fn.replace(TSV_EXT, '').replace('.', '_') table_map = DB_TRANSFORM.get(table_name) or {} columns = [] all_headers = set(headers) all_headers.update(table_map.keys()) for header in all_headers: col_info = table_map.get(header) or {} col_type = col_info.get('type') or sqlalchemy.UnicodeText if 'length' in col_info and col_type is sqlalchemy.String: col_type = sqlalchemy.String(length=col_info['length']) col_args = { 'name': header, 'type_': col_type, 'index': col_info.get('index', False) } col_obj = sqlalchemy.Column(**col_args) columns.append(col_obj) return sqlalchemy.Table(table_name, metadata, *columns)
def generate_content(fd, headers, table): """Generate blocks of rows to be written to the database. :param fd: a file descriptor for the .tsv.gz file :type fd: :class:`_io.TextIOWrapper` :param headers: headers in the file :type headers: list :param table: the table that will populated :type table: :class:`sqlalchemy.Table` :returns: block of data to insert :rtype: list """ data = [] headers_len = len(headers) data_transf = {} table_name = table.name for column, conf in DB_TRANSFORM.get(table_name, {}).items(): if 'transform' in conf: data_transf[column] = conf['transform'] for line in fd: s_line = line.decode('utf-8').strip().split('\t') if len(s_line) != headers_len: continue info = dict(zip(headers, [x if x != r'\N' else None for x in s_line])) for key, tranf in data_transf.items(): if key not in info: continue info[key] = tranf(info[key]) if table_name == 'title_basics': info['t_soundex'] = title_soundex(info['primaryTitle']) elif table_name == 'title_akas': info['t_soundex'] = title_soundex(info['title']) elif table_name == 'name_basics': info['ns_soundex'], info['sn_soundex'], info[ 's_soundex'] = name_soundexes(info['primaryName']) data.append(info) if len(data) >= BLOCK_SIZE: yield data data = [] if data: yield data data = []