def load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync): """ use psycopg.copy_from to data from column file to postgres after data has been loaded, join to create a unified row view """ raw_conn = nonbib_db_engine.raw_connection() cur = raw_conn.cursor() for t in nonbib.NonBib.all_types: table_name = sql_sync.schema + '.' + t logger.info('processing {}'.format(table_name)) # here we have to read multiple files, information in config file are in a list if t == 'datalinks': datalinks.load_column_files_datalinks_table( config, table_name, t, raw_conn, cur) else: filename = config['DATA_PATH'] + config[t.upper()] if t == 'canonical': r = reader.BibcodeFileReader(filename) elif t in ('refereed', 'pub_openaccess', 'private', 'ocrabstract', 'nonarticle'): r = reader.OnlyTrueFileReader(filename) else: r = reader.StandardFileReader(t, filename) if r: cur.copy_from(r, table_name) raw_conn.commit() cur.close() raw_conn.close() sql_sync.create_joined_rows(nonbib_db_conn)
def standard_reader_test(self, file_type, spot_checks, data_dir='data1/'): """verify standard reader creates the correct sql value spot_checks is a list of (bibcode, value) pairs to verify. spot_checks might include first and last bibcodes in file and other interesting or edge cases. """ filename = self.config['TEST_DATA_PATH'] + data_dir + self.config[ file_type.upper()] lines_in_file = sum(1 for line in open(filename)) r = reader.StandardFileReader(file_type, filename) bibcode_count = 0 line = r.read() spot_checks_found = [] multi_line = ('simbad', 'grants', 'citation', 'reference', 'reader') multi_value = ('relevance') while line: bibcode_count += 1 parts = line.split('\t') bibcode = parts[0].strip() if file_type in multi_value: value = line[20:] else: value = parts[1].strip() if file_type not in multi_value: self.assertEqual( 2, len(parts), '{} lines should only include bibcode and value array {}'. format(file_type, line)) self.assertEqual('{', value[0], 'invalid sql array {}'.format(value)) self.assertEqual('}', value[-1], 'invalid sql array {}'.format(value)) # we spot check a couple fields for spot_check in spot_checks: spot_bibcode = spot_check[0] if bibcode == spot_bibcode: spot_checks_found.append(spot_bibcode) spot_value = spot_check[1] self.assertEqual( spot_value, value, 'bad {} value for bibcode {}, expected {}, received {}' .format(file_type, bibcode, spot_value, value)) line = r.read() r.close() self.assertEqual( len(spot_checks), len(spot_checks_found), 'for {} did not find all spot checks'.format(file_type)) if file_type not in multi_line: self.assertEqual( lines_in_file, bibcode_count, '{} standard reader returned wrong number of lines'.format( file_type))
def test_bad_bibcode(self): """bad bicode in input file should be logged and skipped and rest of file processed one bad bibcode in a downloads file""" file_type = 'download' filename = self.config[ 'TEST_DATA_PATH'] + 'dataInvalid/' + self.config[file_type.upper()] lines_in_file = sum(1 for line in open(filename)) r = reader.StandardFileReader(file_type, filename) bibcode_count = 0 line = r.read() while line: bibcode_count += 1 line = r.read() self.assertEqual(bibcode_count, lines_in_file - 1, 'bad bibcode in file not skipped')