Example #1
0
def load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync):
    """ use psycopg.copy_from to data from column file to postgres

    after data has been loaded, join to create a unified row view
    """
    raw_conn = nonbib_db_engine.raw_connection()
    cur = raw_conn.cursor()

    for t in nonbib.NonBib.all_types:
        table_name = sql_sync.schema + '.' + t
        logger.info('processing {}'.format(table_name))
        # here we have to read multiple files, information in config file are in a list
        if t == 'datalinks':
            datalinks.load_column_files_datalinks_table(
                config, table_name, t, raw_conn, cur)
        else:
            filename = config['DATA_PATH'] + config[t.upper()]
            if t == 'canonical':
                r = reader.BibcodeFileReader(filename)
            elif t in ('refereed', 'pub_openaccess', 'private', 'ocrabstract',
                       'nonarticle'):
                r = reader.OnlyTrueFileReader(filename)
            else:
                r = reader.StandardFileReader(t, filename)
            if r:
                cur.copy_from(r, table_name)
                raw_conn.commit()

    cur.close()
    raw_conn.close()
    sql_sync.create_joined_rows(nonbib_db_conn)
    def standard_reader_test(self, file_type, spot_checks, data_dir='data1/'):
        """verify standard reader creates the correct sql value

        spot_checks is a list of (bibcode, value) pairs to verify.
        spot_checks might include first and last bibcodes in file
        and other interesting or edge cases.
        """
        filename = self.config['TEST_DATA_PATH'] + data_dir + self.config[
            file_type.upper()]
        lines_in_file = sum(1 for line in open(filename))
        r = reader.StandardFileReader(file_type, filename)
        bibcode_count = 0
        line = r.read()
        spot_checks_found = []
        multi_line = ('simbad', 'grants', 'citation', 'reference', 'reader')
        multi_value = ('relevance')
        while line:
            bibcode_count += 1
            parts = line.split('\t')
            bibcode = parts[0].strip()
            if file_type in multi_value:
                value = line[20:]
            else:
                value = parts[1].strip()
            if file_type not in multi_value:
                self.assertEqual(
                    2, len(parts),
                    '{} lines should only include bibcode and value array {}'.
                    format(file_type, line))
                self.assertEqual('{', value[0],
                                 'invalid sql array {}'.format(value))
                self.assertEqual('}', value[-1],
                                 'invalid sql array {}'.format(value))
            # we spot check a couple fields
            for spot_check in spot_checks:
                spot_bibcode = spot_check[0]
                if bibcode == spot_bibcode:
                    spot_checks_found.append(spot_bibcode)
                    spot_value = spot_check[1]
                    self.assertEqual(
                        spot_value, value,
                        'bad {} value for bibcode {}, expected {}, received {}'
                        .format(file_type, bibcode, spot_value, value))
            line = r.read()
        r.close()
        self.assertEqual(
            len(spot_checks), len(spot_checks_found),
            'for {} did not find all spot checks'.format(file_type))
        if file_type not in multi_line:
            self.assertEqual(
                lines_in_file, bibcode_count,
                '{} standard reader returned wrong number of lines'.format(
                    file_type))
    def test_bad_bibcode(self):
        """bad bicode in input file should be logged and skipped and rest of file processed

        one bad bibcode in a downloads file"""
        file_type = 'download'
        filename = self.config[
            'TEST_DATA_PATH'] + 'dataInvalid/' + self.config[file_type.upper()]
        lines_in_file = sum(1 for line in open(filename))
        r = reader.StandardFileReader(file_type, filename)
        bibcode_count = 0
        line = r.read()
        while line:
            bibcode_count += 1
            line = r.read()
        self.assertEqual(bibcode_count, lines_in_file - 1,
                         'bad bibcode in file not skipped')