Example #1
0
 def test_full_outer_join(self):
     self.assertEqual(join.full_outer_join(self.tab1, 0, self.tab2, 0), [
         ['id', 'name', 'i_work_here', 'id', 'age', 'i_work_here'],
         [u'1', u'Chicago Reader', u'first', u'1', u'first', u'0'],
         [u'1', u'Chicago Reader', u'first', u'1', u'second', u'0'],
         [u'2', u'Chicago Sun-Times', u'only', u'2', u'only', u'0', u'0'],
         [u'3', u'Chicago Tribune', u'only', u'', u'', u''],
         [u'1', u'Chicago Reader', u'second', u'1', u'first', u'0'],
         [u'1', u'Chicago Reader', u'second', u'1', u'second', u'0'],
         [u'', u'', u'', u'4', u'only', u'0']])
Example #2
0
 def test_full_outer_join(self):
     self.assertEqual(join.full_outer_join(self.tab1, 0, self.tab2, 0), [
         ['id', 'name', 'i_work_here', 'id', 'age', 'i_work_here'],
         [u'1', u'Chicago Reader', u'first', u'1', u'first', u'0'],
         [u'1', u'Chicago Reader', u'first', u'1', u'second', u'0'],
         [u'2', u'Chicago Sun-Times', u'only', u'2', u'only', u'0', u'0'],
         [u'3', u'Chicago Tribune', u'only', u'', u'', u''],
         [u'1', u'Chicago Reader', u'second', u'1', u'first', u'0'],
         [u'1', u'Chicago Reader', u'second', u'1', u'second', u'0'],
         [u'', u'', u'', u'4', u'only', u'0']])
Example #3
0
 def test_full_outer_join_no_duplicate_column(self):
     self.maxDiff = 1000
     self.assertEqual(
         join.full_outer_join(self.tab1,
                              0,
                              self.tab2,
                              0,
                              no_duplicate_id_column=True),
         [['id', 'name', 'i_work_here', 'age', 'i_work_here'],
          [u'1', u'Chicago Reader', u'first', u'first', u'0'],
          [u'1', u'Chicago Reader', u'first', u'second', u'0'],
          [u'2', u'Chicago Sun-Times', u'only', u'only', u'0', u'0'],
          [u'3', u'Chicago Tribune', u'only', u'', u''],
          [u'1', u'Chicago Reader', u'second', u'first', u'0'],
          [u'1', u'Chicago Reader', u'second', u'second', u'0'],
          [u'4', u'', u'', u'only', u'0']])
Example #4
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.')

        if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns:
            self.argparser.error('You must provide join column names when performing an outer join.')

        if self.args.left_join and self.args.right_join:
             self.argparser.error('It is not valid to specify both a left and a right join.')

        tables = []

        for f in self.input_files:
            tables.append(list(CSVKitReader(f, **self.reader_kwargs)))
            f.close()

        join_column_ids = []
        
        if self.args.columns:
            for i, t in enumerate(tables):
                join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))

        jointab = []
        
        if self.args.left_join:
            # Left outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, t in enumerate(remaining_tables):
                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        else:
            if self.args.columns:
                # Inner join
                jointab = tables[0]

                for i, t in enumerate(tables[1:]):
                    jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
            else:
                jointab = tables[0]

                # Sequential join
                for t in tables[1:]:
                    jointab = join.sequential_join(jointab, t)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in jointab:
            output.writerow(row)
Example #5
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error(
                'You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(
                self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error(
                    'The number of join column names must match the number of files, or be a single column name that exists in all files.'
                )

        if (self.args.left_join or self.args.right_join
                or self.args.outer_join) and not self.args.columns:
            self.argparser.error(
                'You must provide join column names when performing an outer join.'
            )

        if self.args.left_join and self.args.right_join:
            self.argparser.error(
                'It is not valid to specify both a left and a right join.')

        tables = []

        for f in self.input_files:
            tables.append(list(CSVKitReader(f, **self.reader_kwargs)))
            f.close()

        join_column_ids = []

        if self.args.columns:
            for i, t in enumerate(tables):
                join_column_ids.append(
                    match_column_identifier(t[0], join_column_names[i]))

        jointab = []

        if self.args.left_join:
            # Left outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.left_outer_join(jointab, join_column_ids[0], t,
                                               join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, t in enumerate(remaining_tables):
                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)],
                                                jointab, join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.full_outer_join(jointab, join_column_ids[0], t,
                                               join_column_ids[i + 1])
        else:
            if self.args.columns:
                # Inner join
                jointab = tables[0]

                for i, t in enumerate(tables[1:]):
                    jointab = join.inner_join(jointab, join_column_ids[0], t,
                                              join_column_ids[i + 1])
            else:
                jointab = tables[0]

                # Sequential join
                for t in tables[1:]:
                    jointab = join.sequential_join(jointab, t)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in jointab:
            output.writerow(row)