def test_merge_header_definitions_no_conflicting_headers(self):
        lines_1 = [
            '##FORMAT=<ID=NS,Number=1,Type=Float,Description="Number samples">\n',
            '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
        ]
        lines_2 = [
            '##FORMAT=<ID=DP,Number=2,Type=Float,Description="Total Depth">\n',
            '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample3\n'
        ]

        vcf_reader_1 = vcf.Reader(fsock=iter(lines_1))
        vcf_reader_2 = vcf.Reader(fsock=iter(lines_2))
        headers_1 = self._get_vcf_header_from_reader(vcf_reader_1, 'file1')
        headers_2 = self._get_vcf_header_from_reader(vcf_reader_2, 'file2')
        pipeline = TestPipeline()
        merged_definitions = (
            pipeline
            | Create([headers_1, headers_2])
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

        expected = VcfHeaderDefinitions()
        expected._formats = {
            'NS': {
                Definition(1, 'Float'): ['file1']
            },
            'DP': {
                Definition(2, 'Float'): ['file2']
            }
        }
        assert_that(merged_definitions, equal_to([expected]))
        pipeline.run()
Example #2
0
  def test_merge_header_definitions_save_five_copies(self):
    lines_1 = [
        '##INFO=<ID=NS,Number=1,Type=Float,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
    ]
    lines_2 = [
        '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample3\n'
    ]

    vcf_reader_1 = vcf.Reader(fsock=iter(lines_1))
    vcf_reader_2 = vcf.Reader(fsock=iter(lines_2))
    file_names = ['file1', 'file2', 'file3', 'file4', 'file5', 'file6']
    headers = []
    for file_name in file_names:
      headers.append(self._get_vcf_header_from_reader(vcf_reader_1, file_name))
    headers.append(self._get_vcf_header_from_reader(vcf_reader_2, 'file7'))

    pipeline = TestPipeline()
    merged_definitions = (
        pipeline
        | Create(headers)
        | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

    expected = VcfHeaderDefinitions()
    expected._infos = {
        'NS': {Definition(1, 'Float'):
                   ['file1', 'file2', 'file3', 'file4', 'file5'],
               Definition(1, 'Integer'): ['file7']}}
    assert_that(merged_definitions, equal_to([expected]))
    pipeline.run()
    def test_report_conflicted_and_inferred_headers(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {
            'NS': {
                Definition(1, 'Float'): ['file1'],
                Definition(1, 'Integer'): ['file2']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Float', 'Number samples',
                                         None, None))])
        formats = OrderedDict([('DP',
                                createFormat('DP', 2, 'Float',
                                             'Total Depth'))])
        resolved_headers = VcfHeader(infos=infos, formats=formats)
        inferred_headers = VcfHeader(formats=formats)
        expected = [
            preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
            preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join([
                'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n'
            ]), (preprocess_reporter._DELIMITER).join(
                [' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n',
            preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n',
            preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers, inferred_headers)
    def test_report_multiple_files(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {
            'NS': {
                Definition(1, 'Float'): ['file1', 'file2'],
                Definition(1, 'Integer'): ['file3']
            }
        }

        infos = OrderedDict([('NS',
                              createInfo('NS', 1, 'Float', 'Number samples',
                                         None, None))])
        resolved_headers = VcfHeader(infos=infos)

        expected = [
            preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
            preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join([
                'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n'
            ]), (preprocess_reporter._DELIMITER).join(
                [' ', ' ', ' ', 'file2', ' \n']),
            (preprocess_reporter._DELIMITER).join(
                [' ', ' ', 'num=1 type=Integer', 'file3', ' \n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers)
  def test_create_empty_header_defitions(self):
    expected_info = {}
    expected_format = {}
    header_definitions = VcfHeaderDefinitions()

    self.assertDictEqual(header_definitions.infos, expected_info)
    self.assertDictEqual(header_definitions.formats, expected_format)
 def test_type_check(self):
   merger = DefinitionsMerger()
   empty_header_definitions = VcfHeaderDefinitions()
   with self.assertRaises(NotImplementedError):
     merger.merge(empty_header_definitions, None)
   with self.assertRaises(NotImplementedError):
     merger.merge(None, empty_header_definitions)
    def test_create_definitions_multi(self):
        lines = [
            '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
            '##INFO=<ID=DP,Number=2,Type=Float,Description="Number samples">\n',
            '##FORMAT=<ID=NS,Number=3,Type=Integer,Description="Number samples">\n',
            '##FORMAT=<ID=DP,Number=4,Type=Float,Description="Number samples">\n',
            '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
        ]

        vcf_reader = vcf.Reader(fsock=iter(lines))
        header = self._get_vcf_header_from_reader(vcf_reader, 'file1')

        expected_info = {
            'NS': {
                Definition(1, 'Integer'): ['file1']
            },
            'DP': {
                Definition(2, 'Float'): ['file1']
            }
        }
        expected_format = {
            'NS': {
                Definition(3, 'Integer'): ['file1']
            },
            'DP': {
                Definition(4, 'Float'): ['file1']
            }
        }
        header_definitions = VcfHeaderDefinitions(header)

        self.assertDictEqual(header_definitions.infos, expected_info)
        self.assertDictEqual(header_definitions.formats, expected_format)
Example #8
0
  def test_merge_header_definitions_one_header(self):
    lines = [
        '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
    ]

    vcf_reader = vcf.Reader(fsock=iter(lines))
    headers = self._get_vcf_header_from_reader(vcf_reader, 'file1')
    pipeline = TestPipeline()
    merged_definitions = (
        pipeline
        | Create([headers])
        | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

    expected = VcfHeaderDefinitions()
    expected._infos = {'NS': {Definition(1, 'Integer'): ['file1']}}
    assert_that(merged_definitions, equal_to([expected]))
    pipeline.run()
    def test_report_no_conflicts(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}}
        header_definitions._formats = {
            'NS': {
                Definition(1, 'Float'): ['file2']
            }
        }

        infos = OrderedDict([('NS',
                              Info('NS', 1, 'Integer', 'Number samples', None,
                                   None))])
        formats = OrderedDict([('NS', Format('NS', 1, 'Float',
                                             'Number samples'))])
        resolved_headers = VcfHeader(infos=infos, formats=formats)

        expected = ['No Header Conflicts Found.\n', '\n']
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, resolved_headers)
    def test_report_no_resolved_headers(self):
        header_definitions = VcfHeaderDefinitions()
        header_definitions._infos = {
            'NS': {
                Definition(1, 'Float'): ['file1'],
                Definition(1, 'Integer'): ['file2']
            }
        }

        expected = [
            preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n',
            preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['NS', 'INFO', 'num=1 type=Float', 'file1',
                 'Not resolved.\n']), (preprocess_reporter._DELIMITER).join(
                     [' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n'
        ]

        self._generate_report_and_assert_contents_equal(
            expected, header_definitions)
 def test_report_no_inconsistencies(self):
     header_definitions = VcfHeaderDefinitions()
     inferred_headers = VcfHeader()
     expected = [
         'No Header Conflicts Found.\n', '\n',
         'No Inferred Headers Found.\n', '\n',
         'No Malformed Records Found.\n', '\n'
     ]
     self._generate_report_and_assert_contents_equal(
         expected,
         header_definitions,
         inferred_headers=inferred_headers,
         malformed_records=[])
  def test_create_definitions_with_info(self):
    lines = [
        '##INFO=<ID=NS,Number=1,Type=Float,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
    ]

    header = self._get_header_from_lines(lines, 'file1')

    expected_info = {'NS': {Definition(1, 'Float'): ['file1']}}
    expected_format = {}
    header_definitions = VcfHeaderDefinitions(header)

    self.assertDictEqual(header_definitions.infos, expected_info)
    self.assertDictEqual(header_definitions.formats, expected_format)
    def test_report_inferred_headers_only(self):
        header_definitions = VcfHeaderDefinitions()
        formats = OrderedDict([('DP', Format('DP', 2, 'Float',
                                             'Total Depth'))])

        inferred_headers = VcfHeader(formats=formats)
        expected = [
            'No Header Conflicts Found.\n', '\n',
            preprocess_reporter._InconsistencyType.INFERRED_HEADERS + '\n',
            preprocess_reporter._HeaderLine.INFERRED_FIELD_HEADER + '\n',
            (preprocess_reporter._DELIMITER).join(
                ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n'
        ]
        self._generate_report_and_assert_contents_equal(
            expected, header_definitions, inferred_headers=inferred_headers)
 def test_report_malformed_records(self):
     header_definitions = VcfHeaderDefinitions()
     inferred_headers = VcfHeader()
     records = [
         vcfio.MalformedVcfRecord('file1', 'rs6054257  G  A  29  PASS',
                                  'Invalid literal')
     ]
     expected = [
         'No Header Conflicts Found.\n', '\n',
         'No Inferred Headers Found.\n', '\n',
         preprocess_reporter._InconsistencyType.MALFORMED_RECORDS + '\n',
         preprocess_reporter._HeaderLine.MALFORMED_RECORDS_HEADER + '\n',
         (preprocess_reporter._DELIMITER).join(
             ['file1', 'rs6054257  G  A  29  PASS',
              'Invalid literal\n']), '\n'
     ]
     self._generate_report_and_assert_contents_equal(
         expected,
         header_definitions,
         inferred_headers=inferred_headers,
         malformed_records=records)
 def _create_definitions_from_lines(self, lines, file_name):
   header = self._get_header_from_lines(lines, file_name)
   return VcfHeaderDefinitions(header)
 def _create_definitions_from_lines(self, lines, file_name):
     vcf_reader = vcf.Reader(fsock=iter(lines))
     header = self._get_vcf_header_from_reader(vcf_reader, file_name)
     return VcfHeaderDefinitions(header)