def test_partition_with_max_labels_discards_labels(self):
        mock_label_reader = labelled_data.Reader(
            io.StringIO("""
input,name,qty,range_end,unit,comment
1 cup foo,foo,1.0,0.0,cup,
2 drops foz,foz,2.0,0.0,drop,
3 ml faa,faa,3.0,0.0,ml,
4 cloves bar,bar,4.0,0.0,cloves,
5 oz baz,baz,5.0,0.0,oz,
""".strip()))
        partitioner.split_labels(mock_label_reader,
                                 self.mock_training_writer,
                                 self.mock_testing_writer,
                                 training_fraction=0.67,
                                 max_labels=3)
        self.assertMultiLineEqual(
            """
input,name,qty,range_end,unit,comment
1 cup foo,foo,1.0,0.0,cup,
2 drops foz,foz,2.0,0.0,drop,
""".strip(),
            self.mock_training_file.getvalue().strip())
        self.assertMultiLineEqual(
            """
input,name,qty,range_end,unit,comment
3 ml faa,faa,3.0,0.0,ml,
""".strip(),
            self.mock_testing_file.getvalue().strip())
    def test_raises_error_when_csv_does_not_have_required_columns(self):
        with self.assertRaises(labelled_data.InvalidHeaderError):
            mock_file = io.StringIO("""
index,input,UNEXPECTED_COLUMN,qty,range_end,unit,comment
77,3 bananas,bananas,3.0,0.0,,
""".strip())
            next(labelled_data.Reader(mock_file))
Example #3
0
def main(args):
    with open(args.label_path) as label_file, open(
            args.training_path, 'wb') as training_file, open(
                args.testing_path, 'wb') as testing_file:
        label_reader = labelled_data.Reader(label_file)
        training_writer = labelled_data.Writer(training_file)
        testing_writer = labelled_data.Writer(testing_file)

        partitioner.split_labels(label_reader, training_writer, testing_writer,
                                 args.training_fraction, args.max_labels)
 def test_reads_file_with_utf8_encoding(self):
     mock_file = io.StringIO(
         ('index,input,name,qty,range_end,unit,comment\n'
          '1,2 jalape\xc3\xb1os,jalape\xc3\xb1os,2.0,0.0,,,\n'))
     reader = labelled_data.Reader(mock_file)
     self.assertEqual([{
         'input': '2 jalape\xc3\xb1os',
         'name': 'jalape\xc3\xb1os',
         'qty': 2.0,
         'unit': '',
         'range_end': 0.0,
         'comment': '',
     }], [r for r in reader])
Example #5
0
 def run(self):
     """
     Generates training data in the CRF++ format for the ingredient
     tagging task
     """
     with open(self.opts.data_path, encoding='utf-8') as data_file:
         data_reader = labelled_data.Reader(data_file)
         for row in data_reader:
             # Write the utf-8 encoded data directly to stdout instead of using print
             # because print() will output a bytestring like `b"string"`.
             sys.stdout.buffer.write(
                 translator.translate_row(row).encode('utf-8'))
             sys.stdout.buffer.write(b'\n')
Example #6
0
    def test_interprets_empty_range_end_as_zero(self):
        mock_file = io.BytesIO("""
index,input,name,qty,range_end,unit,comment
77,3 bananas,bananas,3.0,,,
""".strip())
        reader = labelled_data.Reader(mock_file)
        self.assertEqual({
            'input': '3 bananas',
            'qty': 3.0,
            'unit': '',
            'name': 'bananas',
            'comment': '',
            'range_end': 0.0,
        }, next(reader))
 def test_reads_file_with_utf8_encoding(self):
     mock_file = io.StringIO(
         "index,input,name,qty,range_end,unit,comment\n"
         "1,2 jalape\xc3\xb1os,jalape\xc3\xb1os,2.0,0.0,,,\n")
     reader = labelled_data.Reader(mock_file)
     self.assertEqual(
         [{
             "input": u"2 jalape\xc3\xb1os",
             "name": u"jalape\xc3\xb1os",
             "qty": 2.0,
             "unit": u"",
             "range_end": 0.0,
             "comment": u"",
         }],
         [r for r in reader],
     )
    def test_interprets_empty_range_end_as_zero(self):
        mock_file = io.StringIO("""\
index,input,name,qty,range_end,unit,comment
77,3 bananas,bananas,3.0,,,
""")
        reader = labelled_data.Reader(mock_file)
        self.assertEqual(
            {
                "input": u"3 bananas",
                "qty": 3.0,
                "unit": u"",
                "name": u"bananas",
                "comment": u"",
                "range_end": 0.0,
            },
            next(reader),
        )
    def test_reads_valid_label_file(self):
        mock_file = io.StringIO("""\
index,input,name,qty,range_end,unit,comment
63,4 to 6 large cloves garlic,garlic,4.0,6.0,clove,
77,3 bananas,bananas,3.0,0.0,,
106,"2 1/2 pounds bell peppers (about 6 peppers in assorted colors), cut into 2-inch chunks",bell peppers,2.5,0.0,pound,"(about 6 peppers in assorted colors), cut into 2-inch chunks"
""")
        reader = labelled_data.Reader(mock_file)
        self.assertEqual(
            [
                {
                    "input": u"4 to 6 large cloves garlic",
                    "qty": 4.0,
                    "unit": u"clove",
                    "name": u"garlic",
                    "range_end": 6.0,
                    "comment": u"",
                },
                {
                    "input": u"3 bananas",
                    "qty": 3.0,
                    "unit": u"",
                    "name": u"bananas",
                    "comment": u"",
                    "range_end": 0.0,
                },
                {
                    "input": (u"2 1/2 pounds bell peppers (about 6 peppers in "
                              u"assorted colors), cut into 2-inch chunks"),
                    "qty":
                    2.5,
                    "unit":
                    u"pound",
                    "name":
                    u"bell peppers",
                    "range_end":
                    0.0,
                    "comment":
                    (u"(about 6 peppers in assorted colors), cut into "
                     u"2-inch chunks"),
                },
            ],
            [r for r in reader],
        )
    def test_reads_valid_label_file(self):
        mock_file = io.StringIO("""
index,input,name,qty,range_end,unit,comment
63,4 to 6 large cloves garlic,garlic,4.0,6.0,clove,
77,3 bananas,bananas,3.0,0.0,,
106,"2 1/2 pounds bell peppers (about 6 peppers in assorted colors), cut into 2-inch chunks",bell peppers,2.5,0.0,pound,"(about 6 peppers in assorted colors), cut into 2-inch chunks"
""".strip())
        reader = labelled_data.Reader(mock_file)
        self.assertEqual([{
            'input': '4 to 6 large cloves garlic',
            'qty': 4.0,
            'unit': 'clove',
            'name': 'garlic',
            'range_end': 6.0,
            'comment': '',
        }, {
            'input': '3 bananas',
            'qty': 3.0,
            'unit': '',
            'name': 'bananas',
            'comment': '',
            'range_end': 0.0,
        }, {
            'input': ('2 1/2 pounds bell peppers (about 6 peppers in '
                      'assorted colors), cut into 2-inch chunks'),
            'qty':
            2.5,
            'unit':
            'pound',
            'name':
            'bell peppers',
            'range_end':
            0.0,
            'comment': ('(about 6 peppers in assorted colors), cut into '
                        '2-inch chunks'),
        }], [r for r in reader])
    def test_partition_20_percent_training(self):
        mock_label_reader = labelled_data.Reader(
            io.StringIO(
                """\
input,name,qty,range_end,unit,comment
1 cup foo,foo,1.0,0.0,cup,
2 drops foz,foz,2.0,0.0,drop,
3 ml faa,faa,3.0,0.0,ml,
4 cloves bar,bar,4.0,0.0,cloves,
5 oz baz,baz,5.0,0.0,oz,\
"""
            )
        )
        partitioner.split_labels(
            mock_label_reader,
            self.mock_training_writer,
            self.mock_testing_writer,
            training_fraction=0.2,
        )
        self.assertMultiLineEqual(
            """\
input,name,qty,range_end,unit,comment
1 cup foo,foo,1.0,0.0,cup,\
""",
            self.mock_training_file.getvalue().strip(),
        )
        self.assertMultiLineEqual(
            """\
input,name,qty,range_end,unit,comment
2 drops foz,foz,2.0,0.0,drop,
3 ml faa,faa,3.0,0.0,ml,
4 cloves bar,bar,4.0,0.0,cloves,
5 oz baz,baz,5.0,0.0,oz,\
""",
            self.mock_testing_file.getvalue().strip(),
        )