def extract_data(self):
        self.logger.info("extracting data")
        extractor = Extractor(self.train_destination)
        patents = get_files(join(self.working_dir, "patents"), ".XML")
        train_patent_list = []
        test_patent_list = []
        num_of_valid_patents = 0
        num_of_unvalid_patents = 0
        total_number_of_test_patents = 0

        for patent in patents:
            self.logger.info("extracting " + patent)
            try:
                parsed_patent = extractor.parse(patent)
                if self.is_patent_valid(parsed_patent):
                    num_of_valid_patents += 1
                    if len(test_patent_list) % 1000 == 0:
                        self.logger.info("train_patent_list has length %d" % (len(train_patent_list)))
                    if randint(1, 10) == 10: # 10% szansy
                        test_patent_list.append(parsed_patent)
                        total_number_of_test_patents += 1
                    else:
                        train_patent_list.append(parsed_patent)
                else:
                    num_of_unvalid_patents += 1
            except Exception as e:
                self.logger.error(e.message)
                self.logger.error("Number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents))
        self.save_list(test_patent_list, self.test_destination)
        self.save_list(train_patent_list, self.train_destination)
        self.logger.info("Final number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents))
        self.logger.info("Total number of test examples is %d" % (total_number_of_test_patents))
    def extract_data(self):
        self.logger.info("extracting data")
        extractor = Extractor(self.train_destination)
        patents = get_files(join(self.working_dir, "patents"), ".XML")
        train_patent_list = []
        test_patent_list = []
        num_of_valid_patents = 0
        num_of_unvalid_patents = 0
        total_number_of_test_patents = 0

        for patent in patents:
            self.logger.info("extracting " + patent)
            try:
                parsed_patent = extractor.parse(patent)
                if self.is_patent_valid(parsed_patent):
                    num_of_valid_patents += 1
                    if len(test_patent_list) % 1000 == 0:
                        self.logger.info("train_patent_list has length %d" %
                                         (len(train_patent_list)))
                    if randint(1, 10) == 10:  # 10% szansy
                        test_patent_list.append(parsed_patent)
                        total_number_of_test_patents += 1
                    else:
                        train_patent_list.append(parsed_patent)
                else:
                    num_of_unvalid_patents += 1
            except Exception as e:
                self.logger.error(e.message)
                self.logger.error(
                    "Number of valid patents was %d, number of unvalid patents was %d"
                    % (num_of_valid_patents, num_of_unvalid_patents))
        self.save_list(test_patent_list, self.test_destination)
        self.save_list(train_patent_list, self.train_destination)
        self.logger.info(
            "Final number of valid patents was %d, number of unvalid patents was %d"
            % (num_of_valid_patents, num_of_unvalid_patents))
        self.logger.info("Total number of test examples is %d" %
                         (total_number_of_test_patents))
class TestExtractor(unittest.TestCase):
    def setUp(self):
        self.extractor = Extractor("test_data")

    def tearDown(self):
        pass

    def test_should_load_json_file(self):
        self.assertIsNotNone(self.extractor.structure["us-patent-grant-v44-2013-05-16.dtd"]["documentID"])

    def test_xpaths(self):
        inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-20131224.XML")

        tree = ET.parse(inputfile)
        root = tree.getroot()

        dtdStructure = self.extractor.structure[tree.docinfo.internalDTD.system_url]
        patent = self.extractor.parse(inputfile)

        self.assertEqual(patent.documentID, root.findall(dtdStructure["documentID"])[0].text)
        self.assertEqual(patent.title, root.findall(dtdStructure["inventionTitle"])[0].text)
        self.assertEqual(patent.date, root.findall(dtdStructure["date"])[0].text)
        self.assertIsNotNone(patent.abstract)
        self.assertIsNotNone(patent.description)
        self.assertIsNotNone(patent.claims)

    def test_xml_structures(self):
        inputfiles = ["US08613112-20131224.XML",
                     "US08927386-20150106.XML"]
        for inputfile in inputfiles:
            patent = self.extractor.parse(resource_filename("patent_parsing_tools.tests", inputfile))
            self.assertIsNotNone(patent.documentID)
            self.assertIsNotNone(patent.title)
            self.assertIsNotNone(patent.date)
            self.assertIsNotNone(patent.abstract)
            self.assertIsNotNone(patent.description)
            self.assertIsNotNone(patent.claims)

    def test_exception_not_supported_xml_structure(self):
        inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-noDTDFile.XML")
        self.assertRaises(NotSupportedDTDConfiguration, self.extractor.parse, inputfile)

    def test_exception_not_implemented_dtd_structure(self):
        inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-notSupportedDTD.XML")
        self.assertRaises(NotSupportedDTDConfiguration, self.extractor.parse, inputfile)

    def test_no_exception_when_lack_of_node(self):
        inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-lackofnode.XML")
        self.extractor.parse(inputfile)

    def test_throw_exception_and_go_through(self):
        inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-noDTDFile.XML")
        try:
            self.extractor.parse(resource_filename("patent_parsing_tools.tests", "US08613112-noDTDFile.XML"))
        except NotSupportedDTDConfiguration as r:
            print "Catched first Exception with message: \"" + r.message + "\""

        try:
            self.extractor.parse(resource_filename("patent_parsing_tools.tests", "US08613112-notSupportedDTD.XML"))
        except NotSupportedDTDConfiguration as r:
            print "Catched second Exception with message: \"" + r.message + "\""
 def setUp(self):
     self.extractor = Extractor("test_data")