Example #1
0
 def test_forbidden_name_characters(self):
     for char in ["\u00d7", "\u00f7", "\u037e", "\u2030"]:
         with self.subTest(char):
             document = Document("")
             with self.assertRaises(XMLError):
                 document._Document__parse_doctype_declaration(
                     f"<!DOCTYPE root{char}>")
Example #2
0
 def test_forbidden_name_start_characters(self):
     for char in ["-", ".", "0", "7", "\u0305"]:
         with self.subTest(char):
             document = Document("")
             with self.assertRaises(XMLError):
                 document._Document__parse_doctype_declaration(
                     f"<!DOCTYPE {char}root>")
Example #3
0
 def test_no_text_in_internal_subset(self):
     document = Document("")
     with self.assertRaises(XMLError):
         document._Document__parse_doctype_declaration("""<!DOCTYPE root [
         <!ENTITY % entity 'blah, blahblah'>'
         some disallowed text
         ]>""")
Example #4
0
 def test_internal_parameter_entity_definition(self):
     document = Document("")
     document._Document__parse_doctype_declaration(
         "<!DOCTYPE root [<!ENTITY % entity 'blah'>]>")
     self.assertEqual("blah",
                      document.parameter_entities["entity"].expansion_text)
     self.assertEqual(Entity.Type.PARAMETER,
                      document.parameter_entities["entity"].type)
     self.assertEqual(False, document.parameter_entities["entity"].external)
Example #5
0
def parse(xml: str) -> Document:
    # Normalise whitespace
    xml = xml.replace("\u000d\u000a", "\u000a")
    xml = xml.replace("\u000d", "\u000a")

    # Parse document
    document = Document(xml)
    document.parse()
    return document
Example #6
0
    def test_with_xmldeclaration(self):
        document = Document(
            "<?xml version='1.0' encoding='utf-8' standalone='yes'?>  <root></root>"
        )
        document.parse()

        self.assertEqual("1.0", document.version)
        self.assertEqual("utf-8", document.encoding)
        self.assertEqual(True, document.standalone)

        self.assertIsInstance(document.root, Element)
        self.assertEqual(document.root.name, "root")
Example #7
0
 def test_external_parameter_entity_definition(self):
     with self.subTest("SYSTEM"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root [<!ENTITY % entity SYSTEM 'uri'>]>")
         self.assertEqual("uri",
                          document.parameter_entities["entity"].system_URI)
         self.assertEqual(Entity.Type.PARAMETER,
                          document.parameter_entities["entity"].type)
         self.assertEqual(True,
                          document.parameter_entities["entity"].external)
         self.assertEqual(True,
                          document.parameter_entities["entity"].parsed)
     with self.subTest("PUBLIC"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root [<!ENTITY % entity PUBLIC 'uri1' 'uri2'>]>")
         self.assertEqual("uri1",
                          document.parameter_entities["entity"].public_URI)
         self.assertEqual("uri2",
                          document.parameter_entities["entity"].system_URI)
         self.assertEqual(Entity.Type.PARAMETER,
                          document.parameter_entities["entity"].type)
         self.assertEqual(True,
                          document.parameter_entities["entity"].external)
         self.assertEqual(True,
                          document.parameter_entities["entity"].parsed)
Example #8
0
 def test_unparsed_entity_definition(self):
     with self.subTest("SYSTEM"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root [<!ENTITY entity SYSTEM 'uri' NDATA notation>]>"
         )
         self.assertEqual("uri",
                          document.general_entities["entity"].system_URI)
         self.assertEqual(Entity.Type.GENERAL,
                          document.general_entities["entity"].type)
         self.assertEqual(True,
                          document.general_entities["entity"].external)
         self.assertEqual(False, document.general_entities["entity"].parsed)
         self.assertEqual("notation",
                          document.general_entities["entity"].notation)
     with self.subTest("PUBLIC"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root [<!ENTITY entity PUBLIC 'uri1' 'uri2' NDATA notation>]>"
         )
         self.assertEqual("uri1",
                          document.general_entities["entity"].public_URI)
         self.assertEqual("uri2",
                          document.general_entities["entity"].system_URI)
         self.assertEqual(Entity.Type.GENERAL,
                          document.general_entities["entity"].type)
         self.assertEqual(True,
                          document.general_entities["entity"].external)
         self.assertEqual(False, document.general_entities["entity"].parsed)
         self.assertEqual("notation",
                          document.general_entities["entity"].notation)
Example #9
0
    def test_with_empty_dtd(self):
        document = Document("""
        <?xml version='1.0' encoding='utf-8' standalone='yes'?>  
        <!DOCTYPE root []>
        
        <root></root>
        """)
        document.parse()

        self.assertEqual("1.0", document.version)
        self.assertEqual("utf-8", document.encoding)
        self.assertEqual(True, document.standalone)

        self.assertIsInstance(document.root, Element)
        self.assertEqual(document.root.name, "root")
Example #10
0
 def test_root_name_parsing(self):
     with self.subTest("With no subset"):
         document = Document("")
         document._Document__parse_doctype_declaration("<!DOCTYPE root>")
         self.assertEqual("root", document.dtd_name)
     with self.subTest("With empty internal subset"):
         document = Document("")
         document._Document__parse_doctype_declaration("<!DOCTYPE root []>")
         self.assertEqual("root", document.dtd_name)
Example #11
0
def build_document(abstract, result):
    reception_number = access_result_reception_number(result)
    description_link = access_result_button(abstract, result)
    return Document("document_number",
                    reception_number,
                    county=abstract.county,
                    description_link=description_link,
                    number_results=1)
Example #12
0
def prepare_name_search():
    name = title_strip(search_name)
    if name == '':
        name = request_new_name()
    while request_yes_or_no(f'The current name to be searched is "{name}", is this correct?') is False:
        clear_terminal()
        name = request_new_name()
    clear_terminal()
    return Document(type="name", value=name)
Example #13
0
def index(files: List[str], empty_words: List[str]):
    print("indexing...")
    progress_bar = tqdm(total=len(files), unit="file")
    for file_name in files:
        doc = Document(file_name, {}, len(documents),
                       os.path.getsize(file_name))
        documents.append(doc)
        with open(file_name, encoding="utf-8", errors="ignore") as file:
            tokens = tokenizar(file.read())
        tokens = sacar_palabras_vacias(tokens, empty_words)
        for token in tokens:
            term = terms_doc_dic.get(token,
                                     Term(token, set(), len(terms_doc_dic)))
            terms_doc_dic[token] = term
            doc.has_term(term)
            term.found_in(doc)
        progress_bar.update(1)
    progress_bar.close()
Example #14
0
    def test_without_xmldeclaration(self):
        with self.subTest("Open tag"):
            document = Document("<root></root>")
            document.parse()

            self.assertIsInstance(document.root, Element)
            self.assertEqual(document.root.name, "root")
        with self.subTest("Closed tag"):
            document = Document("<root/>")
            document.parse()

            self.assertIsInstance(document.root, Element)
            self.assertEqual(document.root.name, "root")
Example #15
0
    def test_with_fluff(self):
        document = Document("""
        <?xml version='1.0' encoding='utf-8' standalone='yes'?>  
        <!DOCTYPE root []>
        
        <?Target A rogue processing instruction?>
        <!-- And even a comment! -->
        <root></root>
        <!-- Another comment -->
        <?Target And more PIs?>
        <?Target?>
        """)
        document.parse()

        self.assertEqual("1.0", document.version)
        self.assertEqual("utf-8", document.encoding)
        self.assertEqual(True, document.standalone)

        self.assertIsInstance(document.root, Element)
        self.assertEqual(document.root.name, "root")
        self.assertEqual(3, len(document.processing_instructions))
Example #16
0
 def test_parameter_entity_expansion(self):
     with self.subTest("In DTD"):
         document = Document("")
         document._Document__parse_doctype_declaration("""<!DOCTYPE root [
         <!ENTITY % entity '<!ENTITY entity2 "SOME TEXT">'>
         %entity;
         ]>""")
         self.assertEqual(
             "SOME TEXT",
             document.general_entities["entity2"].expansion_text)
     with self.subTest("In general entity"):
         document = Document("")
         document._Document__parse_doctype_declaration("""<!DOCTYPE root [
         <!ENTITY % entity 'SOME TEXT'>
         <!ENTITY entity2 'SOME %entity; OTHER TEXT'>
         ]>""")
         self.assertEqual(
             "SOME TEXT",
             document.parameter_entities["entity"].expansion_text)
         self.assertEqual(
             "SOME SOME TEXT OTHER TEXT",
             document.general_entities["entity2"].expansion_text)
Example #17
0
 def test_no_superfluous_characters(self):
     with self.subTest("Before xml declaration"):
         document = Document("some text<?xml version='1.0'?><root></root>")
         with self.assertRaises(XMLError):
             document.parse()
     with self.subTest("Between xml declaration and dtd"):
         document = Document(
             "<?xml version='1.0'?>some text<!DOCTYPE root []><root></root>"
         )
         with self.assertRaises(XMLError):
             document.parse()
     with self.subTest("Between dtd and root element"):
         document = Document(
             "<?xml version='1.0'?><!DOCTYPE root []>some text<root></root>"
         )
         with self.assertRaises(XMLError):
             document.parse()
     with self.subTest("After root element"):
         document = Document(
             "<?xml version='1.0'?><!DOCTYPE root []><root></root>some text"
         )
         with self.assertRaises(XMLError):
             document.parse()
Example #18
0
 def test_only_one_root_element(self):
     document = Document(
         "<?xml version='1.0' encoding='utf-8' standalone='yes'?>  <root1></root1> <root2></root2>"
     )
     with self.assertRaises(XMLError):
         document.parse()
Example #19
0
def transform_result_list(result_list):
    document_list = []
    for result in result_list:
        document_list.append(Document(type="document_number", value=result))
    return document_list
Example #20
0
 def test_parameter_reference_within_markup(self):
     with self.subTest("ELEMENT declaration"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_doctype_declaration(
                 """<!DOCTYPE root [
             <!ENTITY % entity 'blah, blahblah'>'
             <!ELEMENT root (%entity;)>
             ]>""")
     with self.subTest("NOTATION declaration"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_doctype_declaration(
                 """<!DOCTYPE root [
             <!ENTITY % entity 'SYSTEM \"blahblah\"'>'
             <!NOTATION blah %entity;>
             ]>""")
     with self.subTest("ATTLIST declaration"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_doctype_declaration(
                 """<!DOCTYPE root [
             <!ENTITY % entity 'CDATA #REQUIRED'>'
             <!ATTLIST root %entity;>
             ]>""")
Example #21
0
 def test_name_collisions(self):
     with self.subTest(
             "General and parameter entities occupy different namespaces"):
         document = Document("")
         document._Document__parse_doctype_declaration("""<!DOCTYPE root [
         <!ENTITY % entity 'SOME TEXT'>
         <!ENTITY entity 'SOME OTHER TEXT'>
         ]>""")
         self.assertEqual(
             "SOME TEXT",
             document.parameter_entities["entity"].expansion_text)
         self.assertEqual(
             "SOME OTHER TEXT",
             document.general_entities["entity"].expansion_text)
     with self.subTest("General entities"):
         document = Document("")
         document._Document__parse_doctype_declaration("""<!DOCTYPE root [
         <!ENTITY entity 'SOME TEXT'>
         <!ENTITY entity 'SOME OTHER TEXT'>
         ]>""")
         self.assertEqual(
             "SOME TEXT",
             document.general_entities["entity"].expansion_text)
     with self.subTest("Parameter entities"):
         document = Document("")
         document._Document__parse_doctype_declaration("""<!DOCTYPE root [
         <!ENTITY % entity 'SOME TEXT'>
         <!ENTITY % entity 'SOME OTHER TEXT'>
         ]>""")
         self.assertEqual(
             "SOME TEXT",
             document.parameter_entities["entity"].expansion_text)
Example #22
0
 def test_encoding_parsing(self):
     document = Document("")
     document._Document__parse_xml_declaration(
         "<?xml version='1.0' encoding='utf-8'?>")
     self.assertEqual("1.0", document.version)
     self.assertEqual("utf-8", document.encoding)
Example #23
0
 def test_version_parsing(self):
     with self.subTest("With space"):
         document = Document("")
         document._Document__parse_xml_declaration(
             "<?xml version = '1.0' ?>")
         self.assertEqual("1.0", document.version)
     with self.subTest("Without space"):
         document = Document("")
         document._Document__parse_xml_declaration("<?xml version='1.0'?>")
         self.assertEqual("1.0", document.version)
     with self.subTest("Invalid version 2.0"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='2.0'?>")
     with self.subTest("Valid version 1.1"):
         document = Document("")
         document._Document__parse_xml_declaration("<?xml version='1.1'?>")
         self.assertEqual("1.1", document.version)
Example #24
0
 def test_missing_whitespace(self):
     with self.subTest("Before version"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xmlversion='1.0'?>")
     with self.subTest("After version -> encoding"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0'encoding='utf-8'?>")
     with self.subTest("After version -> standalone"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0'standalone='no'?>")
     with self.subTest("After encoding -> standalone"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0' encoding='utf-8'standalone='no'?>")
Example #25
0
 def test_standalone_parsing(self):
     with self.subTest("With encoding"):
         document = Document("")
         document._Document__parse_xml_declaration(
             "<?xml version='1.0' encoding='utf-8' standalone='no'?>")
         self.assertEqual("1.0", document.version)
         self.assertEqual("utf-8", document.encoding)
         self.assertEqual(False, document.standalone)
     with self.subTest("Without encoding"):
         document = Document("")
         document._Document__parse_xml_declaration(
             "<?xml version='1.0' standalone='yes'?>")
         self.assertEqual("1.0", document.version)
         self.assertEqual(True, document.standalone)
     with self.subTest("Invalid value"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0' standalone='maybe'?>")
Example #26
0
 def test_external_declaration_parsing(self):
     with self.subTest("SYSTEM"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root SYSTEM 'uri'>")
         self.assertEqual("root", document.dtd_name)
         self.assertEqual("uri", document.external_system_uri)
     with self.subTest("PUBLIC"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root PUBLIC 'uri1' 'uri2'>")
         self.assertEqual("root", document.dtd_name)
         self.assertEqual("uri1", document.external_public_uri)
         self.assertEqual("uri2", document.external_system_uri)
     with self.subTest("SYSTEM with internal subset"):
         document = Document("")
         document._Document__parse_doctype_declaration(
             "<!DOCTYPE root SYSTEM 'uri' []>")
         self.assertEqual("root", document.dtd_name)
         self.assertEqual("uri", document.external_system_uri)
Example #27
0
 def test_invalid_keyword(self):
     with self.subTest("Versionn"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml versionn='1.0'?>")
     with self.subTest("Versio"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml versio='1.0'?>")
     with self.subTest("encodin"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0' encodin='utf-8?>")
     with self.subTest("standalne"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0' standalne='no'?>")
Example #28
0
 def test_misordered_xml_declaration(self):
     with self.subTest("Encoding -> Version"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml encoding='utf-8' version='1.0'?>")
     with self.subTest("Version -> Standalone -> Encoding"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml version='1.0' standalone='no' encoding='utf-8'?>")
     with self.subTest("Standalone -> Version"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml standalone='no' version='1.0'?>")
Example #29
0
    y_train = np.concatenate([y_train1, y_train2])

else:
    data = pd.read_csv(dataset[args.dataset.lower()] + "train.csv")
    X = np.array([unidecode(x) for x in data['x'].values])
    y = np.array([int(i) - 1 for i in data['y'].values])

    ids = range(0, len(X))
    train, test = train_test_split(ids, test_size=0.1, random_state=42)
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

hatebase = pd.read_csv('./data/hatebase/hatebase.csv').vocabulary.values

vectorizer = CountVectorizer(tokenizer=TreebankWordTokenizer().tokenize,
                             ngram_range=(1, 5),
                             preprocessor=Document().preprocess)
classifier = GridSearchCV(
    LogisticRegression(penalty="l1", dual=False),
    [{
        'C': [0.0001, 0.001, 0.1, 1, 10, 100]
    }]  #range of C coefficients to try
)

X = vectorizer.fit_transform(X_train)
classifier.fit(X, y_train)

coeffs = classifier.best_estimator_.coef_
fn = [i.encode("utf-8") for i in vectorizer.get_feature_names()]

sfn = np.array(fn, dtype=np.str_)[coeffs.nonzero()[1]]
sfv = np.array(coeffs[coeffs.nonzero()], dtype=np.float32)
Example #30
0
 def test_missing_version(self):
     with self.subTest("Encoding"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml encoding='utf-8'?>")
     with self.subTest("Standalone"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml standalone='no'?>")
     with self.subTest("Both"):
         document = Document("")
         with self.assertRaises(XMLError):
             document._Document__parse_xml_declaration(
                 "<?xml encoding='utf-8' standalone='yes'?>")