Beispiel #1
0
    def convert_document(cls, xml, version='0.4'):
        ''' converts an XML String to a document dictionary necessary for \
            transmitting the document to Recognize.

        :param xml: weblyzard_xml representation of the document
        :returns: the converted document
        :rtype: dict

        .. note::
            non-sentences are ignored and titles are added based on the
            XmlContent's interpretation of the document.
        '''
        if not isinstance(xml, XMLContent):
            xml = XMLContent(xml)

        try:
            if float(version[0:3]) >= 0.5:
                return xml.get_xml_document(xml_version=2013).strip()
        except Exception as e:
            LOGGER.warn('Could not parse version: %s' % version)
        return xml.as_dict(mapping=cls.ATTRIBUTE_MAPPING,
                           ignore_features=True,
                           ignore_relations=True,
                           ignore_non_sentence=False,
                           add_titles_to_sentences=True)
    def test_as_dict(self):
        ''' tests exporting the document as dict '''

        xml_content = '''<wl:page xmlns:wl="http://www.weblyzard.com/wl/2005" content_id="495692737" lang="en" nilsimsa="5bb001c8a610a105b1120bb9c4889d33c62b19e1493245cc2f252a83e270646b" title="Keystone report leaves environmental, energy, safety debates far from settled" source_id="12830" jonas_type="http" description="WASHINGTON &amp;mdash; The State Department minimized the climate change impact of building the Keystone XL pipeline in its final environmental review issued on Friday, a key finding as President Barack Obama decides whether to approve the controversial project. Olivier Douliery | Abaca Press/MCT Activists engage in civil disobedience Wednesday, February 13, 2013 at the White House in Washington, D.C., in hopes of pressuring President Barack Obama to reject the Keystone XL oil sands pipeline. http://media.mcclatchydc.com/smedia/2014/01/31/17/06/SoIRM.La.91.jpg &quot; style=&quot;border-left:2px solid #dddddd; padding-left:5px;max-width:100%;&quot;&gt; More News Read more Politics However, the review leaves the..." feed_url="http://rss.wn.com/english/keyword/" original_request_url="http://article.wn.com/view/2014/02/01/Keystone_report_leaves_environmental_energy_safety_debates_f_1/" content_type="text/html">
   <wl:sentence pos_tags="None" sem_orient="0.0" significance="12951.7567942" md5sum="0c8cb136073a20a932f2d6748204ce9b" pos="NNP CD ( NN ) : DT NNP NNP POS JJ JJ NN IN DT NN NN IN DT JJ NN NNS TO DT NNP NNP NNP VBZ VBN PRP VBP IN DT JJ CC JJ NN IN NNP NNP VBZ DT NN IN DT NN ." token="0,4 5,7 8,9 9,18 18,19 20,22 23,26 27,32 33,43 43,45 46,51 52,65 66,76 77,79 80,83 84,92 93,101 102,106 107,110 111,119 120,123 124,129 130,132 133,136 137,141 142,146 147,152 153,155 156,158 159,161 162,166 167,169 170,173 174,187 188,191 192,201 202,208 209,211 212,221 222,227 228,239 240,243 244,256 257,259 260,263 264,272 272,273"><![CDATA[Dec. 23 (Bloomberg) -- The State Department's final environmental assessment of the Keystone pipeline from the Canadian tar sands to the U.S. Gulf Coast is c. We look at the environmental and political impact if President Obama greenlights the construction of the pipeline.]]></wl:sentence>
   <wl:sentence pos_tags="None" sem_orient="0.0" significance="0.0" md5sum="cdc2b1edeec27081819ca4f50e067240" pos="NNP NNP VBZ VBN IN NNS : NNS ." token="0,6 7,15 16,18 19,25 26,28 29,35 35,36 37,42 42,43"><![CDATA[Shihab Rattansi is joined by guests: clima.]]></wl:sentence>
   </wl:page>'''

        expected_result = {'id': 495692737, 'lang': 'en',
                           'sentence': [{'id': '0c8cb136073a20a932f2d6748204ce9b',
                                         'token': '0,4 5,7 8,9 9,18 18,19 20,22 23,26 27,32 33,43 43,45 46,51 52,65 66,76 77,79 80,83 84,92 93,101 102,106 107,110 111,119 120,123 124,129 130,132 133,136 137,141 142,146 147,152 153,155 156,158 159,161 162,166 167,169 170,173 174,187 188,191 192,201 202,208 209,211 212,221 222,227 228,239 240,243 244,256 257,259 260,263 264,272 272,273',
                                         'value': '''Dec. 23 (Bloomberg) -- The State Department's final environmental assessment of the Keystone pipeline from the Canadian tar sands to the U.S. Gulf Coast is c. We look at the environmental and political impact if President Obama greenlights the construction of the pipeline.''',
                                         'pos': 'NNP CD ( NN ) : DT NNP NNP POS JJ JJ NN IN DT NN NN IN DT JJ NN NNS TO DT NNP NNP NNP VBZ VBN PRP VBP IN DT JJ CC JJ NN IN NNP NNP VBZ DT NN IN DT NN .'},
                                        {'id': 'cdc2b1edeec27081819ca4f50e067240',
                                         'token': '0,6 7,15 16,18 19,25 26,28 29,35 35,36 37,42 42,43',
                                         'value': 'Shihab Rattansi is joined by guests: clima.',
                                         'pos': 'NNP NNP VBZ VBN IN NNS : NNS .'}]}

        xml_obj = XMLContent(xml_content)

        attr_mapping = {'content_id': 'id',
                        'lang': 'lang',
                        'sentences': 'sentence',
                        'sentences_map': {'pos': 'pos',
                                          'token': 'token',
                                          'md5sum': 'id',
                                          'value': 'value'}}

        result = xml_obj.as_dict(mapping=attr_mapping)

        print('result: ')
        pprint(result)

        print('expected result')
        pprint(expected_result)
        assert result == expected_result

        # add the titles
        result2 = xml_obj.as_dict(mapping=attr_mapping,
                                  add_titles_to_sentences=True)
        assert len(result2['sentence']) == 3

        # ignore non-sentences (without pos tags)
        result3 = xml_obj.as_dict(mapping=attr_mapping,
                                  ignore_non_sentence=True,
                                  add_titles_to_sentences=True)
        assert len(result3['sentence']) == 2
Beispiel #3
0
    def test_as_dict(self):
        ''' tests exporting the document as dict '''

        xml_content = '''<wl:page xmlns:wl="http://www.weblyzard.com/wl/2005" content_id="495692737" lang="en" nilsimsa="5bb001c8a610a105b1120bb9c4889d33c62b19e1493245cc2f252a83e270646b" title="Keystone report leaves environmental, energy, safety debates far from settled" source_id="12830" jonas_type="http" description="WASHINGTON &amp;mdash; The State Department minimized the climate change impact of building the Keystone XL pipeline in its final environmental review issued on Friday, a key finding as President Barack Obama decides whether to approve the controversial project. Olivier Douliery | Abaca Press/MCT Activists engage in civil disobedience Wednesday, February 13, 2013 at the White House in Washington, D.C., in hopes of pressuring President Barack Obama to reject the Keystone XL oil sands pipeline. http://media.mcclatchydc.com/smedia/2014/01/31/17/06/SoIRM.La.91.jpg &quot; style=&quot;border-left:2px solid #dddddd; padding-left:5px;max-width:100%;&quot;&gt; More News Read more Politics However, the review leaves the..." feed_url="http://rss.wn.com/english/keyword/" original_request_url="http://article.wn.com/view/2014/02/01/Keystone_report_leaves_environmental_energy_safety_debates_f_1/" content_type="text/html">
   <wl:sentence pos_tags="None" sem_orient="0.0" significance="12951.7567942" md5sum="0c8cb136073a20a932f2d6748204ce9b" pos="NNP CD ( NN ) : DT NNP NNP POS JJ JJ NN IN DT NN NN IN DT JJ NN NNS TO DT NNP NNP NNP VBZ VBN PRP VBP IN DT JJ CC JJ NN IN NNP NNP VBZ DT NN IN DT NN ." token="0,4 5,7 8,9 9,18 18,19 20,22 23,26 27,32 33,43 43,45 46,51 52,65 66,76 77,79 80,83 84,92 93,101 102,106 107,110 111,119 120,123 124,129 130,132 133,136 137,141 142,146 147,152 153,155 156,158 159,161 162,166 167,169 170,173 174,187 188,191 192,201 202,208 209,211 212,221 222,227 228,239 240,243 244,256 257,259 260,263 264,272 272,273"><![CDATA[Dec. 23 (Bloomberg) -- The State Department's final environmental assessment of the Keystone pipeline from the Canadian tar sands to the U.S. Gulf Coast is c. We look at the environmental and political impact if President Obama greenlights the construction of the pipeline.]]></wl:sentence>
   <wl:sentence pos_tags="None" sem_orient="0.0" significance="0.0" md5sum="cdc2b1edeec27081819ca4f50e067240" pos="NNP NNP VBZ VBN IN NNS : NNS ." token="0,6 7,15 16,18 19,25 26,28 29,35 35,36 37,42 42,43"><![CDATA[Shihab Rattansi is joined by guests: clima.]]></wl:sentence>
   </wl:page>'''

        expected_result = {'id': 495692737, 'lang': 'en',
                           'sentence': [{'id': '0c8cb136073a20a932f2d6748204ce9b',
                                         'token': '0,4 5,7 8,9 9,18 18,19 20,22 23,26 27,32 33,43 43,45 46,51 52,65 66,76 77,79 80,83 84,92 93,101 102,106 107,110 111,119 120,123 124,129 130,132 133,136 137,141 142,146 147,152 153,155 156,158 159,161 162,166 167,169 170,173 174,187 188,191 192,201 202,208 209,211 212,221 222,227 228,239 240,243 244,256 257,259 260,263 264,272 272,273',
                                         'value': '''Dec. 23 (Bloomberg) -- The State Department's final environmental assessment of the Keystone pipeline from the Canadian tar sands to the U.S. Gulf Coast is c. We look at the environmental and political impact if President Obama greenlights the construction of the pipeline.''',
                                         'pos': 'NNP CD ( NN ) : DT NNP NNP POS JJ JJ NN IN DT NN NN IN DT JJ NN NNS TO DT NNP NNP NNP VBZ VBN PRP VBP IN DT JJ CC JJ NN IN NNP NNP VBZ DT NN IN DT NN .'},
                                        {'id': 'cdc2b1edeec27081819ca4f50e067240',
                                         'token': '0,6 7,15 16,18 19,25 26,28 29,35 35,36 37,42 42,43',
                                         'value': 'Shihab Rattansi is joined by guests: clima.',
                                         'pos': 'NNP NNP VBZ VBN IN NNS : NNS .'}]}

        xml_obj = XMLContent(xml_content)

        attr_mapping = {'content_id': 'id',
                        'lang': 'lang',
                        'sentences': 'sentence',
                        'sentences_map': {'pos': 'pos',
                                          'token': 'token',
                                          'md5sum': 'id',
                                          'value': 'value'}}

        result = xml_obj.as_dict(mapping=attr_mapping)

        print('result: ')
        pprint(result)

        print('expected result')
        pprint(expected_result)
        assert result == expected_result

        # add the titles
        result2 = xml_obj.as_dict(mapping=attr_mapping,
                                  add_titles_to_sentences=True)
        assert len(result2['sentence']) == 3

        # ignore non-sentences (without pos tags)
        result3 = xml_obj.as_dict(mapping=attr_mapping,
                                  ignore_non_sentence=True,
                                  add_titles_to_sentences=True)
        assert len(result3['sentence']) == 2
    def convert_document(cls, xml):
        ''' converts an XML String to a dictionary with the correct parameters
        (ignoring non-sentences and adding the titles 

        :param xml: str representing the document
        :returns: converted document
        :rtype: dict
        '''
        if isinstance(xml, dict):
            return xml

        if not isinstance(xml, XMLContent):
            xml = XMLContent(xml)

        return xml.as_dict(mapping=cls.ATTRIBUTE_MAPPING,
                           ignore_non_sentence=True,
                           add_titles_to_sentences=True)
Beispiel #5
0
    def convert_document(cls, xml):
        ''' converts an XML String to a dictionary with the correct parameters
        (ignoring non-sentences and adding the titles 

        :param xml: str representing the document
        :returns: converted document
        :rtype: dict
        '''
        if isinstance(xml, dict):
            return xml

        if not isinstance(xml, XMLContent):
            xml = XMLContent(xml)

        return xml.as_dict(mapping=cls.ATTRIBUTE_MAPPING,
                           ignore_non_sentence=True,
                           add_titles_to_sentences=True)
Beispiel #6
0
 def test_dictionary_export(self):
     xml = XMLContent(self.xml_content2)
     assert len(xml.as_dict()) > 0
 def test_dictionary_export(self):
     xml = XMLContent(self.xml_content2)
     assert len(xml.as_dict()) > 0