Exemple #1
0
 def test_json_from_string(self):
     data = Json2xml.fromstring(
         '{"login":"******","id":1,"avatar_url":"https://avatars0.githubusercontent.com/u/1?v=4"}'
     ).data
     data_object = Json2xml(data)
     xml_output = data_object.json2xml()
     dict_from_xml = xmltodict.parse(xml_output)
     print('keys', dict_from_xml)
     # since it's a valid XML, xml to dict is able to load it and return
     # elements from under the all tag of xml
     self.assertTrue(type(dict_from_xml['all']) == OrderedDict)
Exemple #2
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='Utility to convert json to valid xml.')
    parser.add_argument('--url', dest='url', action='store')
    parser.add_argument('--file', dest='file', action='store')
    parser.add_argument('--data', dest='data', action='store')
    args = parser.parse_args()

    if args.url:
        url = args.url
        data = Json2xml.fromurl(url)
        print(Json2xml.json2xml(data))

    if args.file:
        file = args.file
        data = Json2xml.fromjsonfile(file)
        print(Json2xml.json2xml(data))

    if args.data:
        str_data = args.data
        data = Json2xml.fromstring(str_data)
        print(Json2xml.json2xml(data))
Exemple #3
0
    def toXml(self):

        data = Json2xml.fromstring(self.toJson()).data
        dataConverter = Json2xml(data)

        return dataConverter.json2xml()
Exemple #4
0
    def parse(self, response):
        linkitem = LinkItem()
        linkitem['url'] = response.url
        linkitem['response'] = response.status
        linkitem['parsable'] = any(d in response.url for d in parsable_domain_list)

        yield linkitem

        try:

            rawhtml = response.xpath('//html').extract()[0]
            article = DP(html=rawhtml, url=response.url)
            article.get_domaininfo()
            article.inspect_date()
            url_retrieved = []
            url_validate = re.compile(r'^https?')
            # logging.info(article.date_flag)
            # logging.info(article.has_more)

            if article.date_flag:
                article.inspect_article()
                article.clean_data()

            if article.content_flag:
                articleitem = ArticleItem()
                instanceitem = InstanceItem()
                linkritem = LinkRItem()

                articleitem['author'] = article.author
                articleitem['url'] = response.url
                articleitem['title'] = article.title
                articleitem['datetime'] = article.unixtime
                articleitem['domain'] = article.domain

                yield articleitem

                # main article as an instance
                instanceitem['author'] = article.author
                instanceitem['url'] = response.url
                instanceitem['datetime'] = article.datetime
                instanceitem['unixtime'] = article.unixtime
                instanceitem['type'] = 'Article'
                instanceitem['text_body'] = article.content
                instanceitem['text_body_html'] = article.content_html
                instanceitem['likes'] = article.likes
                instanceitem['links_contained'] = []
                instanceitem['relevance'] = article.content_flag
                instanceitem['gen_time'] = time.time()
                for link in article.links:
                    if not url_validate.search(str(link['href'])) == None:
                        instanceitem['links_contained'].append(link['href'])
                        linkritem['link_from'] = response.url
                        linkritem['link_to'] = link['href']
                        linkritem['gen_time'] = instanceitem['gen_time']
                        yield linkritem
                        url_retrieved.append(str(link['href']))
                        yield scrapy.Request(str(link['href']), callback=self.parse)

                instanceitem['links_contained'] = ','.join(instanceitem['links_contained'])

                yield instanceitem

            if article.has_more:
                instance = IP(url=response.url)
                if instance.domain in json2xml_list:
                    instance.get_instanceinfo_json()
                    # logging.info(instance.json_xpath)

                    json_data = Json2xml.fromstring(response.xpath(instance.json_xpath).extract_first()).data
                    json_object = Json2xml(json_data).json2xml()

                    instance_iter = BeautifulSoup(json_object, 'lxml').select(instance.instance_selector)
                    # logging.info(len(instance_iter))
                    for i in instance_iter:
                        instanceitem['author'] = i.find(instance.author_selector).get_text()
                        instanceitem['url'] = response.url
                        instanceitem['datetime'] = i.find_all(instance.datetime_selector)[-1].get_text()
                        instanceitem['unixtime'] = time.mktime(dateparser.parse(instanceitem['datetime']).timetuple())
                        instanceitem['type'] = 'Comment'
                        instanceitem['text_body_html'] = ''
                        instanceitem['text_body'] = i.find_all(instance.content_selector)[-1].get_text()
                        instanceitem['likes'] = ''
                        instanceitem['id'] = i.find_all('url')[-1].get_text()
                        instanceitem['reply_to'] = ''
                        instanceitem['links_contained'] = re.findall(r'(https?://[^\s]+)', instanceitem['text_body'])
                        instanceitem['relevance'] = article.content_flag
                        instanceitem['gen_time'] = time.time()
                        for link in instanceitem['links_contained']:
                            if not url_validate.search(str(link)) == None:
                                linkritem['link_from'] = response.url
                                linkritem['link_to'] = str(link)
                                linkritem['gen_time'] = instanceitem['gen_time']
                                yield linkritem
                                url_retrieved.append(str(link))
                                yield scrapy.Request(str(link), callback=self.parse)

                        instanceitem['links_contained'] = ','.join(instanceitem['links_contained'])

                        if not instanceitem['text_body'] == None:
                            yield instanceitem

                else:
                    instance.get_instanceinfo()

                    instance_iter = response.xpath(instance.instance_xpath)
                    for i in instance_iter:
                        instanceitem['author'] = i.xpath(instance.author_xpath).extract_first()
                        instanceitem['url'] = response.url
                        instanceitem['datetime'] = i.xpath(instance.datetime_xpath).extract_first()
                        instanceitem['unixtime'] = time.mktime(dateparser.parse(instanceitem['datetime']).timetuple())
                        instanceitem['type'] = 'Comment'
                        instanceitem['text_body_html'] = i.xpath(instance.content_html_xpath).extract_first()
                        instanceitem['likes'] = i.xpath(instance.likes_xpath).extract_first()
                        instanceitem['id'] = i.xpath(instance.id_xpath).extract_first()
                        instanceitem['reply_to'] = i.xpath(instance.reply_to_xpath).extract_first()
                        instanceitem['links_contained'] = i.xpath(instance.links_contained_xpath).extract()
                        instanceitem['relevance'] = article.content_flag
                        instanceitem['gen_time'] = time.time()
                        for link in instanceitem['links_contained']:
                            if not url_validate.search(str(link)) == None:
                                linkritem['link_from'] = response.url
                                linkritem['link_to'] = str(link)
                                linkritem['gen_time'] = instanceitem['gen_time']
                                yield linkritem
                                url_retrieved.append(str(link))
                                yield scrapy.Request(str(link), callback=self.parse)

                        instanceitem['links_contained'] = ','.join(instanceitem['links_contained'])

                        if not instanceitem['text_body_html'] == None:
                            instanceitem['text_body'] = BeautifulSoup(instanceitem['text_body_html'],
                                                                      'lxml').get_text().strip()
                            yield instanceitem

        # if not len(url_retrieved) == 0:
        # url_retrieved = list(set(url_retrieved))

        # urlfile = open('urls.txt', 'a')
        # for link in url_retrieved:
        # urlfile.write("{}\n".format(link))
        # yield scrapy.Request(link, callback = self.parse)

        except Exception as e:
            pass