def test_json_from_string(self): data = Json2xml.fromstring( '{"login":"******","id":1,"avatar_url":"https://avatars0.githubusercontent.com/u/1?v=4"}' ).data data_object = Json2xml(data) xml_output = data_object.json2xml() dict_from_xml = xmltodict.parse(xml_output) print('keys', dict_from_xml) # since it's a valid XML, xml to dict is able to load it and return # elements from under the all tag of xml self.assertTrue(type(dict_from_xml['all']) == OrderedDict)
def main(argv=None): parser = argparse.ArgumentParser( description='Utility to convert json to valid xml.') parser.add_argument('--url', dest='url', action='store') parser.add_argument('--file', dest='file', action='store') parser.add_argument('--data', dest='data', action='store') args = parser.parse_args() if args.url: url = args.url data = Json2xml.fromurl(url) print(Json2xml.json2xml(data)) if args.file: file = args.file data = Json2xml.fromjsonfile(file) print(Json2xml.json2xml(data)) if args.data: str_data = args.data data = Json2xml.fromstring(str_data) print(Json2xml.json2xml(data))
def toXml(self): data = Json2xml.fromstring(self.toJson()).data dataConverter = Json2xml(data) return dataConverter.json2xml()
def parse(self, response): linkitem = LinkItem() linkitem['url'] = response.url linkitem['response'] = response.status linkitem['parsable'] = any(d in response.url for d in parsable_domain_list) yield linkitem try: rawhtml = response.xpath('//html').extract()[0] article = DP(html=rawhtml, url=response.url) article.get_domaininfo() article.inspect_date() url_retrieved = [] url_validate = re.compile(r'^https?') # logging.info(article.date_flag) # logging.info(article.has_more) if article.date_flag: article.inspect_article() article.clean_data() if article.content_flag: articleitem = ArticleItem() instanceitem = InstanceItem() linkritem = LinkRItem() articleitem['author'] = article.author articleitem['url'] = response.url articleitem['title'] = article.title articleitem['datetime'] = article.unixtime articleitem['domain'] = article.domain yield articleitem # main article as an instance instanceitem['author'] = article.author instanceitem['url'] = response.url instanceitem['datetime'] = article.datetime instanceitem['unixtime'] = article.unixtime instanceitem['type'] = 'Article' instanceitem['text_body'] = article.content instanceitem['text_body_html'] = article.content_html instanceitem['likes'] = article.likes instanceitem['links_contained'] = [] instanceitem['relevance'] = article.content_flag instanceitem['gen_time'] = time.time() for link in article.links: if not url_validate.search(str(link['href'])) == None: instanceitem['links_contained'].append(link['href']) linkritem['link_from'] = response.url linkritem['link_to'] = link['href'] linkritem['gen_time'] = instanceitem['gen_time'] yield linkritem url_retrieved.append(str(link['href'])) yield scrapy.Request(str(link['href']), callback=self.parse) instanceitem['links_contained'] = ','.join(instanceitem['links_contained']) yield instanceitem if article.has_more: instance = IP(url=response.url) if instance.domain in json2xml_list: instance.get_instanceinfo_json() # logging.info(instance.json_xpath) json_data = Json2xml.fromstring(response.xpath(instance.json_xpath).extract_first()).data json_object = Json2xml(json_data).json2xml() instance_iter = BeautifulSoup(json_object, 'lxml').select(instance.instance_selector) # logging.info(len(instance_iter)) for i in instance_iter: instanceitem['author'] = i.find(instance.author_selector).get_text() instanceitem['url'] = response.url instanceitem['datetime'] = i.find_all(instance.datetime_selector)[-1].get_text() instanceitem['unixtime'] = time.mktime(dateparser.parse(instanceitem['datetime']).timetuple()) instanceitem['type'] = 'Comment' instanceitem['text_body_html'] = '' instanceitem['text_body'] = i.find_all(instance.content_selector)[-1].get_text() instanceitem['likes'] = '' instanceitem['id'] = i.find_all('url')[-1].get_text() instanceitem['reply_to'] = '' instanceitem['links_contained'] = re.findall(r'(https?://[^\s]+)', instanceitem['text_body']) instanceitem['relevance'] = article.content_flag instanceitem['gen_time'] = time.time() for link in instanceitem['links_contained']: if not url_validate.search(str(link)) == None: linkritem['link_from'] = response.url linkritem['link_to'] = str(link) linkritem['gen_time'] = instanceitem['gen_time'] yield linkritem url_retrieved.append(str(link)) yield scrapy.Request(str(link), callback=self.parse) instanceitem['links_contained'] = ','.join(instanceitem['links_contained']) if not instanceitem['text_body'] == None: yield instanceitem else: instance.get_instanceinfo() instance_iter = response.xpath(instance.instance_xpath) for i in instance_iter: instanceitem['author'] = i.xpath(instance.author_xpath).extract_first() instanceitem['url'] = response.url instanceitem['datetime'] = i.xpath(instance.datetime_xpath).extract_first() instanceitem['unixtime'] = time.mktime(dateparser.parse(instanceitem['datetime']).timetuple()) instanceitem['type'] = 'Comment' instanceitem['text_body_html'] = i.xpath(instance.content_html_xpath).extract_first() instanceitem['likes'] = i.xpath(instance.likes_xpath).extract_first() instanceitem['id'] = i.xpath(instance.id_xpath).extract_first() instanceitem['reply_to'] = i.xpath(instance.reply_to_xpath).extract_first() instanceitem['links_contained'] = i.xpath(instance.links_contained_xpath).extract() instanceitem['relevance'] = article.content_flag instanceitem['gen_time'] = time.time() for link in instanceitem['links_contained']: if not url_validate.search(str(link)) == None: linkritem['link_from'] = response.url linkritem['link_to'] = str(link) linkritem['gen_time'] = instanceitem['gen_time'] yield linkritem url_retrieved.append(str(link)) yield scrapy.Request(str(link), callback=self.parse) instanceitem['links_contained'] = ','.join(instanceitem['links_contained']) if not instanceitem['text_body_html'] == None: instanceitem['text_body'] = BeautifulSoup(instanceitem['text_body_html'], 'lxml').get_text().strip() yield instanceitem # if not len(url_retrieved) == 0: # url_retrieved = list(set(url_retrieved)) # urlfile = open('urls.txt', 'a') # for link in url_retrieved: # urlfile.write("{}\n".format(link)) # yield scrapy.Request(link, callback = self.parse) except Exception as e: pass