def main(): ais = ANTLRFileStream(sys.argv[1]) lexer = JSONLexer(ais) tokens = CommonTokenStream(lexer) parser = JSONParser(tokens) parser.setBuildParseTree(True) tree = parser.json() print tree #print tree.toStringTree(parser) walker = ParseTreeWalker() converter = XMLEmitter() walker.walk(converter, tree) print tree
def do_harvest(query, iterations): book_data = {} currentPosition = 0 query_string = QUERY_FORMAT_STRING.format(query) graph = Graph() parser = JSONParser.JSONParser() # map for collecting nodes nodes = {} while (iterations > len(nodes)): page = requests.get(query_string) tree = html.fromstring(page.content) links = tree.xpath('//table[@id="searchresult"]//a/@href') if (len(links) == 0): break for link in links: book_info_response = requests.get(BASE_URL_DNB + link) get_data_from_book_info(book_data, book_info_response, "Titel") get_data_from_book_info(book_data, book_info_response, "Person(en)") get_data_from_book_info_link(book_data, book_info_response, "Schlagwörter") if (len(book_data['Schlagwörter']) > 0): for v in book_data.values(): print(v) for s in book_data['Schlagwörter']: node = None node = graph.add_node(s) nodes[s] = node s1 = book_data['Schlagwörter'][0] for s in book_data['Schlagwörter']: if s != s1: edge = graph.add_edge(nodes[s1], nodes[s]) edge['label'] = book_data['Titel'] query_string = QUERY_FORMAT_STRING_2.format(query, str(currentPosition)) currentPosition += len(links) iterations -= 1 return parser.tostring(graph)