def main(): if len(sys.argv) != 5: print("Usage: %s xgappfile source_folder gate_output_folder target_folder" % (sys.argv[0])); print("Source folder should only contain plaintext profile data for processing"); return _, xgappfile, source_folder, gate_output_folder, target_folder = sys.argv xgapp_canonical = os.path.realpath(xgappfile) src_canonical = os.path.realpath(source_folder) target_canonical = os.path.realpath(target_folder) try: os.makedirs(gate_output_folder) except: pass gateannotator.run_gate_annotator(xgapp_canonical, src_canonical, gate_output_folder) for infile, outfile in util.traverse_mirror(gate_output_folder, target_folder): print("*** Converting %s to %s" % (infile, outfile)) xml = ie.convert_file(infile) with open(outfile, "w") as f: f.write(xml)
def main(): parser = argparse.ArgumentParser( description='Searcher, scraper and RDF converter for EuroNext.' ) subparser = parser.add_subparsers(help='commands', dest='command') # Search command search_command = subparser.add_parser('search', help='Search EuroNext website') search_subcommands = search_command.add_subparsers(help='Search commands', dest='search_subcommand') search_command.add_argument('-o', dest='output', help='Write search results to file, which can be used as input to the scrape command') search_command.add_argument('--max-results', default=None, type=int, dest='maxresults', help='Maximum results from search') keyword_command = search_subcommands.add_parser('keyword', help='Search EuroNext website by keyword') keyword_command.add_argument('keyword', help='Keyword to search by') icb_command = search_subcommands.add_parser('icb', help='Search EuroNext website by ICB code') icb_command.add_argument('icb', help='ICB code to search by (e.g. 7000 will find all matching 7XXX)') def add_pickle_argument(command): command.add_argument('--pickle',action='store_true', default=False, help='Output as pickled objects. Can be converted to RDF using the " + \ "rdfconvert command. Used to allow changes to the RDF format without having to write converters for RDF output files') def add_extract_profiles_argument(command): command.add_argument('--extract-profiles', dest='extract_profiles', help='Extract cp:profile as text files into the given folder, which can then be processed with GATE.') # Scrape commands scrapeone_command = subparser.add_parser('scrapeone', help='Scrape a page from EuroNext given ISIN and MIC') scrapeone_command.add_argument('isin', help='ISIN number of company') scrapeone_command.add_argument('mic', help='ISO-10383 MIC for company (in URL of source URL)') scrapeone_command.add_argument('outputfile', help='Path to a writable output file') add_pickle_argument(scrapeone_command) scrape_command = subparser.add_parser('scrape', help='Scrape from a file') scrape_command.add_argument('inputfile', help='Path to file containing space-separated ISINs and MICs, one per line.' + \ " Can be generated with the 'search' command.") scrape_command.add_argument('outputdir', help='Path to a writeable output directory') add_pickle_argument(scrape_command) # rdfconvert command rdfconvert_command = subparser.add_parser('rdfconvert', help='Convert pickled objects to RDF') rdfconvert_command.add_argument('inputpath', help='Source file or folder (if --batch)') rdfconvert_command.add_argument('outputpath', help='Destination file or folder (if --batch)') rdfconvert_command.add_argument('--batch', action='store_true', default=False, help='Convert all .pickle files recursively in "inputpath"') extract_profiles_command = subparser.add_parser('extractprofiles', help='Extract cp:profile as text files into the given folder, which can then be processed with GATE') extract_profiles_command.add_argument('inputdir', help='Directory containing cp:graphs') extract_profiles_command.add_argument('outputdir', help='Output directory') args = parser.parse_args() if args.command == 'search': if hasattr(args, 'keyword'): search(keyword=args.keyword, outputfile=args.output, maxresults=args.maxresults) elif hasattr(args, 'icb'): search(icb=args.icb, outputfile=args.output, maxresults=args.maxresults) elif args.command == 'scrapeone': scrape(args.isin, args.mic, args.outputfile, args.pickle) elif args.command == 'scrape': with open(args.inputfile) as f: isins_mics = util.read_space_delimited_file(f) for isin_mic in isins_mics: extension = 'pickle' if args.pickle else 'n3' timestamp = int(time.time() * 1000) outputfile = "%s/%s-%s-%s.%s" % (args.outputdir, isin_mic[0], isin_mic[1], timestamp, extension) print("Scraping %s, %s to %s" % (isin_mic[0], isin_mic[1], outputfile)) try: scrape(isin_mic[0], isin_mic[1], outputfile, args.pickle, timestamp=timestamp) except Exception as e: logger.exception("Failed to scrape %s" % isin_mic) elif args.command == 'rdfconvert': if args.batch: files = list(util.traverse_mirror(args.inputpath, args.outputpath, '.pickle', '.n3')) else: files = [(args.inputpath, args.outputpath)] for inputfile, outputfile in files: print("Converting %s to %s" % (inputfile, outputfile)) with open(inputfile, 'rb') as f: scraped = pickle.load(f) rdfconvert(scraped, outputfile) elif args.command == 'extractprofiles': for directory, file in util.traverse(args.inputdir, '.n3'): # Output into language-specific folder inputfile = directory + os.sep + file g = Graph() g.parse(inputfile, format='n3') for cp_id, _, profile in g.triples((None, NS['cp']['profile'], None)): outdir = args.outputdir + os.sep + profile.language if not os.path.exists(outdir): os.makedirs(outdir) id = cp_id.split('#')[1] outputfile = "%s/%s.txt" % (outdir, id) print(inputfile, "->", outputfile) with open(outputfile, "w+") as f: # Replace HTML tags so they do not get passed as input to gate profile_clean = re.sub(r'<[^>]+>', '\n', profile) f.write(profile_clean)
def main(): parser = argparse.ArgumentParser( description='Searcher, scraper and RDF converter for EuroNext.' ) subparser = parser.add_subparsers(help='commands', dest='command') # Search command search_command = subparser.add_parser('search', help='Search Deutsche Borse index constituents') search_command.add_argument('isin', help='ISIN of the Deutsche Borse index') search_command.add_argument('-o', dest='output_file', help='Output file for results') def add_pickle_argument(command): command.add_argument('--pickle',action='store_true', default=False, help='Output as pickled objects. Can be converted to RDF using the " + \ "rdfconvert command. Used to allow changes to the RDF format without having to write converters for RDF output files') # Scrape commands scrapeone_command = subparser.add_parser('scrapeone', help='Scrape a page given ISIN') scrapeone_command.add_argument('isin', help='ISIN number of company') scrapeone_command.add_argument('outputfile', help='Path to a writable output file') add_pickle_argument(scrapeone_command) scrape_command = subparser.add_parser('scrape', help='Scrape from a file') scrape_command.add_argument('inputfile', help='Path to file containing space-separated ISINs and MICs, one per line.' + \ " Can be generated with the 'search' command.") scrape_command.add_argument('outputdir', help='Path to a writeable output directory') add_pickle_argument(scrape_command) # rdfconvert command rdfconvert_command = subparser.add_parser('rdfconvert', help='Convert pickled objects to RDF') rdfconvert_command.add_argument('inputpath', help='Source file or folder (if --batch)') rdfconvert_command.add_argument('outputpath', help='Destination file or folder (if --batch)') rdfconvert_command.add_argument('--batch', action='store_true', default=False, help='Convert all .pickle files recursively in "inputpath"') args = parser.parse_args() if args.command == 'search': search_index_constituents(args.isin, output_file=args.output_file) elif args.command == 'scrapeone': scrape(args.isin, args.outputfile, args.pickle) elif args.command == 'scrape': extension = 'pickle' if args.pickle else 'n3' with open(args.inputfile) as f: isins = [l[0] for l in util.read_space_delimited_file(f)] print(len(isins), " ISINs found") for i, isin in enumerate(isins): timestamp = int(time.time() * 1000) outputfile = "%s/%s-%s.%s" % (args.outputdir, isin, timestamp, extension) print("%d. Scraping %s to %s" % (i+1, isin, outputfile)) try: scrape(isin, outputfile, args.pickle, timestamp=timestamp) except Exception as e: logger.exception("Failed to scrape %s: %s", isin, str(e)) time.sleep(1) elif args.command == 'rdfconvert': if args.batch: files = list(util.traverse_mirror(args.inputpath, args.outputpath, '.pickle', '.n3')) else: files = [(args.inputpath, args.outputpath)] for inputfile, outputfile in files: print("Converting %s to %s" % (inputfile, outputfile)) with open(inputfile, 'rb') as f: scraped = pickle.load(f) rdfconvert(scraped, outputfile)