def _get_kindcode_and_startpage(country, patnum, kindcode, target_folder): """ Get kindcode and startpage of a given patent. Iterates through a list of possible kindcodes; for every kindocde, the first page of the patent document is tried to be retrieved. A successful download gives us the correct kindcode, for which the start page of the description is then retrieved. - country: Country code - patnum: Patent number - kindocde: Kindcode (e.g. "A1", "B", ...) - target_folder: Folder that the first page is downloaded to """ startpage = None possible_kindcodes = [] if kindcode: possible_kindcodes = [kindcode] possible_kindcodes += ['A1', 'A2', 'A3', 'A4', 'B1'] print possible_kindcodes for code in possible_kindcodes: try: kindcode = download.get_pdf_page(country, patnum, code, 1, target_folder) if kindcode: meta = download.get_meta_data(country, patnum, 'DESCRIPTION', kindcode, skip=True) print meta if meta: startpage = meta['DESCRIPTION'] break except: e = "Unexpected error:", sys.exc_info()[0] continue print kindcode, startpage return kindcode, startpage
def download_searchreports(filename): """ Parses the input file (please refer to documentation for valid input format) and tries to retrieve the search report for every patent found. """ error_file = open(config.searchreport_dir + "searchreports_not_downloadable.log", "a") whitelist = [] try: lines = open(filename).readlines() except IOError: print "Input file " + filename + " could not be opened." return for line in lines: if not line: continue document = line.replace('\n', '').replace(';', ',').split(',') try: wl = document[1].replace('-', '.').split('.') wl = wl[0] + '.' + wl[1] if wl and wl not in whitelist: whitelist.append(wl) except: pass document = document[0].replace('-', '.').split('.') country, patnum = document[0], document[1] name = country + "." + patnum print "Downloading search report for " + name # Has the search report already been downloaded? pdfs = glob.glob(config.searchreport_dir + name + "*.pdf") if not any(pdfs): meta = download.get_meta_data(country, patnum) if meta: download.get_searchreport_pdf(meta, config.searchreport_dir + name) else: error_file.write(name + "\n") open('whitelist.txt', 'w').writelines('\n'.join(whitelist))