예제 #1
0
def _get_kindcode_and_startpage(country, patnum, kindcode, target_folder):
    """
    Get kindcode and startpage of a given patent.
    Iterates through a list of possible kindcodes; for every kindocde, the
    first page of the patent document is tried to be retrieved. A successful download
    gives us the correct kindcode, for which the start page of the description is then retrieved.
    - country: Country code
    - patnum: Patent number
    - kindocde: Kindcode (e.g. "A1", "B", ...)
    - target_folder: Folder that the first page is downloaded to
    """
    startpage = None
    possible_kindcodes = []
    if kindcode: possible_kindcodes = [kindcode]

    possible_kindcodes += ['A1', 'A2', 'A3', 'A4', 'B1']
    print possible_kindcodes
    for code in possible_kindcodes:
        try:
            kindcode = download.get_pdf_page(country, patnum, code, 1, target_folder)
            if kindcode:
                meta = download.get_meta_data(country, patnum, 'DESCRIPTION', kindcode, skip=True)
                print meta
                if meta:
                    startpage = meta['DESCRIPTION']
                    break
        except:
            e =  "Unexpected error:", sys.exc_info()[0]
            continue
    print kindcode, startpage        
    return kindcode, startpage
예제 #2
0
def download_searchreports(filename):
    """ Parses the input file (please refer to documentation for valid input
    format) and tries to retrieve the search report for every patent found.
    """
    

    error_file = open(config.searchreport_dir + "searchreports_not_downloadable.log", "a")
    whitelist = []

    try:
        lines = open(filename).readlines()
    except IOError:
        print "Input file " + filename + " could not be opened."
        return
	

    for line in lines:
        if not line:
            continue
        document = line.replace('\n', '').replace(';', ',').split(',')
        try:
			wl = document[1].replace('-', '.').split('.')
			wl = wl[0] + '.' + wl[1]
			if wl and wl not in whitelist:
				whitelist.append(wl)
			
        except:
			pass


        document = document[0].replace('-', '.').split('.')
        country, patnum = document[0], document[1]
        name = country + "." + patnum
        print "Downloading search report for " + name
        # Has the search report already been downloaded?
        pdfs = glob.glob(config.searchreport_dir + name + "*.pdf")
        if not any(pdfs):
            meta = download.get_meta_data(country, patnum)
            if meta:
                download.get_searchreport_pdf(meta, config.searchreport_dir + name)
            else:
                error_file.write(name + "\n")

    open('whitelist.txt', 'w').writelines('\n'.join(whitelist))