result = read_url(url) pdfout = open(pdf_location, 'wg') pdfout.write(result) pdfout.close() print "Sleeping %s seconds" % (SCRAPE_DELAY_TIME) sleep(SCRAPE_DELAY_TIME) except urllib2.HTTPError: print "MISSING %s" % url continue # get pdfinfo data pdf_data = None try: pdf_data = pdfinfo(pdf_location) except subprocess.CalledProcessError: # if the file is totally broken sometimes we'll get this--just continue. pass print pdf_data # Data should look like: {'Tagged': 'no', 'Producer': 'ReportBuilder', 'Creator': '', 'Encrypted': 'no', 'Author': '', 'File size': '20931 bytes', 'Optimized': 'no', 'PDF version': '1.3', 'Title': '', 'Page size': '612 x 792.003 pts (letter)', 'CreationDate': 'Wed Jun 4 06:24:34 2014', 'Pages': '2'} txt_location = TXT_DIR + fcc_id + ".txt" row['txt_location'] = txt_location if not os.path.isfile(txt_location): print "converting to text" # assumes we can execute pdflayout cmd = "pdftotext -layout %s %s" % (pdf_location, txt_location) print "Running cmd: " + cmd # use the less problematic older style of shell execution; assumes access to pdftotext from whatever is getting called. os.system(cmd)
fh = open(MANIFEST_LOCATION, 'w') # remote filepath is empty here. fieldnames = ['id', 'local_filepath', 'remote_filepath', 'Tagged', 'Producer', 'Creator', 'Encrypted', 'Author', 'Filesize', 'Optimized', 'PDFversion', 'Title', 'Pagesize', 'CreationDate', 'Pages'] fh.write(",".join(fieldnames) + "\n") dictwriter = csv.DictWriter(fh, fieldnames=fieldnames, restval='', extrasaction='ignore') for d, _, files in os.walk(PDF_DIR): for i, this_file in enumerate(files): files_found += 1 file_path = PDF_DIR + "/" + this_file # avoid id number of zero row = {'local_filepath':file_path, 'id':i+1} # get pdfinfo data pdf_data = None try: pdf_data = pdfinfo(file_path) except subprocess.CalledProcessError: # Sometimes this happens on weird pdfs. Just note it and keep going. print "WARNING: couldn't run pdfinfo on %s" % (pdf_data) if pdf_data: result_dict = dict(row.items() + pdf_data.items()) else: result_dict = row dictwriter.writerow(result_dict)