Esempio n. 1
0
                    result = read_url(url)
                    pdfout = open(pdf_location, 'wg')
                    pdfout.write(result)
                    pdfout.close()
                    
                    print "Sleeping %s seconds" % (SCRAPE_DELAY_TIME)
                    sleep(SCRAPE_DELAY_TIME)

                except urllib2.HTTPError:
                    print "MISSING %s" % url
                    continue
            
            # get pdfinfo data
            pdf_data = None
            try:
                pdf_data = pdfinfo(pdf_location)
            except subprocess.CalledProcessError:
                # if the file is totally broken sometimes we'll get this--just continue.
                pass
            print pdf_data
            # Data should look like: {'Tagged': 'no', 'Producer': 'ReportBuilder', 'Creator': '', 'Encrypted': 'no', 'Author': '', 'File size': '20931 bytes', 'Optimized': 'no', 'PDF version': '1.3', 'Title': '', 'Page size': '612 x 792.003 pts (letter)', 'CreationDate': 'Wed Jun  4 06:24:34 2014', 'Pages': '2'}
            txt_location = TXT_DIR + fcc_id + ".txt"
            row['txt_location'] = txt_location
            if not os.path.isfile(txt_location):
                print "converting to text"
                # assumes we can execute pdflayout
                cmd = "pdftotext -layout %s %s" % (pdf_location, txt_location)
                print "Running cmd: " + cmd
                # use the less problematic older style of shell execution; assumes access to pdftotext from whatever is getting called.
                os.system(cmd)
            
Esempio n. 2
0
                    result = read_url(url)
                    pdfout = open(pdf_location, 'wg')
                    pdfout.write(result)
                    pdfout.close()

                    print "Sleeping %s seconds" % (SCRAPE_DELAY_TIME)
                    sleep(SCRAPE_DELAY_TIME)

                except urllib2.HTTPError:
                    print "MISSING %s" % url
                    continue

            # get pdfinfo data
            pdf_data = None
            try:
                pdf_data = pdfinfo(pdf_location)
            except subprocess.CalledProcessError:
                # if the file is totally broken sometimes we'll get this--just continue.
                pass
            print pdf_data
            # Data should look like: {'Tagged': 'no', 'Producer': 'ReportBuilder', 'Creator': '', 'Encrypted': 'no', 'Author': '', 'File size': '20931 bytes', 'Optimized': 'no', 'PDF version': '1.3', 'Title': '', 'Page size': '612 x 792.003 pts (letter)', 'CreationDate': 'Wed Jun  4 06:24:34 2014', 'Pages': '2'}
            txt_location = TXT_DIR + fcc_id + ".txt"
            row['txt_location'] = txt_location
            if not os.path.isfile(txt_location):
                print "converting to text"
                # assumes we can execute pdflayout
                cmd = "pdftotext -layout %s %s" % (pdf_location, txt_location)
                print "Running cmd: " + cmd
                # use the less problematic older style of shell execution; assumes access to pdftotext from whatever is getting called.
                os.system(cmd)
fh = open(MANIFEST_LOCATION, 'w')
# remote filepath is empty here. 
fieldnames = ['id', 'local_filepath', 'remote_filepath', 'Tagged', 'Producer', 'Creator', 'Encrypted', 'Author', 'Filesize', 'Optimized', 'PDFversion', 'Title', 'Pagesize', 'CreationDate', 'Pages']
fh.write(",".join(fieldnames) + "\n")
dictwriter = csv.DictWriter(fh, fieldnames=fieldnames, restval='', extrasaction='ignore')


for d, _, files in os.walk(PDF_DIR):

    for i, this_file in enumerate(files):
        files_found += 1
        file_path = PDF_DIR + "/" + this_file
        
        # avoid id number of zero
        row = {'local_filepath':file_path, 'id':i+1}
        
        # get pdfinfo data
        pdf_data = None
        try:
            pdf_data = pdfinfo(file_path)
        except subprocess.CalledProcessError:
            # Sometimes this happens on weird pdfs. Just note it and keep going.
            print "WARNING: couldn't run pdfinfo on %s" % (pdf_data)
            
        if pdf_data:
            result_dict = dict(row.items() + pdf_data.items())
        else:
            result_dict = row
        
        dictwriter.writerow(result_dict)