Esempio n. 1
0
    def pdf_thread(self, url):

        pdf_name = ''
        exit_call = ''
        csv_row = []
        # save PDF to disk
        try:
            pdf_name = BytesIO(
                url.split("/")[-1].encode('UTF-8')).read().__str__()[2:-1]
            valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
            regex = re.compile(valid_chars)
            pdf_name = regex.sub('', pdf_name.__str__())
            self.pdf_path = self.document_folder + regex.sub('', pdf_name)
            r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            with open(self.pdf_path, 'wb') as code:
                code.write(r.content)
            code.close()
            csv_row.insert(0, [self.csv_header[0], self.line_count.__str__()])
            csv_row.insert(
                1, [self.csv_header[1], url if url.__len__() > 0 else 'NULL'])
            csv_row.insert(2, [
                self.csv_header[2],
                pdf_name if pdf_name.__len__() > 0 else 'NULL'
            ])
            csv_row.insert(3, [
                self.csv_header[3],
                self.pdf_path if self.pdf_path.__len__() > 0 else 'NULL'
            ])
            print(' >>>> PDF START:[' + url + '] ' +
                  self.line_count.__str__() + ' ' +
                  (datetime.datetime.now().__str__()[:-7]))
        except Exception as e:
            csv_row.insert(0, [self.csv_header[0], self.line_count.__str__()])
            csv_row.insert(
                1, [self.csv_header[1], url if url.__len__() > 0 else 'NULL'])
            csv_row.insert(2, [self.csv_header[2], e.__str__()])
            csv_row.insert(3, [
                self.csv_header[3],
                self.pdf_path if self.pdf_path.__len__() > 0 else 'NULL'
            ])
            print(e)
            pass

        my_file = os.path.join(self.document_folder + pdf_name)
        try:
            fp = open(my_file, 'rb')
            # self.pdf(fp, csv_row)
        except Exception as e:
            print('     PDF LOAD FAILED !!! ' + self.line_count.__str__() +
                  ' :  ' + self.pdf_path)
            csv_row.pop(3)
            csv_row.insert(3, [
                self.csv_header[3], 'PDF FAILED TO OPEN:' +
                self.pdf_path if self.pdf_path.__len__() > 0 else 'NULL'
            ])
            # Write results
            row = []
            for i in range(csv_row.__len__()):
                row.append(csv_row[i][1])
            report_path = self.report_folder + self.report_name
            row_append = [
                '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                '', '', '', '', '', ''
            ]
            index = 4
            for ii in row_append:
                row.insert(index, ii)
                index += 1
            # OPEN FAILED
            with open(report_path, 'a', encoding='utf8',
                      newline='') as csv_file:
                writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
                writer.dialect.lineterminator.replace('\n', '')
                writer.writerow(row)
            return
        try:
            self.pdf(fp, csv_row)
        except Exception as e:
            print('PDF FAIL')