def test_all(): with pytest.raises(pdfx.exceptions.FileNotFoundError): pdfx.PDFx("asd") with pytest.raises(pdfx.exceptions.DownloadError): pdfx.PDFx("http://invalid.com/404.pdf") with pytest.raises(pdfx.exceptions.PDFInvalidError): pdfx.PDFx(os.path.join(curdir, "pdfs/invalid.pdf")) pdf = pdfx.PDFx(os.path.join(curdir, "pdfs/valid.pdf")) urls = pdf.get_references(reftype="pdf") assert len(urls) == 18
def test_all(): with pytest.raises(pdfx.exceptions.FileNotFoundError): pdfx.PDFx("asd") with pytest.raises(pdfx.exceptions.DownloadError): pdfx.PDFx("http://invalid.com/404.pdf") with pytest.raises(pdfx.exceptions.PDFInvalidError): pdfx.PDFx(os.path.join(curdir, "pdfs/invalid.pdf")) pdf = pdfx.PDFx(os.path.join(curdir, "pdfs/valid.pdf")) pdf.analyze_text() urls = pdf.get_urls(pdf_only=True) assert len(urls) == 17
def get_files(folder_id): files_metadata = get_folder_contents(folder_id) if len(files_metadata ) == 1: # dl the pdf containing the URL to actual files files_id = files_metadata[0]['id'] actually_download_file(files_id, 'readme.pdf') pdf = pdfx.PDFx('readme.pdf') links_list = pdf.get_references_as_dict()['url'] csvs_link = [i for i in links_list if i.startswith('https://bit.ly/')] print("Today's source files are stored in: " + csvs_link[0]) return_folder_id(csvs_link[0]) else: # dl desired csv's wanted_list = [ 'Case Information', 'DOH Data Collect - Daily Report', 'Testing Aggregates' ] for item in files_metadata: for wanted in wanted_list: idx = files_metadata.index(item) filename = files_metadata[idx]['name'] if wanted in filename: new_name = datetime.strptime( filename[21:29], '%Y%m%d').strftime('%Y-%m-%d_') + wanted + '.csv' actually_download_file(files_metadata[idx]['id'], '%s' % new_name) print( 'Your raw data download only took {0:0.1f} seconds. See you tomorrow!' .format(time() - start_timer))
def extract_courses_from_transcript(userId, pdf_file): file_name = '{}.pdf'.format(userId) pdf_file.save(file_name) pdf = Pdf.open(file_name) pdf.save(file_name) pdf2 = pdfx.PDFx(file_name) text = pdf2.get_text() course_matches = re.findall('(?<=Course)([\S\s]*?)(?=Description|Term GPA)', text) all_courses = [] for i in range(len(course_matches)): data = course_matches[i].split() course_names = data[:len(data)//2] course_codes = data[len(data)//2:] courses = [] for j in range(len(data)//2): courses.append('{}{}'.format(course_names[j], course_codes[j])) all_courses.extend(courses) os.remove(file_name) return all_courses
def bing_search(query, search_type): #search_type: Web, Image, News, Video key = 'Jn9NxuJ85uhnJm9LIJyW5bXt+1xK6ysiTWjwNwmk2sM' query = urllib.quote(query) # create credential for authentication user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)' credentials = (':%s' % key).encode('base64')[:-1] auth = 'Basic %s' % credentials url = 'https://api.datamarket.azure.com/Data.ashx/Bing/Search/' + search_type + '?Query=%27' + query + '%27&$top=5&$format=json' request = urllib2.Request(url) request.add_header('Authorization', auth) request.add_header('User-Agent', user_agent) request_opener = urllib2.build_opener() response = request_opener.open(request) response_data = response.read() json_result = json.loads(response_data) result_list = json_result['d']['results'] #print result_list for result in result_list: count = 0 print(result['Url']) pdf = pdfx.PDFx(result['Url']) pdf_get_text = pdf.get_text().encode('utf8', 'replace') f = open(str(count + 1) + ".text", 'w') f.write(pdf_get_text) #print(pdf.get_text()) return result_list
def run(self): global PDFX_INSTANCES for uri in self.pdf_uris: def signal_item_extract_page(curpage): self.signal_item_extract_page.emit(uri, curpage) print("Opening %s..." % uri) self.signal_item_start.emit(uri) try: PDFX_INSTANCES[uri] = pdfx.PDFx(uri, signal_item_extract_page) except pdfx.exceptions.FileNotFoundError as e: print("File not found") self.signal_item_error.emit(uri, ERROR_FILE_NOT_FOUND) continue except pdfx.exceptions.DownloadError as e: print("Download error") self.signal_item_error.emit(uri, ERROR_DOWNLOAD) continue except pdfx.exceptions.PDFInvalidError as e: print("PDF invalid error") self.signal_item_error.emit(uri, ERROR_PDF_INVALID) continue # sleep(3) self.signal_item_finished.emit(uri) self.signal_finished.emit()
def _getData(self): # ensure we only do work when we need to if self.data: return element_tree = etree.XML(etree.tostring(self._element_tree)) arxiv_id = element_tree.xpath("//*[local-name()='id']/text()")[0] self.arxiv_id = arxiv_id arxiv_abstract = element_tree.xpath( "//*[local-name()='abstract']/text()")[0] arxiv_pdf_link = self.get_pdf_link_from_arxiv_id(arxiv_id) arxiv_pdf_contents = pdfx.PDFx(arxiv_pdf_link) arxiv_pdf_contents_text = arxiv_pdf_contents.get_text() arxiv_pdf_contents_references = arxiv_pdf_contents.get_references_as_dict( ) arxiv_pdf_contents_metadata = arxiv_pdf_contents.get_metadata() _data = { "id": arxiv_id, "abstract": arxiv_abstract, "text": arxiv_pdf_contents_text, "references": arxiv_pdf_contents_references, "metadata": arxiv_pdf_contents_metadata, "recovery_id": self.thread_id } self.data = True print('thread about to queue data') self.queue.put(_data) print("THREAD CLEAN-UP: " + self.arxiv_id)
def download(report,username,password): session = create_olat_session(username, password) if report is None: click.echo("Please select exported report pdf") root = tk.Tk() root.withdraw() report = filedialog.askopenfilename() click.echo("Reading report from %s" % report) # extract links from pdf pdf = pdfx.PDFx(report) references_dict = pdf.get_references_as_dict() data = [value for (key, value) in (references_dict.items())] merged_data = [j for i in data for j in i] filtered_data = filter(check_file_url, merged_data) if not os.path.exists('downloads'): os.makedirs('downloads') for url in filtered_data: file_name = url.rsplit('/', 1)[1] if os.path.isfile('downloads'+'/'+file_name) == True: click.echo("Skipping %s - file exists on disk" % url) else: get_olat_file(session,url,file_name)
def extract_text (file_path): """ parse text from PDF """ text = None page_count = 0 try: pdf_meta = pdfx.PDFx(file_path) meta = pdf_meta.get_metadata() page_count = meta["Pages"] # split into sections buf = [] grafs = [] for line in pdf_meta.get_text().split("\n"): line = line.strip() buf.append(line) if len(line) < 1: section = " ".join(buf).strip().replace("- ", "") + "\n" grafs.append(section) buf = [] text = "\n".join(grafs) except: print(f"ERROR parsing {pdf_file}") traceback.print_exc() finally: return text, page_count
def extract_pdfx(filename): import pdfx pdf = pdfx.PDFx(filename) metadata = pdf.get_metadata() references_list = pdf.get_references() references_dict = pdf.get_references_as_dict() text = pdf.get_text() print(text)
def extract_urls_from_pdf(file_url): ''' load pdf file, scrap content and extract and return a list of valid urls ''' urls = [] pdf = pdfx.PDFx(file_url) references = pdf.get_references_as_dict() for url in references['url']: if any([url.startswith('http'), url.startswith('www')]): urls.append(url) return urls
def extract_metadata(filename): pdf = pdfx.PDFx(filename) # print(pdf.get_metadata()) try: doi = pdf.get_metadata()['dc']['identifier'] print(doi) return doi except KeyError: print("Filename {} has no DOI") return None
def main(filename, outdir): """""" try: pdf_extractor = pdfx.PDFx(uri=filename) except pdfx.exceptions.FileNotFoundError: print("File not found: {}".format(filename), file=sys.stderr) else: pdf_text = pdf_extractor.get_text() urls = get_url_from_pdf(pdf_text=pdf_text) download_urls(urls, dest=outdir)
def process_file(self, originalPdfPath): print("\n ---- Preparing to process %s ---- " % self._get_file_name_from_path(originalPdfPath)) self.pdf = pdfx.PDFx(originalPdfPath) # Download the files to the working directory self.pdf.download_pdfs(self.working_folder) # Loop through the downloaded pdfs, concatenating # them with the original self._merge_downloaded_files_with_original(originalPdfPath)
def downloader(url_set, pdf_path): print('Downloading pdfs to new SAR_PDFs folder') for url in url_set: print(url) if url[-4:] == '.pdf' or '.PDF': try: url = url.replace(' ', '%20', 1) pdf = pdfx.PDFx(url) pdf.download_pdfs(pdf_path) print(url) except: pass
def pdf_references(pdf_url, return_type="dict"): pdf = pdfx.PDFx(pdf_url) if return_type is None or return_type == "list": reference_list = pdf.get_references() return reference_list elif return_type == "dict": reference_dict = pdf.get_references_as_dict() return reference_dict else: print( "The given return type, '%s', is not available... Returning a dictionary instead..." ) pdf_references(pdf_url, return_type="dict")
def extractURLsFromPDFs(papers): links_in_papers = {} i = 1 for paper in papers: sys.stderr.write(str(i) + " extracted: " + paper + "\n") i+= 1 try: pdf = pdfx.PDFx(paper) set_of_urls = pdf.get_references() list_of_urls = [] for e in set_of_urls: list_of_urls.append(e.ref) links_in_papers[paper] = list_of_urls except UnicodeDecodeError: sys.stderr.write("This file has a UnicodeDecodeError!") return links_in_papers
def downloadRegistration(self): # Downloads pdf from 'result.json' period = self.getCurrentPeriod() data = self.data if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH) for item in data.body: if period in item['text']: # fix url bug '%09' url = PdfReader.fixURL(item['url']) # sets filename file_name = url.split('/') file_name = file_name.pop() file_name = file_name.replace('.pdf', '') # download pdf if not(os.path.isfile(f'{DOWNLOAD_PATH}{file_name}.pdf')): pdf = pdfx.PDFx(url) pdf.download_pdfs(DOWNLOAD_PATH) return file_name
def post(self, request): ufile = request.FILES.get('file') if ufile: file_type = ufile .name.split('.')[-1] if file_type == 'pdf': fs = FileSystemStorage() filename = fs.save(ufile .name, ufile) pdf = pdfx.PDFx(filename) urls = re.findall(URL_PATTERN, pdf.get_text()) filestorage = FilesStorage.objects.create(filename=filename) if urls: for url in urls: try: f = FoundLinks.objects.get(url=url) except FoundLinks.DoesNotExist: f = FoundLinks.objects.create(url=url) f.filename.add(filestorage) fs.delete(ufile.name) return HttpResponse('Ok') return HttpResponseForbidden()
def pdf_extract(dirs): print("extracting") '''Function takes filename and path to the file as a tuple and save the extracted text and references \ from PDF file to txt_path dirs = ("pdf_data/", "filename.pdf")''' paths, filename = dirs file_ = filename.replace(".pdf", ".txt") file_json = filename.replace(".pdf", ".json") if file_ in have: print("file already extracted!!") else: print("read pdf file", filename) cmd_text_extractor = "pdfx %s -t -o %s" % (os.path.join( paths, filename), txt_path + file_) pdf = pdfx.PDFx(os.path.join(paths, filename)) references_dict = pdf.get_references_as_dict() print("extrated reference of:", file_) os.system(cmd_text_extractor) print("extracted pdf_file:", file_) with open(ref_path + file_json, 'w') as fp: json.dump(references_dict, fp) print("save json to reference:", file_json) time.sleep(0.01)
#pdf-meta-extractor.py import pdfx import json import os user_pdf = input("Please copy and paste the pdf here (ensure that the file to be analyzed is placed in the same folder as this python file!): \t") pdf = pdfx.PDFx(str(user_pdf)) print('Analyzing PDF...') meta = pdf.get_metadata() url = pdf.get_references_as_dict() with open('pdf-metadata.txt', 'w') as pdf_data: pdf_data.write('\nPDF Metadata \t') pdf_data.write(json.dumps(meta)) pdf_data.write('\n') if len(url) == 0: pdf_data.close() else: for reference in url: pdf_data.write(reference) pdf_data.close()
#%% import module for pdf files import pdfx #%% Read pdf and create dictionary and list # Read pdf file pdf = pdfx.PDFx( "//ZEUS/mmb/molecular_ecology/mollab_team/Sequencing/ngs_sequencing/project_administration/NIOZ330_NIOZ331/NIOZ330_331_raw_data_report_HN00171340.pdf" ) # Get urls from pdf file as a dictionary links_dict = pdf.get_references_as_dict() # Convert dictionary to list links_list = list(links_dict.values()) # Check what is printed to the file # Because for some reason we get a list inside a list we need to use # links_list[0] for the list. for element in links_list[0:]: print(element) #%% Transfer list to txt file #Create new txt file txt_file = open("download_links.txt", "w") #Write list to txt with enter after each line for element in links_list[0:]: txt_file.write(element + '\n') #close txt txt_file.close()
def main(): parser = argparse.ArgumentParser( description="Get infos and links from a PDF, and optionally" "download all referenced PDFs.\nSee " "http://www.metachris.com/pdfx for more information.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog="") parser.add_argument("pdf", help="Filename or URL of a PDF file") parser.add_argument( "-d", "--download-pdfs", metavar="OUTPUT_DIRECTORY", help="Download all referenced PDFs into specified directory") parser.add_argument("-j", "--json", action='store_true', help="Output infos as json (instead of plain text)") parser.add_argument("-v", "--verbose", action="count", default=0, help="Print all urls (instead of only PDF urls)") parser.add_argument("--debug", action='store_true', help="Output debug infos") parser.add_argument("--version", action="version", version="%(prog)s (version {version})".format( version=pdfx.__version__)) args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG, format='%(levelname)s - %(module)s - %(message)s') try: pdf = pdfx.PDFx(args.pdf) except pdfx.exceptions.FileNotFoundError as e: exit_with_error(ERROR_FILE_NOT_FOUND, str(e)) except pdfx.exceptions.DownloadError as e: exit_with_error(ERROR_DOWNLOAD, str(e)) except pdfx.exceptions.PDFInvalidError as e: exit_with_error(ERROR_PDF_INVALID, str(e)) # Print Metadata if not args.json: print("Document infos:") for k, v in sorted(pdf.get_metadata().items()): if v: print("- %s = %s" % (k, str(v).strip("/"))) # Analyze PDF Text try: pdf.analyze_text() except pdfx.exceptions.PDFExtractionError as e: exit_with_error(ERROR_COULD_NOT_EXTRACT_PDF, str(e)) if not args.json: if args.verbose == 0: urls = pdf.get_urls(pdf_only=True) print("\n%s PDF URLs:" % len(urls)) else: urls = pdf.get_urls(pdf_only=False) print("\n%s URLs:" % len(urls)) for url in urls: print("- %s" % url) try: if args.download_pdfs: if not args.json: print("\nDownloading %s pdfs to '%s'..." % (len(pdf.urls_pdf, args.download_pdfs))) pdf.download_pdfs(args.download_pdfs) print("All done!") except Exception as e: exit_with_error(ERROR_DOWNLOAD, str(e)) if args.json: print(json.dumps(pdf.summary, indent=2))
import pdfx import spacy import pandas as pd pdf = pdfx.PDFx("Path of the pdf file") text = pdf.get_text() nlp = spacy.load("en_core_web_sm") doc = nlp(text) cols = ("text", "lemma", "POS", "explain", "stopword") rows = [] for t in doc: row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop] rows.append(row) df = pd.DataFrame(rows, columns=cols) for ent in doc.ents: print(ent.text, ent.label_)
def test_two_pdfs(): # See https://github.com/metachris/pdfx/issues/14 pdfx.PDFx(os.path.join(curdir, "pdfs/i14doc1.pdf")) pdf_2 = pdfx.PDFx(os.path.join(curdir, "pdfs/i14doc2.pdf")) assert len(pdf_2.get_references()) == 2
import sys import pdfx if len(sys.argv) - 1 < 1: print('Necesito el fichero pdf como parámetro') exit(1) for pdffile in sys.argv[1:]: print('Procesamos', pdffile) pdf = pdfx.PDFx(pdffile) metadata = pdf.get_metadata() references_list = pdf.get_references() references_dict = pdf.get_references_as_dict() #print(pdf.summary) print('Listamos referencias') for reference in references_list: print(reference) print('Listamos referencias dict') for reference in references_dict: print('Referencia: ', reference) counter = 1 for listurls in references_dict[reference]: print(counter, '>', listurls) counter += 1
def extract_urls_from_pdf(filename: str) -> t.Set[str]: pdf = pdfx.PDFx(filename) text = pdf.get_text() return extract_urls_from_text(text)
import numpy as np import pdfx pdfs = ["2016_all.pdf","2017_1.pdf","2017_2.pdf","2017_3.pdf"] urls = [] for pdf in iter(pdfs): pdfObj = pdfx.PDFx('miccai/' + pdf) references_list = pdfObj.get_references() urls.extend( [ str (x.ref) for x in references_list if "springer" not in str (x.ref) ] ) urls = np.save("miccaiUrls.npy",urls)
def extractPage(string): return re.split(r'Página \d de \d', string) def removeHoras(string): return re.sub(r'(\d+ h \d+ min)', '', string) def removeExcessLines(string): return re.sub(r'(\n){2,}', '\n', string) for prof in profs: try: pdf = pdfx.PDFx('Docentes/' + prof.name + '/2018 - 2.pdf') except Exception as e: print(e) print(f'{prof.name} não possui informaçao') break continue # print(pdf.get_text()) pages = extractPage(pdf.get_text()) pages[0] = removeHoras(pages[0]) pages[0] = removeExcessLines(pages[0]) print(pages[0]) print('================================')
def extractLink(path): pdf = pdfx.PDFx(path) urls = pdf.get_references_as_dict() if 'url' in list(urls.keys()): return urls[url] return []