Example #1
0
def test_all():
    with pytest.raises(pdfx.exceptions.FileNotFoundError):
        pdfx.PDFx("asd")

    with pytest.raises(pdfx.exceptions.DownloadError):
        pdfx.PDFx("http://invalid.com/404.pdf")

    with pytest.raises(pdfx.exceptions.PDFInvalidError):
        pdfx.PDFx(os.path.join(curdir, "pdfs/invalid.pdf"))

    pdf = pdfx.PDFx(os.path.join(curdir, "pdfs/valid.pdf"))
    urls = pdf.get_references(reftype="pdf")
    assert len(urls) == 18
Example #2
0
def test_all():
    with pytest.raises(pdfx.exceptions.FileNotFoundError):
        pdfx.PDFx("asd")

    with pytest.raises(pdfx.exceptions.DownloadError):
        pdfx.PDFx("http://invalid.com/404.pdf")

    with pytest.raises(pdfx.exceptions.PDFInvalidError):
        pdfx.PDFx(os.path.join(curdir, "pdfs/invalid.pdf"))

    pdf = pdfx.PDFx(os.path.join(curdir, "pdfs/valid.pdf"))
    pdf.analyze_text()
    urls = pdf.get_urls(pdf_only=True)
    assert len(urls) == 17
def get_files(folder_id):
    files_metadata = get_folder_contents(folder_id)

    if len(files_metadata
           ) == 1:  # dl the pdf containing the URL to actual files
        files_id = files_metadata[0]['id']
        actually_download_file(files_id, 'readme.pdf')
        pdf = pdfx.PDFx('readme.pdf')
        links_list = pdf.get_references_as_dict()['url']
        csvs_link = [i for i in links_list if i.startswith('https://bit.ly/')]
        print("Today's source files are stored in: " + csvs_link[0])
        return_folder_id(csvs_link[0])

    else:  # dl desired csv's
        wanted_list = [
            'Case Information', 'DOH Data Collect - Daily Report',
            'Testing Aggregates'
        ]
        for item in files_metadata:
            for wanted in wanted_list:
                idx = files_metadata.index(item)
                filename = files_metadata[idx]['name']
                if wanted in filename:
                    new_name = datetime.strptime(
                        filename[21:29],
                        '%Y%m%d').strftime('%Y-%m-%d_') + wanted + '.csv'
                    actually_download_file(files_metadata[idx]['id'],
                                           '%s' % new_name)
        print(
            'Your raw data download only took {0:0.1f} seconds. See you tomorrow!'
            .format(time() - start_timer))
Example #4
0
def extract_courses_from_transcript(userId, pdf_file):
    file_name = '{}.pdf'.format(userId)

    pdf_file.save(file_name)

    pdf = Pdf.open(file_name)
    pdf.save(file_name)

    pdf2 = pdfx.PDFx(file_name)
    text = pdf2.get_text()

    course_matches = re.findall('(?<=Course)([\S\s]*?)(?=Description|Term GPA)', text)

    all_courses = []

    for i in range(len(course_matches)):
        data = course_matches[i].split()
        course_names = data[:len(data)//2]
        course_codes = data[len(data)//2:]
        courses = []
        for j in range(len(data)//2):
            courses.append('{}{}'.format(course_names[j], course_codes[j]))
        all_courses.extend(courses)
    
    os.remove(file_name)

    return all_courses
Example #5
0
def bing_search(query, search_type):

    #search_type: Web, Image, News, Video
    key = 'Jn9NxuJ85uhnJm9LIJyW5bXt+1xK6ysiTWjwNwmk2sM'
    query = urllib.quote(query)
    # create credential for authentication
    user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)'
    credentials = (':%s' % key).encode('base64')[:-1]
    auth = 'Basic %s' % credentials
    url = 'https://api.datamarket.azure.com/Data.ashx/Bing/Search/' + search_type + '?Query=%27' + query + '%27&$top=5&$format=json'
    request = urllib2.Request(url)
    request.add_header('Authorization', auth)
    request.add_header('User-Agent', user_agent)
    request_opener = urllib2.build_opener()
    response = request_opener.open(request)
    response_data = response.read()
    json_result = json.loads(response_data)
    result_list = json_result['d']['results']
    #print result_list

    for result in result_list:
        count = 0
        print(result['Url'])
        pdf = pdfx.PDFx(result['Url'])
        pdf_get_text = pdf.get_text().encode('utf8', 'replace')
        f = open(str(count + 1) + ".text", 'w')
        f.write(pdf_get_text)
        #print(pdf.get_text())

    return result_list
Example #6
0
    def run(self):
        global PDFX_INSTANCES
        for uri in self.pdf_uris:

            def signal_item_extract_page(curpage):
                self.signal_item_extract_page.emit(uri, curpage)

            print("Opening %s..." % uri)
            self.signal_item_start.emit(uri)

            try:
                PDFX_INSTANCES[uri] = pdfx.PDFx(uri, signal_item_extract_page)
            except pdfx.exceptions.FileNotFoundError as e:
                print("File not found")
                self.signal_item_error.emit(uri, ERROR_FILE_NOT_FOUND)
                continue
            except pdfx.exceptions.DownloadError as e:
                print("Download error")
                self.signal_item_error.emit(uri, ERROR_DOWNLOAD)
                continue
            except pdfx.exceptions.PDFInvalidError as e:
                print("PDF invalid error")
                self.signal_item_error.emit(uri, ERROR_PDF_INVALID)
                continue

            # sleep(3)
            self.signal_item_finished.emit(uri)
        self.signal_finished.emit()
Example #7
0
    def _getData(self):
        # ensure we only do work when we need to
        if self.data:
            return

        element_tree = etree.XML(etree.tostring(self._element_tree))
        arxiv_id = element_tree.xpath("//*[local-name()='id']/text()")[0]
        self.arxiv_id = arxiv_id
        arxiv_abstract = element_tree.xpath(
            "//*[local-name()='abstract']/text()")[0]

        arxiv_pdf_link = self.get_pdf_link_from_arxiv_id(arxiv_id)
        arxiv_pdf_contents = pdfx.PDFx(arxiv_pdf_link)

        arxiv_pdf_contents_text = arxiv_pdf_contents.get_text()
        arxiv_pdf_contents_references = arxiv_pdf_contents.get_references_as_dict(
        )
        arxiv_pdf_contents_metadata = arxiv_pdf_contents.get_metadata()

        _data = {
            "id": arxiv_id,
            "abstract": arxiv_abstract,
            "text": arxiv_pdf_contents_text,
            "references": arxiv_pdf_contents_references,
            "metadata": arxiv_pdf_contents_metadata,
            "recovery_id": self.thread_id
        }

        self.data = True
        print('thread about to queue data')
        self.queue.put(_data)
        print("THREAD CLEAN-UP: " + self.arxiv_id)
Example #8
0
def download(report,username,password):

    session = create_olat_session(username, password)

    if report is None:
        click.echo("Please select exported report pdf")

        root = tk.Tk()
        root.withdraw()

        report = filedialog.askopenfilename()

    click.echo("Reading report from %s" % report)

    # extract links from pdf
    pdf = pdfx.PDFx(report)
    references_dict = pdf.get_references_as_dict()

    data = [value for (key, value) in (references_dict.items())]
    merged_data = [j for i in data for j in i]
    filtered_data = filter(check_file_url, merged_data)

    if not os.path.exists('downloads'):
        os.makedirs('downloads')

    for url in filtered_data:
        file_name = url.rsplit('/', 1)[1]

        if os.path.isfile('downloads'+'/'+file_name) == True:
            click.echo("Skipping %s - file exists on disk" % url)
        else:
            get_olat_file(session,url,file_name)
Example #9
0
def extract_text (file_path):
    """                                                                                                                      
    parse text from PDF                                                                                                      
    """
    text = None
    page_count = 0

    try:
        pdf_meta = pdfx.PDFx(file_path)
        meta = pdf_meta.get_metadata()
        page_count = meta["Pages"]

        # split into sections                                                                                                   
        buf = []
        grafs = []

        for line in pdf_meta.get_text().split("\n"):
            line = line.strip()
            buf.append(line)

            if len(line) < 1:
                section = " ".join(buf).strip().replace("- ", "") + "\n"
                grafs.append(section)
                buf = []

        text = "\n".join(grafs)
    except:
        print(f"ERROR parsing {pdf_file}")
        traceback.print_exc()
    finally:
        return text, page_count
Example #10
0
def extract_pdfx(filename):
    import pdfx
    pdf = pdfx.PDFx(filename)
    metadata = pdf.get_metadata()
    references_list = pdf.get_references()
    references_dict = pdf.get_references_as_dict()
    text = pdf.get_text()
    print(text)
Example #11
0
def extract_urls_from_pdf(file_url):
    ''' load pdf file, scrap content and extract and return a list of valid urls '''
    urls = []
    pdf = pdfx.PDFx(file_url)
    references = pdf.get_references_as_dict()
    for url in references['url']:
        if any([url.startswith('http'), url.startswith('www')]):
            urls.append(url)
    return urls
def extract_metadata(filename):
    pdf = pdfx.PDFx(filename)
    # print(pdf.get_metadata())
    try:
        doi = pdf.get_metadata()['dc']['identifier']
        print(doi)
        return doi
    except KeyError:
        print("Filename {} has no DOI")
        return None
Example #13
0
def main(filename, outdir):
    """"""
    try:
        pdf_extractor = pdfx.PDFx(uri=filename)
    except pdfx.exceptions.FileNotFoundError:
        print("File not found: {}".format(filename), file=sys.stderr)
    else:
        pdf_text = pdf_extractor.get_text()
        urls = get_url_from_pdf(pdf_text=pdf_text)
        download_urls(urls, dest=outdir)
    def process_file(self, originalPdfPath):
        print("\n ---- Preparing to process %s ---- " % self._get_file_name_from_path(originalPdfPath))

        self.pdf = pdfx.PDFx(originalPdfPath)

        # Download the files to the working directory
        self.pdf.download_pdfs(self.working_folder)

        # Loop through the downloaded pdfs, concatenating
        # them with the original
        self._merge_downloaded_files_with_original(originalPdfPath)
Example #15
0
def downloader(url_set, pdf_path):
    print('Downloading pdfs to new SAR_PDFs folder')
    for url in url_set:
        print(url)
        if url[-4:] == '.pdf' or '.PDF':
            try:
                url = url.replace(' ', '%20', 1)
                pdf = pdfx.PDFx(url)
                pdf.download_pdfs(pdf_path)
                print(url)
            except:
                pass
Example #16
0
def pdf_references(pdf_url, return_type="dict"):
    pdf = pdfx.PDFx(pdf_url)
    if return_type is None or return_type == "list":
        reference_list = pdf.get_references()
        return reference_list
    elif return_type == "dict":
        reference_dict = pdf.get_references_as_dict()
        return reference_dict
    else:
        print(
            "The given return type, '%s', is not available... Returning a dictionary instead..."
        )
        pdf_references(pdf_url, return_type="dict")
Example #17
0
def extractURLsFromPDFs(papers):
    links_in_papers = {}
    i = 1
    for paper in papers:
        sys.stderr.write(str(i) + " extracted: " + paper + "\n")
        i+= 1
        try: 
            pdf = pdfx.PDFx(paper)
            set_of_urls = pdf.get_references()
            list_of_urls = []
            for e in set_of_urls:
                list_of_urls.append(e.ref)
            links_in_papers[paper] = list_of_urls
        except UnicodeDecodeError:
            sys.stderr.write("This file has a UnicodeDecodeError!")
    return links_in_papers
Example #18
0
    def downloadRegistration(self):
        # Downloads pdf from 'result.json'
        period = self.getCurrentPeriod()
        data = self.data
        if not os.path.exists(OUTPUT_PATH):
            os.mkdir(OUTPUT_PATH)

        for item in data.body:
            if period in item['text']:
                # fix url bug '%09'
                url = PdfReader.fixURL(item['url'])
                # sets filename
                file_name = url.split('/')
                file_name = file_name.pop()
                file_name = file_name.replace('.pdf', '')
                # download pdf
                if not(os.path.isfile(f'{DOWNLOAD_PATH}{file_name}.pdf')):
                    pdf = pdfx.PDFx(url)
                    pdf.download_pdfs(DOWNLOAD_PATH)

                return file_name
Example #19
0
File: views.py Project: myar/test
    def post(self, request):
        ufile = request.FILES.get('file')

        if ufile:
            file_type = ufile .name.split('.')[-1]
            if file_type == 'pdf':
                fs = FileSystemStorage()
                filename = fs.save(ufile .name, ufile)
                pdf = pdfx.PDFx(filename)
                urls = re.findall(URL_PATTERN, pdf.get_text())
                filestorage = FilesStorage.objects.create(filename=filename)
                if urls:
                    for url in urls:
                        try:
                            f = FoundLinks.objects.get(url=url)
                        except FoundLinks.DoesNotExist:
                            f = FoundLinks.objects.create(url=url)
                        f.filename.add(filestorage)

                fs.delete(ufile.name)
                return HttpResponse('Ok')
        return HttpResponseForbidden()
def pdf_extract(dirs):
    print("extracting")
    '''Function takes filename and path to the file as a tuple and save the extracted text and references \
    from PDF file to txt_path dirs = ("pdf_data/", "filename.pdf")'''
    paths, filename = dirs
    file_ = filename.replace(".pdf", ".txt")
    file_json = filename.replace(".pdf", ".json")
    if file_ in have:
        print("file already extracted!!")
    else:
        print("read pdf file", filename)
        cmd_text_extractor = "pdfx %s -t -o %s" % (os.path.join(
            paths, filename), txt_path + file_)
        pdf = pdfx.PDFx(os.path.join(paths, filename))
        references_dict = pdf.get_references_as_dict()
        print("extrated reference of:", file_)
        os.system(cmd_text_extractor)
        print("extracted pdf_file:", file_)
        with open(ref_path + file_json, 'w') as fp:
            json.dump(references_dict, fp)
        print("save json to reference:", file_json)
        time.sleep(0.01)
#pdf-meta-extractor.py

import pdfx
import json
import os


user_pdf = input("Please copy and paste the pdf here (ensure that the file to be analyzed is placed in the same folder as this python file!): \t")
pdf = pdfx.PDFx(str(user_pdf))
print('Analyzing PDF...')
meta = pdf.get_metadata()
url = pdf.get_references_as_dict()


with open('pdf-metadata.txt', 'w') as pdf_data:
  pdf_data.write('\nPDF Metadata \t')
  pdf_data.write(json.dumps(meta))
  pdf_data.write('\n')
  if len(url) == 0:
      pdf_data.close()
  else:
      for reference in url:
        pdf_data.write(reference)
  pdf_data.close()
Example #22
0
#%% import module for pdf files
import pdfx

#%% Read pdf and create dictionary and list
# Read pdf file
pdf = pdfx.PDFx(
    "//ZEUS/mmb/molecular_ecology/mollab_team/Sequencing/ngs_sequencing/project_administration/NIOZ330_NIOZ331/NIOZ330_331_raw_data_report_HN00171340.pdf"
)

# Get urls from pdf file as a dictionary
links_dict = pdf.get_references_as_dict()

# Convert dictionary to list
links_list = list(links_dict.values())

# Check what is printed to the file
# Because for some reason we get a list inside a list we need to use
# links_list[0] for the list.
for element in links_list[0:]:
    print(element)

#%% Transfer list to txt file
#Create new txt file
txt_file = open("download_links.txt", "w")
#Write list to txt with enter after each line
for element in links_list[0:]:
    txt_file.write(element + '\n')
#close txt
txt_file.close()
Example #23
0
def main():
    parser = argparse.ArgumentParser(
        description="Get infos and links from a PDF, and optionally"
        "download all referenced PDFs.\nSee "
        "http://www.metachris.com/pdfx for more information.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="")
    parser.add_argument("pdf", help="Filename or URL of a PDF file")
    parser.add_argument(
        "-d",
        "--download-pdfs",
        metavar="OUTPUT_DIRECTORY",
        help="Download all referenced PDFs into specified directory")
    parser.add_argument("-j",
                        "--json",
                        action='store_true',
                        help="Output infos as json (instead of plain text)")
    parser.add_argument("-v",
                        "--verbose",
                        action="count",
                        default=0,
                        help="Print all urls (instead of only PDF urls)")
    parser.add_argument("--debug",
                        action='store_true',
                        help="Output debug infos")

    parser.add_argument("--version",
                        action="version",
                        version="%(prog)s (version {version})".format(
                            version=pdfx.__version__))

    args = parser.parse_args()

    if args.debug:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(levelname)s - %(module)s - %(message)s')

    try:
        pdf = pdfx.PDFx(args.pdf)
    except pdfx.exceptions.FileNotFoundError as e:
        exit_with_error(ERROR_FILE_NOT_FOUND, str(e))
    except pdfx.exceptions.DownloadError as e:
        exit_with_error(ERROR_DOWNLOAD, str(e))
    except pdfx.exceptions.PDFInvalidError as e:
        exit_with_error(ERROR_PDF_INVALID, str(e))

    # Print Metadata
    if not args.json:
        print("Document infos:")
        for k, v in sorted(pdf.get_metadata().items()):
            if v:
                print("- %s = %s" % (k, str(v).strip("/")))

    # Analyze PDF Text
    try:
        pdf.analyze_text()
    except pdfx.exceptions.PDFExtractionError as e:
        exit_with_error(ERROR_COULD_NOT_EXTRACT_PDF, str(e))

    if not args.json:
        if args.verbose == 0:
            urls = pdf.get_urls(pdf_only=True)
            print("\n%s PDF URLs:" % len(urls))
        else:
            urls = pdf.get_urls(pdf_only=False)
            print("\n%s URLs:" % len(urls))
        for url in urls:
            print("- %s" % url)

    try:
        if args.download_pdfs:
            if not args.json:
                print("\nDownloading %s pdfs to '%s'..." %
                      (len(pdf.urls_pdf, args.download_pdfs)))
            pdf.download_pdfs(args.download_pdfs)
            print("All done!")
    except Exception as e:
        exit_with_error(ERROR_DOWNLOAD, str(e))

    if args.json:
        print(json.dumps(pdf.summary, indent=2))
import pdfx
import spacy
import pandas as pd

pdf = pdfx.PDFx("Path of the pdf file")
text = pdf.get_text()

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

cols = ("text", "lemma", "POS", "explain", "stopword")
rows = []

for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)

for ent in doc.ents:
    print(ent.text, ent.label_)
Example #25
0
def test_two_pdfs():
    # See https://github.com/metachris/pdfx/issues/14
    pdfx.PDFx(os.path.join(curdir, "pdfs/i14doc1.pdf"))
    pdf_2 = pdfx.PDFx(os.path.join(curdir, "pdfs/i14doc2.pdf"))
    assert len(pdf_2.get_references()) == 2
Example #26
0
import sys
import pdfx

if len(sys.argv) - 1 < 1:
    print('Necesito el fichero pdf como parámetro')
    exit(1)

for pdffile in sys.argv[1:]:
    print('Procesamos', pdffile)

pdf = pdfx.PDFx(pdffile)
metadata = pdf.get_metadata()
references_list = pdf.get_references()
references_dict = pdf.get_references_as_dict()

#print(pdf.summary)

print('Listamos referencias')
for reference in references_list:
    print(reference)

print('Listamos referencias dict')
for reference in references_dict:
    print('Referencia: ', reference)
    counter = 1
    for listurls in references_dict[reference]:
        print(counter, '>', listurls)
        counter += 1
Example #27
0
def extract_urls_from_pdf(filename: str) -> t.Set[str]:
    pdf = pdfx.PDFx(filename)
    text = pdf.get_text()
    return extract_urls_from_text(text)
Example #28
0
import numpy as np
import pdfx

pdfs = ["2016_all.pdf","2017_1.pdf","2017_2.pdf","2017_3.pdf"]
urls = []
for pdf in iter(pdfs):
    pdfObj = pdfx.PDFx('miccai/' + pdf)
    references_list = pdfObj.get_references()
    urls.extend( [  str (x.ref) for x in references_list if "springer" not in str (x.ref) ] )

urls = np.save("miccaiUrls.npy",urls)
Example #29
0

def extractPage(string):
    return re.split(r'Página \d de \d', string)


def removeHoras(string):
    return re.sub(r'(\d+ h \d+ min)', '', string)


def removeExcessLines(string):
    return re.sub(r'(\n){2,}', '\n', string)


for prof in profs:
    try:
        pdf = pdfx.PDFx('Docentes/' + prof.name + '/2018 - 2.pdf')
    except Exception as e:
        print(e)
        print(f'{prof.name} não possui informaçao')
        break
        continue

    # print(pdf.get_text())
    pages = extractPage(pdf.get_text())
    pages[0] = removeHoras(pages[0])
    pages[0] = removeExcessLines(pages[0])
    print(pages[0])
    print('================================')

Example #30
0
def extractLink(path):
    pdf = pdfx.PDFx(path)
    urls = pdf.get_references_as_dict()
    if 'url' in list(urls.keys()):
        return urls[url]
    return []