Beispiel #1
0
        pdf = None
        if has_pdf(p):
            pdf = has_pdf(p)

            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            continue

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(left)
        except:
            pass
        pdf = urljoin(URL, pdf)
        authors = remove_parenthesised(p1.get_text())
        authors = namify(authors)
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
    ])

    for i in range(len(page_lines) - 3):

        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        raw_text = p.get_text().replace("\n", " ")

        pdf = None
        title = p.text

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(p)
        except:
            pass
        #pdf = urljoin(URL, pdf)
        authors = p1.text
        authors = ",".join(authors.split(",")[:-1])
        authors = remove_parenthesised(authors)
        authors = ",".join(authors.split(",")[:-1])
        authors = namify(authors)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
Beispiel #3
0
pages = None
pdf = None
presentation = None
title = []
pages_in_pdf = None
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "pages in pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]
        authors = None
        title = p

        pages = extract_pages(title)
        title = remove_page(title)
        title = unspace(title).strip(".")
        title = unspace(title)
        print(pages_in_pdf)
        pages_in_pdf = int(pages) + 4
        line = [
            title, authors, pdf, presentation, vol, pages, pages_in_pdf, track
        ]
        writer.writerow(line)
        raw_text = unspace(p.get_text())
        pdf = None
        title = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)
        else:
            continue
        presentation = None

        authors = p.text.split(":")[0]
        authors = namify(authors)
        if len(authors) <= 2 and title == None:
            continue

        try:
            title = get_title(p)
        except:
            continue
        try:
            title = unspace(title)
            #title = remove_parenthesised(title)
            pages = extract_pages(p.text.split(";")[1])
            #itle= title.split("pp.")[0].replace("…","")
        except:
            pass

        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
        raw_text = unspace(p.get_text())
        pdf = None
        title = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)
        else:
            pass
        presentation = None
        try:
            authors = p.text.split("--")[1]
        except:
            continue
        try:
            pages = extract_pages(authors)
        except:
            pass
        authors = remove_parenthesised(authors)
        authors = remove_page(authors)

        authors = namify(authors)
        if len(authors) <= 2 and title == None:
            continue

        try:
            title = p.text.split("--")[0]

        except:
            continue
        try:
Beispiel #6
0
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Pages_in_pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]
        p1 = page_lines[i + 1]
        authors_title = p.split("\"")

        try:
            title = unspace(authors_title[1])
            authors = authors_title[0]
        except:
            continue
        authors = unspace(authors)
        authors = namify(authors)

        pages = extract_pages(p1)
        print(pages)
        try:
            pages = extract_pages(pages)
        except:
            pass
        pages_in_pdf = str(int(pages) + 5)
        if len(title) < 4:
            continue
        line = [
            title, authors, pdf, presentation, vol, pages, pages_in_pdf, track
        ]
        writer.writerow(line)