Example #1
0
pages = None
pdf = None
presentation = None
title = []
pages_in_pdf = None
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "pages in pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]
        authors = None
        title = p

        pages = extract_pages(title)
        title = remove_page(title)
        title = unspace(title).strip(".")
        title = unspace(title)
        print(pages_in_pdf)
        pages_in_pdf = int(pages) + 4
        line = [
            title, authors, pdf, presentation, vol, pages, pages_in_pdf, track
        ]
        writer.writerow(line)
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)
        else:
            continue
        if "abstract" in p:
            track = "abstract, full paper not available"
        presentation = None



        authors = p.text.split(":")[-1]
        authors=remove_parenthesised(authors)
        authors= authors.replace("- presentation","")
        authors=remove_page(authors)
        authors = namify(authors)
        if len(authors) <= 2 and title == None:
            continue

        try:
            title = ":".join(p.text.split(":")[:-1])
        except:
            continue
        try:
            title = unspace(title)
            #pages= extract_pages(p)
            title= title.split("pp.")[0].replace("…","")
        except:
            pass
            continue
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            title = get_title(p)
            #print(pdf)

        else:
            continue
        presentation = None

        authors = right
        authors = remove_parenthesised(authors)
        print(authors)
        if len(authors) <= 3 and title == None:
            continue
        try:
            pages = extract_pages(authors)
        except:
            pass
        authors = remove_page(authors).replace(";", "")
        authors = namify(authors)

        try:
            title = unspace(title)
        except:
            pass

        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)