pages = None pdf = None presentation = None title = [] pages_in_pdf = None with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "pages in pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] authors = None title = p pages = extract_pages(title) title = remove_page(title) title = unspace(title).strip(".") title = unspace(title) print(pages_in_pdf) pages_in_pdf = int(pages) + 4 line = [ title, authors, pdf, presentation, vol, pages, pages_in_pdf, track ] writer.writerow(line)
if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) else: continue if "abstract" in p: track = "abstract, full paper not available" presentation = None authors = p.text.split(":")[-1] authors=remove_parenthesised(authors) authors= authors.replace("- presentation","") authors=remove_page(authors) authors = namify(authors) if len(authors) <= 2 and title == None: continue try: title = ":".join(p.text.split(":")[:-1]) except: continue try: title = unspace(title) #pages= extract_pages(p) title= title.split("pp.")[0].replace("…","") except: pass
continue if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) title = get_title(p) #print(pdf) else: continue presentation = None authors = right authors = remove_parenthesised(authors) print(authors) if len(authors) <= 3 and title == None: continue try: pages = extract_pages(authors) except: pass authors = remove_page(authors).replace(";", "") authors = namify(authors) try: title = unspace(title) except: pass line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)