pdf = None if has_pdf(p): pdf = has_pdf(p) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: continue while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(left) except: pass pdf = urljoin(URL, pdf) authors = remove_parenthesised(p1.get_text()) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
]) for i in range(len(page_lines) - 3): j = 1 p = page_lines[i] p1 = page_lines[i + 1] raw_text = p.get_text().replace("\n", " ") pdf = None title = p.text while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(p) except: pass #pdf = urljoin(URL, pdf) authors = p1.text authors = ",".join(authors.split(",")[:-1]) authors = remove_parenthesised(authors) authors = ",".join(authors.split(",")[:-1]) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
pages = None pdf = None presentation = None title = [] pages_in_pdf = None with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "pages in pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] authors = None title = p pages = extract_pages(title) title = remove_page(title) title = unspace(title).strip(".") title = unspace(title) print(pages_in_pdf) pages_in_pdf = int(pages) + 4 line = [ title, authors, pdf, presentation, vol, pages, pages_in_pdf, track ] writer.writerow(line)
raw_text = unspace(p.get_text()) pdf = None title = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) else: continue presentation = None authors = p.text.split(":")[0] authors = namify(authors) if len(authors) <= 2 and title == None: continue try: title = get_title(p) except: continue try: title = unspace(title) #title = remove_parenthesised(title) pages = extract_pages(p.text.split(";")[1]) #itle= title.split("pp.")[0].replace("…","") except: pass line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
raw_text = unspace(p.get_text()) pdf = None title = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) else: pass presentation = None try: authors = p.text.split("--")[1] except: continue try: pages = extract_pages(authors) except: pass authors = remove_parenthesised(authors) authors = remove_page(authors) authors = namify(authors) if len(authors) <= 2 and title == None: continue try: title = p.text.split("--")[0] except: continue try:
"Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Pages_in_pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] p1 = page_lines[i + 1] authors_title = p.split("\"") try: title = unspace(authors_title[1]) authors = authors_title[0] except: continue authors = unspace(authors) authors = namify(authors) pages = extract_pages(p1) print(pages) try: pages = extract_pages(pages) except: pass pages_in_pdf = str(int(pages) + 5) if len(title) < 4: continue line = [ title, authors, pdf, presentation, vol, pages, pages_in_pdf, track ] writer.writerow(line)