pdf = urljoin(URL, pdf) #print(pdf) presentation = None if "presentation" in p.text: presentation = has_pdf(p) presentation = urljoin(URL, presentation) try: # Remove the first text before ":" if pdf: title = get_title(p) except: continue authors = p.text.split(":")[0] authors = namify(authors) if len(authors) <= 2 and title == None: continue if title == None: try: title = p.text.split(":")[1] except: pass try: title = unspace(title) title = remove_parenthesised(title) except: pass line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
for p in page_lines: #get the track raw_text = unspace(p.get_text()) pdf = None authors = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try: # Remove the first text before ":" title = p.text.split("--")[0] title = unspace(title) except: continue try: authors = p.text.split("--")[1] except: pass if authors: authors = remove_parenthesised(authors) authors = namify(authors) if not authors and title == None: continue line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
if "abstract" in p.text: track = "abstract" else: pass presentation = None if "presentation" in p.text: presentation=pdf pdf=None authors = p.text.split(":")[0] if title==None or authors==None: continue authors = namify(authors) if len(title) <= 3: continue try: title = unspace(unspace(title)) except: pass #title= remove_page(title) print("page removed"+title) title= remove_parenthesised(title,exclude="(") line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
pdf = None if has_pdf(p): pdf = has_pdf(p) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: continue while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(left) except: pass pdf = urljoin(URL, pdf) authors = remove_parenthesised(p1.get_text()) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
with open(FILE_NAME+".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Title", "Authors", "Pdf", "Presentation", "Volume_Name","Notes"]) for i in range(len(page_lines)-1): track=None p = page_lines[i] p1 = page_lines[i+1] if "abstract" in p.text: track="abstract, full text not available" if "[" in p.text: title = remove_parenthesised(p.text) pdf=has_pdf(p) elif has_pdf(p): pdf=has_pdf(p) title=get_title(p) else: continue #print(p1) authors = namify(p1.text) print(authors)
except: continue p2 = page_lines[i+2] j = 2 p1 = p2 else: continue if authors==None: continue title=p1.text track=None if "abstract" in title: track= "abstract" title=remove_parenthesised(title) while len(p1.text.strip()) <= 3: p1 = page_lines[i+j] j = j+1 try: pages = extract_pages(p) except: pass pdf=has_pdf(p1) pdf = urljoin(URL, pdf) print(authors) authors=remove_parenthesised(authors) authors = namify(authors) title = unspace(title)
pdf = None if has_pdf(p) and "abstract" not in p.text: pdf = has_pdf(p) print(pdf) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: if "abstract" in p.text: title = remove_parenthesised(p.text) pdf = has_pdf(p) else: continue while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(p) except: pass pdf = urljoin(URL, pdf) if pdf == URL: pdf = None authors = remove_parenthesised(p1.get_text())
"Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for i in range(len(page_lines) - 1): j = 1 p = page_lines[i] p1 = page_lines[i + 1] raw_text = p.get_text().replace("\n", " ") print(raw_text) pdf = None if has_pdf(p1): pdf = has_pdf(p1) print(pdf) title = get_title(p1) if not pdf: continue try: pages = extract_pages(p1) except: pass pdf = urljoin(URL, pdf) authors = ",".join(remove_parenthesised(p.get_text()).split(",")[:-1]) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for i in range(len(page_lines) - 1): track = None p = page_lines[i] p1 = page_lines[i + 1] if "abstract" in p.text: track = "abstract, full text not available" if has_pdf(p): pdf = has_pdf(p) title = get_title(p) else: continue #print(p1) authors = namify(remove_parenthesised(p.b.text)) pdf = urljoin(URL, pdf) if pdf == URL: pdf = None title = unspace(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
if has_bold(p): print("New track name:", track) track = has_bold(p) continue raw_text = unspace(p.get_text()) pdf = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None text = unspace(p.get_text()) try: title = text.split(":")[0] title = unspace(remove_parenthesised(title)) authors = text.split(":")[1].split(",")[0] authors = remove_parenthesised(authors) except: pass #authors = namify(authors) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
]) for i in range(len(page_lines)): j = 1 p = page_lines[i] p1 = page_lines[i + 1] raw_text = p.get_text().replace("\n", " ") pdf = None authors = None authors = p.text if authors == None: continue title = remove_parenthesised(p1.text) track = None pdf = has_pdf(p1) pdf = urljoin(URL, pdf) if pdf == URL: pdf = None print(authors) authors = remove_parenthesised(authors) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)