writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: #get the track if is_centered(p): print("New track name:", track) track = is_centered(p) continue raw_text = unspace(p.get_text()) pdf = None title = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None if "presentation" in p.text: presentation = has_pdf(p) presentation = urljoin(URL, presentation) try: # Remove the first text before ":" if pdf: title = get_title(p) except: continue
vol = None with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: #get the track raw_text = unspace(p.get_text()) pdf = None authors = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try: # Remove the first text before ":" title = p.text.split("--")[0] title = unspace(title) except: continue try: authors = p.text.split("--")[1] except:
for i in range(len(page_lines) - 1): p = page_lines[i] p1 = page_lines[i + 1] #get the track if is_subtitle(p): print("assigne", track) track = is_subtitle(p) writer.writerow([track]) continue raw_text = p.get_text().replace("\n", " ") pdf = None #Skip presention line if has_pdf(p) and "presentation" not in p.text: pdf = has_pdf(p) print(pdf) presentation = None if "presentation" in p1.text: presentation = has_pdf(p1) if not pdf: continue l_1 = p1.get_text() authors = re.split("\d+", p1.get_text())[0] print(authors) authors = namify(authors) title = get_title(p)
p1 = page_lines[i + 1] splitter = ":" line = unspace(p.text) sliced_line = line.split(splitter) try: left = splitter.join(sliced_line[:-1]) right = sliced_line[-1] except: print(sliced_line) continue raw_text = p.get_text().replace("\n", " ") pdf = None if has_pdf(p): pdf = has_pdf(p) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: continue while len(p1.text.strip()) <= 3:
["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for i in range(len(page_lines) - 2): j = 1 p = page_lines[i] p1 = page_lines[i + 1] if is_centered(p1): track = is_centered(p1) #writer.writerow([vol]) continue if has_bold(p1): authors = remove_parenthesised(p1.get_text()) authors = namify(authors) print(authors) else: continue if has_pdf: pdf = has_pdf(p) title = p.text # while len(p1.text.strip()) <= 3: # p1 = page_lines[i+j] # j = j+1 pdf = urljoin(URL, pdf) title = unspace(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
j = 2 p1 = p2 else: continue if authors==None: continue title=p1.text track=None if "abstract" in title: track= "abstract" title=remove_parenthesised(title) while len(p1.text.strip()) <= 3: p1 = page_lines[i+j] j = j+1 try: pages = extract_pages(p) except: pass pdf=has_pdf(p1) pdf = urljoin(URL, pdf) print(authors) authors=remove_parenthesised(authors) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol,pages, track] writer.writerow(line)
with open(FILE_NAME+".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Title", "Authors", "Pdf", "Presentation", "Volume_Name","Notes"]) for i in range(len(page_lines)-1): track=None p = page_lines[i] p1 = page_lines[i+1] if "abstract" in p1.text: track="abstract, full text not available" if "presentation" in p1.text: presentation = has_pdf(p1,1) presentation = urljoin(URL, presentation) try: pdf = has_pdf(p1) if pdf==None: continue except: continue #print(p1) authors = namify(p1.text.split("–")[0]) pdf = urljoin(URL, pdf) if pdf==URL: pdf=None
quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for i in range(len(page_lines) - 2): j = 1 p = page_lines[i] p1 = page_lines[i + 1] raw_text = p.get_text().replace("\n", " ") print(raw_text) pdf = None if has_pdf(p) and "abstract" not in p.text: pdf = has_pdf(p) print(pdf) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: if "abstract" in p.text: title = remove_parenthesised(p.text) pdf = has_pdf(p)
j = 1 p = page_lines[i] p1 = page_lines[i + 1] p0 = page_lines[i - 1] if is_subtitle(p): track = is_subtitle(p) track = unspace(track) #writer.writerow([vol]) continue authors = None if finish_by_digit(p): authors = p.text else: continue title = p0.get_text() pdf = None pdf = has_pdf(p0) if pdf: title = get_title(p0) pdf = urljoin(URL, pdf) if pdf == URL: pdf = None authors = namify(authors) title = unspace(remove_digit(unspace(title))) print(repr(title)) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)