nlp = spacy.load("en_core_web_sm") track = None vol = None with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: authors = None #get the track if has_bold(p): print("New track name:", track) track = has_bold(p) continue raw_text = unspace(p.get_text()) pdf = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try:
with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: if is_centered(p): track = is_centered(p) #writer.writerow([vol]) continue if has_bold(p): authors = remove_parenthesised(p.b.get_text()) authors = namify(authors) print(authors) else: continue if has_pdf: pdf = has_pdf(p) title = get_title(p) if title: title = unspace(title) pdf = urljoin(URL, pdf) line = [title, authors, pdf, presentation, vol, track]
quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for i in range(len(page_lines) - 2): j = 1 p = page_lines[i] p1 = page_lines[i + 1] if is_centered(p1): track = is_centered(p1) #writer.writerow([vol]) continue if has_bold(p1): authors = remove_parenthesised(p1.get_text()) authors = namify(authors) print(authors) else: continue if has_pdf: pdf = has_pdf(p) title = p.text # while len(p1.text.strip()) <= 3: # p1 = page_lines[i+j] # j = j+1 pdf = urljoin(URL, pdf) title = unspace(title)
"Volume_Name", "Pages", "Abstract", "Notes"]) for i in range(len(page_lines)-2): title=[] j = 1 p = page_lines[i] pj = page_lines[i+j] if has_pp(p): pages=extract_pages(p) else: continue while has_bold(pj): title+=pj.text j=j+1 pj = page_lines[i+j] title=unspace("".join(title)) print(title) authors=namify(pj.text) while "Abstract" not in pj.text: j=j+1 pj = page_lines[i+j] j=j+1 pj = page_lines[i+j] abstract=pj.text