for p in page_lines: #get the track raw_text = unspace(p.get_text()) pdf = None authors = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try: # Remove the first text before ":" title = p.text.split("--")[0] title = unspace(title) except: continue try: authors = p.text.split("--")[1] except: pass if authors: authors = remove_parenthesised(authors) authors = namify(authors) if not authors and title == None: continue line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
p1 = page_lines[i+1] if "abstract" in p1.text: track="abstract, full text not available" if "presentation" in p1.text: presentation = has_pdf(p1,1) presentation = urljoin(URL, presentation) try: pdf = has_pdf(p1) if pdf==None: continue except: continue #print(p1) authors = namify(p1.text.split("–")[0]) pdf = urljoin(URL, pdf) if pdf==URL: pdf=None try: title = p.text title = title.split(":")[1] except: title = p.text title = unspace(title) title= remove_parenthesised(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
raw_text = unspace(p.get_text()) pdf = None #Skip presention line if has_pdf(p) and "presentation" not in p.text: pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None if "presentation" in p.text: presentation = has_pdf(p) presentation = urljoin(URL, presentation) try: title = re.split(":", p.get_text())[1] title= re.split("--", title)[0] title = namify(title) except: continue print(has_italic(p)) if not has_italic(p): continue authors=has_italic(p) authors=namify(authors) #authors=re.split(":", p.get_text())[0] line = [title, authors, pdf,presentation, track, raw_text] writer.writerow(line)
track=None p = page_lines[i] p1 = page_lines[i+1] if "abstract" in p.text: track="abstract, full text not available" if "[" in p.text: title = remove_parenthesised(p.text) pdf=has_pdf(p) elif has_pdf(p): pdf=has_pdf(p) title=get_title(p) else: continue #print(p1) authors = namify(p1.text) print(authors) pdf = urljoin(URL, pdf) if pdf==URL: pdf=None title = unspace(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
title=[] j = 1 p = page_lines[i] pj = page_lines[i+j] if has_pp(p): pages=extract_pages(p) else: continue while has_bold(pj): title+=pj.text j=j+1 pj = page_lines[i+j] title=unspace("".join(title)) print(title) authors=namify(pj.text) while "Abstract" not in pj.text: j=j+1 pj = page_lines[i+j] j=j+1 pj = page_lines[i+j] abstract=pj.text line = [title, authors, pdf, presentation, vol, pages, abstract, track] writer.writerow(line)
import sys sys.path.append('..') from common import namify, unspace with open("names","r") as f: names=f.readlines() for name in names: name=namify(name) print(name)
delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for i in range(len(page_lines) - 1): track = None p = page_lines[i] p1 = page_lines[i + 1] if "abstract" in p.text: track = "abstract, full text not available" if has_pdf(p): pdf = has_pdf(p) title = get_title(p) else: continue #print(p1) authors = namify(remove_parenthesised(p.b.text)) pdf = urljoin(URL, pdf) if pdf == URL: pdf = None title = unspace(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
#!/usr/bin/env python3 """ This script takes as an input a string containing a list of names and output the list of names with in the form "Last name, First name [Middle name]" Example usage: python Namify.py 'John B. Smith and Pablo Ramirez-Gonzalez and Bob Doe' >>>Smith, John B. and Ramirez-Gonzalez, Pablo and Doe, Bob Author: Marie Dubremetz """ from nameparser import HumanName import sys import re from common import namify if __name__ == "__main__": line = sys.argv[1] #namify(line) print(namify(line))