delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: authors = None #get the track if has_bold(p): print("New track name:", track) track = has_bold(p) continue raw_text = unspace(p.get_text()) pdf = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try: # Remove the first text before ":" title = get_title(p) except: continue text = unspace(p.get_text()) if text[1] == ".":
nlp = spacy.load("en_core_web_sm") track = None vol=None pages=None with open(FILE_NAME+".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes"]) for p in page_lines: raw_text = unspace(p.get_text()) pdf = None title=None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) else: continue presentation = None authors = p.text.split(":")[0] authors = namify(authors) if len(authors) <= 2 and title == None:
pages = None pdf = None presentation = None title = [] pages_in_pdf = None with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "pages in pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] authors = None title = p pages = extract_pages(title) title = remove_page(title) title = unspace(title).strip(".") title = unspace(title) print(pages_in_pdf) pages_in_pdf = int(pages) + 4 line = [ title, authors, pdf, presentation, vol, pages, pages_in_pdf, track ] writer.writerow(line)
writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "pages in pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] p1 = page_lines[i + 1] authors_pages = p1.split(";") try: pages = unspace(authors_pages[1]) except: print(p1) continue authors = authors_pages[0] title = unspace(p) authors = unspace(authors) authors = namify(authors) vol = """Vol. 1: MT Researchers’ Track""" pages_in_pdf = str(int(pages) + 6) if len(title) < 4: continue line = [
pages = None pdf = None presentation = None title = [] pages_in_pdf = None with open(FILE_NAME+".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "pages in pdf", "Notes"]) for i in range(len(page_lines)): p = page_lines[i] p1 = page_lines[i+1] p2 = page_lines[i+2] title=unspace(p1) authors=unspace(p2) pages=unspace(p) try: pages=extract_pages(p) except: print(p1) continue title=unspace(title) authors = unspace(authors) authors = namify(authors) vol=unspace("""Vol.2: Commercial MT Users and Translators Track
writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for i in range(len(page_lines) - 2): j = 1 p = page_lines[i] p1 = page_lines[i + 1] splitter = ":" line = unspace(p.text) sliced_line = line.split(splitter) try: left = splitter.join(sliced_line[:-1]) right = sliced_line[-1] except: print(sliced_line) continue raw_text = p.get_text().replace("\n", " ") pdf = None if has_pdf(p): pdf = has_pdf(p) title = get_title(p)
writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for string_line in page_lines: title_authors_pages = string_line.split("\"") try: authors = title_authors_pages[1] except: track = unspace(string_line) continue authors = remove_parenthesised(authors) authors = remove_page(authors) authors = unspace(authors) authors = namify(authors) title = title_authors_pages[0] pages = title_authors_pages[1] pages = extract_pages(pages) if len(title) < 4: continue line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for i in range(len(page_lines) - 2): j = 1 p = page_lines[i] p1 = page_lines[i + 1] p0 = page_lines[i - 1] if is_subtitle(p): track = is_subtitle(p) track = unspace(track) #writer.writerow([vol]) continue authors = None if finish_by_digit(p): authors = p.text else: continue title = p0.get_text() pdf = None pdf = has_pdf(p0) if pdf: title = get_title(p0) pdf = urljoin(URL, pdf)
continue raw_text = p.get_text().replace("\n", " ") pdf = None if has_pdf(p): pdf = has_pdf(p) print(pdf) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: continue while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 pdf = urljoin(URL, pdf) authors = remove_parenthesised(p0.get_text()) authors = unspace(authors).strip("\"") title = title.strip("\"") title = unspace(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "pages in pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] title_authors = p.split("\"") try: title = unspace(title_authors[0]) authors = title_authors[1].split(";")[0] except: continue authors = unspace(authors) authors = namify(authors) pages = extract_pages(p) print(pages) try: pages = extract_pages(pages) except: pass pages_in_pdf = str(int(pages) + 4) if len(title) < 4:
"Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for p in page_lines: track = None pdf = None title = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) title = get_title(p) if "abstract" in p.text: track = "abstract" else: pass authors = None try: title = unspace(unspace(title)) title = remove_parenthesised(title) except: pass presentation = None line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes" ]) for string_line in page_lines: title_authors_pages = string_line.split("\"") try: authors = title_authors_pages[1] except: track = unspace(string_line) continue authors = remove_parenthesised(authors) authors = remove_page(authors) authors = namify(authors) title = title_authors_pages[0] pages = title_authors_pages[1] pages = extract_pages(pages) if len(title) < 4: continue line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Pages_in_pdf", "Notes" ]) for i in range(len(page_lines)): p = page_lines[i] p1 = page_lines[i + 1] authors_title = p.split("\"") try: title = unspace(authors_title[1]) authors = authors_title[0] except: continue authors = unspace(authors) authors = namify(authors) pages = extract_pages(p1) print(pages) try: pages = extract_pages(pages) except: pass pages_in_pdf = str(int(pages) + 5) if len(title) < 4: continue
pdf=None presentation=None with open(FILE_NAME+".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Abstract", "Notes"]) for i in range(len(page_lines)-2): j = 1 p = page_lines[i] p1 = page_lines[i+1] if is_subtitle(p): track = is_subtitle(p) track = unspace(track) continue authors=None if has_pp(p): pages=extract_pages(p) print(pages) else: continue title_authors = p.get_text().split(":")[1].split("–") title = unspace(title_authors[0]) authors = title_authors[1] authors = remove_parenthesised(authors) authors = namify(authors) abstract = p1.text
with open(FILE_NAME+".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages", "Notes"]) for p in page_lines: track = None try: pages = extract_pages(p) except: pass splitter = ":" line = unspace(p.text) sliced_line = line.split(splitter) try: left = sliced_line[0] right = ":".join(sliced_line[1:]) except: print(sliced_line) continue raw_text = unspace(p.get_text()) pdf = None title = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf)
quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for i in range(len(page_lines) - 1): p = page_lines[i] p1 = page_lines[i + 1] if has_pdf(p): pdf = has_pdf(p) title = get_title(p) title = remove_digit(title) else: continue #print(p1) authors = remove_digit(p1.text) authors = unspace(authors) authors = authors.split(",")[0] authors = authors.strip(".") authors = namify(authors) print(authors) pdf = urljoin(URL, pdf) title = unspace(title) line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)
]) for i in range(len(page_lines) - 3): j = 1 p = page_lines[i] p1 = page_lines[i + 1] raw_text = p.get_text().replace("\n", " ") pdf = None title = p.text while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(p) except: pass #pdf = urljoin(URL, pdf) authors = p1.text authors = ",".join(authors.split(",")[:-1]) authors = remove_parenthesised(authors) authors = ",".join(authors.split(",")[:-1]) authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
title=[] j = 1 p = page_lines[i] pj = page_lines[i+j] if has_pp(p): pages=extract_pages(p) else: continue while has_bold(pj): title+=pj.text j=j+1 pj = page_lines[i+j] title=unspace("".join(title)) print(title) authors=namify(pj.text) while "Abstract" not in pj.text: j=j+1 pj = page_lines[i+j] j=j+1 pj = page_lines[i+j] abstract=pj.text line = [title, authors, pdf, presentation, vol, pages, abstract, track] writer.writerow(line)
nlp = spacy.load("en_core_web_sm") track = None vol = None with open(FILE_NAME + ".tsv", 'w') as f: writer = csv.writer(f, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: #get the track raw_text = unspace(p.get_text()) pdf = None authors = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try: # Remove the first text before ":" title = p.text.split("--")[0] title = unspace(title) except: continue try:
delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow( ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"]) for p in page_lines: authors = None #get the track if has_bold(p): print("New track name:", track) track = has_bold(p) continue raw_text = unspace(p.get_text()) pdf = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None text = unspace(p.get_text()) try: title = text.split(":")[0] title = unspace(remove_parenthesised(title))