pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)
        title=get_title(p)
        presentation = None
        if "presentation" in p.text:
            presentation = has_pdf(p)
            presentation = urljoin(URL, presentation)
        title_authors=p.get_text().split('–')
        if not pdf:
            try:
                title=title_authors.split[0]
        
            except:
                pass
        authors=has_italic(p)
        if not authors:
            try:
                authors=title_authors[1].split('[')[0]

                
            except:
                pass
        if authors:    
            authors=namify(authors)
        if not authors and not title and not pdf:
            continue
        line = [title, authors, pdf,presentation, track, raw_text]
        print(line)
        writer.writerow(line)
        raw_text = unspace(p.get_text())
        pdf = None
        #Skip presention line
        if has_pdf(p) and "presentation" not in p.text:
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        if "presentation" in p.text:
            presentation = has_pdf(p)
            presentation = urljoin(URL, presentation)
        try:
             title = re.split(":", p.get_text())[1]
             title= re.split("--", title)[0]
             title = namify(title)
        except:
            continue
        
        
        print(has_italic(p))
        if not has_italic(p):
            continue
        authors=has_italic(p)
        authors=namify(authors)
        #authors=re.split(":", p.get_text())[0]
        
        line = [title, authors, pdf,presentation, track, raw_text]
        writer.writerow(line)
            pdf = has_pdf(p)
            print(pdf)
            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            continue

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(p)
        except:
            pass
        pdf = urljoin(URL, pdf)

        authors = remove_parenthesised(p1.get_text())
        authors = namify(authors)
        if has_italic(p1) == None:
            authors = None
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
        raw_text = unspace(p.get_text())
        pdf = None

        if has_pdf(p) and "presentation" not in p.text:
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        if "presentation" in p.text:
            presentation = has_pdf(p)
            presentation = urljoin(URL, presentation)
        try:
            # Remove the first text before ":"
            title = re.split(":", p.get_text())[1:]
            title = ":".join(title)

            title = re.split("--", title)[0]
            title = unspace(title)
        except:
            continue

        if not has_italic(p):
            continue
        authors = has_italic(p)
        authors = namify(authors)

        line = [title, authors, pdf, presentation, track, raw_text]
        writer.writerow(line)
Beispiel #5
0
            print(pdf)
            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            continue

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(p)
        except:
            pass
        pdf = urljoin(URL, pdf)
        if has_italic(p1):
            authors = p1.get_text().split(",")[0]
        else:
            continue
        authors = namify(authors)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
 raw_text = unspace(p.get_text())
 pdf = None
 authors = None
 if has_pdf(p):
     pdf = has_pdf(p)
     pdf = urljoin(URL, pdf)
     #print(pdf)
 else:
     continue
 presentation = None
 try:
     # Remove the first text before ":"
     title = get_title(p)
 except:
     continue
 if has_italic(p):
     authors = has_italic(p)
 else:
     authors = has_italic(p1)
 if authors:
     authors = remove_parenthesised(authors)
     authors = namify(authors)
 if not authors and title == None:
     continue
 try:
     pages = extract_pages(p)
 except:
     try:
         pages = extract_pages(p1)
     except:
         pass
Beispiel #7
0
            track = has_bold(p)

            continue

        raw_text = unspace(p.get_text())
        pdf = None
        authors = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
            # Remove the first text before ":"
            title = has_italic(p)
            title = unspace(title)
        except:
            continue
        try:
            authors = p.text.split("–")[1]

        except:
            pass
        if authors:
            authors = remove_parenthesised(authors)
            authors = namify(authors)
        if not authors and title == None:
            continue
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)