writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:

        #get the track
        if is_centered(p):
            print("New track name:", track)
            track = is_centered(p)

            continue

        raw_text = unspace(p.get_text())
        pdf = None
        title = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        if "presentation" in p.text:
            presentation = has_pdf(p)
            presentation = urljoin(URL, presentation)
        try:
            # Remove the first text before ":"
            if pdf:
                title = get_title(p)
        except:
            continue
Exemple #2
0
vol = None
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:

        #get the track
        raw_text = unspace(p.get_text())
        pdf = None
        authors = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
            # Remove the first text before ":"
            title = p.text.split("--")[0]
            title = unspace(title)
        except:
            continue
        try:
            authors = p.text.split("--")[1]

        except:
Exemple #3
0
    for i in range(len(page_lines) - 1):
        p = page_lines[i]
        p1 = page_lines[i + 1]
        #get the track
        if is_subtitle(p):
            print("assigne", track)
            track = is_subtitle(p)
            writer.writerow([track])

            continue

        raw_text = p.get_text().replace("\n", " ")
        pdf = None
        #Skip presention line
        if has_pdf(p) and "presentation" not in p.text:
            pdf = has_pdf(p)
            print(pdf)

        presentation = None
        if "presentation" in p1.text:
            presentation = has_pdf(p1)

        if not pdf:

            continue
        l_1 = p1.get_text()
        authors = re.split("\d+", p1.get_text())[0]
        print(authors)
        authors = namify(authors)
        title = get_title(p)
Exemple #4
0
        p1 = page_lines[i + 1]

        splitter = ":"
        line = unspace(p.text)
        sliced_line = line.split(splitter)
        try:
            left = splitter.join(sliced_line[:-1])
            right = sliced_line[-1]
        except:
            print(sliced_line)
            continue

        raw_text = p.get_text().replace("\n", " ")

        pdf = None
        if has_pdf(p):
            pdf = has_pdf(p)

            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            continue

        while len(p1.text.strip()) <= 3:
Exemple #5
0
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for i in range(len(page_lines) - 2):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        if is_centered(p1):
            track = is_centered(p1)
            #writer.writerow([vol])
            continue

        if has_bold(p1):
            authors = remove_parenthesised(p1.get_text())
            authors = namify(authors)
            print(authors)
        else:
            continue
        if has_pdf:
            pdf = has_pdf(p)
        title = p.text
        # while len(p1.text.strip()) <= 3:
        #     p1 = page_lines[i+j]
        #     j = j+1

        pdf = urljoin(URL, pdf)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
                j = 2
                p1 = p2

        else:
            continue

        if authors==None:
            continue
        title=p1.text
        track=None
        if "abstract" in title:
            track= "abstract"
        title=remove_parenthesised(title)

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i+j]
            j = j+1
        try:
            pages = extract_pages(p)
        except:
            pass
        pdf=has_pdf(p1)
        pdf = urljoin(URL, pdf)
        print(authors)
        authors=remove_parenthesised(authors)
        authors = namify(authors)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol,pages, track]
        writer.writerow(line)
Exemple #7
0
with open(FILE_NAME+".tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Title", "Authors", "Pdf", "Presentation",
                     "Volume_Name","Notes"])

    for i in range(len(page_lines)-1):
        track=None

        p = page_lines[i]
        p1 = page_lines[i+1]
        if "abstract" in p1.text:
            track="abstract, full text not available"

        if "presentation" in p1.text:
            presentation = has_pdf(p1,1)
            presentation = urljoin(URL, presentation)

        try:
            pdf = has_pdf(p1)
            if pdf==None:
                continue
        except:
            continue        
        #print(p1)
        authors = namify(p1.text.split("–")[0])
       

        pdf = urljoin(URL, pdf)
        if pdf==URL:
            pdf=None
Exemple #8
0
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for i in range(len(page_lines) - 2):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        raw_text = p.get_text().replace("\n", " ")
        print(raw_text)
        pdf = None
        if has_pdf(p) and "abstract" not in p.text:
            pdf = has_pdf(p)
            print(pdf)
            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            if "abstract" in p.text:
                title = remove_parenthesised(p.text)
                pdf = has_pdf(p)
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]
        p0 = page_lines[i - 1]
        if is_subtitle(p):
            track = is_subtitle(p)
            track = unspace(track)
            #writer.writerow([vol])
            continue
        authors = None
        if finish_by_digit(p):
            authors = p.text
        else:
            continue
        title = p0.get_text()

        pdf = None
        pdf = has_pdf(p0)
        if pdf:
            title = get_title(p0)

        pdf = urljoin(URL, pdf)
        if pdf == URL:
            pdf = None
        authors = namify(authors)

        title = unspace(remove_digit(unspace(title)))
        print(repr(title))
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)