nlp = spacy.load("en_core_web_sm")
track = None
vol = None
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:
        authors = None
        #get the track
        if has_bold(p):
            print("New track name:", track)
            track = has_bold(p)

            continue

        raw_text = unspace(p.get_text())
        pdf = None

        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
Beispiel #2
0
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:

        if is_centered(p):
            track = is_centered(p)
            #writer.writerow([vol])
            continue

        if has_bold(p):
            authors = remove_parenthesised(p.b.get_text())
            authors = namify(authors)
            print(authors)
        else:
            continue
        if has_pdf:
            pdf = has_pdf(p)

            title = get_title(p)
            if title:

                title = unspace(title)
        pdf = urljoin(URL, pdf)

        line = [title, authors, pdf, presentation, vol, track]
Beispiel #3
0
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for i in range(len(page_lines) - 2):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        if is_centered(p1):
            track = is_centered(p1)
            #writer.writerow([vol])
            continue

        if has_bold(p1):
            authors = remove_parenthesised(p1.get_text())
            authors = namify(authors)
            print(authors)
        else:
            continue
        if has_pdf:
            pdf = has_pdf(p)
        title = p.text
        # while len(p1.text.strip()) <= 3:
        #     p1 = page_lines[i+j]
        #     j = j+1

        pdf = urljoin(URL, pdf)

        title = unspace(title)
                     "Volume_Name", "Pages", "Abstract", "Notes"])
                     
    for i in range(len(page_lines)-2):
        title=[]
        j = 1
        p = page_lines[i]

        pj = page_lines[i+j]

        if has_pp(p):
            pages=extract_pages(p)
            
        else:
            continue

        while has_bold(pj):
            title+=pj.text
            j=j+1
            pj = page_lines[i+j]
        
        title=unspace("".join(title))
        print(title)
        authors=namify(pj.text)
        while "Abstract" not in pj.text:
            j=j+1
            pj = page_lines[i+j]
        j=j+1
        pj = page_lines[i+j]
        abstract=pj.text