Ejemplo n.º 1
0
    for p in page_lines:

        #get the track
        raw_text = unspace(p.get_text())
        pdf = None
        authors = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
            # Remove the first text before ":"
            title = p.text.split("--")[0]
            title = unspace(title)
        except:
            continue
        try:
            authors = p.text.split("--")[1]

        except:
            pass
        if authors:
            authors = remove_parenthesised(authors)
            authors = namify(authors)
        if not authors and title == None:
            continue
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
Ejemplo n.º 2
0
        p1 = page_lines[i+1]
        if "abstract" in p1.text:
            track="abstract, full text not available"

        if "presentation" in p1.text:
            presentation = has_pdf(p1,1)
            presentation = urljoin(URL, presentation)

        try:
            pdf = has_pdf(p1)
            if pdf==None:
                continue
        except:
            continue        
        #print(p1)
        authors = namify(p1.text.split("–")[0])
       

        pdf = urljoin(URL, pdf)
        if pdf==URL:
            pdf=None
        try:    
            title = p.text
            title = title.split(":")[1]
        except:
            title = p.text
        title = unspace(title)
        title= remove_parenthesised(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
        raw_text = unspace(p.get_text())
        pdf = None
        #Skip presention line
        if has_pdf(p) and "presentation" not in p.text:
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        if "presentation" in p.text:
            presentation = has_pdf(p)
            presentation = urljoin(URL, presentation)
        try:
             title = re.split(":", p.get_text())[1]
             title= re.split("--", title)[0]
             title = namify(title)
        except:
            continue
        
        
        print(has_italic(p))
        if not has_italic(p):
            continue
        authors=has_italic(p)
        authors=namify(authors)
        #authors=re.split(":", p.get_text())[0]
        
        line = [title, authors, pdf,presentation, track, raw_text]
        writer.writerow(line)
Ejemplo n.º 4
0
        track=None

        p = page_lines[i]
        p1 = page_lines[i+1]
        if "abstract" in p.text:
            track="abstract, full text not available"

        if "[" in p.text:
            title = remove_parenthesised(p.text)
            pdf=has_pdf(p)
        elif has_pdf(p):
            pdf=has_pdf(p)
            title=get_title(p)

        else:
            continue        
        #print(p1)




        authors = namify(p1.text)
        print(authors)

        pdf = urljoin(URL, pdf)
        if pdf==URL:
            pdf=None
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
        title=[]
        j = 1
        p = page_lines[i]

        pj = page_lines[i+j]

        if has_pp(p):
            pages=extract_pages(p)
            
        else:
            continue

        while has_bold(pj):
            title+=pj.text
            j=j+1
            pj = page_lines[i+j]
        
        title=unspace("".join(title))
        print(title)
        authors=namify(pj.text)
        while "Abstract" not in pj.text:
            j=j+1
            pj = page_lines[i+j]
        j=j+1
        pj = page_lines[i+j]
        abstract=pj.text


        line = [title, authors, pdf, presentation, vol, pages, abstract, track]
        writer.writerow(line)
import sys
sys.path.append('..')
from common import namify, unspace

with open("names","r") as f:
    names=f.readlines()
    for name in names:
        name=namify(name)
        print(name)
Ejemplo n.º 7
0
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for i in range(len(page_lines) - 1):
        track = None

        p = page_lines[i]
        p1 = page_lines[i + 1]
        if "abstract" in p.text:
            track = "abstract, full text not available"

        if has_pdf(p):
            pdf = has_pdf(p)
            title = get_title(p)

        else:
            continue
        #print(p1)

        authors = namify(remove_parenthesised(p.b.text))

        pdf = urljoin(URL, pdf)
        if pdf == URL:
            pdf = None
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
Ejemplo n.º 8
0
#!/usr/bin/env python3
"""
This script takes as an input a string containing a list of names and output the list of names with in the form "Last name, First name [Middle name]"


Example usage:

python Namify.py 'John B. Smith and Pablo Ramirez-Gonzalez and Bob Doe'
>>>Smith, John B. and Ramirez-Gonzalez, Pablo and Doe, Bob


Author: Marie Dubremetz
"""
from nameparser import HumanName
import sys
import re
from common import namify

if __name__ == "__main__":
    line = sys.argv[1]
    #namify(line)
    print(namify(line))