def mess_length(word_length, bcm):
    with open('bcm.p', 'rb') as f:
        big_complex_mess = pickle.load(big_complex_mess, f)
    reduced = big_complex_mess[0:word_length]
    urlDat = {}
    pmegmess = text_proc(reduced, urlDat)
    return mess_length, pmegmess
def check_self_contained(file_name):
    royal = '../BenchmarkCorpus/' + str(file_name)
    klpdr = open(royal)
    strText = klpdr.read()
    urlDat = {'link': 'local_resource_royal'}
    klpdfr = text_proc(strText, urlDat, WORD_LIM=100)
    return klpdfr
 def convert_and_score(self,f):
     urlDat = {}
     b = os.path.getsize(f)
     link_tuple = pickle.load(open(f,'rb'))
     se_b, page_rank, link, category, buff_ = link_tuple
     if buff_ is not None:
         urlDat = { 'link':link,'page_rank':page_rank,'se':se_b,'query':category,'file':f }
         urlDat = text_proc(buff_,urlDat, WORD_LIM = self.mwl)
     return urlDat
def get_bmarks():
    xkcd_self_sufficient = str('http://splasho.com/upgoer5/library.php')
    high_standard = str(
        'https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D'
    )
    the_science_of_writing = str(
        'https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf')
    pmeg = str(
        'http://www.elsewhere.org/pomo/'
    )  # Note this is so obfuscated, even the english language classifier rejects it.
    this_manuscript = str('https://www.overleaf.com/read/dqkttvmqjvhn')
    this_readme = str('https://github.com/russelljjarvis/ScienceAccessibility')
    links = [
        xkcd_self_sufficient, high_standard, the_science_of_writing,
        this_manuscript, this_readme
    ]
    urlDats = list(map(process, links))

    pmegs = []
    for i in range(0, 9):
        p = process(pmeg)
        if p is not None:
            pmegs.append(
                p
            )  # grab this constantly changing page 10 times to get the mean value.
    if pmegs[0] is not None:
        urlDats.append(process(pmegs[0]))
    big_complex_mess = ''
    urlDat = {}
    for p in pmegs:
        if p is not None:
            for s in p['tokens']:
                big_complex_mess += s + str(' ')
    bcm = ''
    for p in pmegs[0:2]:
        if p is not None:
            for s in p['tokens']:
                bcm += s + str(' ')

    pmegmess_2 = text_proc(bcm, urlDat)
    #import pdb; pdb.set_trace()
    with open('bcm.p', 'wb') as f:
        pickle.dump(big_complex_mess, f)

    urlDats[-1]['standard'] = np.mean([p['standard'] for p in pmegs])
    #import pdb
    #pdb.set_trace()
    urlDats[-1]['sp'] = np.mean([p['sp'] for p in pmegs])
    urlDats[-1]['gf'] = np.mean([p['gf'] for p in pmegs])
    with open('benchmarks.p', 'wb') as f:
        pickle.dump(urlDats, f)

    return urlDats
def process(link):
    urlDat = {}
    urlDat['link'] = link
    urlDat['page_rank'] = 'benchmark'
    if str('pdf') not in link:
        content = C.open(link).content
        buffer = convert(content, urlDat['link'])
    else:
        pdf_file = requests.get(link, stream=True)
        buffer = convert_pdf_to_txt(pdf_file)

    urlDat = text_proc(buffer, urlDat)
    return urlDat
def get_bmarks():

    xkcd_self_sufficient = str('http://splasho.com/upgoer5/library.php')
    high_standard = str(
        'https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D'
    )
    the_science_of_writing = str(
        'https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf')
    pmeg = str(
        'http://www.elsewhere.org/pomo/'
    )  # Note this is so obfuscated, even the english language classifier rejects it.
    links = [xkcd_self_sufficient, high_standard, the_science_of_writing, pmeg]
    royal = '../BenchmarkCorpus/royal.txt'
    klpd = '../BenchmarkCorpus/planning_document.txt'
    klpdf = open(klpd)
    strText = klpdf.read()
    urlDat = {'link': 'local_resource'}

    klpdfp = text_proc(strText, urlDat, WORD_LIM=100)
    grid = db.from_sequence(links, npartitions=8)
    urlDats = list(db.map(process, grid).compute())
    urlDats.append(klpdfp)
    print(urlDats)

    klpdr = open(royal)
    strText = klpdr.read()
    urlDat = {'link': 'local_resource_royal'}

    klpdfr = text_proc(strText, urlDat, WORD_LIM=100)
    print(klpdfr)
    grid = db.from_sequence(links, npartitions=8)
    urlDats = list(db.map(process, grid).compute())
    urlDats.append(klpdfp)

    with open('benchmarks.p', 'wb') as f:
        pickle.dump(urlDats, f)
    return urlDats
 def convert_and_score(self, f):
     urlDat = {}
     b = os.path.getsize(f)
     link_tuple = pickle.load(open(f, "rb"))
     se_b, page_rank, link, category, buff_ = link_tuple
     if buff_ is not None:
         urlDat = {
             "link": link,
             "page_rank": page_rank,
             "se": se_b,
             "query": category,
             "file": f,
         }
         urlDat = text_proc(buff_, urlDat, WORD_LIM=self.mwl)
     return urlDat
def convert_and_score(f):
    urlDat = {}
    b = os.path.getsize(f)
    link_tuple = pickle.load(open(f, 'rb'))
    se_b, page_rank, link, category, buffer = link_tuple
    if type(buffer) is not type(None):
        urlDat = {
            'link': link,
            'page_rank': page_rank,
            'se': se_b,
            'query': category,
            'file': f
        }
        urlDat = text_proc(buffer, urlDat)
        print(urlDat)

    return urlDat
def get_greg_nicholas():
    urlDat = {}
    urlDat['link'] = "nicholas"
    urlDat['page_rank'] = 'nicholas'
    #pdf_file = requests.get(link, stream=True)
    #bufferd = convert_pdf_to_txt(pdf_file)
    #File_object = open(r"local_text.txt","Access_Mode")
    file1 = open("local_text.txt", "r")
    txt = file1.readlines()
    new_str = ''
    for i in txt:
        new_str += str(i)
    urlDat = text_proc(new_str, urlDat)
    print(urlDat)
    #add to benchmarks
    with open('benchmarks.p', 'rb') as f:
        urlDats = pickle.load(f)
    urlDats.append(urlDat)
    with open('benchmarks.p', 'wb') as f:
        pickle.dump(urlDats, f)

    return urlDat
def process(link):
    urlDat = {}
    urlDat['link'] = link
    urlDat['page_rank'] = 'benchmark'

    try:
        if str('pdf') not in link:
            content = C.open(link).content

            soup = BeautifulSoup(content, 'html.parser')
            for script in soup(["script", "style"]):
                script.extract()  # rip it out
            text = soup.get_text()
            #wt = copy.copy(text)
            #organize text
            lines = (
                line.strip() for line in text.splitlines()
            )  # break into lines and remove leading and trailing space on each
            chunks = (phrase.strip() for line in lines
                      for phrase in line.split("  ")
                      )  # break multi-headlines into a line each
            text = '\n'.join(chunk for chunk in chunks
                             if chunk)  # drop blank lines
            bufferd = str(text)
        else:
            pdf_file = requests.get(link, stream=True)
            bufferd = convert_pdf_to_txt(pdf_file)

        urlDat = text_proc(bufferd, urlDat)

    except:
        print('bummer dude')
        #content = C.open(link).content
        #print(content)
        urlDat = None
    return urlDat
Exemple #11
0
#authors['markram'] = MARKRAM
#authors['emarder'] = EMARDER
authors['bhen'] = BHENDERSON
authors['pg'] = PMCGURRIN

with open('authors.p', 'wb') as f:
    pickle.dump(authors, f)

try:
    assert os.path.isfile('other_standards.p')
    other_s = pickle.load(open('other_standards.p', 'rb'))

except:
    hs = process(high_standard)
    urlDat = {'link': high_standard}
    hss = text_proc(hs, urlDat)

    benchmark = process(xkcd_self_sufficient)
    urlDat = {'link': xkcd_self_sufficient}
    bench = text_proc(benchmark, urlDat)
    other_s = pickle.dump([hss, benchmark, bench],
                          open('other_standards.p', 'wb'))


def get_ind_author(author_link_scholar_link_list):
    more = [
        author_results['markram'], author_results['emarder'], authors['bhen']
    ]
    names = [str('bhen'), str('pg')]
    latest = []
    latest.extend(authors['bhen'])