def mess_length(word_length, bcm): with open('bcm.p', 'rb') as f: big_complex_mess = pickle.load(big_complex_mess, f) reduced = big_complex_mess[0:word_length] urlDat = {} pmegmess = text_proc(reduced, urlDat) return mess_length, pmegmess
def check_self_contained(file_name): royal = '../BenchmarkCorpus/' + str(file_name) klpdr = open(royal) strText = klpdr.read() urlDat = {'link': 'local_resource_royal'} klpdfr = text_proc(strText, urlDat, WORD_LIM=100) return klpdfr
def convert_and_score(self,f): urlDat = {} b = os.path.getsize(f) link_tuple = pickle.load(open(f,'rb')) se_b, page_rank, link, category, buff_ = link_tuple if buff_ is not None: urlDat = { 'link':link,'page_rank':page_rank,'se':se_b,'query':category,'file':f } urlDat = text_proc(buff_,urlDat, WORD_LIM = self.mwl) return urlDat
def get_bmarks(): xkcd_self_sufficient = str('http://splasho.com/upgoer5/library.php') high_standard = str( 'https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D' ) the_science_of_writing = str( 'https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf') pmeg = str( 'http://www.elsewhere.org/pomo/' ) # Note this is so obfuscated, even the english language classifier rejects it. this_manuscript = str('https://www.overleaf.com/read/dqkttvmqjvhn') this_readme = str('https://github.com/russelljjarvis/ScienceAccessibility') links = [ xkcd_self_sufficient, high_standard, the_science_of_writing, this_manuscript, this_readme ] urlDats = list(map(process, links)) pmegs = [] for i in range(0, 9): p = process(pmeg) if p is not None: pmegs.append( p ) # grab this constantly changing page 10 times to get the mean value. if pmegs[0] is not None: urlDats.append(process(pmegs[0])) big_complex_mess = '' urlDat = {} for p in pmegs: if p is not None: for s in p['tokens']: big_complex_mess += s + str(' ') bcm = '' for p in pmegs[0:2]: if p is not None: for s in p['tokens']: bcm += s + str(' ') pmegmess_2 = text_proc(bcm, urlDat) #import pdb; pdb.set_trace() with open('bcm.p', 'wb') as f: pickle.dump(big_complex_mess, f) urlDats[-1]['standard'] = np.mean([p['standard'] for p in pmegs]) #import pdb #pdb.set_trace() urlDats[-1]['sp'] = np.mean([p['sp'] for p in pmegs]) urlDats[-1]['gf'] = np.mean([p['gf'] for p in pmegs]) with open('benchmarks.p', 'wb') as f: pickle.dump(urlDats, f) return urlDats
def process(link): urlDat = {} urlDat['link'] = link urlDat['page_rank'] = 'benchmark' if str('pdf') not in link: content = C.open(link).content buffer = convert(content, urlDat['link']) else: pdf_file = requests.get(link, stream=True) buffer = convert_pdf_to_txt(pdf_file) urlDat = text_proc(buffer, urlDat) return urlDat
def get_bmarks(): xkcd_self_sufficient = str('http://splasho.com/upgoer5/library.php') high_standard = str( 'https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D' ) the_science_of_writing = str( 'https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf') pmeg = str( 'http://www.elsewhere.org/pomo/' ) # Note this is so obfuscated, even the english language classifier rejects it. links = [xkcd_self_sufficient, high_standard, the_science_of_writing, pmeg] royal = '../BenchmarkCorpus/royal.txt' klpd = '../BenchmarkCorpus/planning_document.txt' klpdf = open(klpd) strText = klpdf.read() urlDat = {'link': 'local_resource'} klpdfp = text_proc(strText, urlDat, WORD_LIM=100) grid = db.from_sequence(links, npartitions=8) urlDats = list(db.map(process, grid).compute()) urlDats.append(klpdfp) print(urlDats) klpdr = open(royal) strText = klpdr.read() urlDat = {'link': 'local_resource_royal'} klpdfr = text_proc(strText, urlDat, WORD_LIM=100) print(klpdfr) grid = db.from_sequence(links, npartitions=8) urlDats = list(db.map(process, grid).compute()) urlDats.append(klpdfp) with open('benchmarks.p', 'wb') as f: pickle.dump(urlDats, f) return urlDats
def convert_and_score(self, f): urlDat = {} b = os.path.getsize(f) link_tuple = pickle.load(open(f, "rb")) se_b, page_rank, link, category, buff_ = link_tuple if buff_ is not None: urlDat = { "link": link, "page_rank": page_rank, "se": se_b, "query": category, "file": f, } urlDat = text_proc(buff_, urlDat, WORD_LIM=self.mwl) return urlDat
def convert_and_score(f): urlDat = {} b = os.path.getsize(f) link_tuple = pickle.load(open(f, 'rb')) se_b, page_rank, link, category, buffer = link_tuple if type(buffer) is not type(None): urlDat = { 'link': link, 'page_rank': page_rank, 'se': se_b, 'query': category, 'file': f } urlDat = text_proc(buffer, urlDat) print(urlDat) return urlDat
def get_greg_nicholas(): urlDat = {} urlDat['link'] = "nicholas" urlDat['page_rank'] = 'nicholas' #pdf_file = requests.get(link, stream=True) #bufferd = convert_pdf_to_txt(pdf_file) #File_object = open(r"local_text.txt","Access_Mode") file1 = open("local_text.txt", "r") txt = file1.readlines() new_str = '' for i in txt: new_str += str(i) urlDat = text_proc(new_str, urlDat) print(urlDat) #add to benchmarks with open('benchmarks.p', 'rb') as f: urlDats = pickle.load(f) urlDats.append(urlDat) with open('benchmarks.p', 'wb') as f: pickle.dump(urlDats, f) return urlDat
def process(link): urlDat = {} urlDat['link'] = link urlDat['page_rank'] = 'benchmark' try: if str('pdf') not in link: content = C.open(link).content soup = BeautifulSoup(content, 'html.parser') for script in soup(["script", "style"]): script.extract() # rip it out text = soup.get_text() #wt = copy.copy(text) #organize text lines = ( line.strip() for line in text.splitlines() ) # break into lines and remove leading and trailing space on each chunks = (phrase.strip() for line in lines for phrase in line.split(" ") ) # break multi-headlines into a line each text = '\n'.join(chunk for chunk in chunks if chunk) # drop blank lines bufferd = str(text) else: pdf_file = requests.get(link, stream=True) bufferd = convert_pdf_to_txt(pdf_file) urlDat = text_proc(bufferd, urlDat) except: print('bummer dude') #content = C.open(link).content #print(content) urlDat = None return urlDat
#authors['markram'] = MARKRAM #authors['emarder'] = EMARDER authors['bhen'] = BHENDERSON authors['pg'] = PMCGURRIN with open('authors.p', 'wb') as f: pickle.dump(authors, f) try: assert os.path.isfile('other_standards.p') other_s = pickle.load(open('other_standards.p', 'rb')) except: hs = process(high_standard) urlDat = {'link': high_standard} hss = text_proc(hs, urlDat) benchmark = process(xkcd_self_sufficient) urlDat = {'link': xkcd_self_sufficient} bench = text_proc(benchmark, urlDat) other_s = pickle.dump([hss, benchmark, bench], open('other_standards.p', 'wb')) def get_ind_author(author_link_scholar_link_list): more = [ author_results['markram'], author_results['emarder'], authors['bhen'] ] names = [str('bhen'), str('pg')] latest = [] latest.extend(authors['bhen'])