from pyliterature import Pyliterature # read url from bib file urls = [] file = open('example.bib') lines = file.readlines() for line in lines: if 'url' in line: url = line.split('=')[1].split('{')[1].split('}')[0] urls.append(url) #---------------------------------------- # read old database file keyword = 'catalytic' liter = Pyliterature() liter.read_database(keyword) # find new url not in the database urls_new = [] for url in urls: if url not in liter.url_list: urls_new.append(url) #----------------------------------- # load text from new url html for url in urls_new: liter.url = url print(url + '\n\n') liter.parser() # parse keysnets from text
from pyliterature import Pyliterature urls = [] file = open('urls.dat') lines = file.readlines() for line in lines: urls.append(line) keyword = 'DFT' liter = Pyliterature() for url in urls: liter.url = url print(url + '\n\n') liter.parser() # liter.url = None liter.keyword = keyword # print(liter.text) liter.parser() print('===================================================') for keysent in liter.keysents: print(keysent) print('\n')
from pyliterature import Pyliterature url = 'http://www.nature.com/nature/journal/v541/n7635/full/nature20782.html' keyword = 'DFT' liter = Pyliterature(url, keyword) # load text from url html and parse the sentences including keyword liter.parser() print('=================article text==================================') print(liter.text) print('=================key sentences==================================') for keysent in liter.keysents: print(keysent) print('\n')
from pyliterature import Pyliterature urls = [ 'http://science.sciencemag.org/content/355/6320/49.full', 'http://www.nature.com/nature/journal/v541/n7635/full/nature20782.html', 'http://www.sciencedirect.com/science/article/pii/S1751616116301138', 'http://pubs.acs.org/doi/full/10.1021/acscatal.6b02960', ] keyword = 'DFT' liter = Pyliterature() for url in urls: print(url + '\n\n') liter.url = url liter.parser() # liter.url = None liter.keyword = keyword # print(liter.text) liter.parser() print('===================================================') for keysent in liter.keysents: print(keysent) print('\n')
from pyliterature import Pyliterature """ load html every time is very slow, it's better to save the text we have into a database. we can read text from the database next time. """ #---------------------------------------- # read old database file keyword = 'DFT' liter = Pyliterature() liter.read_database(keyword) for url in liter.url_list: print(url) urls = [ 'http://science.sciencemag.org/content/355/6320/49.full', 'http://www.nature.com/nature/journal/v541/n7635/full/nature20782.html', 'http://www.sciencedirect.com/science/article/pii/S1751616116301138', 'http://pubs.acs.org/doi/full/10.1021/acscatal.6b02960', ] # find new url not in the database urls_new = [] for url in urls: if url not in liter.url_list: urls_new.append(url) #----------------------------------- # load text from new url html for url in urls_new: liter.url = url print(url + '\n\n')
from pyliterature import Pyliterature url = 'http://www.sciencedirect.com/science/article/pii/S1751616116301138' keyword = 'CALPHAD' liter = Pyliterature(url, keyword) # load text from url html and parse the sentences including keyword liter.parser() print('=================article text==================================') print(liter.text) print('=================key sentences==================================') for keysent in liter.keysents: print(keysent) print('\n')