def calculate_carbon(pub): ''' Calculate the carbon emissions from travel :publication (article) from database ''' author_count = pub['author count'] pa_print.tprint('\nCalculating carbon footprint...') for author in range(author_count): if pub['author location info'][author] != 'N/A': distance = geodesic(pub['author location info'][author][2], pub['conference location info'][0][2]).km pub['author distances'].append(distance) # * Calculate C02 emissions, more details here: https://github.com/milankl/CarbonFootprintAGU carbon = 0.0 # kgCO2e if distance < 400: # bus / train / car at 60gCO2e / km / person carbon = distance * 2 * 0.06 elif distance < 1500: # short flight at 200gCO2e / km / person carbon = distance * 2 * 0.2 elif distance < 8000: # long flight at 250gCO2e / km / person carbon = distance * 2 * 0.25 else: # super long flight at 300gCO2e / km / person carbon = distance * 2 * 0.3 pub['author footprints'].append(carbon / 1000) pa_print.tprint( f'✓ - CO2 emissions for author {int(author + 1)}: {(carbon / 1000):.3f} tCO2e' ) else: pub['author distances'].append('N/A') pub['author footprints'].append('N/A')
def doc_check(doc, pub, type): ''' Check for common decoding errors (does not catch all) # ! more intelligent method? :document from text extraction (miner) or xml extraction (grobid) :publication (article) from database :type of doc (either 'text' or 'grobid') ''' errored = False alphas = re.compile('[^a-zA-Z]') doc_alphas = alphas.sub('', doc) if len(doc) > 2 * len(doc_alphas): # more symbols than 2x letters pub[f'{type} non alpha'] = 'X' pa_print.tprint('\nFile was not decoded well - non-alpha') errored = True cids = re.compile(r'\(cid:[0-9]+\)') doc_cidless = cids.sub( '', doc, re.M) # when font cannot be decoded, (cid:#) is returned, remove these if len(doc) > 2 * len( doc_cidless): # if most of content was undecodable, skip pub[f'{type} poor decoding'] = 'X' pa_print.tprint('\nFile was not decoded well - cid: present') errored = True return errored
def request_location(author_info, args, pub): ''' Extracts location from author blocks or universities and queries OpenCageGeocode :publication from bibtex file ''' author_count = pub['author count'] # Conference location lookup cnf_query = pub['address'] query_type = 'conference' query_location(cnf_query, query_type, pub) # *** creates unneeded columns *** # Author location lookup for author in range(author_count): # length of usable locations query_type = 'author' # Assign one query (in order of priority) # 1) If there is a university address from grobid if pub['grobid author unis'][author] != 'N/A': # uni address location_query = ', '.join( pub['grobid author unis'][author]) # (uni name, country) query_origin = 'grobid uni' # 2) If grobid was used to add address (while 'location' is api derived) elif pub['grobid addresses'][author] != 'N/A': location_query = pub['grobid addresses'][author] query_origin = 'grobid address' # 3) If theres a uni address from text block elif pub['text author unis'][author] != 'N/A': location_query = ', '.join( pub['text author unis'][author]) # (uni name, country) query_origin = 'text uni' # 4) Else, scrape from raw author block (which may or may not have email) elif author < len(author_info) and author_info[ author] != 'N/A': # check if author_info contains author 'i' and is non-empty auth_block = author_info[author] cut_line = -1 if '@' in auth_block else 0 # one line above if email present info_lines = auth_block.split('\n') location_query = ' '.join(info_lines[cut_line - 1:cut_line]) if len([line for line in location_query if line.isdigit() ]) > 8: # look for tele # location_query = ' '.join( info_lines[cut_line - 2:cut_line - 1]) # take line higher if telephone query_origin = 'raw author block' else: location_query = 'N/A' query_origin = 'No query' pa_print.tprint("\nCouldn't find a location to use!") pa_print.tprint(f'\nLooking for: {location_query}') pub['author loc queries'].append(location_query) pub['author query origins'].append(query_origin) query_location(location_query, query_type, pub)
def download_xml(xml_path, pub): pa_print.tprint('\nLocal PubPub XML not found - downloading...') url = pub['url'] r = requests.get(url, allow_redirects=True) url = re.search(r"jats","url":"(.*?.xml)", r.text).group(1) r = requests.get(url, allow_redirects=True) open(jats_src, 'wb').write(r.content)
def load_unidomains(path): ''' Loads unidomain file from json or downloads if not found :path of unisomains.json file ''' if not os.path.isfile(path): # if not, download pa_print.tprint('\nDownloading unidomains database...') r = requests.get(unidomains_url, allow_redirects=True) open(path, 'wb').write(r.content) with open(path, 'rb') as fp: unidomains = orjson.loads(fp.read()) return unidomains
def load_bibtex(path): ''' Loads BibTeX file into object or downloads if not found :path of BibTeX file ''' if not os.path.isfile(path): # if not, download pa_print.tprint('\nDownloading bibtex database...') r = requests.get(bibtex_url, allow_redirects=True) open(path, 'wb').write(r.content) with open(path) as bib_file: parser = bibtexparser.bparser.BibTexParser() parser.customization = bibtexparser.customization.convert_to_unicode bib_db = bibtexparser.load(bib_file, parser=parser) bib_db = bib_db.entries return bib_db
def import_config(filepath): ''' Imports a custom configuration for filter words and years :filepath the file path ''' user_config = pd.read_csv(filepath, header=0, delimiter=',') user_config = user_config.fillna('') keywords = [] ignore_words = [] merge_words = [] selected_years = [] for config_tuple in user_config.itertuples(index=False): if config_tuple[0] == 'keywords': # single list for i in config_tuple[1:]: keywords.append(i) elif config_tuple[0] == 'ignore': # single list for i in config_tuple[1:]: ignore_words.append(i) elif config_tuple[0] == 'merge': # list of lists merge_group = list(filter(None, config_tuple[1:])) merge_words.append(merge_group) elif config_tuple[0] == 'years': # single list year_num = [i for i in config_tuple if i != ''] if len(year_num) == 2: selected_years.append(str(int(config_tuple[1]))) else: year_span = int(config_tuple[2]) - int(config_tuple[1]) for i in range(year_span + 1): selected_years.append(str(int(config_tuple[1]) + i)) keywords = list(filter(None, keywords)) ignore_words = list(filter(None, ignore_words)) pa_print.tprint('\nParameters from custom.csv:') if selected_years: pa_print.tprint(f'Selected years: {selected_years}') if keywords: pa_print.tprint(f'Search words: {keywords}') if ignore_words: pa_print.tprint(f'Ignored words: {ignore_words}') if merge_words: pa_print.tprint(f'Merged words: {merge_words}') return (keywords, ignore_words, merge_words, selected_years)
def trim_headfoot(doc, pub=None): ''' Trim the header and footer from extracted text (unused and inferior to Grobid service) :document from text extraction (miner) or xml extraction (grobid) ''' # Function for trimming header and footer # Remove until abstract or introduction pdf_trimmed = abst_regex.split(doc, 1) if len(pdf_trimmed) == 1: pdf_trimmed = intro_regex.split( pdf_trimmed[0], 1) # if no abstract, use 'introduction' if len(pdf_trimmed) == 1: pdf_trimmed = pdf_trimmed[0] if pub is not None: pub['header fail'] = 'X' pa_print.tprint('Could not split header during parsing!') else: pdf_trimmed = pdf_trimmed[1] # pa_print.tprint('Split header at intro') else: pdf_trimmed = pdf_trimmed[1] # pa_print.tprint('Split header at abstract') # return pdf_trimmed # Remove after references or acknowledgements pdf_slimmed = ackn_regex.split(pdf_trimmed, 1) if len(pdf_slimmed) == 1: pdf_slimmed = ref_regex.split(pdf_slimmed[0], 1) if len(pdf_slimmed) == 1: if pub is not None: pub['footer fail'] = 'X' pa_print.tprint('Could not split footer during parsing!') else: pdf_slimmed = pdf_slimmed[0] # pa_print.tprint('Split footer at references') else: pdf_slimmed = pdf_slimmed[0] # pa_print.tprint('Split footer at acknowledgements') return pdf_slimmed
def extract_text(pub): '''Extracts text content from pdf using pdfminer.six, downloads pdf if non-existant :publication (article) from database ''' pdf_fn = pub['url'].split('/')[-1] pdf_path = pdf_src + pdf_fn # Allows for override of corrupted pdfs if os.path.isfile(pdf_path): pass else: # doesnt exist - download download_pdf(pdf_path, pub) # Page count for those without if pub['page count'] == 'N/A': pdf = open(pdf_path, 'rb') check = False while True: # try once try: parser = PDFParser(pdf) document = PDFDocument(parser) except Exception as e: if check is True: raise PSSyntaxError( f'{pdf_path} appears to be malformed and qpdf cannot repair it.' ) pa_print.tprint(str(e)) pa_print.tprint(f'Attempting to repair {pdf_path}') pike = pikepdf.Pdf.open(pdf_path, allow_overwriting_input=True) pike.save(pdf_path) check = True continue break pub['page count'] = resolve1(document.catalog['Pages'])['Count'] fn = pdf_fn.split('.')[0] miner_text_file = f'{text_src}miner/miner_{fn}.txt' # Read miner text if exists if os.path.isfile(miner_text_file): with open(miner_text_file, 'r') as f: doc = f.read() return doc else: # if not, make them pa_print.tprint(f'\nExtracting: {pdf_fn}') laparams = LAParams() setattr(laparams, 'all_texts', True) doc = extract_pdf(pdf_path, laparams=laparams) with open(miner_text_file, 'w') as f: f.write(doc) return doc
def request_scholar(pub, args): ''' Queries citations from Semantic Scholar :publication from bibtex file ''' try: with open('./cache/json/scholar_cache.json', 'rb') as fp: scholar_cache = orjson.loads(fp.read()) except FileNotFoundError: pa_print.tprint('\nCreating new Semantic Scholar cache!') scholar_cache = {} semantic_scholar_data = { "queryString": [], "page": 1, "pageSize": 1, "sort": "relevance", "authors": [], "coAuthors": [], "venues": [], "yearFilter": None, "requireViewablePdf": False, "publicationTypes": [], "externalContentTypes": [] } # Fix names for searching regextitle = re.compile(r'[^a-zA-Z0-9 ]') regexname = re.compile(r'[^a-zA-Z- ]') author_last_list = [] for _, (_, last) in enumerate(pub['author names']): last = last.split('-')[-1] author_last_list.append(last) title = unidecode.unidecode(pub['title']) if args.nime: if title == 'Now': # title is too short, this return other paper, trying to filter it out by forcing full author name author_last_list[0] = 'GarthPaine' pub['citation count'] = 'N/A' pub['key citation count'] = 'N/A' # Make query title, name and year lists query_title = list( dict.fromkeys([ title, regextitle.sub('', title), ' '.join([w for w in title.split() if len(w) > 1]) ])) if len(author_last_list) > 1: query_name = [' '.join(author_last_list), author_last_list[0], ''] else: query_name = [author_last_list[0], ''] query_year = ['', pub['year']] # Save query to be used for cache full_query = f"{title} {' '.join(author_last_list)} {pub['year']}" pub['scholar query'] = full_query if full_query not in scholar_cache or args.citations: pa_print.tprint(f'\nQuerying Semantic Scholar...') for temp in list(itertools.product(query_title, query_name, query_year)): # Generate new query from combination temp_title, temp_author, temp_year = temp[0], temp[1], temp[2] scholar_query = f'{temp_title} {temp_author} {temp_year}' semantic_scholar_data['queryString'] = scholar_query # Try query pa_print.tprint(f"Trying query: '{scholar_query}'") try: query_result = scholar_api(semantic_scholar_data) except Exception as e: query_result = {'results': {}} err_info = 'x - While querying Semantic Scholar an exception of type {0} occurred.\nArguments:\n{1!r}.' err_msg = err_info.format(type(e).__name__, e.args) pa_print.tprint(err_msg) if not 'error' in query_result.keys(): if bool(query_result['results']) and \ bool(query_result['results'][0]['scorecardStats']) and \ len(query_result['results'][0]['authors']) <= (len(author_last_list) + 1): result_author = ' '.join([ t[0]['name'] for t in query_result['results'][0]['authors'] ]) result_author = regexname.sub( '', unidecode.unidecode(result_author)).lower() query_author = regexname.sub( '', author_last_list[0].lower().split(' ')[-1]) if result_author.find(query_author) != -1: pub['scholar query'] = scholar_query pub['citation count'] = query_result['results'][0][ 'scorecardStats'][0]['citationCount'] pub['key citation count'] = query_result['results'][0][ 'scorecardStats'][0]['keyCitationCount'] scholar_cache[full_query] = query_result['results'][0][ 'scorecardStats'] pa_print.tprint( f"✓ - Paper has been cited {pub['citation count']} times" ) break if pub['citation count'] == 'N/A': pa_print.tprint( 'x - Cannot find citations for paper in Semantic Scholar') scholar_cache[full_query] = 'N/A' with open('./cache/json/scholar_cache.json', 'wb') as fp: fp.write(orjson.dumps(scholar_cache)) else: if scholar_cache[full_query] != 'N/A': pub['citation count'] = scholar_cache[full_query][0][ 'citationCount'] pub['key citation count'] = scholar_cache[full_query][0][ 'keyCitationCount'] else: pub['citation count'] = 'N/A' pub['key citation count'] = 'N/A' pa_print.tprint( f"\no - Retrieved from cache: {pub['citation count']} citations") # Average citations per year of age if pub['citation count'] != 'N/A': pub['yearly citations'] = int(pub['citation count']) / pub['age'] else: pub['yearly citations'] = 'N/A'
def request_uni(unidomains, author_info, args, pub): ''' Extract university from email handle :publication from bibtex file ''' pub_matches = 0 grob_matches = 0 text_matches = 0 author_count = pub['author count'] # Internal functions for lookup in unidomains.json def lookup_uni(handle, email_type, pub): nonlocal pub_matches for uni in unidomains: if handle in uni['domains']: pub[f'{email_type} author unis'].append( (uni['name'], uni['country'])) pub_matches += 1 uni_match = True break def handle_check(email, email_type, pub): handle = email.split("@")[-1].strip() # Look for handle in json, split once by dot and retry if not found uni_match = False lookup_uni(handle, email_type, pub) while uni_match == False and handle.count('.') > 1: handle = handle.split('.', 1)[-1] lookup_uni(handle, email_type, pub) # 1) Using grobid derived emails to choose handle email_type = 'grobid' for author in range(author_count): email = pub['grobid emails'][author] if email != 'N/A': # check for valid email handle_check(email, email_type, pub) grob_matches = pub_matches # 2) Using scraped author info block from header if not enough emails if len(author_info) > 0 and (grob_matches < author_count): email_type = 'text' for author in author_info: # ! could be more authors than exit info_emails = email_regex.findall( author) # look for '@handle.tld' in block for _, email in enumerate( info_emails ): # case: multiple emails are within an author block #! (will overwrite) if email != 'N/A': handle_check(email, email_type, pub) # Fill in missing unis with 'N/A' # ! author block not linked in order with authors for type, author in [(type, author) for type in ['grobid', 'text'] for author in range(author_count)]: try: pub[f'{type} author unis'][author] except IndexError: pub[f'{type} author unis'].append('N/A') text_matches = pub_matches - grob_matches pub_matches = max(text_matches, grob_matches) pa_print.tprint(f'o - Found {pub_matches} uni\'s from email handles\n')
def query_location(location_query, query_type, pub): # 'query_type is now only used to print status # Load cache try: with open('./cache/json/location_cache.json', 'rb') as fp: location_cache = orjson.loads(fp.read()) except FileNotFoundError: pa_print.tprint('\nCreating new location cache!') location_cache = {'N/A': 'N/A'} # Not cached if location_query not in location_cache: try: # location = geolocator.geocode(location_query, language="en") # Nominatim fallback # OpenCageGeocode: 2,500 req/day, 1 req/s - https://github.com/OpenCageData/python-opencage-geocoder location = geocoder.geocode(location_query, language='en', limit=1, no_annotations=1, no_record=1)[0] # Format result geometry = location['geometry'] # lat/long components = location['components'] # fine loc info location_info = (location['formatted'], (components['country'], components['continent']), (geometry['lat'], geometry['lng']), location['confidence'] ) # 1 (>25km) to 10 (<0.25km) location_cache[location_query] = location_info pub[f'{query_type} location info'].append( location_info[:3]) # add all location into one column pub[f'{query_type} location confidence'].append( location_info[3]) # confidence in separate column pa_print.tprint( f'✓ - Parsed {query_type} location: {location_info[0]}') time.sleep(1 + random.random()) except: # API fails location_cache[location_query] = 'N/A' pub[f'{query_type} location info'].append('N/A') pub[f'{query_type} location confidence'].append('N/A') pa_print.tprint( f'x - Could not parse {query_type} location: {location_query}') # Save changes to cache with open('./cache/json/location_cache.json', 'wb') as fp: fp.write(orjson.dumps(location_cache)) # Cached else: if location_cache[location_query] != 'N/A' and not (location_query == 'N/A'): location_info = location_cache[location_query] pub[f'{query_type} location info'].append(location_info[:3]) pub[f'{query_type} location confidence'].append(location_info[3]) pa_print.tprint( f'o - Cached {query_type} location: {location_info[0]}') else: location_info = 'N/A' pub[f'{query_type} location info'].append('N/A') pub[f'{query_type} location confidence'].append('N/A') pa_print.tprint(f'o - Null {query_type} location: {location_info}')
def extract_author_info(doc, pub): ''' Searches through pdf text for author block using regex (no Grobid needed) :document from text extraction (miner) or xml extraction (grobid) :publication (article) from database ''' pa_print.tprint('\nExtracting authors from paper...') author_info = [] author_count = pub['author count'] # * Method 1 - Look for block with email tail (bibtex not needed, more robust) author_info = auth_regex.findall( doc)[:author_count] # grab only up to total authors if len(author_info) != 0: pa_print.tprint(f'✓ - Found by block') # * Method 2 - Look for block starting with author name (bibtex needed) else: for author in range(author_count): # only look up to i authors author_first = pub['author names'][author][0] author_last = pub['author names'][author][1] pa_print.tprint(f'\nLooking for: {author_first} {author_last}') author_first = author_first.replace('\\', '') # fixes issues with regex author_last = author_last.replace('\\', '') name_regex = r'(?:^.*' + author_first + r'.+' + author_last + r'.*$)(?:\s^[\S |].+$)*' author_search = re.search(name_regex, doc, re.M) try: author_info.append(author_search.group(0)) pa_print.tprint('✓ - Found by name') except: pa_print.tprint('x - No match by name') pa_print.tprint( f'\n✓ - Found {len(author_info)} author(s) in paper of {author_count} total' ) # If there were a different number of authors from text block if len(author_info) < author_count: pub['author block mismatch'] = 'Too few' elif len(author_info) > author_count: pub['author block mismatch'] = 'Too many' # Add 'N/A' for missing authors # ! Note: Author block will not correspond in order to authors authors_missed = author_count - len(author_info) pub['author block missed'] = authors_missed for author in range(authors_missed): author_info.append('N/A') # Add for visibility with csv - # ! but may not be the best idea if processing afterwards pub['author infos'] = '\n\n'.join(author_info) return author_info
def extract_grobid(pub, bib_db, iterator): '''Parse xml files output from Grobid service (3rd party utility needed to generate files) :publication (article) from database ''' def elem_text(elem, fill='N/A'): # to get element text w/o error if elem: return elem.getText(separator=' ', strip=True) else: return fill if 'pubpub' in pub['url']: xml_name = f"nime{pub['year']}_{pub['article-number']}.xml" else: xml_name = pub['url'].split('/')[-1].split('.')[0] + '.tei.xml' xml_path = xml_src + xml_name if os.path.exists(xml_path): with open(xml_path, 'r') as tei: soup = BeautifulSoup(tei, 'lxml') if soup.analytic is None: pa_print.tprint(f'\n{xml_name} is empty!') return pa_print.tprint(f'\nParsing through grobid XML of {xml_name}') grob_names, grob_emails, grob_orgs, grob_addrs = [], [], [], [] # Begin with parsing author info authors = soup.analytic.find_all('author') for author in authors: persname = author.persname if persname: firstname = elem_text(persname.find("forename", type="first"), '') middlename = elem_text( persname.find("forename", type="middle"), '') surname = elem_text(persname.surname, '') # *** should this be find? *** name = (firstname, middlename, surname) grob_names.append(name) grob_emails.append(elem_text(author.email)) # There's an issue where affils can be within an <author> alongside an author or independently # authors = [author for author in authors if not author.affiliation] affils = [author for author in authors if author.affiliation] for affil in affils: grob_orgs.append(elem_text(affil.orgname)) grob_addrs.append(elem_text(affil.address)) grob_info = [grob_names, grob_emails, grob_orgs, grob_addrs] # Fill in missing data with 'N/A' author_count = pub['author count'] for author in range(author_count): for info in grob_info: try: info[author] except IndexError: info.append('N/A') # Add info to df - merge everything! pub['grobid author names'].extend( grob_names) # to check who appeared in grobid info pub['grobid emails'].extend(grob_emails) pub['grobid organisations'].extend(grob_orgs) pub['grobid addresses'].extend(grob_addrs) # Extract meaningful text using grobid tags (within p tags) and save to txt grob_text_file = f"{text_src}grobid/grob_{xml_name.split('.')[0]}.txt" if os.path.isfile(grob_text_file): # check if txt already exists with open(grob_text_file, 'r') as f: grob_text = f.read() else: # ! This needs to be a little more sophisticated # PubPub tei's have expansive body # /n and spaces need to be addressed grob_text = [] grob_body = soup.body.find_all('p') for p in grob_body: p = re.sub(r'\s+', ' ', elem_text(p)).strip() grob_text.append(p) grob_text = str(grob_text) with open(grob_text_file, 'w') as f: f.write(grob_text) return grob_text else: # No XML - populate pa_print.tprint('\nGrobid XML does not exist for paper!') if 'pubpub' in pub['url']: check_xml(bib_db, jats=True) else: check_xml(bib_db) iterator.clear() iterator.refresh()
def download_pdf(pdf_path, pub): pa_print.tprint('\nLocal PDF not found - downloading...') url = pub['url'] r = requests.get(url, allow_redirects=True) open(pdf_path, 'wb').write(r.content)
# * Load database for email handle to uni matching unidomains = load_unidomains(unidomains_path) # * Load and extract BibTeX bib_db = load_bibtex(bibtex_path) bib_db = extract_bibtex(bib_db, args) # * Loop here for Grobid/PDF population if args.grobid: check_xml(bib_db, True) # * Parse data through pdfs print('\nExtracting and parsing publication data...') iterator = tqdm(bib_db) for _, pub in enumerate(iterator): pa_print.tprint(f"\n--- Now on: {pub['title']} ---") # Extract text from pdf if not PubPub if not 'pubpub' in pub['url']: doc = extract_text(pub) errored = doc_quality(doc, pub, 'text') # check for errors # Only extract header meta-data if not errored if not errored: author_info = extract_author_info(doc, pub) else: author_info = [] else: author_info = [] # Extract doc from Grobid