def calculate_carbon(pub):
    ''' Calculate the carbon emissions from travel

    :publication (article) from database
    '''
    author_count = pub['author count']

    pa_print.tprint('\nCalculating carbon footprint...')
    for author in range(author_count):
        if pub['author location info'][author] != 'N/A':
            distance = geodesic(pub['author location info'][author][2],
                                pub['conference location info'][0][2]).km
            pub['author distances'].append(distance)

            # * Calculate C02 emissions, more details here: https://github.com/milankl/CarbonFootprintAGU
            carbon = 0.0  # kgCO2e

            if distance < 400:  # bus / train / car at 60gCO2e / km / person
                carbon = distance * 2 * 0.06
            elif distance < 1500:  # short flight at 200gCO2e / km / person
                carbon = distance * 2 * 0.2
            elif distance < 8000:  # long flight at 250gCO2e / km / person
                carbon = distance * 2 * 0.25
            else:  # super long flight at 300gCO2e / km / person
                carbon = distance * 2 * 0.3

            pub['author footprints'].append(carbon / 1000)
            pa_print.tprint(
                f'✓ - CO2 emissions for author {int(author + 1)}: {(carbon / 1000):.3f} tCO2e'
            )
        else:
            pub['author distances'].append('N/A')
            pub['author footprints'].append('N/A')
def doc_check(doc, pub, type):
    ''' Check for common decoding errors (does not catch all) # ! more intelligent method?

    :document from text extraction (miner) or xml extraction (grobid)
    :publication (article) from database
    :type of doc (either 'text' or 'grobid')
    '''
    errored = False

    alphas = re.compile('[^a-zA-Z]')
    doc_alphas = alphas.sub('', doc)
    if len(doc) > 2 * len(doc_alphas):  # more symbols than 2x letters
        pub[f'{type} non alpha'] = 'X'
        pa_print.tprint('\nFile was not decoded well - non-alpha')
        errored = True

    cids = re.compile(r'\(cid:[0-9]+\)')
    doc_cidless = cids.sub(
        '', doc,
        re.M)  # when font cannot be decoded, (cid:#) is returned, remove these
    if len(doc) > 2 * len(
            doc_cidless):  # if most of content was undecodable, skip
        pub[f'{type} poor decoding'] = 'X'
        pa_print.tprint('\nFile was not decoded well - cid: present')
        errored = True

    return errored
Ejemplo n.º 3
0
def request_location(author_info, args, pub):
    ''' Extracts location from author blocks or universities and queries OpenCageGeocode

    :publication from bibtex file
    '''
    author_count = pub['author count']

    # Conference location lookup
    cnf_query = pub['address']
    query_type = 'conference'
    query_location(cnf_query, query_type,
                   pub)  # *** creates unneeded columns ***

    # Author location lookup
    for author in range(author_count):  # length of usable locations
        query_type = 'author'

        # Assign one query (in order of priority)
        # 1) If there is a university address from grobid
        if pub['grobid author unis'][author] != 'N/A':  # uni address
            location_query = ', '.join(
                pub['grobid author unis'][author])  # (uni name, country)
            query_origin = 'grobid uni'

        # 2) If grobid was used to add address (while 'location' is api derived)
        elif pub['grobid addresses'][author] != 'N/A':
            location_query = pub['grobid addresses'][author]
            query_origin = 'grobid address'

        # 3) If theres a uni address from text block
        elif pub['text author unis'][author] != 'N/A':
            location_query = ', '.join(
                pub['text author unis'][author])  # (uni name, country)
            query_origin = 'text uni'

        # 4) Else, scrape from raw author block (which may or may not have email)
        elif author < len(author_info) and author_info[
                author] != 'N/A':  # check if author_info contains author 'i' and is non-empty
            auth_block = author_info[author]
            cut_line = -1 if '@' in auth_block else 0  # one line above if email present
            info_lines = auth_block.split('\n')
            location_query = ' '.join(info_lines[cut_line - 1:cut_line])
            if len([line for line in location_query if line.isdigit()
                    ]) > 8:  # look for tele #
                location_query = ' '.join(
                    info_lines[cut_line - 2:cut_line -
                               1])  # take line higher if telephone
            query_origin = 'raw author block'

        else:
            location_query = 'N/A'
            query_origin = 'No query'
            pa_print.tprint("\nCouldn't find a location to use!")

        pa_print.tprint(f'\nLooking for: {location_query}')
        pub['author loc queries'].append(location_query)
        pub['author query origins'].append(query_origin)
        query_location(location_query, query_type, pub)
def download_xml(xml_path, pub):
    pa_print.tprint('\nLocal PubPub XML not found - downloading...')
    url = pub['url']
    r = requests.get(url, allow_redirects=True)
    url = re.search(r"jats&quot;,&quot;url&quot;:&quot;(.*?.xml)",
                    r.text).group(1)

    r = requests.get(url, allow_redirects=True)
    open(jats_src, 'wb').write(r.content)
def load_unidomains(path):
    ''' Loads unidomain file from json or downloads if not found

    :path of unisomains.json file
    '''
    if not os.path.isfile(path):  # if not, download
        pa_print.tprint('\nDownloading unidomains database...')
        r = requests.get(unidomains_url, allow_redirects=True)
        open(path, 'wb').write(r.content)

    with open(path, 'rb') as fp:
        unidomains = orjson.loads(fp.read())

    return unidomains
def load_bibtex(path):
    ''' Loads BibTeX file into object or downloads if not found

    :path of BibTeX file
    '''
    if not os.path.isfile(path):  # if not, download
        pa_print.tprint('\nDownloading bibtex database...')
        r = requests.get(bibtex_url, allow_redirects=True)
        open(path, 'wb').write(r.content)

    with open(path) as bib_file:
        parser = bibtexparser.bparser.BibTexParser()
        parser.customization = bibtexparser.customization.convert_to_unicode
        bib_db = bibtexparser.load(bib_file, parser=parser)
        bib_db = bib_db.entries

    return bib_db
def import_config(filepath):
    ''' Imports a custom configuration for filter words and years

    :filepath the file path
    '''
    user_config = pd.read_csv(filepath, header=0, delimiter=',')
    user_config = user_config.fillna('')

    keywords = []
    ignore_words = []
    merge_words = []
    selected_years = []

    for config_tuple in user_config.itertuples(index=False):
        if config_tuple[0] == 'keywords':  # single list
            for i in config_tuple[1:]:
                keywords.append(i)
        elif config_tuple[0] == 'ignore':  # single list
            for i in config_tuple[1:]:
                ignore_words.append(i)
        elif config_tuple[0] == 'merge':  # list of lists
            merge_group = list(filter(None, config_tuple[1:]))
            merge_words.append(merge_group)
        elif config_tuple[0] == 'years':  # single list
            year_num = [i for i in config_tuple if i != '']
            if len(year_num) == 2:
                selected_years.append(str(int(config_tuple[1])))
            else:
                year_span = int(config_tuple[2]) - int(config_tuple[1])
                for i in range(year_span + 1):
                    selected_years.append(str(int(config_tuple[1]) + i))

    keywords = list(filter(None, keywords))
    ignore_words = list(filter(None, ignore_words))

    pa_print.tprint('\nParameters from custom.csv:')
    if selected_years:
        pa_print.tprint(f'Selected years: {selected_years}')
    if keywords:
        pa_print.tprint(f'Search words: {keywords}')
    if ignore_words:
        pa_print.tprint(f'Ignored words: {ignore_words}')
    if merge_words:
        pa_print.tprint(f'Merged words: {merge_words}')

    return (keywords, ignore_words, merge_words, selected_years)
def trim_headfoot(doc, pub=None):
    ''' Trim the header and footer from extracted text (unused and inferior to Grobid service)

    :document from text extraction (miner) or xml extraction (grobid)
    '''
    # Function for trimming header and footer
    # Remove until abstract or introduction
    pdf_trimmed = abst_regex.split(doc, 1)
    if len(pdf_trimmed) == 1:
        pdf_trimmed = intro_regex.split(
            pdf_trimmed[0], 1)  # if no abstract, use 'introduction'
        if len(pdf_trimmed) == 1:
            pdf_trimmed = pdf_trimmed[0]
            if pub is not None: pub['header fail'] = 'X'
            pa_print.tprint('Could not split header during parsing!')
        else:
            pdf_trimmed = pdf_trimmed[1]
            # pa_print.tprint('Split header at intro')
    else:
        pdf_trimmed = pdf_trimmed[1]
        # pa_print.tprint('Split header at abstract')
    # return pdf_trimmed

    # Remove after references or acknowledgements
    pdf_slimmed = ackn_regex.split(pdf_trimmed, 1)
    if len(pdf_slimmed) == 1:
        pdf_slimmed = ref_regex.split(pdf_slimmed[0], 1)
        if len(pdf_slimmed) == 1:
            if pub is not None: pub['footer fail'] = 'X'
            pa_print.tprint('Could not split footer during parsing!')
        else:
            pdf_slimmed = pdf_slimmed[0]
            # pa_print.tprint('Split footer at references')
    else:
        pdf_slimmed = pdf_slimmed[0]
        # pa_print.tprint('Split footer at acknowledgements')

    return pdf_slimmed
def extract_text(pub):
    '''Extracts text content from pdf using pdfminer.six, downloads pdf if non-existant

    :publication (article) from database
    '''
    pdf_fn = pub['url'].split('/')[-1]
    pdf_path = pdf_src + pdf_fn

    # Allows for override of corrupted pdfs
    if os.path.isfile(pdf_path):
        pass
    else:  # doesnt exist - download
        download_pdf(pdf_path, pub)

    # Page count for those without
    if pub['page count'] == 'N/A':
        pdf = open(pdf_path, 'rb')
        check = False
        while True:  # try once
            try:
                parser = PDFParser(pdf)
                document = PDFDocument(parser)
            except Exception as e:
                if check is True:
                    raise PSSyntaxError(
                        f'{pdf_path} appears to be malformed and qpdf cannot repair it.'
                    )
                pa_print.tprint(str(e))
                pa_print.tprint(f'Attempting to repair {pdf_path}')
                pike = pikepdf.Pdf.open(pdf_path, allow_overwriting_input=True)
                pike.save(pdf_path)
                check = True
                continue
            break

        pub['page count'] = resolve1(document.catalog['Pages'])['Count']

    fn = pdf_fn.split('.')[0]
    miner_text_file = f'{text_src}miner/miner_{fn}.txt'

    # Read miner text if exists
    if os.path.isfile(miner_text_file):
        with open(miner_text_file, 'r') as f:
            doc = f.read()
            return doc

    else:  # if not, make them
        pa_print.tprint(f'\nExtracting: {pdf_fn}')

        laparams = LAParams()
        setattr(laparams, 'all_texts', True)
        doc = extract_pdf(pdf_path, laparams=laparams)

        with open(miner_text_file, 'w') as f:
            f.write(doc)

        return doc
Ejemplo n.º 10
0
def request_scholar(pub, args):
    ''' Queries citations from Semantic Scholar

    :publication from bibtex file
    '''
    try:
        with open('./cache/json/scholar_cache.json', 'rb') as fp:
            scholar_cache = orjson.loads(fp.read())
    except FileNotFoundError:
        pa_print.tprint('\nCreating new Semantic Scholar cache!')
        scholar_cache = {}

    semantic_scholar_data = {
        "queryString": [],
        "page": 1,
        "pageSize": 1,
        "sort": "relevance",
        "authors": [],
        "coAuthors": [],
        "venues": [],
        "yearFilter": None,
        "requireViewablePdf": False,
        "publicationTypes": [],
        "externalContentTypes": []
    }

    # Fix names for searching
    regextitle = re.compile(r'[^a-zA-Z0-9 ]')
    regexname = re.compile(r'[^a-zA-Z- ]')
    author_last_list = []

    for _, (_, last) in enumerate(pub['author names']):
        last = last.split('-')[-1]
        author_last_list.append(last)

    title = unidecode.unidecode(pub['title'])

    if args.nime:
        if title == 'Now':  # title is too short, this return other paper, trying to filter it out by forcing full author name
            author_last_list[0] = 'GarthPaine'

    pub['citation count'] = 'N/A'
    pub['key citation count'] = 'N/A'

    # Make query title, name and year lists
    query_title = list(
        dict.fromkeys([
            title,
            regextitle.sub('', title),
            ' '.join([w for w in title.split() if len(w) > 1])
        ]))
    if len(author_last_list) > 1:
        query_name = [' '.join(author_last_list), author_last_list[0], '']
    else:
        query_name = [author_last_list[0], '']
    query_year = ['', pub['year']]

    # Save query to be used for cache
    full_query = f"{title} {' '.join(author_last_list)} {pub['year']}"
    pub['scholar query'] = full_query

    if full_query not in scholar_cache or args.citations:
        pa_print.tprint(f'\nQuerying Semantic Scholar...')
        for temp in list(itertools.product(query_title, query_name,
                                           query_year)):

            # Generate new query from combination
            temp_title, temp_author, temp_year = temp[0], temp[1], temp[2]
            scholar_query = f'{temp_title} {temp_author} {temp_year}'
            semantic_scholar_data['queryString'] = scholar_query

            # Try query
            pa_print.tprint(f"Trying query: '{scholar_query}'")
            try:
                query_result = scholar_api(semantic_scholar_data)

            except Exception as e:
                query_result = {'results': {}}
                err_info = 'x - While querying Semantic Scholar an exception of type {0} occurred.\nArguments:\n{1!r}.'
                err_msg = err_info.format(type(e).__name__, e.args)
                pa_print.tprint(err_msg)

            if not 'error' in query_result.keys():
                if bool(query_result['results']) and \
                bool(query_result['results'][0]['scorecardStats']) and \
                len(query_result['results'][0]['authors']) <= (len(author_last_list) + 1):
                    result_author = ' '.join([
                        t[0]['name']
                        for t in query_result['results'][0]['authors']
                    ])
                    result_author = regexname.sub(
                        '', unidecode.unidecode(result_author)).lower()
                    query_author = regexname.sub(
                        '', author_last_list[0].lower().split(' ')[-1])
                    if result_author.find(query_author) != -1:
                        pub['scholar query'] = scholar_query
                        pub['citation count'] = query_result['results'][0][
                            'scorecardStats'][0]['citationCount']
                        pub['key citation count'] = query_result['results'][0][
                            'scorecardStats'][0]['keyCitationCount']
                        scholar_cache[full_query] = query_result['results'][0][
                            'scorecardStats']
                        pa_print.tprint(
                            f"✓ - Paper has been cited {pub['citation count']} times"
                        )
                        break

        if pub['citation count'] == 'N/A':
            pa_print.tprint(
                'x - Cannot find citations for paper in Semantic Scholar')
            scholar_cache[full_query] = 'N/A'

        with open('./cache/json/scholar_cache.json', 'wb') as fp:
            fp.write(orjson.dumps(scholar_cache))

    else:
        if scholar_cache[full_query] != 'N/A':
            pub['citation count'] = scholar_cache[full_query][0][
                'citationCount']
            pub['key citation count'] = scholar_cache[full_query][0][
                'keyCitationCount']
        else:
            pub['citation count'] = 'N/A'
            pub['key citation count'] = 'N/A'

        pa_print.tprint(
            f"\no - Retrieved from cache: {pub['citation count']} citations")

    # Average citations per year of age
    if pub['citation count'] != 'N/A':
        pub['yearly citations'] = int(pub['citation count']) / pub['age']
    else:
        pub['yearly citations'] = 'N/A'
Ejemplo n.º 11
0
def request_uni(unidomains, author_info, args, pub):
    ''' Extract university from email handle

    :publication from bibtex file
    '''
    pub_matches = 0
    grob_matches = 0
    text_matches = 0

    author_count = pub['author count']

    # Internal functions for lookup in unidomains.json
    def lookup_uni(handle, email_type, pub):
        nonlocal pub_matches
        for uni in unidomains:
            if handle in uni['domains']:
                pub[f'{email_type} author unis'].append(
                    (uni['name'], uni['country']))
                pub_matches += 1
                uni_match = True
                break

    def handle_check(email, email_type, pub):
        handle = email.split("@")[-1].strip()

        # Look for handle in json, split once by dot and retry if not found
        uni_match = False
        lookup_uni(handle, email_type, pub)
        while uni_match == False and handle.count('.') > 1:
            handle = handle.split('.', 1)[-1]
            lookup_uni(handle, email_type, pub)

    # 1) Using grobid derived emails to choose handle
    email_type = 'grobid'
    for author in range(author_count):
        email = pub['grobid emails'][author]
        if email != 'N/A':  # check for valid email
            handle_check(email, email_type, pub)

    grob_matches = pub_matches

    # 2) Using scraped author info block from header if not enough emails
    if len(author_info) > 0 and (grob_matches < author_count):
        email_type = 'text'
        for author in author_info:  # ! could be more authors than exit
            info_emails = email_regex.findall(
                author)  # look for '@handle.tld' in block
            for _, email in enumerate(
                    info_emails
            ):  # case: multiple emails are within an author block #! (will overwrite)
                if email != 'N/A':
                    handle_check(email, email_type, pub)

    # Fill in missing unis with 'N/A' # ! author block not linked in order with authors
    for type, author in [(type, author) for type in ['grobid', 'text']
                         for author in range(author_count)]:
        try:
            pub[f'{type} author unis'][author]
        except IndexError:
            pub[f'{type} author unis'].append('N/A')

    text_matches = pub_matches - grob_matches
    pub_matches = max(text_matches, grob_matches)

    pa_print.tprint(f'o - Found {pub_matches} uni\'s from email handles\n')
Ejemplo n.º 12
0
def query_location(location_query, query_type,
                   pub):  # 'query_type is now only used to print status
    # Load cache
    try:
        with open('./cache/json/location_cache.json', 'rb') as fp:
            location_cache = orjson.loads(fp.read())
    except FileNotFoundError:
        pa_print.tprint('\nCreating new location cache!')
        location_cache = {'N/A': 'N/A'}

    # Not cached
    if location_query not in location_cache:
        try:
            # location = geolocator.geocode(location_query, language="en") # Nominatim fallback
            # OpenCageGeocode: 2,500 req/day, 1 req/s - https://github.com/OpenCageData/python-opencage-geocoder
            location = geocoder.geocode(location_query,
                                        language='en',
                                        limit=1,
                                        no_annotations=1,
                                        no_record=1)[0]

            # Format result
            geometry = location['geometry']  # lat/long
            components = location['components']  # fine loc info
            location_info = (location['formatted'], (components['country'],
                                                     components['continent']),
                             (geometry['lat'],
                              geometry['lng']), location['confidence']
                             )  # 1 (>25km) to 10 (<0.25km)

            location_cache[location_query] = location_info
            pub[f'{query_type} location info'].append(
                location_info[:3])  # add all location into one column
            pub[f'{query_type} location confidence'].append(
                location_info[3])  # confidence in separate column
            pa_print.tprint(
                f'✓ - Parsed {query_type} location: {location_info[0]}')
            time.sleep(1 + random.random())

        except:  # API fails
            location_cache[location_query] = 'N/A'
            pub[f'{query_type} location info'].append('N/A')
            pub[f'{query_type} location confidence'].append('N/A')
            pa_print.tprint(
                f'x - Could not parse {query_type} location: {location_query}')

        # Save changes to cache
        with open('./cache/json/location_cache.json', 'wb') as fp:
            fp.write(orjson.dumps(location_cache))

    # Cached
    else:
        if location_cache[location_query] != 'N/A' and not (location_query
                                                            == 'N/A'):
            location_info = location_cache[location_query]
            pub[f'{query_type} location info'].append(location_info[:3])
            pub[f'{query_type} location confidence'].append(location_info[3])
            pa_print.tprint(
                f'o - Cached {query_type} location: {location_info[0]}')

        else:
            location_info = 'N/A'
            pub[f'{query_type} location info'].append('N/A')
            pub[f'{query_type} location confidence'].append('N/A')
            pa_print.tprint(f'o - Null {query_type} location: {location_info}')
def extract_author_info(doc, pub):
    ''' Searches through pdf text for author block using regex (no Grobid needed)

    :document from text extraction (miner) or xml extraction (grobid)
    :publication (article) from database
    '''
    pa_print.tprint('\nExtracting authors from paper...')

    author_info = []
    author_count = pub['author count']

    # * Method 1 - Look for block with email tail (bibtex not needed, more robust)
    author_info = auth_regex.findall(
        doc)[:author_count]  # grab only up to total authors

    if len(author_info) != 0:
        pa_print.tprint(f'✓ - Found by block')

    # * Method 2 - Look for block starting with author name (bibtex needed)
    else:
        for author in range(author_count):  # only look up to i authors
            author_first = pub['author names'][author][0]
            author_last = pub['author names'][author][1]
            pa_print.tprint(f'\nLooking for: {author_first} {author_last}')

            author_first = author_first.replace('\\',
                                                '')  # fixes issues with regex
            author_last = author_last.replace('\\', '')

            name_regex = r'(?:^.*' + author_first + r'.+' + author_last + r'.*$)(?:\s^[\S |].+$)*'
            author_search = re.search(name_regex, doc, re.M)
            try:
                author_info.append(author_search.group(0))
                pa_print.tprint('✓ - Found by name')
            except:
                pa_print.tprint('x - No match by name')

    pa_print.tprint(
        f'\n✓ - Found {len(author_info)} author(s) in paper of {author_count} total'
    )

    # If there were a different number of authors from text block
    if len(author_info) < author_count:
        pub['author block mismatch'] = 'Too few'
    elif len(author_info) > author_count:
        pub['author block mismatch'] = 'Too many'

    # Add 'N/A' for missing authors # ! Note: Author block will not correspond in order to authors
    authors_missed = author_count - len(author_info)
    pub['author block missed'] = authors_missed
    for author in range(authors_missed):
        author_info.append('N/A')

    # Add for visibility with csv - # ! but may not be the best idea if processing afterwards
    pub['author infos'] = '\n\n'.join(author_info)

    return author_info
def extract_grobid(pub, bib_db, iterator):
    '''Parse xml files output from Grobid service (3rd party utility needed to generate files)

    :publication (article) from database
    '''
    def elem_text(elem, fill='N/A'):  # to get element text w/o error
        if elem:
            return elem.getText(separator=' ', strip=True)
        else:
            return fill

    if 'pubpub' in pub['url']:
        xml_name = f"nime{pub['year']}_{pub['article-number']}.xml"
    else:
        xml_name = pub['url'].split('/')[-1].split('.')[0] + '.tei.xml'

    xml_path = xml_src + xml_name

    if os.path.exists(xml_path):
        with open(xml_path, 'r') as tei:
            soup = BeautifulSoup(tei, 'lxml')

        if soup.analytic is None:
            pa_print.tprint(f'\n{xml_name} is empty!')
            return

        pa_print.tprint(f'\nParsing through grobid XML of {xml_name}')

        grob_names, grob_emails, grob_orgs, grob_addrs = [], [], [], []

        # Begin with parsing author info
        authors = soup.analytic.find_all('author')

        for author in authors:
            persname = author.persname
            if persname:
                firstname = elem_text(persname.find("forename", type="first"),
                                      '')
                middlename = elem_text(
                    persname.find("forename", type="middle"), '')
                surname = elem_text(persname.surname,
                                    '')  # *** should this be find? ***
                name = (firstname, middlename, surname)
                grob_names.append(name)

            grob_emails.append(elem_text(author.email))

        # There's an issue where affils can be within an <author> alongside an author or independently
        # authors = [author for author in authors if not author.affiliation]
        affils = [author for author in authors if author.affiliation]
        for affil in affils:
            grob_orgs.append(elem_text(affil.orgname))
            grob_addrs.append(elem_text(affil.address))

        grob_info = [grob_names, grob_emails, grob_orgs, grob_addrs]

        # Fill in missing data with 'N/A'
        author_count = pub['author count']
        for author in range(author_count):
            for info in grob_info:
                try:
                    info[author]
                except IndexError:
                    info.append('N/A')

        # Add info to df - merge everything!
        pub['grobid author names'].extend(
            grob_names)  # to check who appeared in grobid info
        pub['grobid emails'].extend(grob_emails)
        pub['grobid organisations'].extend(grob_orgs)
        pub['grobid addresses'].extend(grob_addrs)

        # Extract meaningful text using grobid tags (within p tags) and save to txt
        grob_text_file = f"{text_src}grobid/grob_{xml_name.split('.')[0]}.txt"
        if os.path.isfile(grob_text_file):  # check if txt already exists
            with open(grob_text_file, 'r') as f:
                grob_text = f.read()
        else:
            # ! This needs to be a little more sophisticated
            # PubPub tei's have expansive body
            # /n and spaces need to be addressed
            grob_text = []
            grob_body = soup.body.find_all('p')
            for p in grob_body:
                p = re.sub(r'\s+', ' ', elem_text(p)).strip()
                grob_text.append(p)
            grob_text = str(grob_text)
            with open(grob_text_file, 'w') as f:
                f.write(grob_text)

        return grob_text

    else:  # No XML - populate
        pa_print.tprint('\nGrobid XML does not exist for paper!')
        if 'pubpub' in pub['url']:
            check_xml(bib_db, jats=True)
        else:
            check_xml(bib_db)
        iterator.clear()
        iterator.refresh()
def download_pdf(pdf_path, pub):
    pa_print.tprint('\nLocal PDF not found - downloading...')
    url = pub['url']
    r = requests.get(url, allow_redirects=True)
    open(pdf_path, 'wb').write(r.content)
Ejemplo n.º 16
0
    # * Load database for email handle to uni matching
    unidomains = load_unidomains(unidomains_path)

    # * Load and extract BibTeX
    bib_db = load_bibtex(bibtex_path)
    bib_db = extract_bibtex(bib_db, args)

    # * Loop here for Grobid/PDF population
    if args.grobid:
        check_xml(bib_db, True)

    # * Parse data through pdfs
    print('\nExtracting and parsing publication data...')
    iterator = tqdm(bib_db)
    for _, pub in enumerate(iterator):
        pa_print.tprint(f"\n--- Now on: {pub['title']} ---")

        # Extract text from pdf if not PubPub
        if not 'pubpub' in pub['url']:
            doc = extract_text(pub)
            errored = doc_quality(doc, pub, 'text')  # check for errors

            # Only extract header meta-data if not errored
            if not errored:
                author_info = extract_author_info(doc, pub)
            else:
                author_info = []
        else:
            author_info = []

        # Extract doc from Grobid