Beispiel #1
0
    def _parse_body_text(self, doc):
        """ Returns the body_text of a document as a <class 'list'> of <class 'dict'>.
        This should be a list of objects of some kind. Seems to be usually something like
        {'section_heading':  <class 'str'>,
         'text': <class 'str'>
         }

         """
        body_text = None

        if self.parse_full_text:
            paper_fs = gridfs.GridFS(self.db,
                                     collection='Scraper_share_osf_io_fs')
            pdf_file = paper_fs.get(doc['PDF_gridfs_id'])

            try:
                paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read()))
            except Exception as e:
                print('Failed to extract PDF %s(%r) (%r)' %
                      (doc['Doi'], doc['PDF_gridfs_id'], e))
                traceback.print_exc()
                paragraphs = []

            body_text = [{
                'section_heading': None,
                'text': x
            } for x in paragraphs]

            return body_text

        return body_text
Beispiel #2
0
def parse_biorxiv_doc(doc, db):
    parsed_doc = dict()
    parsed_doc['title'] = clean_title(doc['Title'])
    parsed_doc['doi'] = doc['Doi']
    parsed_doc['origin'] = "Scraper_chemrxiv_org"
    parsed_doc['link'] = doc['Link']
    parsed_doc['journal'] = doc['Journal']
    parsed_doc['publication_date'] = doc['Publication_Date']
    parsed_doc['authors'] = doc["Authors"]
    parsed_doc['abstract'] = ' '.join(
        map(lambda x: re.sub(r'\s+', ' ', x), doc['Abstract']))
    parsed_doc['has_year'] = True
    parsed_doc['has_month'] = True
    parsed_doc['has_day'] = True

    paper_fs = gridfs.GridFS(db, collection='Scraper_chemrxiv_org_fs')
    pdf_file = paper_fs.get(doc['PDF_gridfs_id'])

    try:
        paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read()))
    except Exception as e:
        print('Failed to extract PDF %s(%r) (%r)' %
              (doc['Doi'], doc['PDF_gridfs_id'], e))
        traceback.print_exc()
        paragraphs = []

    parsed_doc['body_text'] = [{
        'section_heading': None,
        'text': x
    } for x in paragraphs]

    return parsed_doc
def parse_synopsis_doc(doc, db):
    parsed_doc = dict()
    parsed_doc['title'] = clean_title(doc['Title'])
    parsed_doc['link'] = doc['Link']
    parsed_doc['synopsis_link'] = doc['Synopsis_Link']
    parsed_doc['origin'] = "Scraper_public_health_ontario"
    parsed_doc['journal_string'] = doc['Journal_String'].strip(' \t\r.')
    parsed_doc['authors'] = doc["Authors"]
    parsed_doc['abstract'] = find_abstract(doc.get('Abstract'))

    paper_fs = gridfs.GridFS(db, collection='Scraper_publichealthontario_fs')
    pdf_file = paper_fs.get(doc['PDF_gridfs_id'])

    # with open('example.pdf', 'wb') as f:
    #     f.write(pdf_file.read())
    #     pdf_file.seek(0)

    try:
        paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read()),
                                            return_dicts=True,
                                            only_printable=True)
    except Exception as e:
        print('Failed to extract PDF %s(%r) (%r)' %
              (doc['Doi'], doc['PDF_gridfs_id'], e))
        traceback.print_exc()
        paragraphs = []

    sections = {}
    last_sec = None
    for p in paragraphs:
        is_heading = 18 < p['bbox'][3] - p['bbox'][1] and p['bbox'][2] - p[
            'bbox'][0] < 230
        if is_heading:
            last_sec = p['text'].lower()
            sections[last_sec] = []
        elif last_sec is not None:
            sections[last_sec].append(p)

    parsed_doc['synopsis'] = {
        'summary': sections.get('one-minute summary', None),
        'additional_info': sections.get('additional information', None),
        'pho_reviewer_comments': sections.get('pho reviewers comments', None),
    }
    if all(x is None for x in parsed_doc['synopsis'].values()):
        parsed_doc['synopsis'] = None

    return parsed_doc
def handle_doc(file_obj):
    collection, fs = auth_db()

    # check again!
    doc = collection.find_one({'_id': file_obj['_id']})
    if 'pdf_extraction_version' in doc and \
            doc['pdf_extraction_version'] == parser_version and \
            'parsed_date' in doc and \
            doc['parsed_date'] > doc['uploadDate']:
        return None, None

    pdf_file = fs.find_one(file_obj['_id'])
    data = BytesIO(pdf_file.read())
    try:
        paragraphs = extract_paragraphs_pdf(data,
                                            laparams=laparams,
                                            return_dicts=True)
        collection.update({'_id': file_obj['_id']}, {
            '$set': {
                'pdf_extraction_success': True,
                'pdf_extraction_plist': paragraphs,
                'pdf_extraction_exec': None,
                'pdf_extraction_version': parser_version,
                'parsed_date': datetime.datetime.now(),
            }
        })
        exc = None
    except Exception as e:
        paragraphs = None
        traceback.print_exc()
        exc = f'Failed to extract PDF {file_obj["filename"]} {e}' + traceback.format_exc(
        )
        collection.update({'_id': file_obj['_id']}, {
            '$set': {
                'pdf_extraction_success': False,
                'pdf_extraction_plist': None,
                'pdf_extraction_exec': exc,
                'pdf_extraction_version': parser_version,
                'parsed_date': datetime.datetime.now(),
            }
        })

    return paragraphs, exc
def try_parse_pdf_hierarchy(pdf_file):
    try:
        paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read()))
    except Exception as e:
        print('Failed to extract PDF %s(%r) (%r)' %
              (doc['Doi'], doc['PDF_gridfs_id'], e))
        traceback.print_exc()
        paragraphs = []

    headings = r'\n' \
               r'(?:abstract|backgrounds?|introduction|methods?|' \
               r'results?|discussions?|conclusions?|acknowledgements?|' \
               r'references?)'
    continuing_section = fr'(?:.(?!{headings}))'
    sections = fr"""
    (?:
    (?:^|\n)\s*
    (?:
        abstract\s+(?P<abstract>{continuing_section}+)*|
        backgrounds?\s+(?P<background>{continuing_section}+)|
        introduction\s+(?P<introduction>{continuing_section}+)|
        methods?\s+(?P<method>{continuing_section}+)|
        results?\s+(?P<result>{continuing_section}+)|
        discussions?\s+(?P<discussion>{continuing_section}+)|
        conclusions?\s+(?P<conclusion>{continuing_section}+)|
        acknowledgements??\s+(?P<acknowledgement>{continuing_section}+)|
        references?\s+(?P<reference>{continuing_section}+)
    )
    )+
    """
    sections = re.compile(sections, re.VERBOSE | re.DOTALL | re.IGNORECASE)

    body_text = '\n'.join(paragraphs)
    parsed_content = {'body': body_text}
    for match in re.finditer(sections, body_text):
        for name, value in match.groupdict().items():
            if value is not None:
                parsed_content[name] = value

    # print(body_text)

    return parsed_content