def read_bioc_json(self, j): """Read a document from a JSON-formatted BioC representation. Currently this is specific to the PubMed corpus it was used on.""" self.id = 'pmc-' + j['id'] self.url = j['infons']['xref'] st = SentTokenizer() for i, passage in enumerate(j['passages']): if i == 0: lines = passage['text'].splitlines()[:3] # Guess that the author is the second line and the book # title and year are the third line. self.authors = lines[1].split(', ') self.book = re.sub(r' \(.+', '', lines[2]) m = re.match(r'.*\(([0-9]+)\)', lines[2]) if m: self.year = int(m.group(1)) for annotation in passage['annotations']: if annotation['infons']['value'] == 'article-title': a = annotation['locations'][0]['offset'] b = a + annotation['locations'][0]['length'] self.title = passage['text'][a:b - 1] elif annotation['infons']['value'] == 'abstract': a = annotation['locations'][0]['offset'] - 1 b = a + annotation['locations'][0]['length'] sec = {} sec['heading'] = 'Abstract' sec['text'] = st.tokenize(passage['text'][a:b]) if sec['text'][0] != 'null': self.sections.append(sec) else: sys.sterr.write('Unexpected infon value %s.\n' % (anntoation['infons']['value']))
def read_bioc_json(self, j): """Read a document from a string containing a JSON-formatted BioC representation. Currently this is specific to the PubMed corpus it was used on.""" self.id = 'pmc-' + j['id'] self.url = j['infons']['xref'] st = SentTokenizer() for i, passage in enumerate(j['passages']): if i == 0: lines = passage['text'].splitlines()[:3] # Guess that the author is the second line and the book # title and year are the third line. self.authors = lines[1].split(', ') self.book = re.sub(r' \(.+', '', lines[2]) m = re.match(r'.*\(([0-9]+)\)', lines[2]) if m: self.year = int(m.group(1)) for annotation in passage['annotations']: if annotation['infons']['value'] == 'article-title': a = annotation['locations'][0]['offset'] b = a + annotation['locations'][0]['length'] self.title = passage['text'][a:b-1] elif annotation['infons']['value'] == 'abstract': a = annotation['locations'][0]['offset'] - 1 b = a + annotation['locations'][0]['length'] sec = {} sec['heading'] = 'Abstract' sec['text'] = st.tokenize(passage['text'][a:b]) if sec['text'][0] != 'null': self.sections.append(sec) else: sys.sterr.write('Unexpected infon value %s.\n' % (anntoation['infons']['value']))
def __init__(self, fname=None, format=None): if fname and not format: if 'json' in fname: format = 'json' elif 'xml' in fname: format = 'sd' elif 'txt' in fname: format = 'text' j = {'info': {}} if fname and format == 'json': try: j = json.load(io.open(fname, 'r', encoding='utf-8')) except Exception as e: print('Error reading JSON document:', fname, file=sys.stderr) print(e, file=sys.stderr) self.id = j['info'].get('id', '') self.authors = [x.strip() for x in j['info'].get('authors', [])] self.title = title_case(j['info'].get('title', '')) self.book = title_case(j['info'].get('book', '')) self.year = j['info'].get('year', '') self.url = j['info'].get('url', '') self.references = set(j.get('references', [])) self.sections = j.get('sections', []) self.roles = {} if fname and format == 'text': st = SentTokenizer() self.sections = [{'text': st.tokenize(open(fname).read())}] elif fname and format == 'sd': self.read_sd(fname)
def __init__(self, fname=None, form=None): if fname and not form: if 'json' in fname: form = 'json' elif 'xml' in fname: form = 'sd' elif 'txt' in fname: form = 'text' j = {'info': {}} if fname and form == 'json': try: j = json.load(io.open(fname, 'r', encoding='utf-8')) except Exception as e: print('Error reading JSON document:', fname, file=sys.stderr) print(e, file=sys.stderr) sys.exit(1) if 'id' in j['info']: self.id = j['info']['id'] else: basename = os.path.basename(fname) basename = re.sub('\.(json|xml|txt)$', '', basename) self.id = basename self.authors = [x.strip() for x in j['info'].get('authors', [])] self.title = title_case(j['info'].get('title', '')) self.book = title_case(j['info'].get('book', '')) self.year = j['info'].get('year', '') self.url = j['info'].get('url', '') self.references = set(j.get('references', [])) self.sections = j.get('sections', []) self.roles = {} self.corpus = None if fname and form == 'text': st = SentTokenizer() self.sections = [{'text': st.tokenize(open(fname).read())}] elif fname and form == 'sd': self.read_sd(fname)
def read_sd(self, f, fref=None): """Read document contents from a ScienceDirect XML file.""" def get_para_sents(p): if p.find('list'): # Really this needs to be split into the paragraph text # before and after the list, but BeautifulSoup is a pain, and # this is good enough. l = p.find('list').replace_with(' ... ') sents = [ re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text()) ] for para in l.find_all(['para', 'simple_para']): sents.extend([ re.sub(r'\s+', ' ', x) for x in st.tokenize(para.get_text()) ]) return sents return [re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())] if '-ref.xml' in f: return xml = io.open(f, 'r', encoding='utf-8').read() xml = ftfy.fix_text(xml, uncurl_quotes=False, fix_entities=False) xml = strtr(xml, {'e´': 'é', 'e`': 'è'}) xml = re.sub("([</])(dc|prism|ce|sb|xocs):", r"\1", xml) soup = BeautifulSoup(xml, 'lxml') try: pii = re.sub('[()-.]', '', soup.find('pii').string) except: print('No PII found for', f) return self.id = 'sd-' + pii.lower() self.authors = [] try: for author in soup('creator'): x = author.string.strip() name = re.sub('^.*, ', '', x) + ' ' + re.sub(',.*$', '', x) self.authors.append(name) except: pass if not self.authors and soup.editor: self.authors = [ x.get_text() + ' (ed.)' for x in soup.editor('authors') ] if soup.title: self.title = soup.title.string.strip() if soup.publicationname: self.book = soup.publicationname.string.strip() self.url = 'http://www.sciencedirect.com/science/article/pii/' + pii if soup.coverdate: # Dates are in format YYYY-MM-DD self.year = int(re.sub('-.*', '', soup.coverdate.string)) st = SentTokenizer() if soup.abstract: sec = { 'heading': 'Abstract', 'text': st.tokenize(soup.find('abstract-sec').get_text()) } self.sections.append(sec) sec_id = '' sec = {'text': []} sec_last = {'text': []} for p in soup.find_all(['para', 'simple-para']): if p.find_parents('outline'): continue elif p.find('list') and p.find('list').find('section-title'): continue elif p.find_parents('para'): continue elif p.find_parents('floats'): # Lest these show up at the start and be treated as an # abstract. sec_last['text'] += get_para_sents(p) continue if p.parent.name in ['section', 'biography']: p_sec_id = p.parent.get('id', '') if p_sec_id != sec_id: if sec['text']: self.sections.append(sec) sec = {'text': []} sec_id = p_sec_id heading = p.parent.find('section-title') if heading and heading.string: sec['heading'] = heading.string.strip() elif p.parent.name == 'biography': sec['heading'] = 'Biography' sec['text'] += get_para_sents(p) if sec['text']: self.sections.append(sec) if sec_last['text']: self.sections.append(sec_last) if soup.rawtext and len(self.sections) < 3: self.sections.append( {'text': st.tokenize(soup.rawtext.get_text())}) if len(self.text()) < 200: print(' ! Skip:', self.title, self.id + '. Missing text.') return if not fref: fref = f.replace('-full.xml', '-ref.xml') if os.path.exists(fref): reftext = io.open(fref, 'r', encoding='utf-8').read() self.references = set([ x.replace('PII:', 'sd-').lower() for x in re.findall('PII:[^<]+', reftext) ])
def read_sd(self, f, fref=None): """Read document contents from a ScienceDirect XML file.""" def get_para_sents(p): if p.find('list'): # Really this needs to be split into the paragraph text # before and after the list, but BeautifulSoup is a pain, and # this is good enough. l = p.find('list').replace_with(' ... ') sents = [re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())] for para in l.find_all(['para', 'simple_para']): sents.extend([re.sub(r'\s+', ' ', x) for x in st.tokenize(para.get_text())]) return sents return [re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())] if '-ref.xml' in f: return xml = io.open(f, 'r', encoding='utf-8').read() xml = ftfy.fix_text(xml, uncurl_quotes=False, fix_entities=False) xml = strtr(xml, {'e´': 'é', 'e`': 'è'}) xml = re.sub("([</])(dc|prism|ce|sb|xocs):", r"\1", xml) soup = BeautifulSoup(xml, 'lxml') try: pii = re.sub('[()-.]', '', soup.find('pii').string) except: print('No PII found for', f) return self.id = 'sd-' + pii.lower() self.authors = [] try: for author in soup('creator'): x = author.string.strip() name = re.sub('^.*, ', '', x) + ' ' + re.sub(',.*$', '', x) self.authors.append(name) except: pass if not self.authors and soup.editor: self.authors = [x.get_text() + ' (ed.)' for x in soup.editor('authors')] if soup.title: self.title = soup.title.string.strip() if soup.publicationname: self.book = soup.publicationname.string.strip() self.url = 'http://www.sciencedirect.com/science/article/pii/' + pii if soup.coverdate: # Dates are in format YYYY-MM-DD self.year = int(re.sub('-.*', '', soup.coverdate.string)) st = SentTokenizer() if soup.abstract: sec = {'heading': 'Abstract', 'text': st.tokenize(soup.find('abstract-sec').get_text())} self.sections.append(sec) sec_id = '' sec = {'text': []} sec_last = {'text': []} for p in soup.find_all(['para', 'simple-para']): if p.find_parents('outline'): continue elif p.find('list') and p.find('list').find('section-title'): continue elif p.find_parents('para'): continue elif p.find_parents('floats'): # Lest these show up at the start and be treated as an # abstract. sec_last['text'] += get_para_sents(p) continue if p.parent.name in ['section', 'biography']: p_sec_id = p.parent.get('id', '') if p_sec_id != sec_id: if sec['text']: self.sections.append(sec) sec = {'text': []} sec_id = p_sec_id heading = p.parent.find('section-title') if heading and heading.string: sec['heading'] = heading.string.strip() elif p.parent.name == 'biography': sec['heading'] = 'Biography' sec['text'] += get_para_sents(p) if sec['text']: self.sections.append(sec) if sec_last['text']: self.sections.append(sec_last) if soup.rawtext and len(self.sections) < 3: self.sections.append({'text': st.tokenize(soup.rawtext.get_text())}) if len(self.text()) < 200: print(' ! Skip:', self.title, self.id + '. Missing text.') return if not fref: fref = f.replace('-full.xml', '-ref.xml') if os.path.exists(fref): reftext = io.open(fref, 'r', encoding='utf-8').read() self.references = set([x.replace('PII:', 'sd-').lower() for x in re.findall('PII:[^<]+', reftext)])