Python SentTokenizer Examples, techknacq.lx.SentTokenizer Python Examples

Example #1

0

Show file

    def read_bioc_json(self, j):
        """Read a document from a JSON-formatted BioC representation.
        Currently this is specific to the PubMed corpus it was used on."""

        self.id = 'pmc-' + j['id']
        self.url = j['infons']['xref']

        st = SentTokenizer()
        for i, passage in enumerate(j['passages']):
            if i == 0:
                lines = passage['text'].splitlines()[:3]
                # Guess that the author is the second line and the book
                # title and year are the third line.
                self.authors = lines[1].split(', ')
                self.book = re.sub(r' \(.+', '', lines[2])
                m = re.match(r'.*\(([0-9]+)\)', lines[2])
                if m:
                    self.year = int(m.group(1))
            for annotation in passage['annotations']:
                if annotation['infons']['value'] == 'article-title':
                    a = annotation['locations'][0]['offset']
                    b = a + annotation['locations'][0]['length']
                    self.title = passage['text'][a:b - 1]
                elif annotation['infons']['value'] == 'abstract':
                    a = annotation['locations'][0]['offset'] - 1
                    b = a + annotation['locations'][0]['length']
                    sec = {}
                    sec['heading'] = 'Abstract'
                    sec['text'] = st.tokenize(passage['text'][a:b])
                    if sec['text'][0] != 'null':
                        self.sections.append(sec)
                else:
                    sys.sterr.write('Unexpected infon value %s.\n' %
                                    (anntoation['infons']['value']))

Example #2

0

Show file

File: corpus.py Project: ISI-TechknAcq/techknacq-tk

    def read_bioc_json(self, j):
        """Read a document from a string containing a JSON-formatted BioC
        representation. Currently this is specific to the PubMed corpus it
        was used on."""

        self.id = 'pmc-' + j['id']
        self.url = j['infons']['xref']

        st = SentTokenizer()
        for i, passage in enumerate(j['passages']):
            if i == 0:
                lines = passage['text'].splitlines()[:3]
                # Guess that the author is the second line and the book
                # title and year are the third line.
                self.authors = lines[1].split(', ')
                self.book = re.sub(r' \(.+', '', lines[2])
                m = re.match(r'.*\(([0-9]+)\)', lines[2])
                if m:
                    self.year = int(m.group(1))
            for annotation in passage['annotations']:
                if annotation['infons']['value'] == 'article-title':
                    a = annotation['locations'][0]['offset']
                    b = a + annotation['locations'][0]['length']
                    self.title = passage['text'][a:b-1]
                elif annotation['infons']['value'] == 'abstract':
                    a = annotation['locations'][0]['offset'] - 1
                    b = a + annotation['locations'][0]['length']
                    sec = {}
                    sec['heading'] = 'Abstract'
                    sec['text'] = st.tokenize(passage['text'][a:b])
                    if sec['text'][0] != 'null':
                        self.sections.append(sec)
                else:
                    sys.sterr.write('Unexpected infon value %s.\n' %
                                    (anntoation['infons']['value']))

Example #3

0

Show file

File: corpus.py Project: ISI-TechknAcq/techknacq-tk

    def __init__(self, fname=None, format=None):
        if fname and not format:
            if 'json' in fname:
                format = 'json'
            elif 'xml' in fname:
                format = 'sd'
            elif 'txt' in fname:
                format = 'text'

        j = {'info': {}}
        if fname and format == 'json':
            try:
                j = json.load(io.open(fname, 'r', encoding='utf-8'))
            except Exception as e:
                print('Error reading JSON document:', fname, file=sys.stderr)
                print(e, file=sys.stderr)

        self.id = j['info'].get('id', '')
        self.authors = [x.strip() for x in j['info'].get('authors', [])]
        self.title = title_case(j['info'].get('title', ''))
        self.book = title_case(j['info'].get('book', ''))
        self.year = j['info'].get('year', '')
        self.url = j['info'].get('url', '')
        self.references = set(j.get('references', []))
        self.sections = j.get('sections', [])
        self.roles = {}

        if fname and format == 'text':
            st = SentTokenizer()
            self.sections = [{'text': st.tokenize(open(fname).read())}]
        elif fname and format == 'sd':
            self.read_sd(fname)

Example #4

0

Show file

    def __init__(self, fname=None, form=None):
        if fname and not form:
            if 'json' in fname:
                form = 'json'
            elif 'xml' in fname:
                form = 'sd'
            elif 'txt' in fname:
                form = 'text'

        j = {'info': {}}
        if fname and form == 'json':
            try:
                j = json.load(io.open(fname, 'r', encoding='utf-8'))
            except Exception as e:
                print('Error reading JSON document:', fname, file=sys.stderr)
                print(e, file=sys.stderr)
                sys.exit(1)

        if 'id' in j['info']:
            self.id = j['info']['id']
        else:
            basename = os.path.basename(fname)
            basename = re.sub('\.(json|xml|txt)$', '', basename)
            self.id = basename
        self.authors = [x.strip() for x in j['info'].get('authors', [])]
        self.title = title_case(j['info'].get('title', ''))
        self.book = title_case(j['info'].get('book', ''))
        self.year = j['info'].get('year', '')
        self.url = j['info'].get('url', '')
        self.references = set(j.get('references', []))
        self.sections = j.get('sections', [])
        self.roles = {}
        self.corpus = None

        if fname and form == 'text':
            st = SentTokenizer()
            self.sections = [{'text': st.tokenize(open(fname).read())}]
        elif fname and form == 'sd':
            self.read_sd(fname)

Example #5

0

Show file

    def read_sd(self, f, fref=None):
        """Read document contents from a ScienceDirect XML file."""
        def get_para_sents(p):
            if p.find('list'):
                # Really this needs to be split into the paragraph text
                # before and after the list, but BeautifulSoup is a pain, and
                # this is good enough.
                l = p.find('list').replace_with(' ... ')
                sents = [
                    re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())
                ]
                for para in l.find_all(['para', 'simple_para']):
                    sents.extend([
                        re.sub(r'\s+', ' ', x)
                        for x in st.tokenize(para.get_text())
                    ])
                return sents
            return [re.sub(r'\s+', ' ', x) for x in st.tokenize(p.get_text())]

        if '-ref.xml' in f:
            return

        xml = io.open(f, 'r', encoding='utf-8').read()
        xml = ftfy.fix_text(xml, uncurl_quotes=False, fix_entities=False)
        xml = strtr(xml, {'e´': 'é', 'e`': 'è'})
        xml = re.sub("([</])(dc|prism|ce|sb|xocs):", r"\1", xml)
        soup = BeautifulSoup(xml, 'lxml')

        try:
            pii = re.sub('[()-.]', '', soup.find('pii').string)
        except:
            print('No PII found for', f)
            return

        self.id = 'sd-' + pii.lower()
        self.authors = []
        try:
            for author in soup('creator'):
                x = author.string.strip()
                name = re.sub('^.*, ', '', x) + ' ' + re.sub(',.*$', '', x)
                self.authors.append(name)
        except:
            pass

        if not self.authors and soup.editor:
            self.authors = [
                x.get_text() + ' (ed.)' for x in soup.editor('authors')
            ]

        if soup.title:
            self.title = soup.title.string.strip()
        if soup.publicationname:
            self.book = soup.publicationname.string.strip()
        self.url = 'http://www.sciencedirect.com/science/article/pii/' + pii
        if soup.coverdate:
            # Dates are in format YYYY-MM-DD
            self.year = int(re.sub('-.*', '', soup.coverdate.string))

        st = SentTokenizer()
        if soup.abstract:
            sec = {
                'heading': 'Abstract',
                'text': st.tokenize(soup.find('abstract-sec').get_text())
            }
            self.sections.append(sec)

        sec_id = ''
        sec = {'text': []}
        sec_last = {'text': []}
        for p in soup.find_all(['para', 'simple-para']):
            if p.find_parents('outline'):
                continue
            elif p.find('list') and p.find('list').find('section-title'):
                continue
            elif p.find_parents('para'):
                continue
            elif p.find_parents('floats'):
                # Lest these show up at the start and be treated as an
                # abstract.
                sec_last['text'] += get_para_sents(p)
                continue
            if p.parent.name in ['section', 'biography']:
                p_sec_id = p.parent.get('id', '')
                if p_sec_id != sec_id:
                    if sec['text']:
                        self.sections.append(sec)
                    sec = {'text': []}
                    sec_id = p_sec_id
                    heading = p.parent.find('section-title')
                    if heading and heading.string:
                        sec['heading'] = heading.string.strip()
                    elif p.parent.name == 'biography':
                        sec['heading'] = 'Biography'
            sec['text'] += get_para_sents(p)
        if sec['text']:
            self.sections.append(sec)
        if sec_last['text']:
            self.sections.append(sec_last)

        if soup.rawtext and len(self.sections) < 3:
            self.sections.append(
                {'text': st.tokenize(soup.rawtext.get_text())})

        if len(self.text()) < 200:
            print(' ! Skip:', self.title, self.id + '. Missing text.')
            return

        if not fref:
            fref = f.replace('-full.xml', '-ref.xml')

        if os.path.exists(fref):
            reftext = io.open(fref, 'r', encoding='utf-8').read()
            self.references = set([
                x.replace('PII:', 'sd-').lower()
                for x in re.findall('PII:[^<]+', reftext)
            ])

Example #6

0

Show file

File: corpus.py Project: ISI-TechknAcq/techknacq-tk

    def read_sd(self, f, fref=None):
        """Read document contents from a ScienceDirect XML file."""

        def get_para_sents(p):
            if p.find('list'):
                # Really this needs to be split into the paragraph text
                # before and after the list, but BeautifulSoup is a pain, and
                # this is good enough.
                l = p.find('list').replace_with(' ... ')
                sents = [re.sub(r'\s+', ' ', x) for x in
                         st.tokenize(p.get_text())]
                for para in l.find_all(['para', 'simple_para']):
                    sents.extend([re.sub(r'\s+', ' ', x) for x in
                                  st.tokenize(para.get_text())])
                return sents
            return [re.sub(r'\s+', ' ', x) for x in
                    st.tokenize(p.get_text())]

        if '-ref.xml' in f:
            return

        xml = io.open(f, 'r', encoding='utf-8').read()
        xml = ftfy.fix_text(xml, uncurl_quotes=False,
                            fix_entities=False)
        xml = strtr(xml, {'e´': 'é', 'e`': 'è'})
        xml = re.sub("([</])(dc|prism|ce|sb|xocs):", r"\1", xml)
        soup = BeautifulSoup(xml, 'lxml')

        try:
            pii = re.sub('[()-.]', '', soup.find('pii').string)
        except:
            print('No PII found for', f)
            return

        self.id = 'sd-' + pii.lower()
        self.authors = []
        try:
            for author in soup('creator'):
                x = author.string.strip()
                name = re.sub('^.*, ', '', x) + ' ' + re.sub(',.*$', '', x)
                self.authors.append(name)
        except:
            pass

        if not self.authors and soup.editor:
            self.authors = [x.get_text() + ' (ed.)' for x in
                            soup.editor('authors')]

        if soup.title:
            self.title = soup.title.string.strip()
        if soup.publicationname:
            self.book = soup.publicationname.string.strip()
        self.url = 'http://www.sciencedirect.com/science/article/pii/' + pii
        if soup.coverdate:
            # Dates are in format YYYY-MM-DD
            self.year = int(re.sub('-.*', '', soup.coverdate.string))

        st = SentTokenizer()
        if soup.abstract:
            sec = {'heading': 'Abstract',
                   'text': st.tokenize(soup.find('abstract-sec').get_text())}
            self.sections.append(sec)

        sec_id = ''
        sec = {'text': []}
        sec_last = {'text': []}
        for p in soup.find_all(['para', 'simple-para']):
            if p.find_parents('outline'):
                continue
            elif p.find('list') and p.find('list').find('section-title'):
                continue
            elif p.find_parents('para'):
                continue
            elif p.find_parents('floats'):
                # Lest these show up at the start and be treated as an
                # abstract.
                sec_last['text'] += get_para_sents(p)
                continue
            if p.parent.name in ['section', 'biography']:
                p_sec_id = p.parent.get('id', '')
                if p_sec_id != sec_id:
                    if sec['text']:
                        self.sections.append(sec)
                    sec = {'text': []}
                    sec_id = p_sec_id
                    heading = p.parent.find('section-title')
                    if heading and heading.string:
                        sec['heading'] = heading.string.strip()
                    elif p.parent.name == 'biography':
                        sec['heading'] = 'Biography'
            sec['text'] += get_para_sents(p)
        if sec['text']:
            self.sections.append(sec)
        if sec_last['text']:
            self.sections.append(sec_last)

        if soup.rawtext and len(self.sections) < 3:
            self.sections.append({'text': st.tokenize(soup.rawtext.get_text())})

        if len(self.text()) < 200:
            print(' ! Skip:', self.title, self.id + '. Missing text.')
            return

        if not fref:
            fref = f.replace('-full.xml', '-ref.xml')

        if os.path.exists(fref):
            reftext = io.open(fref, 'r', encoding='utf-8').read()
            self.references = set([x.replace('PII:', 'sd-').lower() for x in
                                   re.findall('PII:[^<]+', reftext)])