Ejemplo n.º 1
0
class CoreNLP:
    def __init__(self):
        self.parser = CoreNLPDependencyParser(url=self.corenlp_server())
        self.sentence_tokenizer = PunktSentenceTokenizer()

    @staticmethod
    def corenlp_server():
        return getenv('CORENLP_SERVER')

    def dep_parse(self, text: str, conll_version=10) -> str:
        """Get a CoreNLP depparse,lemma"""
        def get_conll(t):
            deps, = self.parser.raw_parse(t)
            return deps.to_conll(conll_version)  # xrenner requires conll10

        sentences = self.sentence_tokenizer.sentences_from_text(text)
        return '\n'.join(map(get_conll, sentences))
Ejemplo n.º 2
0
def sentence_split(input_text):
    input_text = "<root>" + input_text + "</root>"

    soup = BeautifulSoup(input_text, "xml")
    paragraphs = []
    for doc in soup.find('root').findAll('DOC'):
        if doc['type'] == 'story':
            headlines = doc('HEADLINE')
            for h in headlines:
                paragraphs.append(h.contents[0])
            p_blocks = doc.find('TEXT').findAll('P')
            for p in p_blocks:
                paragraphs.append(p.contents[0])
        elif doc['type'] == 'multi':
            paragraphs.append(doc.find('TEXT').contents[0])

    sentences = []
    punkt = PunktSentenceTokenizer()
    for parag in paragraphs:
        for sent in punkt.sentences_from_text(parag, realign_boundaries=True):
            sentences.append(replace.sub(' ', sent).strip())
    return sentences
Ejemplo n.º 3
0
def sent_tokenize(text):
    model_path = join(dirname(__file__), 'sent_tokenize_model_v1.0.pkl')
    with open(model_path, 'rb') as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = [
        'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc',
        'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i',
        '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts',
        'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r',
        'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k',
        'e.l', 'o.t', 's.a'
    ]
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    sentences = tokenizer.sentences_from_text(text)
    return sentences