def process_paragraph(sent, without_output = False): doc_raw = nlp(sent) doc_clean = nlp(sent.replace('\n', ' ')) annotated = doc_to_format(doc_raw) joined_sents = join_clean_raw(doc_clean, annotated) if without_output: joined_sents = [x[0:-1] for x in joined_sents] return joined_sents
def parse_sentence(sent): sent = sent.replace('\n', ' ') # print(sent) doc = nlp(sent) word_l = [] for i, token in enumerate(doc): data = { 'word': token.text, 'id': i, 'word_properties': { 'lemma': token.lemma_, }, # 'morpho': { # 'raw': token.tag_, # **morpho_proc(token.tag_) # }, 'pos_tag': { 'tag': token.pos_, **render_pos(token.pos_) }, # 'dependency': { # 'value': token.dep_ # } } word_l.append(data) return word_l
def test_03(): for sent in sents: spacy_sent = nlp(sent) tf = FuncaoTextualSentenceParser(sent, spacy_sent) print(tf) print() pass
def annotate(self, pars): labels = [] paragraphs = [] for i_p, p in enumerate(pars): text = p text = text.replace('\n', ' ').strip() sentences = nlp(text).sents paragraph = [] for i,s in enumerate(sentences): short_labels, ann = self.stj.annotate(s) labels += short_labels paragraph.append({ 'sent_id': i, 'original': s.text, 'annotation': ann, }) paragraphs.append(paragraph) labels_transform = ClasificacaoTextual.seq_transform(labels) ct = ClasificacaoTextual.classify( labels_transform ) return { 'text_classification': ct, 'paragraphs': paragraphs, }
def test_02(): spacy_sents = [nlp(sent) for sent in sents] idx = 0 # print(sents[idx],'\n', spacy_sents[idx]) tf = FuncaoTextualSentenceParser(sents[idx], spacy_sents[idx]) print(tf) pass
def test_03(): ss = ['Entendeu-se por isso a culpa.'] spacy_sent = nlp(ss[0]) pc = ParseConnectors() # A. connectors result = pc.parse(spacy_sent) print('[connectors]\n', result)
def test_ssj(): for s in sents[:4]: n = nlp(s) stj = SentenceToJson() sl, m = stj.annotate(n) print(sl) print(m) #### print('======')
def parse_sentences_list(sents_l): sents = sents_l #not a list sents = nlp(sents.replace('\n', ' ')) sents = [x.text for x in list(sents.sents)] sents = [{ 'sent_id': i, 'original': s, 'parsed': parse_sentence(s) } for i, s in enumerate(sents)] return sents
def test_01(): spacy_sents = [nlp(sent) for sent in sents] cp = CausalityParser() for ss in spacy_sents: ee = cp.parse_causality(ss) print('sent:', ss) for e_ in ee: print(e_) print()
def __init__(self, paragraph): self.last_symbol_line = "!,.:;?)}]" self.first_symbol_line = "({[" self.prep_ll = ['para', 'por', 'pelo', 'pela', 'pelo', 'pelas'] self.consonants = 'bcdfghjklmnpqrstvwxyz' self.paragraph = paragraph.replace('\n', ' ') # (a) and (b) # self.lines = [x.text for x in list(nlp(self.paragraph).sents)] # (c) self.lines = nlp(self.paragraph).sents self.tokenized_lines = [] # (d) for line in self.lines: self.tokenized_lines.append([(token.text, token.pos_) for token in line])
def annotate(self, pars): paragraphs = [] sentences_all = [] for i_p, p in enumerate(pars): text_rec = p.replace('\n', ' ').strip() sentences = [ nlp(sent.text.strip()) for sent in nlp(text_rec).sents ] sentences_all += sentences ft = FuncaoTextual(sentences) result = ft.parse() table = result['table'] text_func = result['textual_function'] paragraphs.append({'id_par': i_p, 'funcao_textual': text_func}) ft = FuncaoTextual(sentences_all) result = ft.parse() table = result['table'] return {'tabela': table, 'paragraphs': paragraphs}
def test_01(): pc = ParseConnectors() pv = ParseVerbs() pd = ParseDenominacao() for sent in sents: spacy_sent = nlp(sent) # A. connectors print(pc.parse(spacy_sent)) # B. Verbs result = pv.parse(spacy_sent) print(result) # C. Denominacao result = pd.parse(spacy_sent) print(result)
def vocabulary_of_paragraph(text, exclude_list=[]): doc = nlp(text) exclude_list = [x.lower() for x in exclude_list] # tokens = [token for token in doc] elements = [(token.orth_, token.pos_) for token in doc] vocabulary = [ e[0].upper() for e in elements if e[1] in ['NOUN', 'VERB', 'ADJ', 'PROPN'] ] vocabulary = [x for x in vocabulary if not (x.lower() in stopwords_pt)] vocabulary = [x for x in vocabulary if not (x.lower() in exclude_list)] final_vocabulary = [] for i in range(len(vocabulary)): word = vocabulary[i] if not (word in vocabulary[:i]): final_vocabulary.append(word) pass return final_vocabulary
def test_02(): ss = ['Por isso ele não foi à escola.', 'Entendeu-se por isso a culpa.'] pc = ParseConnectors() pv = ParseVerbs() pd = ParseDenominacao() for sent in ss: spacy_sent = nlp(sent) print(spacy_sent) # A. connectors result = pc.parse(spacy_sent) print('[connectors]\n', result) # B. Verbs result = pv.parse(spacy_sent) print('[verbs]\n', result) # C. Denominacao result = pd.parse(spacy_sent) print('[den]\n', result)
def test_01(): spacy_sent = [nlp(text)] ft = FuncaoTextual(spacy_sent) r = ft.parse() print(r)