def sentence_alignment_from_one_paragraph(en_para, po_para): en_sent = [] po_sent = [] align_en = [] align_po = [] en_count = 0 po_count = 0 count = 0 # English sentence segmenter nlp = spacy.load("en_core_web_sm") doc = nlp(str.strip(en_para)) for sent in doc.sents: en_count += 1 en_sent.append(sent.text) # print('*******'+sent.text) # Polish sentence segmenter nlp = Polish() # just the language with no model sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(str.strip(po_para)) for sent in doc.sents: po_count += 1 po_sent.append(sent.text) # print('-------'+sent.text) for a, b in align(en_sent, po_sent): count += 1 # print('----->', a, '|||', b, '<------') align_en.append(a.split()) align_po.append(b.split()) # print('en sent count', en_count) # print('po sent count', po_count) print('aligned:', count) return align_en, align_po
def __init__(self, texts_file, tags_file, clean_data=True, remove_stopwords=False, is_train=True): self.args = Parser().get_sections(['GENERAL', 'RNN', 'FLAIR']) self.max_sent_length = int(self.args['max_sent_length']) self.batch_size = int(self.args['batch_size']) self.emb_size = int(self.args['emb_size']) self.clean_data = clean_data self.remove_stopwords = remove_stopwords self.is_train = is_train self.nlp = Polish() self.df = self.build_dataframe(texts_file, tags_file) self.unk_emb = self.get_random_emb(self.emb_size) self.word2idx, self.idx2word = self.build_dict() if self.is_train: self.embeddings = self.get_embeddings(self.args['emb_path'])
def _getMeaningfulWords(self, query, language): if language == 'pl': nlp = Polish() elif language == 'en': nlp = English() else: raise ValueError(f'unsupported language {language}') query = self._lematize(query, language) token_list = [token.text for token in nlp(query)] filtered_query = [] for word in token_list: lexeme = nlp.vocab[word] if not lexeme.is_stop: filtered_query.append(word) return filtered_query
from longest_common_subseq import lcs, diff from spacy.lang.pl import Polish from spacy.tokenizer import Tokenizer from random import random print("----------STRING EDITION VISUALIZED--------") str_in_arr = ["los", "Łódź", "kwintesencja", "ATGAATCTTACCGCCTCG"] str_out_arr = ["kloc", "Lodz", "quintessence", "ATGAGGCTCTGGCCCTG"] for str_in, str_out in zip(str_in_arr, str_out_arr): print("\nEDITING", str_in, "INTO", str_out + "\n") arr = edit_distance(str_in, str_out) print_operations(str_in, str_out, get_operations(arr)) with open("romeo-i-julia-700.txt", "r") as f: text = f.read() tokenizer = Tokenizer(Polish().vocab) tokens = tokenizer(text) tokenized1 = [] tokenized2 = [] for token in tokens: if random() >= 0.03: tokenized1.append(token) if random() >= 0.03: tokenized2.append(token) with open("tokenized1.txt", "w") as f: for token in tokenized1: f.write(token.text_with_ws) with open("tokenized2.txt", "w") as f: for token in tokenized2: f.write(token.text_with_ws)
for sent in sentences: for token in sent: token['ner'] = tags[i] i += 1 return sentences def required_files_exist(dir): required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml] for file in required_files: if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)): return False return True nlp = Polish() doc_id = 0 corpus = [] NE_njkp_to_spacy = {'persName': 'PERSON', 'placeName': 'LOC', 'orgName': 'ORG', 'date': 'DATE', 'time': 'TIME', 'geogName': 'LOC'} for f in os.listdir(os.path.join(path_prefix, corpus_path)): doc_json = {} current_folder = f if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
def file_to_tokens(path): tokenizer = Tokenizer(Polish().vocab) with open(path, 'r') as file: text = file.read() tokens = tokenizer(text) return list(map(str, tokens))
import nltk import nltk.stem import pandas as pd from nltk.corpus import stopwords from spacy.lang.pl import Polish from spacy.lang.pl.examples import sentences from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer parser = Polish() stops = set(nltk.corpus.stopwords.words('polish')) words = [word for word in words if word not in stops] s = nltk.stem.WordNetLemmatizer() class Topic: def __init__(self): print('init') @staticmethod def preapare_data(): with open('/home/hyperscypion/Desktop/database.chatbot', 'r') as file: read = file.read() read = read.splitlines() for text in read: text = text.replace(',', '').replace('|', ',').replace('.', '') text += '\n' with open('/home/hyperscypion/Desktop/database.csv', 'a') as fout: fout.writelines(text)