コード例 #1
0
ファイル: align.py プロジェクト: zzcoolj/rosetta
def sentence_alignment_from_one_paragraph(en_para, po_para):
    en_sent = []
    po_sent = []
    align_en = []
    align_po = []
    en_count = 0
    po_count = 0
    count = 0

    # English sentence segmenter
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(str.strip(en_para))
    for sent in doc.sents:
        en_count += 1
        en_sent.append(sent.text)
        # print('*******'+sent.text)

    # Polish sentence segmenter
    nlp = Polish()  # just the language with no model
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(str.strip(po_para))
    for sent in doc.sents:
        po_count += 1
        po_sent.append(sent.text)
        # print('-------'+sent.text)

    for a, b in align(en_sent, po_sent):
        count += 1
        # print('----->', a, '|||', b, '<------')
        align_en.append(a.split())
        align_po.append(b.split())
    # print('en sent count', en_count)
    # print('po sent count', po_count)
    print('aligned:', count)

    return align_en, align_po
コード例 #2
0
    def __init__(self, texts_file, tags_file, clean_data=True, remove_stopwords=False, is_train=True):
        self.args = Parser().get_sections(['GENERAL', 'RNN', 'FLAIR'])
        self.max_sent_length = int(self.args['max_sent_length'])
        self.batch_size = int(self.args['batch_size'])
        self.emb_size = int(self.args['emb_size'])
        self.clean_data = clean_data
        self.remove_stopwords = remove_stopwords
        self.is_train = is_train

        self.nlp = Polish()
        self.df = self.build_dataframe(texts_file, tags_file)
        self.unk_emb = self.get_random_emb(self.emb_size)
        self.word2idx, self.idx2word = self.build_dict()
        if self.is_train:
            self.embeddings = self.get_embeddings(self.args['emb_path'])
コード例 #3
0
    def _getMeaningfulWords(self, query, language):
        if language == 'pl':
            nlp = Polish()
        elif language == 'en':
            nlp = English()
        else:
            raise ValueError(f'unsupported language {language}')

        query = self._lematize(query, language)

        token_list = [token.text for token in nlp(query)]
        filtered_query = []
        for word in token_list:
            lexeme = nlp.vocab[word]
            if not lexeme.is_stop:
                filtered_query.append(word)
        return filtered_query
コード例 #4
0
ファイル: tests.py プロジェクト: Atheam/text_algorithms
from longest_common_subseq import lcs, diff
from spacy.lang.pl import Polish
from spacy.tokenizer import Tokenizer
from random import random

print("----------STRING EDITION VISUALIZED--------")
str_in_arr = ["los", "Łódź", "kwintesencja", "ATGAATCTTACCGCCTCG"]
str_out_arr = ["kloc", "Lodz", "quintessence", "ATGAGGCTCTGGCCCTG"]
for str_in, str_out in zip(str_in_arr, str_out_arr):
    print("\nEDITING", str_in, "INTO", str_out + "\n")
    arr = edit_distance(str_in, str_out)
    print_operations(str_in, str_out, get_operations(arr))

with open("romeo-i-julia-700.txt", "r") as f:
    text = f.read()
    tokenizer = Tokenizer(Polish().vocab)
    tokens = tokenizer(text)
    tokenized1 = []
    tokenized2 = []
    for token in tokens:
        if random() >= 0.03:
            tokenized1.append(token)
        if random() >= 0.03:
            tokenized2.append(token)
    with open("tokenized1.txt", "w") as f:
        for token in tokenized1:
            f.write(token.text_with_ws)
    with open("tokenized2.txt", "w") as f:
        for token in tokenized2:
            f.write(token.text_with_ws)
コード例 #5
0
    for sent in sentences:
        for token in sent:
            token['ner'] = tags[i]
            i += 1

    return sentences

def required_files_exist(dir):
    required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml]
    for file in required_files:
        if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)):
            return False

    return True

nlp = Polish()
doc_id = 0
corpus = []

NE_njkp_to_spacy = {'persName': 'PERSON',
 'placeName': 'LOC',
 'orgName': 'ORG',
 'date': 'DATE',
 'time': 'TIME',
 'geogName': 'LOC'}

for f in os.listdir(os.path.join(path_prefix, corpus_path)):
    doc_json = {}
    current_folder = f

    if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
コード例 #6
0
def file_to_tokens(path):
    tokenizer = Tokenizer(Polish().vocab)
    with open(path, 'r') as file:
        text = file.read()
        tokens = tokenizer(text)
    return list(map(str, tokens))
コード例 #7
0
import nltk
import nltk.stem
import pandas as pd
from nltk.corpus import stopwords
from spacy.lang.pl import Polish
from spacy.lang.pl.examples import sentences
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

parser = Polish()
stops = set(nltk.corpus.stopwords.words('polish'))
words = [word for word in words if word not in stops]

s = nltk.stem.WordNetLemmatizer()


class Topic:
    def __init__(self):
        print('init')

    @staticmethod
    def preapare_data():
        with open('/home/hyperscypion/Desktop/database.chatbot', 'r') as file:
            read = file.read()
            read = read.splitlines()
            for text in read:
                text = text.replace(',', '').replace('|', ',').replace('.', '')
                text += '\n'
                with open('/home/hyperscypion/Desktop/database.csv',
                          'a') as fout:
                    fout.writelines(text)