Python Polishの例

プログラミング言語: Python

名前空間/パッケージ名: spacy.lang.pl

クラス/型: Polish

hotexamples.comのコード掲載数: 7

Python Polish - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのspacy.lang.pl.Polishの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Polish(7)

add_pipe(1)

create_pipe(1)

よく使われるメソッド

Polish (7)

add_pipe (1)

create_pipe (1)

コード例 #1

ファイルを表示

ファイル: align.py プロジェクト: zzcoolj/rosetta

def sentence_alignment_from_one_paragraph(en_para, po_para):
    en_sent = []
    po_sent = []
    align_en = []
    align_po = []
    en_count = 0
    po_count = 0
    count = 0

    # English sentence segmenter
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(str.strip(en_para))
    for sent in doc.sents:
        en_count += 1
        en_sent.append(sent.text)
        # print('*******'+sent.text)

    # Polish sentence segmenter
    nlp = Polish()  # just the language with no model
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(str.strip(po_para))
    for sent in doc.sents:
        po_count += 1
        po_sent.append(sent.text)
        # print('-------'+sent.text)

    for a, b in align(en_sent, po_sent):
        count += 1
        # print('----->', a, '|||', b, '<------')
        align_en.append(a.split())
        align_po.append(b.split())
    # print('en sent count', en_count)
    # print('po sent count', po_count)
    print('aligned:', count)

    return align_en, align_po

コード例 #2

ファイルを表示

ファイル: dataset.py プロジェクト: maciejbiesek/poleval-cyberbullying

    def __init__(self, texts_file, tags_file, clean_data=True, remove_stopwords=False, is_train=True):
        self.args = Parser().get_sections(['GENERAL', 'RNN', 'FLAIR'])
        self.max_sent_length = int(self.args['max_sent_length'])
        self.batch_size = int(self.args['batch_size'])
        self.emb_size = int(self.args['emb_size'])
        self.clean_data = clean_data
        self.remove_stopwords = remove_stopwords
        self.is_train = is_train

        self.nlp = Polish()
        self.df = self.build_dataframe(texts_file, tags_file)
        self.unk_emb = self.get_random_emb(self.emb_size)
        self.word2idx, self.idx2word = self.build_dict()
        if self.is_train:
            self.embeddings = self.get_embeddings(self.args['emb_path'])

コード例 #3

ファイルを表示

ファイル: adsService.py プロジェクト: mikolajMacioszczyk/adsPicker

    def _getMeaningfulWords(self, query, language):
        if language == 'pl':
            nlp = Polish()
        elif language == 'en':
            nlp = English()
        else:
            raise ValueError(f'unsupported language {language}')

        query = self._lematize(query, language)

        token_list = [token.text for token in nlp(query)]
        filtered_query = []
        for word in token_list:
            lexeme = nlp.vocab[word]
            if not lexeme.is_stop:
                filtered_query.append(word)
        return filtered_query

コード例 #4

ファイルを表示

ファイル: tests.py プロジェクト: Atheam/text_algorithms

from longest_common_subseq import lcs, diff
from spacy.lang.pl import Polish
from spacy.tokenizer import Tokenizer
from random import random

print("----------STRING EDITION VISUALIZED--------")
str_in_arr = ["los", "Łódź", "kwintesencja", "ATGAATCTTACCGCCTCG"]
str_out_arr = ["kloc", "Lodz", "quintessence", "ATGAGGCTCTGGCCCTG"]
for str_in, str_out in zip(str_in_arr, str_out_arr):
    print("\nEDITING", str_in, "INTO", str_out + "\n")
    arr = edit_distance(str_in, str_out)
    print_operations(str_in, str_out, get_operations(arr))

with open("romeo-i-julia-700.txt", "r") as f:
    text = f.read()
    tokenizer = Tokenizer(Polish().vocab)
    tokens = tokenizer(text)
    tokenized1 = []
    tokenized2 = []
    for token in tokens:
        if random() >= 0.03:
            tokenized1.append(token)
        if random() >= 0.03:
            tokenized2.append(token)
    with open("tokenized1.txt", "w") as f:
        for token in tokenized1:
            f.write(token.text_with_ws)
    with open("tokenized2.txt", "w") as f:
        for token in tokenized2:
            f.write(token.text_with_ws)

コード例 #5

ファイルを表示

    for sent in sentences:
        for token in sent:
            token['ner'] = tags[i]
            i += 1

    return sentences

def required_files_exist(dir):
    required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml]
    for file in required_files:
        if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)):
            return False

    return True

nlp = Polish()
doc_id = 0
corpus = []

NE_njkp_to_spacy = {'persName': 'PERSON',
 'placeName': 'LOC',
 'orgName': 'ORG',
 'date': 'DATE',
 'time': 'TIME',
 'geogName': 'LOC'}

for f in os.listdir(os.path.join(path_prefix, corpus_path)):
    doc_json = {}
    current_folder = f

    if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):

コード例 #6

ファイルを表示

def file_to_tokens(path):
    tokenizer = Tokenizer(Polish().vocab)
    with open(path, 'r') as file:
        text = file.read()
        tokens = tokenizer(text)
    return list(map(str, tokens))

コード例 #7

ファイルを表示

import nltk
import nltk.stem
import pandas as pd
from nltk.corpus import stopwords
from spacy.lang.pl import Polish
from spacy.lang.pl.examples import sentences
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

parser = Polish()
stops = set(nltk.corpus.stopwords.words('polish'))
words = [word for word in words if word not in stops]

s = nltk.stem.WordNetLemmatizer()


class Topic:
    def __init__(self):
        print('init')

    @staticmethod
    def preapare_data():
        with open('/home/hyperscypion/Desktop/database.chatbot', 'r') as file:
            read = file.read()
            read = read.splitlines()
            for text in read:
                text = text.replace(',', '').replace('|', ',').replace('.', '')
                text += '\n'
                with open('/home/hyperscypion/Desktop/database.csv',
                          'a') as fout:
                    fout.writelines(text)