Esempio n. 1
0
def replace_proper_nouns_in_line(line, to_replace):
    toktok = ToktokTokenizer()
    tokenized  = toktok.tokenize(line)
    tagged_sent = pos_tag(tokenized)
    for i in range(len(tokenized)):
        if tagged_sent[i][1] == "NNP":
            tokenized[i] = tokenized[i][:2] + to_replace
            
    return tokenized
Esempio n. 2
0
def normalize_sentences(sentences: List[str]) -> List[str]:
    new_sentences = []
    tokenizer = ToktokTokenizer()
    morph = pymorphy2.MorphAnalyzer()
    for line in sentences:
        line = line.lower()
        line = ''.join(c if c in RUSSIAN_ALPHABET else ' ' for c in line)
        line = ' '.join(morph.parse(word)[0].normal_form for word in tokenizer.tokenize(line))
        new_sentences.append(line)
    return new_sentences
Esempio n. 3
0
    def __init__(self, synsets: List[List[str]], tokenizer=None):
        self.tokenizer = tokenizer or ToktokTokenizer()

        list_synsets = list()
        max_word_token_list_len = 0
        for synset in synsets:
            tokenized_synset = []
            for word in synset:
                tokenized_word = self.tokenizer.tokenize(word)
                max_word_token_list_len = max(max_word_token_list_len,
                                              len(tokenized_word))
                tokenized_synset.append(tokenized_word)
            list_synsets.append(tokenized_synset)

        dict_synsets = dict()
        for synset in list_synsets:
            tag = synset[0]
            for word in synset[1:]:
                dict_synsets[tuple(word)] = tuple(tag)

        self.synsets = dict_synsets
        self.max_word_token_list_len = max_word_token_list_len
from nltk import ToktokTokenizer
from readtxt import readtxt
from pattern.nl import attributive, predicative, parse

ConceptsFile = '/home/sander/Studie/Stage/fuzzy-octo-tribble/Entities/Concepts_UMLS.txt'
tokenize = ToktokTokenizer().tokenize

def IndexLastComma(Tokens):
    assert ',' in Tokens
    i = len(Tokens)-1
    while Tokens[i] != ',':
        i-=1
    return i

def Variants(Concept):

    TC = tokenize(Concept)

    # Lowercase all tokens except for capitalized abbreviations  
    TokenizedConcept = [] 
    for Token in TC:
        if Token.upper() == Token:
            TokenizedConcept.append(Token)
        else:
            TokenizedConcept.append(Token.lower())    

    Variants = [TokenizedConcept]
    
    # First variant removes specification between brackets.
    # But is this always sound???
Esempio n. 5
0
 def __init__(self, tokenizer=None):
     self.tokenizer = tokenizer or ToktokTokenizer()
Esempio n. 6
0
def spellfix(s: str, corpora: Corpora, fix_threshold: float) -> str:
    tokenizer = ToktokTokenizer()
    return ' '.join(
        spellfix_word(word, corpora, fix_threshold)
        for word in tokenizer.tokenize(s))
Esempio n. 7
0
 def __init__(self, tokenizer=None):
     tokenizer = tokenizer or ToktokTokenizer()
     super(CoreNLPChunkingExtractor, self).__init__(tokenizer)
     self.parser = CoreNLPParser()
def _tokenize_text(s):
    tokenizer = ToktokTokenizer()
    return tokenizer.tokenize(s)
Esempio n. 9
0
# Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text


# Stemming the text
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text


# Tokenization of text
tokenizer = ToktokTokenizer()

# set stopwords to english
stop_words = set(stopwords.words('english'))


# removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [
            token for token in tokens if token not in stop_words
        ]
    else:
        filtered_tokens = [
Esempio n. 10
0
 def update_with_sentence(self, s: str):
     tokenizer = ToktokTokenizer()
     self.update_with_list(tokenizer.tokenize(s))