def lix(corpus: Corpus, _: str, callback: Callable) -> Optional[Tuple[np.ndarray, List[str]]]: """ Readability index LIX https://en.wikipedia.org/wiki/Lix_(readability_test) """ corpus = preprocess_only_words(corpus) tokenizer = tokenize.PunktSentenceTokenizer() def lix_index(document, tokens): callback() # if the text is a single sentence, scores will be high sentences = len(tokenizer.tokenize(document)) words = len(tokens) long_words = len([token for token in tokens if len(token) > 6]) try: return words / sentences + (long_words * 100 / words) except ZeroDivisionError: return 0 return ( np.c_[[ lix_index(d, tokens) for d, tokens in zip(corpus.documents, corpus.tokens) ]], ["LIX index"], )
class PunktSentenceTokenizer(BaseTokenizer): """ 根据句子分词. This example. Another example. → (This example.), (Another example.) """ tokenizer = tokenize.PunktSentenceTokenizer() name = '句子' @wait_nltk_data def __init__(self): super().__init__()
class PunktSentenceTokenizer(BaseTokenizer): """ Split by full-stop, keeping entire sentences. """ tokenizer = tokenize.PunktSentenceTokenizer() name = 'Sentence' @wait_nltk_data def __init__(self): super().__init__()
def tokenization(sample_text): cust_tokenizer = tokenize.PunktSentenceTokenizer(train_text) text = filter(sample_text) # Then we can actually tokenize, using: return cust_tokenizer.tokenize(text) #tokenize in sentences
class PunktSentenceTokenizer(BaseTokenizer): """ Split by full-stop, keeping entire sentences. """ tokenizer = tokenize.PunktSentenceTokenizer() name = 'Sentence'
import re from typing import List, Tuple, Dict, Set, Pattern from nltk import tokenize from bionorm.common.models import SpeciesMention from bionorm.normalizers.gene.GNormPlus.models import GNormPaper, GNormSpeciesAnnotation, SpeciesAnnotationPlacement, GNormGeneMention, \ GNormPassage HUMAN_ID = '9606' TaxonomyFrequency = Dict[str, float] HumanViruses = Set[str] GeneWithoutSpPrefix = Set[str] PrefixMap = Dict[str, Pattern[str]] SENTENCE_TOKENIZER = tokenize.PunktSentenceTokenizer() def assign_species(paper: GNormPaper, taxonomy_frequency: TaxonomyFrequency, human_viruses: HumanViruses, gene_without_sp_prefix: GeneWithoutSpPrefix, prefix_map: PrefixMap): species_to_num_hash: Dict[str, float] = {} for passage in paper.passages: # type: GNormPassage for species in passage.species: # type: SpeciesMention if species.id is None: continue ID = species.id weight = 1.0 if passage.name == 'title': weight = 2.0 if ID in species_to_num_hash: