Python NNSplit Examples, nnsplit.NNSplit Python Examples

Example #1

0

Show file

File: vader_classification.py Project: leeyuankang/text_mining_project

def perform_vader_classification(review_id, review):
    # Replace with new input
    # new_review = "You When I booked with your company on line you showed me pictures of a room I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly Which was completely false advertising After being there we realised that you have grouped lots of rooms on the photos together leaving me the consumer confused and extreamly disgruntled especially as its my my wife s 40th birthday present Please make your website more clear through pricing and photos as again I didn t really know what I was paying for and how much it had wnded up being Your photos told me I was getting something I wasn t Not happy and won t be using you again "

    sent_list = []
    splitter = NNSplit("en")

    sent = splitter.split([review])
    for i in sent[0]:
        new_string = ''
        for j in i:
            new_string += j.text + " "
        sent_list.append(new_string)
        
    sent_list_lower = [sent.lower() for sent in sent_list]

    data = pd.DataFrame(sent_list_lower, columns=["sentence"])
    data['review_id'] = review_id
    data['sen_lvl_polarity'] = data['sentence'].apply(get_polarity)
    data['sen_lvl_sentiment'] = data['sen_lvl_polarity'].apply(det_sentiment)
    length = (data['sentence'].apply(number_words) >= 8)
    data = data.loc[length]
    data = data.reindex(columns=['review_id','sentence', 'sen_lvl_polarity', 'sen_lvl_sentiment'])

    # review level polarity
    data['review_lvl_polarity'] = data['sen_lvl_polarity'].mean()
    
    data['review_lvl_sentiment'] = data['review_lvl_polarity'].apply(det_sentiment)
 
    return data

Example #2

0

Show file

def segment_setences(words, lang="en"):

    content = " ".join(map(lambda word: word["text"], words))

    sentences = []

    left = 0

    splits = NNSplit.load(lang).split([content])

    for tokens2d in tqdm(splits):
        for tokens in tokens2d:

            text = "".join(map(lambda token: str(token), tokens)).strip()

            right = min(len(words), left + len(tokens)) - 1

            while right > 0 and not text.endswith(words[right]["text"]):
                right -= 1

            sentences.append({
                "start": words[left]["start"],
                "end": words[right]["end"],
                "text": text
            })

            left = right + 1

    return sentences

Example #3

0

Show file

 def __init__(self, keyword, channel, contents_id):
     self.engine = create_engine(
         ("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format(
             'root', 'robot369', '1.221.75.76', 3306, 'datacast2'))
     self.keyword = keyword
     self.channel = channel
     self.splitter = NNSplit.load("en")
     self.contents_id = contents_id

Example #4

0

Show file

File: test.py Project: s3bk/nltk-test

def ner_nnsplit():
    from nnsplit import NNSplit
    splitter = NNSplit("/home/sebk/Rust/nnsplit/models/en/model")

    def ner(paras):
        for para in splitter.split(paras):
            for sent in para:
                print(">", sent)

    return ner

Example #5

0

Show file

File: AspectM.py Project: alhassanaraouf/IO-Analytix

def SplitingText(text):
    """
    split text into Sentences
    
    Parameters
    ----------
    text : string
        take text as input.

    Returns
    -------
    normalized : list
    list contains Sentences.
    """    
    splitter = NNSplit("en")
    text = text.replace("but", " ")
    text = [text]
    split_text = splitter.split(text)
    normalized_split_text = normalize(split_text)
    return normalized_split_text

Example #6

0

Show file

def evaluate(subtitle_path, spacy_model, nnsplit_path, punctuation):
    # nnsplit must be installed to evaluate
    from nnsplit import NNSplit

    print("Evaluating..")

    dataset = data.Subset(OpenSubtitlesDataset(subtitle_path, 1_000_000),
                          np.arange(100_000))
    targets = {
        "NNSplit":
        NNSplitInterface(NNSplit(nnsplit_path, use_cuda=True,
                                 batch_size=2**7)),
        "Spacy (Tagger)":
        SpacyInterface(spacy_model, use_sentencizer=False),
        "Spacy (Sentencizer)":
        SpacyInterface(spacy_model, use_sentencizer=True),
    }

    eval_setups = {
        "Clean": (0.0, 0.0),
        "Partial punctuation": (0.5, 0.0),
        "Partial case": (0.0, 0.5),
        "Partial punctuation and case": (0.5, 0.5),
        "No punctuation and case": (1.0, 1.0),
    }

    result = {}
    preds = {}

    for eval_name, (remove_punct_prob,
                    lower_start_prob) in eval_setups.items():
        result[eval_name] = {}
        evaluator = Evaluator(dataset, remove_punct_prob, lower_start_prob,
                              punctuation)

        for target_name, interface in targets.items():
            correct = evaluator.evaluate(interface.split)
            preds[f"{eval_name}_{target_name}"] = {
                "samples": evaluator.texts,
                "correct": correct,
            }
            result[eval_name][target_name] = correct.mean()

    result = pd.DataFrame.from_dict(result).T
    print(result)
    print(result.to_markdown())

Example #7

0

Show file

File: tokenizers.py Project: ASAPP-H/clip2

def load_tokenizer(type_: str):
    print(f"Loading tokenizer: {type_}")
    if type_ in SPACY_MODELS:
        spacy.prefer_gpu()
        return spacy.load(type_)
    elif type_ == "nltk":
        # nltk tokenization functions are called on-demand
        return None
    elif type_ == "stanfordnlp_pretrained":
        download_stanfordnlp_models()
        return stanfordnlp.Pipeline(
            processors="tokenize",
            lang="en",
            tokenize_pretokenized=False,
            models_dir=STANFORDNLP_DIR,
        )
    elif type_ == "stanfordnlp_whitespace":
        download_stanfordnlp_models()
        return stanfordnlp.Pipeline(
            processors="tokenize",
            lang="en",
            tokenize_pretokenized=True,
            models_dir=STANFORDNLP_DIR,
        )
    elif type_ == "nnsplit":
        return NNSplit("en")
    elif type_ == "deepsegment":
        return DeepSegment("en")
    elif type_ == "wboag":
        # wboag (mimic_utils) tokenization function is called on-demand
        return None
    elif type_ == "syntok":
        # syntok tokenization function is called on-demand
        return None
    else:
        raise ValueError(f"Unknown tokenizer type: {type_}")

Example #8

0

Show file

# =============================================================================
# Moses: Used in cc_net https://github.com/luismsgomes/mosestokenizer
# =============================================================================
if False:
    from mosestokenizer import *
    splitsents = MosesSentenceSplitter('de')
    splitsents([data])

# =============================================================================
# https://github.com/bminixhofer/nnsplit
# =============================================================================
if False:
    from nnsplit import NNSplit

    splitter = NNSplit("de")

    res = splitter.split([data])

# =============================================================================
# More advanced: Deepsegment: Does not support German
# =============================================================================
if False:
    from deepsegment import DeepSegment
    # The default language is 'en'
    segmenter = DeepSegment('de')

    with open('data/start.txt', 'r') as myfile:
        data = myfile.read()

    segmenter.segment('I am Batman i live in gotham')

Example #9

0

Show file

File: vader_classification.py Project: seanchai96/text_mining

def get_polarity(sentence):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(sentence)
    compound = score['compound']
    return compound


def number_words(sentence):
    return len(re.findall(r'\w+', str(sentence)))


# Replace with new input
new_review = "You When I booked with your company on line you showed me pictures of a room I thought I was getting and paying for and then when we arrived that s room was booked and the staff told me we could only book the villa suite theough them directly Which was completely false advertising After being there we realised that you have grouped lots of rooms on the photos together leaving me the consumer confused and extreamly disgruntled especially as its my my wife s 40th birthday present Please make your website more clear through pricing and photos as again I didn t really know what I was paying for and how much it had wnded up being Your photos told me I was getting something I wasn t Not happy and won t be using you again "

sent_list = []
splitter = NNSplit("en")

sent = splitter.split([new_review])
for i in sent[0]:
    new_string = ''
    for j in i:
        new_string += j.text + " "
    sent_list.append(new_string)

sent_list_lower = [sent.lower() for sent in sent_list]

# stop_list = stopwords.words('english')
# sent_list_lower_no_stopword_list = [[word for word in sent.split() if not word in stop_list] for sent in sent_list_lower]
# sent_list_lower_no_stopword = []
# for sent in sent_list_lower_no_stopword_list:
#     new_sent = ' '.join(sent)

Example #10

0

Show file

File: create_vector_script.py Project: TTDS-Dream-Team/Preprocessing

from nnsplit import NNSplit
from sentence_transformers import SentenceTransformer
import numpy as np
import h5py
from tqdm.auto import tqdm
import zlib
import pymongo
from mongo_proxy import MongoProxy
import json
from bson import ObjectId
import time
from threading import Thread, Lock
import gc
from guppy import hpy

splitter = NNSplit.load("en", use_cuda=True)

lock = Lock()


class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return json.JSONEncoder.default(self, o)


db_pwd = "LTEG2pfoDiKfH29M"
client = MongoProxy(
    MongoClient(
        f"mongodb+srv://cdminix:{db_pwd}@cluster0.pdjrf.mongodb.net/Reviews_Data?retryWrites=true&w=majority"

Example #11

0

Show file

File: benchmark.py Project: marlon-br/nnsplit

 def nnsplit_init(**kwargs):
     return NNSplitInterface(NNSplit("../models/de/model.onnx", **kwargs))

Example #12

0

Show file

    return DEFAULT_LANGUAGE_MODEL.split(s)


c = 'wethepeopleoftheunitedstatesinordertoformamoreperfectunionestablishjusticeinsuredomestictranquilityprovideforthecommondefencepromotethegeneralwelfareandsecuretheblessingsoflibertytoourselvesandourposteritydoordainandestablishthisconstitutionfortheunitedstatesofamerica'
d = 'WeholdthesetruthstobeselfevidentthatallmenarecreatedequalthattheyareendowedbytheirCreatorwithcertainunalienableRightsthatamongtheseareLifeLibertyandthepursuitofHappinessThattosecuretheserightsGovernmentsareinstitutedamongMenderivingtheirjustpowersfromtheconsentofthegovernedThatwheneveranyFormofGovernmentbecomesdestructiveoftheseendsitistheRightofthePeopletoalterortoabolishitandtoinstitutenewGovernmentlayingitsfoundationonsuchprinciplesandorganizingitspowersinsuchformastothemshallseemmostlikelytoeffecttheirSafetyandHappinessPrudenceindeedwilldictatethatGovernmentslongestablishedshouldnotbechangedforlightandtransientcausesandaccordinglyallexperiencehathshewnthatmankindaremoredisposedtosufferwhileevilsaresufferablethantorightthemselvesbyabolishingtheformstowhichtheyareaccustomedButwhenalongtrainofabusesandusurpationspursuinginvariablythesameObjectevincesadesigntoreducethemunderabsoluteDespotismitistheirrightitistheirdutytothrowoffsuchGovernmentandtoprovidenewGuardsfortheirfuturesecuritSuchhasbeenthepatientsufferanceoftheseColoniesandsuchisnowthenecessitywhichconstrainsthemtoaltertheirformerSystemsofGovernmentThehistoryofthepresentKingofGreatBritainisahistoryofrepeatedinjuriesandusurpationsallhavingindirectobjecttheestablishmentofanabsoluteTyrannyovertheseStatesToprovethisletFactsbesubmittedtoacandidworld'
r = 'HowdymynameisBrittanyPitcherandiamanelectricalengineeringmajorfromspringtxbutmostimportantlyiamtheloudestandproudestmemberofthefightingtexasaggieclassoftwentytwentyoneawhoop'
z = 'hellomynameisbrittanypitcherandmyfavoritecolorismarooniaminseniordesignrightnowthisiswhyiamworkingonthisprojectitismeanttohelpthosewhoarehardofhearingordeaftoovercomelanguagebarrierswiththeirpeersiamexcitedforittobefinishedandtodeterminghowwellitworks'

#create string of r, c, d
c = " ".join(split(c))
d = " ".join(split(d))
r = " ".join(split(r))
z = " ".join(split(z))

#try to split sentences

from nnsplit import NNSplit
splitter = NNSplit.load("en")

splits = splitter.split([res])[0]

i = len(splits) - 1
#split can be iterated over
for sentence in splits:
    print(sentence, end='')
    if (i > 0):
        print("\b.")
        i = i - 1
    else:
        print('.')