def tokenSentence(s):
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(s)
    l = tokenizer.tokenize(s)
    s = '\n'.join(l)

    return s
Exemple #2
0
def train_sentence_splitter(lang):
    """
    Train an NLTK punkt tokenizer for sentence splitting.
    http://www.nltk.org
    """
    # Read in trainings corpus
    plain_file = "%s.plain" % (lang)
    text = codecs.open(plain_file, "Ur", "utf-8").read()

    # Train tokenizer
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(text)

    # Dump pickled tokenizer
    pickle_file = "%s.pickle" % (lang)
    out = open(pickle_file, "wb")
    pickle.dump(tokenizer, out)
    out.close()
def train_sentence_splitter(lang):
    """
    Train an NLTK punkt tokenizer for sentence splitting.
    http://www.nltk.org
    """
    # Read in trainings corpus
    plain_file = "%s.plain" % (lang)
    text = codecs.open(plain_file, "Ur", "utf-8").read()

    # Train tokenizer
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(text)

    # Dump pickled tokenizer
    pickle_file = "%s.pickle" % (lang)
    out = open(pickle_file, "wb")
    pickle.dump(tokenizer, out)
    out.close()
Exemple #4
0
def tokenizer(text):

    # Tokenize text into sentences

    punkt_param = PunktParameters()
    with open("abbrev_list.pkl", "rb") as fp:
        abbrev_list = pickle.load(fp)
        punkt_param.abbrev_types = set(abbrev_list)
        tokenizer = PunktSentenceTokenizer(punkt_param)
        tokenizer.train(text)

    all_sentences = tokenizer.tokenize(text)

    seen = set()
    sentences = []
    for sentence in all_sentences:
        if sentence not in seen:
            seen.add(sentence)
            sentences.append(sentence)

    return sentences
Exemple #5
0
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
text = "There are more options under Mobile Networks in 2.3.4 that should do what you're asking, so you will see it in RC2. Thanks for the submission!"
tokenizer = PunktSentenceTokenizer()
tokenizer.train(text)
print(tokenizer.tokenize(text))


Exemple #6
0
def tokenize_corpus(corpus):
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(corpus)
    sentences = tokenizer.tokenize(corpus)
    return '\n'.join(sentences)
Exemple #7
0
 def sentence_segmentation(document):
     sentence_tokenizer = PunktSentenceTokenizer()
     sentence_tokenizer.train(document)
     sentence_tokenizer._params.abbrev_types
     segmented_sentence = sentence_tokenizer.tokenize(document)
     return segmented_sentence
import settings
import pickle
import nlp_util as nlp
from nltk.tokenize.punkt import PunktSentenceTokenizer

articles = pickle.load(
    open(settings.PATH_DATAOBJECTS + 'gs_articles.pickle', 'rb'))

tokenizer = PunktSentenceTokenizer()

for article in articles:
    tokenizer.train(nlp.get_clean_article(article.to_string()))

# pickle.dump(articles, open(settings.PATH_DATAOBJECTS + 'gs_tokenizer.pickle', 'wb'))

X = []
for article in articles:
    X.append(tokenizer.tokenize(nlp.get_clean_article(article.to_string())))
def performSentenceSegmentation(file_content):
    #Training the model using given text: unsupervised learning
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(file_content)
    sentence_segmentation = tokenizer.tokenize(file_content)
    return sentence_segmentation
Exemple #10
0
def train(filename, stem=True):
    """
    Given file to use as unsupervised data, train tfidfvectorizer and punkt
    sentence tokenizer and output to pickle in data directory.
    """
    text = codecs.open(filename, "rb", "utf8").read()

    abbreviations = [
            "u.s.a", "fig", "gov", "sen", "jus", "jdg", "rep", "pres",
            "mr", "mrs", "ms", "h.r", "s.", "h.b", "s.b", "u.k", "u.n",
            "u.s.s.r", "u.s",
    ]

    print("TRAINING SENTENCE TOKENIZER...")
    pst = PunktSentenceTokenizer()
    pst.train(text.replace("\n\n", " "))
    # add extra abbreviations
    pst._params.abbrev_types.update(abbreviations)    
    print("TRAINED ABBREVIATIONS: \n{}".format(pst._params.abbrev_types))
    
    # stemming
    if stem:
        wnl = WordNetLemmatizer()
        print("WORD TOKENIZING TEXT")
        tokens = nltk.word_tokenize(text)
        
        # pos tagging
        print("POS TAGGING TEXT...")
        tagged_tokens = pos_tag(tokens)

        print("STEMMING TRAINING TEXT...")
        for i, tok in enumerate(tagged_tokens):
            position = None
            if tok[1] == "NN" or tok[1] == "NNS" or tok[1] == "NNPS":
                position = wordnet.NOUN
            elif "JJ" in tok[1]:
                position = wordnet.ADJ
            elif "VB" in tok[1]:
                position = wordnet.VERB
            elif "RB" in tok[1]:
                position = wordnet.ADV

            if position:
                tokens[i] = wnl.lemmatize(tok[0], position)

            if i % 1000000 == 0:
                print("TOKEN: {}".format(i))

        text = "".join([("" if tok in string.punctuation else " ")+tok 
                for tok in tokens])
        text = text.strip() 
    
    print("TRAINING VECTORIZER...")
    tfv = TfidfVectorizer()
    tfv.fit(pst.tokenize(text))

    # export trained tokenizer + vectorizer
    print("EXPORTING TRAINED TOKENIZER + VECTORIZER...")
    if stem:
        punkt_out_filename = "data/punkt_stem.pk"
        tfidf_out_filename = "data/tfidf_stem.pk"
    else:
        punkt_out_filename = "data/punkt.pk"
        tfidf_out_filename = "data/tfidf.pk"

    with open(punkt_out_filename, "wb") as pst_out:
        pickle.dump(pst, pst_out)
    with open(tfidf_out_filename, "wb") as tfv_out:
        pickle.dump(tfv, tfv_out)

    print("EXPORTING COMPLETED")
    return