Beispiel #1
0
    def train_sentence_tokenizer(self: object, text: str):
        """
        Train sentences tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = (self.punctuation +
                                                      self.strict_punctuation)
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer
Beispiel #2
0
 def constructor():
     trainer = PunktTrainer()
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.INCLUDE_ABBREV_COLLOCS = True
     trainer.train_tokens(self.words())
     params = trainer.get_params()
     return PunktSentenceTokenizer(params)
def train(src, tgt):
    with open(src, 'r', encoding='utf-8') as infile, \
            open(tgt, 'wb') as sent_tokenizer:
        contents = infile.read()
        language_punkt_vars = PunktLanguageVars
        # language_punkt_vars.sent_end_chars=tuple(args.end_chars)
        print("# Training sent tokenizer")
        trainer = PunktTrainer(contents, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True
        params = trainer.get_params()
        tokenizer = PunktSentenceTokenizer(params)
        tokenizer._params.abbrev_types.add('brgy')
        tokenizer._params.abbrev_types.add('sen')
        tokenizer._params.abbrev_types.add('supt')
        tokenizer._params.abbrev_types.add('rep')
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('col')
        tokenizer._params.abbrev_types.add('sec')
        tokenizer._params.abbrev_types.add('mt')
        tokenizer._params.abbrev_types.add('asst')
        tokenizer._params.abbrev_types.add('mr')
        tokenizer._params.abbrev_types.add('c/insp')
        tokenizer._params.abbrev_types.add('rep')
        tokenizer._params.abbrev_types.add('sta')
        tokenizer._params.abbrev_types.add('sto')
        pickle.dump(tokenizer, sent_tokenizer)
Beispiel #4
0
 def constructor():
     trainer = PunktTrainer()
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.INCLUDE_ABBREV_COLLOCS = True
     trainer.train_tokens(self.words())
     params = trainer.get_params()
     return PunktSentenceTokenizer(params)
Beispiel #5
0
def rank_sentences(text, sentence_scores, title="", n=7):

    final_sentences = []

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())

    for s in sentence_scores:
        if title == "":
            break
        else:
            sentence_scores[s] *= (1 + similarity_score(title, s))

    sc = sentence_scores.copy()
    sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True))
    ordered_sents = dict(islice(sc.items(), n))

    proper_sentences = sent_tokenizer.tokenize(text)

    for s in proper_sentences:
        if s.lower() in ordered_sents:
            final_sentences.append(s)

    return final_sentences
Beispiel #6
0
    def train_sentence_tokenizer(self: object, text: str):
        """
        Train sentence tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer
Beispiel #7
0
def get_tokenizer(training_text):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(training_text)
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    tokenizer._params.abbrev_types.update(ABBREVIATIONS)

    return tokenizer
    def trainSentenceTokenizer(self):
        text = ""
        for file_id in gutenberg.fileids():
            text += gutenberg.raw(file_id)

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('fig')
        return tokenizer
Beispiel #9
0
def get_sentence_tokenizer(language):
    """
    Return the sentence tokenizer callable.
    """

    pickle_path = 'sentence_tokenizer.pickle'

    try:
        input_file = open(pickle_path, 'rb')
        sentence_tokenizer = load(input_file)
        input_file.close()
    except FileNotFoundError:

        data_file_paths = []

        sentences = []

        try:
            # Get the paths to each file the bot will be trained with
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=language.ENGLISH_NAME.lower()
            ))
        except LookupError:
            # Fall back to English sentence splitting rules if a language is not supported
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=languages.ENG.ENGLISH_NAME.lower()
            ))

        data_file_paths.extend(corpus_files)

        for corpus, _categories, _file_path in load_corpus(*data_file_paths):
            for conversation in corpus:
                for text in conversation:
                    sentences.append(text.upper())
                    sentences.append(text.lower())

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train('\n'.join(sentences))

        sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

        # Pickle the sentence tokenizer for future use
        output_file = open(pickle_path, 'wb')
        dump(sentence_tokenizer, output_file, -1)
        output_file.close()

    return sentence_tokenizer
def trainSentenceTokenizer():
    """
    Method trains custom sentence tokenizer using punk.
    At the moment it preforms worse then plain english one (most likely due to not that much data)
    """
    collection = database["crawled-data"]

    text = ""
    for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}):
        text += record[ABSTRACT_DOCUMENT] + " "

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.INCLUDE_ABBREV_COLLOCS = True
    trainer.train(text)

    model = nltk.PunktSentenceTokenizer(trainer.get_params())
    with open("latvianPunkt2.pickle", mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #11
0
def score_sentences(text, word_scores, unique):

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_score = {}

    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())
    sentences = sent_tokenizer.tokenize(text.lower())

    for s in sentences:
        words = clean_text(s)
        sent_score[s] = 0

        for w in words:
            w = lemmatizer.lemmatize(w)
            if w in unique:
                sent_score[s] += word_scores[w]

    return sent_score
Beispiel #12
0
    def get_tokenizer(self, xml, abbrevWordList, spentSplitList):
        #class BulletPointLangVars(PunktLanguageVars):
            #sent_end_chars = ('?', '!')
            #for i in range(len(spentSplitList)):
            #    sent_end_chars = sent_end_chars + tuple(spentSplitList[i])

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        train_data = 'sss'
        trainer.train(train_data)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        #tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars = BulletPointLangVars())

        #문장분리 예외추가
        rule['ABBREV_WORDS'].extend(abbrevWordList)

        for i in rule['ABBREV_WORDS']:
            tokenizer._params.abbrev_types.add(i)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        return tokenizer
    def get_V(self, topics_file_name, other_file):
        if other_file == True:
            path = topics_file_name
        else:
            path = 'OpinosisDataset1.0_0/topics/{}'.format(topics_file_name)
        text = open(path, encoding="utf8", errors='ignore')
        text = text.read()

        # get the X_train_counts and X_train_tf
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        X = tokenizer.tokenize(text)
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                            token_pattern=r'\b\w+\b',
                                            min_df=1)
        X_train_counts = bigram_vectorizer.fit_transform(X)

        tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
        X_train_tf = tf_transformer.transform(X_train_counts)
        return X_train_counts, X_train_tf, tokenizer, bigram_vectorizer
Beispiel #14
0
def create_sentences(text_file, min_sentence_len):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True

    with open(text_file, "r") as input_file:
        paragraphs = input_file.read()

    trainer.train(paragraphs)

    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    # print(tokenizer._params.abbrev_types)

    sentences = []

    for line in open(text_file, "r+").readlines():
        sentences_tmp = tokenizer.tokenize(line)
        for sentence in sentences_tmp:
            sentences.append(sentence)

    with open("dataset/sentences.txt", "a") as out_file:
        for sentence in sentences:
            if len(sentence) > min_sentence_len:
                out_file.write(sentence + "\n\n")
Beispiel #15
0
'''Does the same thing as split_sent.py, but expects the file to be uncompressed and that the columns are src_url, tgt_url, src_line, tgt_line, adq_score, dom'''
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import nltk.data, sys, gzip

train = False
if train:
    with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open(
            "de_corp", 'rt', encoding='utf-8') as decorp:
        text_en = encorp.read()
        text_de = decorp.read()

    trainer_en = PunktTrainer()
    trainer_en.INCLUDE_ALL_COLLOCS = True
    trainer_en.train(text_en)

    trainer_de = PunktTrainer()
    trainer_de.INCLUDE_ALL_COLLOCS = True
    trainer_de.train(text_de)

    tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params())
    tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params())
else:
    #tokenizer_en=PunktSentenceTokenizer()
    #tokenizer_de=PunktSentenceTokenizer()
    #nltk.download('punkt')
    tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')

mismatch = 0
with open(sys.argv[1]) as filtered:
    for line in filtered:
Beispiel #16
0
from bs4 import BeautifulSoup
text = ""
from nltk.corpus import gutenberg

for file_id in gutenberg.fileids():
    text += gutenberg.raw(file_id)
print len(text)
soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser')
from pprint import pprint

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

trainer = PunktTrainer()

trainer.INCLUDE_ALL_COLLOCS = True

trainer.train(text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

sentences = soup.get_text(' ')

sentence_list= tokenizer.tokenize(sentences)

from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')

db=client['nlp']

coll=db['Keywords_list']
"""allows the disabling of the abbreviation penalty heuristic, which
exponentially disadvantages words that are found at times without a
final period."""

trainer.ABBREV_BACKOFF = 5
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""

trainer.COLLOCATION = 7.88
"""minimal log-likelihood value that two tokens need to be considered
as a collocation"""

trainer.SENT_STARTER = 30
"""minimal log-likelihood value that a token requires to be considered
as a frequent sentence starter"""

trainer.INCLUDE_ALL_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word ends in a period. It may be useful in corpora where there is a lot
of variation that makes abbreviations like Mr difficult to identify."""

trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
Beispiel #18
0
from os.path import basename
from nltk.tokenize.punkt import PunktTrainer

__author__ = 'Florian Leitner'
__version__ = '1.0'

if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'):
    print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0])))
    sys.exit(1)

trainer = PunktTrainer()
# configuration
trainer.ABBREV = 0.3  # cut-off value whether a ‘token’ is an abbreviation
trainer.ABBREV_CUTOFF = 5  # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm
trainer.COLLOCATION = 7.88  # minimal log-likelihood value that two tokens need to be considered as a collocation
trainer.IGNORE_ABBREV_PENALTY = False  # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period
trainer.INCLUDE_ABBREV_COLLOCS = True  # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic
trainer.INCLUDE_ALL_COLLOCS = False  # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify
trainer.MIN_COLLOC_FREQ = 3  # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used
trainer.SENT_STARTER = 30  # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

for line in fileinput.input():
    trainer.train(line)
    #print(line)

#trainer.freq_threshold()
trainer.finalize_training()
params = trainer.get_params()
pickle.dump(params, sys.stdout.buffer)
Beispiel #19
0
"""allows the disabling of the abbreviation penalty heuristic, which
exponentially disadvantages words that are found at times without a
final period."""

trainer.ABBREV_BACKOFF = 5
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""

trainer.COLLOCATION = 7.88
"""minimal log-likelihood value that two tokens need to be considered
as a collocation"""

trainer.SENT_STARTER = 30
"""minimal log-likelihood value that a token requires to be considered
as a frequent sentence starter"""

trainer.INCLUDE_ALL_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word ends in a period. It may be useful in corpora where there is a lot
of variation that makes abbreviations like Mr difficult to identify."""

trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log