def summarize_pdf(article_text):
    
    trainer=PunktTrainer()
    trainer.train(article_text)
    tok=PunktSentenceTokenizer(trainer.get_params())
    sentence_list = tok.tokenize(article_text)
    sentence_lists=[]
    sent_list=[]

    clean_sent=[]
    for sent in sentence_list:
            tok=TreebankWordTokenizer()
            words=tok.tokenize(sent)
            wordss=[]
            words=[ww.lower() for ww in words]
            sentence_lists.append(" ".join(words))
            for word,tag in pos_tag(words):
                if tag.startswith('NN'):
                    pos='n'
                elif tag.startswith('VB'):
                    pos='v'
                
                elif tag.startswith('RB'):
                    pos='r'
                else:
                    pos='a'
                stem=WordNetLemmatizer()
                w=stem.lemmatize(word,pos)
                if(w not in punc) & bool(re.search("[^\d]",w)):
                    wordss.append(w.lower())
            clean_sent.append(' '.join(wordss))    
            sent_list.append(wordss)
    return sent_list,clean_sent,sentence_lists,sentence_list
Example #2
0
File: utils.py Project: cltk/cltk
    def train_sentence_tokenizer(self: object, text: str):
        """
        Train sentence tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer
Example #3
0
    def train_sentence_tokenizer(self: object, text: str):
        """
        Train sentences tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = (self.punctuation +
                                                      self.strict_punctuation)
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer
def train(src, tgt):
    with open(src, 'r', encoding='utf-8') as infile, \
            open(tgt, 'wb') as sent_tokenizer:
        contents = infile.read()
        language_punkt_vars = PunktLanguageVars
        # language_punkt_vars.sent_end_chars=tuple(args.end_chars)
        print("# Training sent tokenizer")
        trainer = PunktTrainer(contents, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True
        params = trainer.get_params()
        tokenizer = PunktSentenceTokenizer(params)
        tokenizer._params.abbrev_types.add('brgy')
        tokenizer._params.abbrev_types.add('sen')
        tokenizer._params.abbrev_types.add('supt')
        tokenizer._params.abbrev_types.add('rep')
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('col')
        tokenizer._params.abbrev_types.add('sec')
        tokenizer._params.abbrev_types.add('mt')
        tokenizer._params.abbrev_types.add('asst')
        tokenizer._params.abbrev_types.add('mr')
        tokenizer._params.abbrev_types.add('c/insp')
        tokenizer._params.abbrev_types.add('rep')
        tokenizer._params.abbrev_types.add('sta')
        tokenizer._params.abbrev_types.add('sto')
        pickle.dump(tokenizer, sent_tokenizer)
Example #5
0
class Punket_tokenizer:
    def __init__(self):
        self.modelfile = 'punket_tokenizer.pk'

        if os.path.exists(self.modelfile):
            self.tokenizer = self.punkt_tokenize_load()

        else:
            self.trainer = PunktTrainer()
            text = ""
            for file_id in gutenberg.fileids():
                text += gutenberg.raw(file_id)
            self.trainer.INCLUDE_ALL_COLLOCS = True
            self.trainer.train(text)
            self.tokenizer = PunktSentenceTokenizer(self.trainer.get_params())

            self.tokenizer._params.abbrev_types.add('dr')
            self.tokenizer._params.abbrev_types.add('mr')
            self.tokenizer._params.abbrev_types.add('mrs')
            self.tokenizer._params.abbrev_types.add('miss')
            self.tokenizer._params.abbrev_types.add('ms')
            self.tokenizer._params.abbrev_types.add('no')

            self.tokenizer._params.abbrev_types.add('jan')
            self.tokenizer._params.abbrev_types.add('feb')
            self.tokenizer._params.abbrev_types.add('mar')
            self.tokenizer._params.abbrev_types.add('apr')
            self.tokenizer._params.abbrev_types.add('may')
            self.tokenizer._params.abbrev_types.add('jun')
            self.tokenizer._params.abbrev_types.add('aug')
            self.tokenizer._params.abbrev_types.add('sep')
            self.tokenizer._params.abbrev_types.add('oct')
            self.tokenizer._params.abbrev_types.add('nov')
            self.tokenizer._params.abbrev_types.add('dec')

            with open(self.modelfile, mode='wb') as fout:
                pickle.dump(self.tokenizer,
                            fout,
                            protocol=pickle.HIGHEST_PROTOCOL)

    def punkt_tokenize_load(self):
        with open(self.modelfile, mode='rb') as fin:
            punket_tokenizer = pickle.load(fin)

        return punket_tokenizer

    def puket_tokenizer_add_rule(self, word):
        self.tokenizer._params.abbrev_types.add(word)

    def punket_sentence_tokenizer(self, sentences):
        return self.tokenizer.tokenize(sentences)
def score(trainer: PunktTrainer, typ: str) -> float:
    # Count how many periods & nonperiods are in the
    # candidate.
    num_periods = typ.count('.') + 1
    num_nonperiods = len(typ) - num_periods + 1

    # Let <a> be the candidate without the period, and <b>
    # be the period.  Find a log likelihood ratio that
    # indicates whether <ab> occurs as a single unit (high
    # value of ll), or as two independent units <a> and
    # <b> (low value of ll).
    count_with_period = trainer._type_fdist[typ + '.']
    count_without_period = trainer._type_fdist[typ]
    ll = trainer._dunning_log_likelihood(
        count_with_period + count_without_period,
        trainer._num_period_toks,
        count_with_period,
        trainer._type_fdist.N(),
    )

    # Apply three scaling factors to 'tweak' the basic log
    # likelihood ratio:
    #   F_length: long word -> less likely to be an abbrev
    #   F_periods: more periods -> more likely to be an abbrev
    #   F_penalty: penalize occurrences w/o a period
    f_length = math.exp(-num_nonperiods)
    f_periods = num_periods
    f_penalty = int(trainer.IGNORE_ABBREV_PENALTY) or math.pow(
        num_nonperiods, -count_without_period)
    score = ll * f_length * f_periods * f_penalty
    return score
Example #7
0
def train_punkt(ctx, input, output, abbr, colloc):
    """Train Punkt sentence splitter using sentences in input."""
    click.echo('chemdataextractor.tokenize.train_punkt')
    import pickle
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
    punkt = PunktTrainer()
    # Set these to true to include collocations more leniently, then increase MIN_COLLOC_FREQ to restrict again
    # punkt.INCLUDE_ALL_COLLOCS = False
    # punkt.INCLUDE_ABBREV_COLLOCS = False
    # punkt.MIN_COLLOC_FREQ = 1
    # Don't train on titles. They may contain abbreviations, but basically never have actual sentence boundaries.
    for fin in input:
        click.echo('Training on %s' % fin.name)
        sentences = fin.read()  #.replace('.\n', '. \n\n')
        punkt.train(sentences, finalize=False, verbose=True)
    punkt.finalize_training(verbose=True)
    if abbr:
        abbreviations = abbr.read().strip().split('\n')
        click.echo('Manually adding abbreviations: %s' % abbreviations)
        punkt._params.abbrev_types.update(abbreviations)
    if colloc:
        collocations = [
            tuple(l.split('. ', 1)) for l in colloc.read().strip().split('\n')
        ]
        click.echo('Manually adding collocs: %s' % collocations)
        punkt._params.collocations.update(collocations)
    model = PunktSentenceTokenizer(punkt.get_params())
    pickle.dump(model, output, protocol=pickle.HIGHEST_PROTOCOL)
Example #8
0
File: tiger.py Project: ooz/Confopy
 def constructor():
     trainer = PunktTrainer()
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.INCLUDE_ABBREV_COLLOCS = True
     trainer.train_tokens(self.words())
     params = trainer.get_params()
     return PunktSentenceTokenizer(params)
Example #9
0
def get_nltk_sent_tokenizer(container, lang):

    assert lang in ["zh", "en"], "Unknown language."

    trainer = PunktTrainer()
    if isinstance(container, Container):
        article_paths = container.get_all_article_paths(
            root_dir="../processed_data/crawler/nejm/articles/", ext=lang)
    elif isinstance(container, list):
        print("{} Articles.".format(len(container)))
        article_paths = container
    else:
        raise ValueError("Cannot parse container with class {}".\
         format(container.__class__))

    missing_count = 0
    for path in article_paths:
        try:
            article = get_article_as_lowercase_string(path)
            trainer.train(text=article, finalize=False)
        except FileNotFoundError:
            print("{} not found.".format(path))
            missing_count += 1
    print("{} articles not found.".format(missing_count))

    trainer.finalize_training()
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    return tokenizer
def train_from_file(training_file):
    """Make a ruleset from a file."""
    language_punkt_vars = PunktLanguageVars
    language_punkt_vars.sent_end_chars = ('.', '?', ':')
    language_punkt_vars.internal_punctuation = (',', ';')
    with open(training_file) as opened_training_file:
        train_data = opened_training_file.read()
    trainer = PunktTrainer(train_data, language_punkt_vars)
    with open('latin.pickle', 'wb') as open_pickle_file:
        pickle.dump(trainer, open_pickle_file)
    def __init__(self, vocab_size=None):
        self.vocab_size = vocab_size

        # Tokenization tools and other private attributes.
        self._sent_tokenizer = PunktSentenceTokenizer()
        self._sent_trainer = PunktTrainer()
        self._word_tokenizer = WordTokenizer()
        self._index_to_dfreq = None
        self._is_finalized = False

        # Number of times a given word has been seen across entire corpus.
        self.word_to_freq = OrderedCounter()
        # Number of docs that contained word w.
        self.word_to_dfreq = OrderedCounter()
        # Number of documents trained on so far.
        self.num_docs = 0

        # Dicts that will be filled when fitting documents.
        # word_index: w => i (index into vocabulary)
        # index_docs: i => doc_counts (doc_freq for word with index i).
        self.word_to_index = OrderedDict()
Example #12
0
 def constructor():
     trainer = PunktTrainer()
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.INCLUDE_ABBREV_COLLOCS = True
     trainer.train_tokens(self.words())
     params = trainer.get_params()
     return PunktSentenceTokenizer(params)
Example #13
0
def rank_sentences(text, sentence_scores, title="", n=7):

    final_sentences = []

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())

    for s in sentence_scores:
        if title == "":
            break
        else:
            sentence_scores[s] *= (1 + similarity_score(title, s))

    sc = sentence_scores.copy()
    sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True))
    ordered_sents = dict(islice(sc.items(), n))

    proper_sentences = sent_tokenizer.tokenize(text)

    for s in proper_sentences:
        if s.lower() in ordered_sents:
            final_sentences.append(s)

    return final_sentences
Example #14
0
def main():
    opts, args = getopt.getopt(sys.argv[1:], 'l:', [])

    lang = None
    for o, a in opts:
        if o == '-l':
            lang = a

    if lang is None:
        print >> sys.stderr, "Must pass -l language on the command line!"
        sys.exit(1)
    if lang == 'en':
        print >> sys.stderr, "Don't train for -l en!  We are using the pre-trained punkt tokenizer from NLTK."
        sys.exit(1)

    lang_vars = MyPunktLanguageVars()
    trainer = PunktTrainer(lang_vars=lang_vars)
    train(trainer, lang)
    trainer.finalize_training(verbose=True)

    tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars)
    pickle.dump(tokenizer, open('LingwoNLP/punkt-'+lang+'.pickle','wt'))
Example #15
0
def get_tokenizer(training_text):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(training_text)
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    tokenizer._params.abbrev_types.update(ABBREVIATIONS)

    return tokenizer
Example #16
0
def main():
    opts, args = getopt.getopt(sys.argv[1:], 'l:', [])

    lang = None
    for o, a in opts:
        if o == '-l':
            lang = a

    if lang is None:
        print >> sys.stderr, "Must pass -l language on the command line!"
        sys.exit(1)
    if lang == 'en':
        print >> sys.stderr, "Don't train for -l en!  We are using the pre-trained punkt tokenizer from NLTK."
        sys.exit(1)

    lang_vars = MyPunktLanguageVars()
    trainer = PunktTrainer(lang_vars=lang_vars)
    train(trainer, lang)
    trainer.finalize_training(verbose=True)

    tokenizer = PunktSentenceTokenizer(trainer.get_params(),
                                       lang_vars=lang_vars)
    pickle.dump(tokenizer, open('LingwoNLP/punkt-' + lang + '.pickle', 'wt'))
Example #17
0
def train_latin_from_file():
    """Open a training set file and write a Latin pickle trainer"""
    training_file = 'training_sentences.txt'
    training_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', training_file)
    with open(training_path, 'r') as f:
        train_data = f.read()
    language_vars = PunktLanguageVars
    language_vars.sent_end_chars=('.', '?', ';', ':')
    language_vars.internal_punctuation = ','
    trainer = PunktTrainer(train_data, language_vars)
    pickle_name = 'latin.pickle'
    pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_latin/', pickle_name)
    with open(pickle_path, 'wb') as f:
        pickle.dump(trainer, f)
Example #18
0
def train_greek_from_file():
    language_punkt_vars = PunktLanguageVars
    language_punkt_vars.sent_end_chars = ('.', ';',)
    language_punkt_vars.internal_punctuation = (',', '·')
    training_file = 'training_sentences.txt'
    training_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', training_file)
    with open(training_path) as f:
        train_data = f.read()
    #build trainer
    trainer = PunktTrainer(train_data, language_punkt_vars)
    pickle_name = 'greek.pickle'
    pickle_path = os.path.join(cltk_data, 'compiled', 'sentence_tokens_greek/', pickle_name)
    with open(pickle_path, 'wb') as f:
        pickle.dump(trainer, f)
def train_tokenizer(trainfile,abbreviationfile,modelfile):
 k = 0
 skipped_ = 0
 custom_ = 0
 
 punkt = PunktTrainer()
 input_ = codecs.open(trainfile, encoding='utf-8')
 
 for sentence in input_:
  k+=1
  if k%100 == 0:
   print('trained from sentences :' + str(k))
  try:
   punkt.train(sentence, finalize=False, verbose=False)
  except:
   skipped_ += 1
 
 input_.close()
 
 if abbreviationfile !='':
  abbreviations_ = codecs.open(abbreviationfile,encoding='utf-8') 
  for abbr in abbreviations_:
   try:
    punkt.train('Start ' + abbr + '. End.' ,finalize=False, verbose=False)
    custom_ += 1
   except:
    pass
  abbreviations_.close()
  
 punkt.finalize_training(verbose=False)
 
 model = PunktSentenceTokenizer(punkt.get_params())
 model_output = codecs.open(modelfile,mode='wb')
 pickle.dump(model,model_output,protocol=pickle.HIGHEST_PROTOCOL)
 model_output.close()
 
 print('')
 print(str(skipped_) + ' sentences skipped')
 print(str(custom_) + ' custom abbreviations added')
    def trainSentenceTokenizer(self):
        text = ""
        for file_id in gutenberg.fileids():
            text += gutenberg.raw(file_id)

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('fig')
        return tokenizer
Example #21
0
def train_punktsent(trainfile, modelfile):
    """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
    punkt = PunktTrainer()
    try:
        with codecs.open(trainfile, 'r', 'utf8') as fin:
            punkt.train(fin.read(), finalize=False, verbose=False)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    ##HACK: Adds abbreviations from rb_tokenizer.
    abbrv_sent = " ".join([i.strip() for i in \
                           codecs.open('abbrev.lex','r','utf8').readlines()])
    abbrv_sent = "Start" + abbrv_sent + "End."
    punkt.train(abbrv_sent, finalize=False, verbose=False)
    # Finalize and outputs trained model.
    punkt.finalize_training(verbose=True)
    model = PunktSentenceTokenizer(punkt.get_params())
    with open(modelfile, mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
    return model
Example #22
0
def get_sentence_tokenizer(language):
    """
    Return the sentence tokenizer callable.
    """

    pickle_path = 'sentence_tokenizer.pickle'

    try:
        input_file = open(pickle_path, 'rb')
        sentence_tokenizer = load(input_file)
        input_file.close()
    except FileNotFoundError:

        data_file_paths = []

        sentences = []

        try:
            # Get the paths to each file the bot will be trained with
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=language.ENGLISH_NAME.lower()
            ))
        except LookupError:
            # Fall back to English sentence splitting rules if a language is not supported
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=languages.ENG.ENGLISH_NAME.lower()
            ))

        data_file_paths.extend(corpus_files)

        for corpus, _categories, _file_path in load_corpus(*data_file_paths):
            for conversation in corpus:
                for text in conversation:
                    sentences.append(text.upper())
                    sentences.append(text.lower())

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train('\n'.join(sentences))

        sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

        # Pickle the sentence tokenizer for future use
        output_file = open(pickle_path, 'wb')
        dump(sentence_tokenizer, output_file, -1)
        output_file.close()

    return sentence_tokenizer
def trainSentenceTokenizer():
    """
    Method trains custom sentence tokenizer using punk.
    At the moment it preforms worse then plain english one (most likely due to not that much data)
    """
    collection = database["crawled-data"]

    text = ""
    for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}):
        text += record[ABSTRACT_DOCUMENT] + " "

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.INCLUDE_ABBREV_COLLOCS = True
    trainer.train(text)

    model = nltk.PunktSentenceTokenizer(trainer.get_params())
    with open("latvianPunkt2.pickle", mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
Example #24
0
def train_punktsent(trainfile, modelfile):
  """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
  punkt = PunktTrainer()
  try:
    with codecs.open(trainfile, 'r','utf8') as fin:
      punkt.train(fin.read(), finalize=False, verbose=False)
  except KeyboardInterrupt:
    print 'KeyboardInterrupt: Stopping the reading of the dump early!'
  ##HACK: Adds abbreviations from rb_tokenizer.
  abbrv_sent = " ".join([i.strip() for i in \
                         codecs.open('abbrev.lex','r','utf8').readlines()])
  abbrv_sent = "Start"+abbrv_sent+"End."
  punkt.train(abbrv_sent,finalize=False, verbose=False)
  # Finalize and outputs trained model.
  punkt.finalize_training(verbose=True)
  model = PunktSentenceTokenizer(punkt.get_params())
  with open(modelfile, mode='wb') as fout:
    pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
  return model
Example #25
0
    def get_tokenizer(self, xml, abbrevWordList, spentSplitList):
        #class BulletPointLangVars(PunktLanguageVars):
            #sent_end_chars = ('?', '!')
            #for i in range(len(spentSplitList)):
            #    sent_end_chars = sent_end_chars + tuple(spentSplitList[i])

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        train_data = 'sss'
        trainer.train(train_data)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        #tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars = BulletPointLangVars())

        #문장분리 예외추가
        rule['ABBREV_WORDS'].extend(abbrevWordList)

        for i in rule['ABBREV_WORDS']:
            tokenizer._params.abbrev_types.add(i)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        return tokenizer
Example #26
0
def score_sentences(text, word_scores, unique):

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_score = {}

    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())
    sentences = sent_tokenizer.tokenize(text.lower())

    for s in sentences:
        words = clean_text(s)
        sent_score[s] = 0

        for w in words:
            w = lemmatizer.lemmatize(w)
            if w in unique:
                sent_score[s] += word_scores[w]

    return sent_score
def build_sentence_model(text, extra_abbrevs=None):
    """
    Build a sentence model from text with optional
    extra abbreviations to include.
    :param text:
    :param extra_abbrevs:
    :return:
    """

    # Setup Punkt trainer
    punkt_trainer = PunktTrainer()
    punkt_trainer.train(text, verbose=False, finalize=False)
    punkt_trainer.finalize_training(verbose=False)

    # Extract parameters from trainer
    punkt_params = punkt_trainer.get_params()

    # Add any extras if passed
    if extra_abbrevs is not None:
        for abbrev in extra_abbrevs:
            punkt_params.abbrev_types.add(abbrev.strip(".").lower())

    # Return model instantiated with new parameters
    return PunktSentenceTokenizer(punkt_params)
    def get_V(self, topics_file_name, other_file):
        if other_file == True:
            path = topics_file_name
        else:
            path = 'OpinosisDataset1.0_0/topics/{}'.format(topics_file_name)
        text = open(path, encoding="utf8", errors='ignore')
        text = text.read()

        # get the X_train_counts and X_train_tf
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        X = tokenizer.tokenize(text)
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                            token_pattern=r'\b\w+\b',
                                            min_df=1)
        X_train_counts = bigram_vectorizer.fit_transform(X)

        tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
        X_train_tf = tf_transformer.transform(X_train_counts)
        return X_train_counts, X_train_tf, tokenizer, bigram_vectorizer
Example #29
0
def create_sentences(text_file, min_sentence_len):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True

    with open(text_file, "r") as input_file:
        paragraphs = input_file.read()

    trainer.train(paragraphs)

    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    # print(tokenizer._params.abbrev_types)

    sentences = []

    for line in open(text_file, "r+").readlines():
        sentences_tmp = tokenizer.tokenize(line)
        for sentence in sentences_tmp:
            sentences.append(sentence)

    with open("dataset/sentences.txt", "a") as out_file:
        for sentence in sentences:
            if len(sentence) > min_sentence_len:
                out_file.write(sentence + "\n\n")
Example #30
0
import fileinput
import pickle
import sys

from os.path import basename
from nltk.tokenize.punkt import PunktTrainer

__author__ = 'Florian Leitner'
__version__ = '1.0'

if len(sys.argv) == 2 and sys.argv[1] in ('-h', '--help'):
    print('usage: {} < TEXT > MODEL'.format(basename(sys.argv[0])))
    sys.exit(1)

trainer = PunktTrainer()
# configuration
trainer.ABBREV = 0.3  # cut-off value whether a ‘token’ is an abbreviation
trainer.ABBREV_CUTOFF = 5  # upper cut-off for Mikheev’s (2002) abbreviation detection algorithm
trainer.COLLOCATION = 7.88  # minimal log-likelihood value that two tokens need to be considered as a collocation
trainer.IGNORE_ABBREV_PENALTY = False  # disables the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period
trainer.INCLUDE_ABBREV_COLLOCS = True  # include as potential collocations all word pairs where the first word is an abbreviation - such collocations override the orthographic heuristic, but not the sentence starter heuristic
trainer.INCLUDE_ALL_COLLOCS = False  # this includes as potential collocations all word pairs where the first word ends in a period - it may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify
trainer.MIN_COLLOC_FREQ = 3  # minimum bound on the number of times a bigram needs to appear before it can be considered a collocation - useful when INCLUDE_*_COLLOCS are used
trainer.SENT_STARTER = 30  # minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

for line in fileinput.input():
    trainer.train(line)
    #print(line)

#trainer.freq_threshold()
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import pickle
from pymongo import MongoClient
from progressbar import ProgressBar


client = MongoClient()
db = client.legislation
bills = db.bills

trainer = PunktTrainer()

# # set custom parameters
extra_collocations = {(u'sec', u'##number##')}
extra_sentence_starters = {u'(##number##)'}
# extra_abbreviations = {u'U.S.C', u'usc'}

trainer.ABBREV = 0.3
"""cut-off value whether a 'token' is an abbreviation"""

trainer.IGNORE_ABBREV_PENALTY = False
"""allows the disabling of the abbreviation penalty heuristic, which
exponentially disadvantages words that are found at times without a
final period."""

trainer.ABBREV_BACKOFF = 5
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""

trainer.COLLOCATION = 7.88
"""minimal log-likelihood value that two tokens need to be considered
as a collocation"""
# coding: utf-8
import codecs
from sys import argv, exit

from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer

if len(argv) != 3:
    print "Usage: %s <TRAINING_CORPUS> <SENTENCES_TO_SPLIT>" % __file__
    exit(1)

training = ''.join(codecs.open(argv[1], 'rb', 'utf-8').readlines())
trainer = PunktTrainer()
trainer.train(training, verbose=True)
tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True)
text = ''.join(codecs.open(argv[2], 'rb', 'utf-8').readlines())
sentences = tokenizer.tokenize(text)
codecs.open('split', 'wb', 'utf-8').writelines([s + '\n' for s in sentences])
Example #33
0
from pprint import pprint
import pickle
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

# print(dir(gutenberg))
# print(gutenberg.fileids())

text = ""
# for file_id in gutenberg.fileids():
#     text += gutenberg.raw(file_id)

with open('eminescu.txt', 'r') as file:
    text = file.read()

with open('hogas.txt', 'r', encoding='utf8') as file:
    text += file.read()

with open('bucuresti.txt', 'r', encoding='utf8') as file:
    text += file.read()

with open('pesteri.txt', 'r', encoding='utf8') as file:
    text += file.read()

# print(len(text))

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)

with open('model.txt', 'wb') as file:
    pickle.dump(trainer, file)
Example #34
0
from bs4 import BeautifulSoup
text = ""
from nltk.corpus import gutenberg

for file_id in gutenberg.fileids():
    text += gutenberg.raw(file_id)
print len(text)
soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser')
from pprint import pprint

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

trainer = PunktTrainer()

trainer.INCLUDE_ALL_COLLOCS = True

trainer.train(text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

sentences = soup.get_text(' ')

sentence_list= tokenizer.tokenize(sentences)

from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')

db=client['nlp']

coll=db['Keywords_list']
from nltk.tokenize.punkt import PunktTrainer
import pickle

PUNCTUATION = (
    ';',
    '.',
    '!',
    '?',
)
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True

with open('./corpus.txt', 'r') as fs:
    text = fs.read()

trainer.train(text, verbose=True)
params = trainer.get_params()
with open('./egs/punkt_tokenize/vi.pkl', 'wb') as fs:
    pickle.dump(params, fs)
        num_nonperiods, -count_without_period)
    score = ll * f_length * f_periods * f_penalty
    return score


if __name__ == "__main__":
    MODE = sys.argv[1]

    if MODE == "train":
        n = int(sys.argv[2])

        print("reading...")
        text = get_text(n)

        print("training...")
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.ABBREV = 0.3
        trainer.train(text, verbose=True)
        del text

        print("building tokenizer...")
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        abbrevs = tokenizer._params.abbrev_types
        print(sorted(abbrevs))
        print("%i abbreviations" % len(abbrevs))

        target_abbrevs = [
            "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms",
            "seq", "o.r.s"
        ]
Example #37
0
'''Does the same thing as split_sent.py, but expects the file to be uncompressed and that the columns are src_url, tgt_url, src_line, tgt_line, adq_score, dom'''
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import nltk.data, sys, gzip

train = False
if train:
    with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open(
            "de_corp", 'rt', encoding='utf-8') as decorp:
        text_en = encorp.read()
        text_de = decorp.read()

    trainer_en = PunktTrainer()
    trainer_en.INCLUDE_ALL_COLLOCS = True
    trainer_en.train(text_en)

    trainer_de = PunktTrainer()
    trainer_de.INCLUDE_ALL_COLLOCS = True
    trainer_de.train(text_de)

    tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params())
    tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params())
else:
    #tokenizer_en=PunktSentenceTokenizer()
    #tokenizer_de=PunktSentenceTokenizer()
    #nltk.download('punkt')
    tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')

mismatch = 0
with open(sys.argv[1]) as filtered:
    for line in filtered:
                outer += 1
                trainer.train("\n".join(lines), finalize=False)
                count = 0
                lines = []
            count += 1

            if preprocess:
                line = preprocess(line)

            lines.append(line)


# open model
vars = PunktLanguageVars
vars.sent_end_chars = (u".", u"?", u"!", u")", u"\"", u"'", u":", u"|", u"»", u"]")
trainer = PunktTrainer(lang_vars=vars())

train_with_file(gull_fn, trainer)

for fn in glob(os.path.join(avis_path, '*.s')):
    train_with_file(fn, trainer, preprocess=lambda x: avis_pat.match(x.strip()).group(1))

params = trainer.get_params()
punkt = PunktSentenceTokenizer(params)
cPickle.dump(punkt, 'punkt-norwegian-open.pickle')

# full model
vars = PunktLanguageVars
vars.sent_end_chars = (u".", u"?", u"!", u")", u"\"", u"'", u":", u"|", u"»", u"]")
trainer = PunktTrainer(lang_vars=vars())