Example #1
0
class TextCleaner(object):
	"""
	Takes in an iterable / sequence of multi-sentence text.

	Returns cleaned text as requested.

	Author Note: 
	The goal is to have standardized text cleaning utilities that I can use for 
	any text application with multi-language support.
	
	"""

	def __init__(self, language='english'):

		self.tokenizer = = nltk.data.load(‘tokenizers/punkt/' + language + '.pickle’)
		self.punkt_word_tokenizer = PunktWordTokenizer()
	
	def sentence_tokenize(text):
		self.sentences = tokenizer.tokenize(text)




	def remove_stop(sentence):
		self.punkt_word_tokenizer.tokenize(sentence)





	return text
Example #2
0
 def __init__(self, lang, vocab_dir, corpus_dir, window_size, output_dir):
   self._lang = 0
   self._vocab_dir = vocab_dir
   self._corpus_dir = corpus_dir
   self._window_size = window_size
   self._output_dir = output_dir
   self._stemmer = Snowball()
   self._sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
   self._word_tokenizer = PunktWordTokenizer()
   self._cooccur = defaultdict()
   self._wordcount = defaultdict()
   self._vocab = set()
   self._doc_num = 0
 def __init__(self, word_tokenizer='Treebank', wtokenizer=None):
     if word_tokenizer not in ['Treebank', 'PunktWord', 'WordPunct', '']:
         msg = 'word_tokenizer "{word_tokenizer}" should be Treebank, PunktWord, WordPunct or empty'
         raise ValueError(msg.format(word_tokenizer=word_tokenizer))
     if word_tokenizer == 'Treebank':
         from nltk.tokenize import TreebankWordTokenizer
         self.tokenizer = TreebankWordTokenizer().tokenize
     elif word_tokenizer == 'PunktWord':
         # PunktTokenizer splits on punctuation, but keeps it with the word. => [‘this’, “‘s”, ‘a’, ‘test’]
         from nltk.tokenize import PunktWordTokenizer
         self.tokenizer = PunktWordTokenizer().tokenize
     elif word_tokenizer == 'WordPunct':
         # WordPunctTokenizer splits all punctuations into separate tokens. => [‘This’, “‘”, ‘s’, ‘a’, ‘test’]
         from nltk.tokenize import WordPunctTokenizer
         self.tokenizer = WordPunctTokenizer().tokenize
     else:
         if wtokenizer is None:
             self.tokenizer = None
         else:
             if not callable(wtokenizer):
                 msg = 'wtokenizer should be callable'
                 warnings.warn(msg)
                 self.tokenizer = None
             else:
                 self.tokenizer = wtokenizer
Example #4
0
def get_sentiment(sentiment_db, txt):
    """
        Returns a tuple with the valence and arousal strength \
        based on the input text. Returns null in case it cannot \
        be computed.

            :param sentiment_db: the file ANEW
            :param txt: the sentence to be analysed.

    """
    words = PunktWordTokenizer().tokenize(txt)
    try:
        sentiments = map(lambda word: sentiment_db.get(
            STEMMER.stem(word), None), words)
        sentiments = filter(None, sentiments)
    except IndexError:
        sentiments = None
    if sentiments:
        valences = [s['valence'] for s in sentiments if s is not None]
        arousals = [s['arousal'] for s in sentiments if s is not None]
        valence = float(sum(valences))/len(valences)
        arousal = float(sum(arousals))/len(arousals)
        return valence, arousal
    else:
        return None
class Cleaner:
    """better than a Polish maid"""
    def __init__(self, input_file, basedir):
        self.original_data = self.open_gold_data(input_file)
        self.basedir = basedir
        self.data_dict = {}
        self.tokenizer = PunktWordTokenizer()

    def open_and_parse_xml_file(self, file_name):
        with open(file_name, "r") as f_in:
            return parse_parser_xml_results(f_in.read())

    def update_cache(self, file_name):
        data_dict[file_name] = self.open_and_parse_xml_file(
            os.path.join(self.basedir, file_name + ".raw.xml"))

    def open_gold_data(self, gold_file):
        original_data = []
        with open(gold_file, "r") as f_in:
            for line in f_in:
                line = line.rstrip().split()
                if line == []:
                    continue
                if len(line) == 11:
                    line.extend(["", "", ""])
                else:
                    line.extend(["", ""])
                original_data.append(FeatureRow(*line))
        return original_data

    def get_correct_offset(self, token, sentence, offset_begin, offset_end):
        token_list = self.tokenizer.tokenize(" ".join(token.split("_")))
        if len(token_list) > offset_end - offset_begin:
            offset_end = len(token_list) + offset_begin

        if token_list == sentence[offset_begin:offset_end]:
            return (offset_begin, offset_end)
        while token_list != sentence[offset_begin:offset_end]:
            offset_begin += 1
            offset_end += 1
            if offset_end >= len(sentence):
                raise IndexError("{:d} invalid index, token={:s}".format(
                    offset_end, token))
        return (offset_begin, offset_end)

    def build_new_data(self):
        for fr in self.original_data:
            curr_article = fr.article
            curr_referent = (fr.token_ref, fr.sentence_ref,
                             fr.offset_begin_ref, fr.offset_end_ref)
            try:
                nlp_data = self.data_dict[curr_article]
            except KeyError:
                self.update_cache(curr_article)
                nlp_data = self.data_dict[curr_article]
            new_offsets = self.get_correct_offset(
                fr.token, nlp_data["sentences"][int(fr.sentence)]["text"],
                int(fr.offset_begin), int(fr.offset_end))
            if new_offsets != (fr.offset_begin, fr.offset_end):
                print fr.token, new_offsets
def tokenize(t):
    tokenizer = PunktWordTokenizer()
    sentences = sent_tokenize(t)
    words = []
    refined_words = []
    for sentence in sentences:
        word = tokenizer.tokenize(sentence)
        for i in word:
            words.append(i.lower())

#Removal of stopwords and punctuations
#stopwords = open('stop-words-it-en.txt','r').read().split('\r\n')
    for word in words:
        if word not in stopwords.words('french') and word not in punctuation:
            refined_words.append(word)
    return refined_words
class Cleaner:
    """better than a Polish maid"""

    def __init__(self, input_file, basedir):
        self.original_data = self.open_gold_data(input_file)
        self.basedir = basedir
        self.data_dict = {}
        self.tokenizer = PunktWordTokenizer()


    def open_and_parse_xml_file(self, file_name):
        with open(file_name, "r") as f_in:
            return parse_parser_xml_results(f_in.read())

    def update_cache(self, file_name):
        data_dict[file_name] = self.open_and_parse_xml_file(os.path.join(self.basedir,file_name+".raw.xml"))

    def open_gold_data(self, gold_file):
        original_data = []
        with open(gold_file, "r") as f_in:
            for line in f_in:
                line = line.rstrip().split()
                if line == []:
                    continue
                if len(line) == 11:
                    line.extend(["", "", ""])
                else:
                    line.extend(["", ""])
                original_data.append(FeatureRow(*line))
        return original_data

    def get_correct_offset(self, token, sentence, offset_begin, offset_end):
        token_list = self.tokenizer.tokenize(" ".join(token.split("_")))
        if len(token_list) > offset_end-offset_begin:
            offset_end = len(token_list) + offset_begin

        if token_list == sentence[offset_begin:offset_end]:
            return (offset_begin, offset_end)
        while token_list != sentence[offset_begin:offset_end]:
            offset_begin += 1
            offset_end += 1
            if offset_end >= len(sentence):
                raise IndexError("{:d} invalid index, token={:s}".format(offset_end, token))
        return (offset_begin, offset_end)


    def build_new_data(self):
        for fr in self.original_data:
            curr_article = fr.article
            curr_referent = (fr.token_ref, fr.sentence_ref, fr.offset_begin_ref, fr.offset_end_ref)
            try:
                nlp_data = self.data_dict[curr_article]
            except KeyError:
                self.update_cache(curr_article)
                nlp_data = self.data_dict[curr_article]
            new_offsets = self.get_correct_offset(fr.token, nlp_data["sentences"][int(fr.sentence)]["text"], int(fr.offset_begin), int(fr.offset_end))
            if new_offsets != (fr.offset_begin, fr.offset_end):
                print fr.token, new_offsets
Example #8
0
    def testPunktTokenizerContraction(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = " You'll see a large white question mark."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['You', "'ll", 'see', 'a', 'large', 'white', 'question', 'mark', '.'])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
Example #9
0
    def testPunktTokenizer(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = " Facing the long wall in front of you, your destination will be the first door to your left (36-880)."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['Facing', 'the', 'long', 'wall', 'in', 'front', 'of', 'you', ',', 'your', 'destination', 'will', 'be', 'the', 'first', 'door', 'to', 'your', 'left', '(', '36-880', ')', '.'])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
Example #10
0
    def testPunktTokenizerContraction(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = " You'll see a large white question mark."
        indexes, tokens = tokenizer.tokenize(string)
        self.assertEqual(tokens, [
            'You', "'ll", 'see', 'a', 'large', 'white', 'question', 'mark', '.'
        ])

        for i, token in enumerate(tokens):
            self.assertEqual(string[indexes[i]:indexes[i] + len(token)], token)
Example #11
0
def tokenizeString(string, lower=True, tokenizer="wordpunct"):
    if tokenizer == "wordpunct":
        tokenized = WordPunctTokenizer().tokenize(string)
        if lower == True:
            tokenized = [w.lower() for w in tokenized]
    if tokenizer == "punktword":
        tokenized = PunktWordTokenizer().tokenize(string)
        if lower == True:
            tokenized = [w.lower() for w in tokenized]
    return tokenized
Example #12
0
def count_word_ngrams(n, processed_string):
    """
    Counts all word ngrams in processed_string
    and creates a dictionary of those ngram counts
    called ngram_counts_dict.
    """
    pwt = PunktWordTokenizer()
    processed_string = pwt.tokenize(processed_string)
    ngram_counts_dict = defaultdict(int)

    i = 0
    j = i + n
    for i,_ in enumerate(processed_string):
        ngram = ' '.join(processed_string[i:j])
        i += 1
        j = i + n
        ngram_counts_dict[ngram] = 1

    return ngram_counts_dict
Example #13
0
    def testPunktTokenizerNiceView(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = "you should have  a    nice   view ."
        indexes, tokens = tokenizer.tokenize(string)
        self.assertEqual(tokens,
                         ['you', "should", 'have', 'a', 'nice', 'view', '.'])
        self.assertEqual(indexes, [0, 4, 11, 17, 22, 29, 34])

        for i, token in enumerate(tokens):
            self.assertEqual(string[indexes[i]:indexes[i] + len(token)], token)
Example #14
0
    def testPunktTokenizerNiceView(self):
        tokenizer = IndexedTokenizer(PunktWordTokenizer())
        string = "you should have  a    nice   view ."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['you', "should", 'have', 'a', 'nice', 'view', '.'])
        self.assertEqual([t.start for t in tokens],
                         [0,      4,       11,     17,   22,     29,     34])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
Example #15
0
def tokenize_text_into_words(text, word_tokenizer='Treebank'):
    if word_tokenizer == 'Treebank':
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer = TreebankWordTokenizer()
    elif word_tokenizer == 'PunktWord':
        # PunktTokenizer splits on punctuation, but keeps it with the word. => [‘this’, “‘s”, ‘a’, ‘test’]
        from nltk.tokenize import PunktWordTokenizer
        tokenizer = PunktWordTokenizer()
    elif word_tokenizer == 'WordPunct':
        # WordPunctTokenizer splits all punctuations into separate tokens. => [‘This’, “‘”, ‘s’, ‘a’, ‘test’]
        from nltk.tokenize import WordPunctTokenizer
        tokenizer = WordPunctTokenizer()
    else:
        return -1
    return tokenizer.tokenize(text)
Example #16
0
def orphanedWords(sessions, withStopwords=True):
    orphanCount = 0
    count = 0.0
    tokenizer = IndexedTokenizer(PunktWordTokenizer())
    for session, instructionIdx, instruction, sdcs in sessions.sdcs():
        indexes, tokens = tokenizer.tokenize(instruction)
        for i, token in enumerate(tokens):
            idx = indexes[i]
            word = extractWord(token)

            if not(word is None):
                range = TextStandoff(instruction, (idx, idx + len(word)))
                if not containedInAny(sdcs, range):
                    if withStopwords or not word.lower() in stopwords:
                        orphanCount += 1
            count += 1

    orphanFraction = orphanCount/count
    print orphanCount, "orphans in", count,
    print "words. (%.2f%%)" % (orphanFraction * 100)
    def _create_word_tokenizer(name):
        """
        Here you can add supported word tokenizers.

        Note that it must implement the span_tokenize method.
        """
        if name == 'WordPunctTokenizer':
            from nltk.tokenize import WordPunctTokenizer
            return WordPunctTokenizer()

        elif name == 'PunktWordTokenizer':
            from nltk.tokenize import PunktWordTokenizer
            return PunktWordTokenizer()

        elif name.startswith('RegexTokenizer'):
            # name is a Python expression for constructing a RegexTokenizer,
            # eg. "RegexTokenizer(r'\w+|[^\W\S]+')\n".
            # Strip off the class name and parse the argument.
            arg = ast.literal_eval(name[len('RegexTokenizer'):])
            return RegexTokenizer(arg)

        else:
            raise ValueError('Unknown word tokenizer: {}'.format(name))
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

# Sentence tokenization
from nltk.tokenize import sent_tokenize

sent_tokenize_list = sent_tokenize(text)
print "\nSentence tokenizer:"
print sent_tokenize_list

# Create a new word tokenizer
from nltk.tokenize import word_tokenize

print "\nWord tokenizer:"
print word_tokenize(text)

# Create a new punkt word tokenizer
from nltk.tokenize import PunktWordTokenizer

punkt_word_tokenizer = PunktWordTokenizer()
print "\nPunkt word tokenizer:"
print punkt_word_tokenizer.tokenize(text)

# Create a new WordPunct tokenizer
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()
print "\nWord punct tokenizer:"
print word_punct_tokenizer.tokenize(text)

from __future__ import unicode_literals
from nltk.tokenize import PunktWordTokenizer as WordTokenizer
import random
import pprint
import scipy.sparse
import time
import itertools
import sys
import pickle
import helper
import constants


tokenizer = WordTokenizer()
int2tags = constants.int2tags
tags2int = constants.tags2int
int2citationFeilds = ['Authors', 'Date', 'Title', 'Source']
generic = ["city", "centre", "county", "street", "road", "and", "in", "town", "village"]


def filterArticles(articles):
    relevant_articles = {}
    correct = [0] * (len(int2tags) -1 )
    gold_num = [0] * (len(int2tags)-1)
    filtered_correct = [0] * (len(int2tags) -1 )
    filtered_gold_num = [0] * (len(int2tags)-1)
    helper.load_constants()
    print "Num incidents", len(incidents)
    print "Num unfilitered articles", len(articles)
    for incident_id in incidents.keys():
        incident = incidents[incident_id]
def my_token(s):
    my_tokenizer = PunktWordTokenizer()
    return my_tokenizer.tokenize(s)
Example #21
0
def compare_runs(model, runfile1, runfile2):

    ofile1 = cPickle.load(open(runfile1, 'r'))
    ofile2 = cPickle.load(open(runfile2, 'r'))
    print "cls", ofile1.__class__
    diffCount = 0
    leniantDiffCount = 0
    attempt = 0

    subject_to_num_o1_o2_flips = {}
    subject_to_num_o2_o1_flips = {}

    route_to_num_o1_o2_flips = {}
    route_to_num_o2_o1_flips = {}

    o1_o2 = []
    o2_o1 = []

    total_past_flip = 0.0
    total_past = 0.0

    total_through_flip = 0.0
    total_through = 0.0
    total_flips = 0.0

    total_length = 0.0
    total_length_flip = 0.0

    total_to = 0.0
    total_to_flip = 0.0

    tokenizer = IndexedTokenizer(PunktWordTokenizer())
    for i, (corr1,
            corr2) in enumerate(zip(ofile1['correct'], ofile2['correct'])):

        route = ofile1["regions"][i]
        assert route == ofile2["regions"][i]
        subject = ofile1["subjects"][i]
        assert subject == ofile2["subjects"][i]
        subject_to_num_o1_o2_flips.setdefault(subject, 0)
        subject_to_num_o2_o1_flips.setdefault(subject, 0)
        route_to_num_o1_o2_flips.setdefault(route, 0)
        route_to_num_o2_o1_flips.setdefault(route, 0)

        sentence = ofile1["sentences"][i]
        assert sentence == ofile2["sentences"][i]

        indexes, tokens = tokenizer.tokenize(sentence)

        num_through = len(
            [x for x in tokens if x.lower() in ("through", "thru")])

        num_past = len([x for x in tokens if x.lower() in ("past", "pass")])

        num_to = len([x for x in tokens if x.lower() in ("to", "into")])
        total_past += num_past
        total_through += num_through
        total_to += num_to
        total_length += len(tokens)

        if subject == "Subject 06":
            total_through_flip += num_through
            total_past_flip += num_past
            total_to_flip += num_to
            total_length_flip += len(tokens)
            total_flips += 1

        if corr1 != corr2:
            print
            print "*******************************************************"

            print "subject", subject
            print "region", route
            print "difference", i, corr1, corr2
            correctnessFlip = any(corr1) != any(corr2)
            if correctnessFlip:
                print "***correctness flip!!!"
                if any(corr1):
                    subject_to_num_o2_o1_flips[subject] += 1
                    route_to_num_o2_o1_flips[route] += 1
                    o2_o1.append((i, sentence))
                else:

                    assert any(corr2)
                    subject_to_num_o1_o2_flips[subject] += 1
                    route_to_num_o1_o2_flips[route] += 1
                    o1_o2.append((i, sentence))

            else:
                print "No correctness flip."
                total_through_no_flip += num_through
            assert ofile1["sentences"][i] == ofile2["sentences"][i]

            print sentence
            sloc = ofile1["start_regions"][i]
            sloc = (sloc[0][0], sloc[1][0])
            eloc = ofile1["end_regions"][i]
            eloc = (eloc[0][0], eloc[1][0])
            print "sloc", sloc
            print "eloc", eloc

            iElocTopo = model.loc_to_idx(eloc)
            iSlocTopo = model.loc_to_idx(sloc)
            print "ieloc", iElocTopo
            print corr1
            print corr2
            slocs = [path[0] for path in ofile1["path"][i] if path != None]
            print "islocs", [model.vpts_for_topo(iSlocTopo)]
            print "ielocs", [model.vpts_for_topo(iElocTopo)]
            #if any(corr2) and not any(corr1): # one is right, and 2 is wrong
            if any(corr1) != any(corr2):  # any differences
                for o, (p1, p2) in enumerate(
                        zip(ofile1["path"][i], ofile2["path"][i])):
                    if corr1[o] != corr2[o]:
                        if p1 != None or p2 != None:
                            if attempt == 2 and False:

                                print "correct", corr1[o], corr2[o]
                                sloc1 = model.vpt_to_num[p1[0]]
                                sloc2 = model.vpt_to_num[p2[0]]
                                print "sloc1", sloc1
                                print "sloc2", sloc2
                                assert sloc1 == sloc2
                                #gui1.runSentence(sentence, sloc1)
                                print "execing"
                                #import basewindow
                                #app = basewindow.makeApp()
                                #retval = app.exec_()

                            attempt += 1

                leniantDiffCount += 1
            diffCount += 1
    print "differences", diffCount

    print "flips o1->o2"
    print "\n\n".join([str(x) for x in o1_o2])
    print "***************************"
    print "flips o2->o1"
    print "\n\n".join([str(x) for x in o2_o1])

    print "o1->o2", route_to_num_o1_o2_flips
    print "o2->o1", route_to_num_o2_o1_flips

    print "average number of through", total_through / len(ofile1['correct'])
    print "average number of through, subject 6", total_through_flip / total_flips
    print
    print "average number of past", total_past / len(ofile1['correct'])
    print "average number of past, subject 6", total_past_flip / total_flips
    print
    print "average number of to", total_to / len(ofile1['correct'])
    print "average number of to, subject 6", total_to_flip / total_flips
    print
    print "average length", total_length / len(ofile1['correct'])
    print "average length, subject 6", total_length_flip / total_flips
 def __init__(self, input_file, basedir):
     self.original_data = self.open_gold_data(input_file)
     self.basedir = basedir
     self.data_dict = {}
     self.tokenizer = PunktWordTokenizer()
Example #23
0
class corpusParser():


  def __init__(self, lang, vocab_dir, corpus_dir, window_size, output_dir):
    self._lang = 0
    self._vocab_dir = vocab_dir
    self._corpus_dir = corpus_dir
    self._window_size = window_size
    self._output_dir = output_dir
    self._stemmer = Snowball()
    self._sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    self._word_tokenizer = PunktWordTokenizer()
    self._cooccur = defaultdict()
    self._wordcount = defaultdict()
    self._vocab = set()
    self._doc_num = 0
    #self._vocab_word_index = defaultdict()
    #self._vocab_index_word = defaultdict()


  def loadVocab(self):
    vocabfile = open(self._vocab_dir, 'r')
    vocab_word_index = defaultdict()
    vocab_index_word = []
    #index = -1
    for line in vocabfile:
      line = line.strip()
      words = line.split('\t')
      word = words[1]
      self._vocab.add(word)
      #index += 1
      #self._vocab_word_index[word] = index
      #self._vocab_index_word.append(word)

    vocabfile.close()

    # Initialize wordcount and cooccur
    for word in self._vocab:
      self._wordcount[word] = 0
      self._cooccur[word] = defaultdict()
      for word2 in self._vocab:
        if word2 > word:
          self._cooccur[word][word2] = 0
    

  def parseDoc(self, doc_raw):
    tokens = []
    for sent in self._sent_tokenizer.tokenize(doc_raw):
      for token in self._word_tokenizer.tokenize(sent):
        tokens.append(self._stemmer(self._lang, token))

    tokens_len = len(tokens)
    for index1 in range(0, tokens_len):
      w1 = tokens[index1]
      if w1 in self._vocab:
        self._wordcount[w1] += 1

        if self._window_size == -1:
          index_end = tokens_len
        else:
          index_end = min(tokens_len, index1 + self._window_size)

        for index2 in range(index1 + 1, index_end):
          w2 = tokens[index2]
          if w2 in self._vocab:
            if w1 < w2:
              self._cooccur[w1][w2] += 1
            elif w1 > w2:
              self._cooccur[w2][w1] += 1


  def parseCorpus20news(self):

    print "Loading vocab"
    self.loadVocab()
    
    doc_count = 0

    print "Parsing corpus"
    data_folders = [self._corpus_dir + "/train", self._corpus_dir + "/test"]
    print data_folders
    for data_folder in data_folders:
      for folder in glob("%s/*^tgz" % data_folder):
        for ff in glob("%s/*" % folder):
          doc_count += 1
          infile = open(ff, 'r')
          doc_raw = ""
          for line in infile:
            line = line.strip().lower()
            doc_raw += " " + line
          self.parseDoc(doc_raw)
          infile.close()
          if doc_count % 1000 == 0:
            print "Finish parsing", doc_count, "documents!"

    self._doc_num = doc_count
    print "Total number of docunments: ", doc_count
    print "writing results!"
    self.writeResult()


  def parseCorpusNyt(self):

    print "Loading vocab"
    self.loadVocab()
    
    doc_count = 0

    print "Parsing corpus"

    years = ["1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996"]
    print data_folders

    for year in years:
      folder_year = self._corpus_dir + "/" + year
      for month in glob("%s/[0-9][0-9]" % folder_year):
        for day in glob("%s" % month):
          for ff in glob("%s/*" % day):
            doc_count += 1
            infile = open(ff, 'r')
            doc_raw = ""
            for line in infile:
              line = line.strip().lower()
              doc_raw += " " + line
            self.parseDoc(doc_raw)
            infile.close()
            if doc_count % 1000 == 0:
              print "Finish parsing", doc_count, "documents!"

    self._doc_num = doc_count
    print "Total number of docunments: ", doc_count
    print "writing results!"
    self.writeResult()


  def parseCorpusWiki(self):
    print "Loading vocab"
    self.loadVocab()

    print "Parsing corpus"
    doc_count = 0
    file_count = 0
    for folder in glob("%s/*" % self._corpus_dir):
      for ff in glob("%s/*" % folder):
        infile = open(ff, 'r')
        file_count += 1
        if file_count % 100 == 0:
          print "Finish parsing", file_count, "files or ", doc_count, "documents!"

        for line in infile:
          line = line.strip().lower()

          if line.startswith("<doc"):
            doc_count += 1
            doc_flag = True
            doc_raw = ""
          elif line.startswith("</doc>"):
            doc_flag = False
            ### processing doc
            self.parseDoc(doc_raw)
          else:
            assert doc_flag == True
            doc_raw += " " + line
        infile.close()

    self._doc_num = doc_count
    print "Total number of docunments: ", doc_count
    self.writeResult()


  def writeResult(self):
    # write wordcount
    outputfile = self._output_dir + "/wordcount.txt"
    outfile = open(outputfile, 'w')
    for word in self._wordcount.keys():
      tmp = word + "\t" + str(self._wordcount[word]) + "\n"
      outfile.write(tmp)
    outfile.close()

    # write coccurance:
    outputfile = self._output_dir + "/cooccurance.txt"
    outfile = open(outputfile, 'w')
    for w1 in self._cooccur.keys():
      for w2 in self._cooccur[w1].keys():
        if self._cooccur[w1][w2] != 0:
          tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n"
          outfile.write(tmp)
    outfile.close()
Example #24
0
'''
Created on 06/05/2013

@author: Rodrigo
'''

from nltk.tokenize import word_tokenize, PunktWordTokenizer, RegexpTokenizer
print word_tokenize("Hello word!")

print word_tokenize("We can't do this")

tokenizer = PunktWordTokenizer()
print tokenizer.tokenize("We can't do this")

tokenizer = RegexpTokenizer("[\w']+")
print tokenizer.tokenize("We can't do this")

# Split instead of findall
tokenizer = RegexpTokenizer("\s+", gaps=True)
print tokenizer.tokenize("We can't do this")
Example #25
0
 def __tokenize(content):
     tokenizer = PunktWordTokenizer()
     return content.tokenize(tokenizer)
Example #26
0
len(word_tokens)        # Returns the number of words in the tokenized list of text
len(sentence_tokens)    # Returns the number of sentences in the tokenized list of text
word_unique = list(set(word_tokens))  # Eliminates duplicated words in the tokenized list

# Word tokenization details
# When tokenizing words, the punctiation and contraction symbols receive special treatemnt:
nlkt.word_tokenize('Hello World.')  # Returns ['Hello', 'World', '.']
nltk.word_tokenize("can't")         # Returns ['ca', "n't"]

# Word Tokenization alternatives

# PunktWordTokenizer
# Splits on punctuation, but keeps it with the word
from nltk.tokenize import PunktWordTokenizer        # Imports the tokenizer
tokenizer = PunktWordTokenizer()                    # Instanciates the tokenizer
tokenizer.tokenize("Can't is a contraction")        # Returns ['Can', "'t", 'is', 'a', 'contraction.']

# WordPunctTokenizer
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenize.tokenizer("Can't is a contraction")        # Returns ['Can', "'", 't', 'is', 'a', 'contraction', '.']

# Tokenizing (sentences) in different languages (Spanish)
para = "Hola amigos. Gracias por ver este video. Saludos"       # Defines the text to tokenize
tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')   # Loads the spanish sentence tokenizer
print (tokenizer.tokenize(para))                                # Tokenizes the text

# Tokenize based on lines, spaces or tweets (special class)
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize
Example #27
0
# -*- coding: utf-8 -*-
from nltk.corpus import stopwords as _stopwords
from nltk.tokenize import PunktWordTokenizer
from curses.ascii import isascii
import unicodedata

language = "portuguese"
stopwords = [sw.decode('utf-8') for sw in _stopwords.words(language)]
punctuation = u'!(),-.:;?'
tkz = PunktWordTokenizer()

make_ascii = lambda text: \
    filter(isascii, unicodedata.normalize('NFD', text).encode('utf-8'))

def detokenize(words):
    text = "".join((" " if w not in punctuation else "") + w for w in words)
    return text

def make_slug(text):
    text = text.replace(u"/", u"")
    text = text.replace(u".", u"")
    words = [make_ascii(w.lower()) for w in tkz.tokenize(text) if (w not in stopwords) and (w not in punctuation)]
    return u"-".join(words)
Example #28
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from nltk.tokenize import PunktWordTokenizer

tokenizer = PunktWordTokenizer()
result = tokenizer.tokenize("Can't is a contraction.")
print(result)
#['Can', "'t", 'is', 'a', 'contraction.']
Example #29
0
#More on NLTK tokenizers
#English tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Spanish tokenizer
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
texto = "Un buen reportaje puede ser tan fascinante e instructivo sobre el mundo real como un gran cuento o una magnífica novela. Si alguien lo pone en duda, le ruego que lea la crónica de Ioan Grillo Bring On the Wall que apareció en The New York Times el pasado 7 de mayo. Cuenta la historia del Flaco, un contrabandista mexicano que, desde que estaba en el colegio, a los 15 años, se ha pasado la vida contrabandeando drogas e inmigrantes ilegales a Estados Unidos. Aunque estuvo cinco años en la cárcel no se ha arrepentido del oficio que practica y menos ahora, cuando, dice, su ilícita profesión está más floreciente que nunca."
spanish_tokenizer.tokenize(texto)

#Different types of tokenizers (and how to call different methods on NLTK)
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("This's a test")

from nltk.tokenize import PunktWordTokenizer
punkt_word_tokenizer = PunktWordTokenizer()
punkt_word_tokenizer.tokenize("this's a test")

from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
word_punct_tokenizer.tokenize("this's a test")

#Stemming
#Using the porter algorithm
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('maximum')
porter_stemmer.stem('presumably')
porter_stemmer.stem('multiply')
porter_stemmer.stem('provision')
porter_stemmer.stem('owed')
Example #30
0
from nltk.tokenize import PunktWordTokenizer
tokenizer = PunktWordTokenizer()
strExSentence = "Can't is a contraction."
lstWordPunkt = tokenizer.tokenize(strExSentence)
print(lstWordPunkt)

#OUTPUT is ['Can', "'t", 'is', 'a', 'contraction.']


from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
lstWordPunkt = tokenizer.tokenize(strExSentence)
print(lstWordPunkt)

#OUTPUT is  ['Can', "'", 't', 'is', 'a', 'contraction', '.']


def fnTest(strArgument):
    print(strArgument)


fnTest("mark")




    
Example #31
0
# Lookup for the stopword corpus
LANGUAGE_ID = {ENGLISH: "english", GERMAN: "german", CHINESE: "chinese", \
                   FRENCH: "french", SPANISH: "spanish", ARABIC: "arabic", \
                   DIXIE: "english"}

sent_tokenizer = {}
for ii in LANGUAGE_ID:
    try:
        sent_tokenizer[ii] = nltk.data.load('tokenizers/punkt/%s.pickle' % \
                                                LANGUAGE_ID[ii])
    except LookupError:
        print("Error loading sentence tokenizer for %s" % LANGUAGE_ID[ii])
        None

word_tokenizer = PunktWordTokenizer()


def write_proto(filename, proto):
    f = open(filename, "wb")
    f.write(proto.SerializeToString())
    f.close()


class DocumentReader:
    """
    Base class that represents a document
    """
    def __init__(self, raw, lang=ENGLISH):
        self.lang = lang
        self._raw = raw
 def __init__(self, input_file, basedir):
     self.original_data = self.open_gold_data(input_file)
     self.basedir = basedir
     self.data_dict = {}
     self.tokenizer = PunktWordTokenizer()
Example #33
0
##     along with Unoporuno.  If not, see <http://www.gnu.org/licenses/>.
##
# ngram freq calculator
# usage
#       python ngrams.py texto.txt ngram
#       python ngrams.py 4000_centros_investigacion.txt 2
import sys, nltk
from nltk.tokenize import PunktWordTokenizer

input_file = open(sys.argv[1])
s_ngrams = sys.argv[2]
input_ngrams = int(s_ngrams)
ngrams_in_text = []

for line in input_file:
    tokens = PunktWordTokenizer().tokenize(line)
    ngrams = nltk.ngrams(tokens, input_ngrams)
    ngrams_in_text += ngrams

# for line in input_file:
#     tokens = PunktWordTokenizer().tokenize(line)
#     bigrams  = nltk.bigrams(tokens)
#     bigrams_in_text += bigrams

frequency = nltk.FreqDist(ngrams_in_text)
for f in frequency:
    #print f
    ngram_str = ''
    for s in f:
        print s,
    print frequency[f]
Example #34
0
from __future__ import unicode_literals
from nltk.tokenize import PunktWordTokenizer as WordTokenizer
import random
import pprint
import scipy.sparse
import time
import itertools
import sys
import pickle
import helper
import constants


tokenizer = WordTokenizer()
int2tags = constants.int2slots
tags2int = constants.tags2int
int2citationFeilds = ['Authors', 'Date', 'Title', 'Source']
generic = ["city", "centre", "county", "street", "road", "and", "in", "town", "village"]


def filterArticles(articles):
    relevant_articles = {}
    correct = [0] * (len(int2tags) -1 )
    gold_num = [0] * (len(int2tags)-1)
    filtered_correct = [0] * (len(int2tags) -1 )
    filtered_gold_num = [0] * (len(int2tags)-1)
    helper.load_constants()
    print "Num incidents", len(incidents)
    print "Num unfilitered articles", len(articles)
    for incident_id in incidents.keys():
        incident = incidents[incident_id]
Example #35
0
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

# Sentence tokenization
from nltk.tokenize import sent_tokenize

sent_tokenize_list = sent_tokenize(text)
print("\nSentence tokenizer:")
print(sent_tokenize_list)

# Create a new word tokenizer
from nltk.tokenize import word_tokenize

print("\nWord tokenizer:")
print(word_tokenize(text))

# Create a new punkt word tokenizer
from nltk.tokenize import PunktWordTokenizer

punkt_word_tokenizer = PunktWordTokenizer()
print("\nPunkt word tokenizer:")
print(punkt_word_tokenizer.tokenize(text))

# Create a new WordPunct tokenizer
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()
print("\nWord punct tokenizer:")
print(word_punct_tokenizer.tokenize(text))

Example #36
0
	def __init__(self, language='english'):

		self.tokenizer = = nltk.data.load(‘tokenizers/punkt/' + language + '.pickle’)
		self.punkt_word_tokenizer = PunktWordTokenizer()
words2 = word_tokenize("Hello World.")
words3 = penn_tokenizer.tokenize("Hello World.")
print words1
print words2
print words3

# <markdowncell>

# <p> Or, since we have already broken <em>para</em> into sentences, we can create the word list by tokeninzing each 
#     sentence and creating a <strong>flatmap</strong> as shown here: </p>

# <codecell>

from nltk.tokenize import PunktWordTokenizer
words3 = [word for sentence in sentences for word in word_tokenize(sentence)]
punkt_tokenizer = PunktWordTokenizer()
words4 = [word for sentence in sentences for word in punkt_tokenizer.tokenize(sentence)]
print words3 == words4
print words3
print words4

# <markdowncell>

# <p>Notice that there are <em><strong>subtle differences in the output</em></strong>. The first example did not separate the '<strong>.</strong>' from the words <em>World</em>
#     and <em>you</em> where as the
#     second example did. Both accounted for the '<strong>.</strong>' after <em>NLTK</em>. I'm not sure why this is the case. Notice how both examples resulted in the splitting of
#     <em>It's</em> into two words. It seems that the <strong>TreebankWordTokenizer</strong>, for which <strong>word_tokenizer</strong> is a wrapper, seems to change behavior 
# when working on a whole paragraph versus individual sentences.</p>
# 
# <p>But I don't want contractions split into separate words. Use a <strong>RegexpTokenizer</strong> as shown next:</p>