Example #1
0
 def expandSet(kwd_set, root_elt):
     '''
     Expands a given set of keywords using the whole text and
     co-occurance probabilities
     @param kwd_set: Set<string>. List of mentioned kwds
     @param root_elt: etree.Element. The root element of the document
     '''
     lines = [elt.text for elt in root_elt.findall(".//line")]
     stop_words = set(stopwords.words("english"))
     tokenizer = PunktWordTokenizer()
     all_pairs = []
     for line in lines:
         for kwd in kwd_set:
             if re.match(kwd, line):
                 tokens = filter(lambda x: x not in stop_words and
                                     x not in string.punctuation,
                                 tokenizer.tokenize(line))
                 for token in tokens:
                     all_pairs.append((kwd, token))
     top_pairs = [pair for pair, freq in Counter(all_pairs).iteritems()
                  if freq >= 2]
     for pair in top_pairs:
         if KeywordExpander.verbose and pair[1] not in kwd_set:
             print "Expanding kwd with : ", pair[1]
         kwd_set.add(pair[1]);
         
     return kwd_set
Example #2
0
def tokenize_title(text):
  text = re.sub('[.]','',text)
  text = PunktWordTokenizer().tokenize(text)
  for f in stopwords.words('english'):
    if f in text:
      text.remove(f)
  return text
Example #3
0
def usingTitleAlgorithm(userinput):
    #nltk's english stopword list
    stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
     'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
     'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',
     'itself', 'they', 'them', 'their', 'theirs', 'themselves',
     'what', 'which', 'who', 'whom', 'this', 'that', 'these',
     'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
     'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a',
     'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
     'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
     'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
     'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
    'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
     'why', 'how', 'all', 'any', 'both', 'each', 'few','more', 'most', 'other', 'some', 'such',
     'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now']
    url = str(userinput)
    title, htmlText = web_crawler(url)
    qry = PunktWordTokenizer().tokenize(title)  #tokenize title
    qry = [words for words in qry if words.lower() not in stop] #Run Query through stopwords
    totalText = PunktSentenceTokenizer().tokenize(htmlText)
    textList = []
    for i in totalText:
        i = PunktWordTokenizer().tokenize(i.strip('.'))
        textList.append(i)

    sentenceRanks = summarizer(qry, textList)
    finalResults = []
    for num in range(1,4):  #skipping the first sentence because it's just going to be the title
        ind = sentenceRanks[num][0]
        finalResults.append(' '.join(textList[ind]))
    finalResults.append(title)
    return finalResults
Example #4
0
    def __init__(self,
                 tok=False,
                 wtok=False,
                 stok=False,
                 pos=False,
                 stem=False,
                 pos_model=None,
                 abbrev_set=None,
                 stok_model=None):
        """@param abbrev_set: a set of frequent abbreviations."""
        if tok:
            wtok = True
            stok = True

        if wtok:
            self.wordTokenizer = PunktWordTokenizer()
            #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE)
            self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE)
            # Bragantino,2006.In fix this shit
        if stok:
            if stok_model is not None:
                try:
                    self.senTokenizer = stok_model
                except LookupError:
                    sys.stderr.write("WARNING: tokenizer cannot be loaded")
                    sys.stderr.write(
                        "WARNING: using an untrained sen_tokenizer")
                    self.senTokenizer = PunktSentenceTokenizer()
            else:
                try:
                    self.senTokenizer = nltk.data.load(
                        'tokenizers/punkt/english.pickle')
                except LookupError:
                    sys.stderr.write(
                        "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!"
                    )
                    sys.stderr.write(
                        "WARNING: using an untrained sen_tokenizer")
                    self.senTokenizer = PunktSentenceTokenizer()

        self.abbrev_set = (set(abbrev_set)
                           if abbrev_set is not None else set())

        if pos:
            if pos_model is not None:
                self.posTagger = HunposTagger(pos_model, encoding="utf-8")
            else:
                self.posTagger = HunposTagger(os.path.join(
                    os.environ['HUNPOS'], 'english.model'),
                                              encoding="utf-8")
        if stem:
            self.stemmer = WordNetLemmatizer()
def get_indices():
    text = """What the f**k did you just f*****g say about me, you little bitch? I'll have you know I graduated top of my class in the military, and I've been involved in numerous secret raids on enemies, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire US army. You are nothing to me but just another target. I will wipe you the f**k out with precision the likes of which has never been seen before on this Earth, mark my f*****g words. You think you can get away with saying that shit to me over the Internet? Think again, f****r. As we speak I am contacting my secret network of spies across the USA and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're f*****g dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States marines and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your f*****g tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're f*****g dead, kid."""
    # text = "Hi I like call of duty."
    text_tokens = PunktWordTokenizer().tokenize(text)
    print(len(text_tokens))
    print(text_tokens.index("clever"))
    result = []
    for ii, word in enumerate(text_tokens):
        answer = raw_input("Replace '{0}'? (y/n): ".format(word))
        if answer == 'y':
            result.append(ii)

    print(result)
Example #6
0
def Document2Word2VecTrainingInputFormat(document):
	"""
		Given an input string of plain text sentences, first
		tokenizes the documents into each sentence, then tokenizes
		each sentence at the word level. Returns a list of lists where 
		each inner lists represents a sentence in the input and the contents are the individual words of the sentence.
	"""
	output = list()
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	word_detector = PunktWordTokenizer()
	sentences = sent_detector.tokenize(document)
	for sent in sentences:
		output.append(word_detector.tokenize(sent))
	return output
Example #7
0
def worker(args):

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    word_detector = PunktWordTokenizer()

    def split_sentences(txt):
        sents = sent_detector.tokenize(txt.strip(), realign_boundaries=True)
        for sent in sents:
            sstr = u' '.join(word for word in word_detector.tokenize(sent))
            tkns = filter(None, sstr.split(u' '))
            if len(tkns) > 0:
                yield u' '.join(tkns)

    chunk, ofile = args

    with codecs.open(ofile, 'w', 'utf-8') as of:
        for path in chunk:
            with gzip.open(path) as f:
                for txt in find_story(f):
                    if txt is None:
                        continue
                    else:
                        for sent in split_sentences(txt):
                            of.write(sent)
                            of.write(u'\n')
                            of.flush()
            print 'Completed', path
Example #8
0
class Translator:
    """docstring for Translator"""
    def __init__(self, *dictionaries):
        self.tokenizer = PunktWordTokenizer()
        self.dictionaries = dictionaries

    def translate(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)

        def select_value(l):
            '''Should select the corect value'''
            #TODO: Implement this, right now has default behavior
            if isinstance(l, list):
                return l[0]
            else:
                return l

        def tr(word):
            for d in self.dictionaries:
                found = d[word]
                if found is not None:
                    return found
            else:
                return word

        return [select_value(tr(w)) for w in tokens]
Example #9
0
    def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print "Gathering sentences and removing stopwords"
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [
                word for word in PunktWordTokenizer().tokenize(line.lower())
                if word not in stopwords
            ]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print len(documents), "documents red"
        print len(self.dictionary), " unique tokens"
Example #10
0
 def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False,
              pos_model=None, abbrev_set=None):
     """@param abbrev_set: a set of frequent abbreviations."""
     if tok:
         wtok = True
         stok = True
         
     if wtok:
         self.wordTokenizer = PunktWordTokenizer()
         #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE)
         self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE)
         # Bragantino,2006.In fix this shit
     if stok:
         try:
             self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
         except LookupError:
             sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!")
             sys.stderr.write("WARNING: using an untrained sen_tokenizer")
             self.senTokenizer = PunktSentenceTokenizer()
     
     self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set())
     
     if pos:
         if pos_model is not None:
             self.posTagger = HunposTagger(pos_model, encoding="utf-8")
         else:
             self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8")
     if stem:
         self.stemmer = WordNetLemmatizer()
Example #11
0
def get_non_nouns(sentence):
    s1 = PunktWordTokenizer().tokenize(sentence)
    s2 = nltk.pos_tag(s1)
    useful = ["JJ", "JJS", "JJS", "RB", "RBR", "RBS"]
    splitted = []
    # print s2
    for pairs in s2:
        if (pairs[1] in useful):
            splitted.append(pairs[0])
    return splitted

    sentence = sentence.split()

    lmtzr = WordNetLemmatizer()
    for word in sentence:
        word = lmtzr.lemmatize(word)
    power = {}
    for aspect in feature_list:
        power[aspect] = compute_for_aspect(aspect, sentence, ad_words)
    m_probability = 0
    m_aspect = "general"
    for aspect in feature_list:
        if power[aspect][0] > m_probability:
            m_aspect = aspect
            m_probability = power[aspect][0]
    print(m_aspect + "\n")

    return m_aspect
Example #12
0
def worker(args):

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    word_detector = PunktWordTokenizer()

    def split_sentences(txt):
        sents = sent_detector.tokenize(txt.strip(), realign_boundaries=True)
        for sent in sents:
            sstr = u' '.join(word for word in word_detector.tokenize(sent))
            tkns = filter(None, sstr.split(u' '))
            if len(tkns) > 0:
                yield u' '.join(tkns)

    tdir, txt_files, ofile = args

    with codecs.open(ofile, 'w', 'utf-8') as of:
        for fname in txt_files:
            txt_file = os.path.join(tdir, fname)

            with codecs.open(txt_file, 'r', 'utf-8') as f:
                text = u' '.join(f.readlines())

                for sent in split_sentences(text):
                    of.write(sent)
                    of.write(u'\n')
                    of.flush()
            print 'Completed', txt_file
Example #13
0
class Translator:
    """docstring for Translator"""
    def __init__(self, *dictionaries):
        self.tokenizer = PunktWordTokenizer()
        self.dictionaries = dictionaries

    def translate(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)

        def select_value(l):
            '''Should select the corect value'''
            #TODO: Implement this, right now has default behavior
            if isinstance(l, list):
                return l[0]
            else:
                return l

        def tr(word):
            for d in self.dictionaries:
                found = d[word]
                if found is not None:
                    return found
            else:
                return word

        return [select_value(tr(w)) for w in tokens]
Example #14
0
    def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        print "Gathering sentences and removing stopwords"
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # TODO: remove punctuation, commas, etc.
            # remove common words and tokenize
            document = [word for word in PunktWordTokenizer().tokenize(line.lower()) if word not in stopwords]
            documents.append(document)

            # TODO: avoid keeping all documents in memory
            #dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
        f_sentences.close()

        """
        print "Removing tokens that appear only once"
        # remove words that appear only once
        # TODO: ver qual eh a frequencia de corte no word2vec, e fazer o mesmo
        all_tokens = sum(documents, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        documents = [[word for word in text if word not in tokens_once] for text in documents]
        """

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print len(documents), "documents red"
        print len(self.dictionary), " unique tokens"
Example #15
0
def parse_ap(input_path, output_path):
    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from nltk.tokenize.punkt import PunktWordTokenizer
    tokenizer = PunktWordTokenizer()

    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    from string import ascii_lowercase

    doc_title = ""
    doc_content = []
    doc_count = 0

    input_file_stream = open(input_path, 'r')
    output_file_stream = open(output_path, 'w')
    for line in input_file_stream:
        line = line.strip().lower()

        if line == "<text>" or line == "</text>":
            continue

        if line == "<doc>":
            continue

        if line == "</doc>":
            output_file_stream.write("%s\t%s\n" %
                                     (doc_title, " ".join(doc_content)))
            doc_count += 1
            if doc_count % 1000 == 0:
                print("successfully parsed %d documents" % (doc_count))
            continue

        if line.startswith("<docno>"):
            line = line.lstrip("<docno>")
            line = line.rstrip("</docno>")
            doc_title = line.strip()
            continue

        #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))];
        doc_tokens = [
            x for x in tokenizer.tokenize(line)
            if (min(y in ascii_lowercase for y in x))
        ]
        doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop]
Example #16
0
def parse_ap(input_path, output_path):
    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from nltk.tokenize.punkt import PunktWordTokenizer 
    tokenizer = PunktWordTokenizer()

    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer();
    
    from string import ascii_lowercase
    
    doc_title = "";
    doc_content = [];
    doc_count = 0;
      
    input_file_stream = open(input_path, 'r');
    output_file_stream = open(output_path, 'w');
    for line in input_file_stream:
        line = line.strip().lower();
        
        if line=="<text>" or line=="</text>":
            continue;
        
        if line=="<doc>":
            continue;
        
        if line=="</doc>":
            output_file_stream.write("%s\t%s\n" % (doc_title, " ".join(doc_content)));
            doc_count += 1;
            if doc_count%1000==0:
                print("successfully parsed %d documents" % (doc_count));
            continue;
         
        if line.startswith("<docno>"):
            line = line.lstrip("<docno>");
            line = line.rstrip("</docno>");
            doc_title = line.strip();
            continue;
            
        #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))];
        doc_tokens = [x for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))];
        doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop];
Example #17
0
 def SentenceBreak(self, paragraphs):
     output = []
     for paragraph in paragraphs:
         wb_para = ' '.join(PunktWordTokenizer().tokenize(
             paragraph.strip()))
         if self.sbreak_model:
             output.extend(self.sbreak_model.tokenize(wb_para))
         else:
             output.append(wb_para)
     return output
Example #18
0
def get_non_nouns(sentence):
    s1 = PunktWordTokenizer().tokenize(sentence)
    s2 = nltk.pos_tag(s1)
    useful = ["JJ", "JJS", "JJS", "RB", "RBR", "RBS"]
    splitted = []
    # print s2
    for pairs in s2:
        if (pairs[1] in useful):
            splitted.append(pairs[0])
    return splitted
Example #19
0
def remove_unnecessary(sentence):
  s1 = PunktWordTokenizer().tokenize(sentence)
  s2 = nltk.pos_tag(s1)
  useful = ["JJ","JJS","JJS","NN","NNS","NNP","NNPS","RB","RBR","RBS"]
  splitted = []
  # print s2
  for pairs in s2:
    if(pairs[1] in useful):
      splitted.append(pairs[0])
  return splitted
Example #20
0
def usingSentenceIntersectionAlgorithm(userinput):
    url = userinput
    title, htmlText = web_crawler(url)
    qry = PunktWordTokenizer().tokenize(title)  #tokenize title
    
    totalText = PunktSentenceTokenizer().tokenize(htmlText)
    textList = []
    for i in totalText:
        i = PunktWordTokenizer().tokenize(i.strip('.'))
        textList.append(i)
    ranks = sentence_ranks(textList)
    ranks_sorted = sorted(ranks.items(), key = operator.itemgetter(1))
    ranks_sorted.reverse()
    
    finalResults = []
    for num in range(1,4):  #skipping the first sentence because it's just going to be the title
        finalResults.append(ranks_sorted[num][0])
    finalResults.append(title)
    
    return finalResults
Example #21
0
def tokenize_document(doc, is_ted, is_only_nouns):
    """
    For a given string text, the script get the text's tokens. The text is pre-processd and filtered, after that the NLTK tokenizer
    process is carried out, if a flag is enabled, the tokens are tagged and filtered out only the nouns and finally the tokens are
    lemmatized.
    PARAMETERS:
       1. doc: The string text from which extract the tokens
       2. is_ted: A flag to say if to add to the english standard stopword the custom stopwords prepared for the TED talks corpus
       3. is_only_nouns: A flag to say if extract only the tokens tagged as a nouns
    RETURNS:
       A list of strings where each string is a token from the given text
    """
    res = []
    
    try: 
        # First pre-process and filter the given text
        doc2= remove_punctuation_stopwords(doc, is_ted)
        # From the pre-proccesed and filtered text apply the NLTK tokenizer process
        tokens = PunktWordTokenizer().tokenize(' '.join(doc2))
        # If enabled the flag, then only extract the tokens tagged as a nouns
        if is_only_nouns:
            tagged_tokens = nltk.pos_tag(tokens)
            tokens = []
            for token, tag in tagged_tokens:
                if (tag == 'NN') or (tag == 'NNP') or (tag == 'NNS'):
                    tokens.append(token)
        # Lemmatize the tokens using the NLTK lemmatizer
        for i in range(0,len(tokens)):
            lema = WordNetLemmatizer().lemmatize(tokens[i])
            # If the token was not lemmatized, then apply verb lemmatization
            if lema == tokens[i]:
                lema = WordNetLemmatizer().lemmatize(tokens[i], 'v')
            if (len(lema) > 1) and (not lema.isdigit()):
                # Append the lema to the result to be returned
                res.append(lema)
    except:
        print "tokenize_document"
        print ""
        traceback.print_exc()

    return res
Example #22
0
def stringTokenizer(s):
    try:
        punct = set(string.punctuation)
        s = ''.join(x for x in s if x not in punct)
        s = s.lower()
        s = unicode(s)
        print "tokenizing query"
        tokens = PunktWordTokenizer().tokenize(s)
        tokens = [w for w in tokens if not w in stopset]
        return tokens
    except:
        print "keyword tokenizing error"
Example #23
0
 def test_latin_stopwords(self):
     """Filter Latin stopwords"""
     from cltk.stop.classical_latin.stops import STOPS_LIST
     sentence = 'Quo usque tandem abutere, Catilina, patientia nostra?'
     lowered = sentence.lower()
     tokens = PunktWordTokenizer().tokenize(lowered)
     no_stops = [w for w in tokens if w not in STOPS_LIST]
     target_list = [
         'usque', 'tandem', 'abutere', ',', 'catilina', ',', 'patientia',
         'nostra', '?'
     ]
     self.assertEqual(no_stops, target_list)
Example #24
0
def neutralRemover(message):
    # Analyze the sentiment of a bite by comparing it to an array of "positively"
    # and "negatively" oriented words.
    buy, sell = 0, 0

    #Tokenize the message into individual words
    tokenizer = PunktWordTokenizer()

    #Assign a bullish or bearish sentiment to each word
    for word in tokenizer.tokenize(message):
        if word in pos:
            buy += 1
        if word in neg:
            sell += 1

    #Compare total bullish sentiment to total bearish sentiment
    if buy > sell:
        return 1

    if buy < sell:
        return -1
    return 0
Example #25
0
def main():
    query_file = sys.argv[1]
    doc_file = sys.argv[2]
    new_file = output_name(doc_file)

    query = read_file(query_file).translate(None, string.punctuation)
    query_words = PunktWordTokenizer().tokenize(query)

    lookup = {}

    # Construct dictionary from words to synonyms in query
    for word in query_words:
        word = str.lower(word)
        if word in lookup:
            break
        synonyms = [
            lemma.name
            for lemma in sum([ss.lemmas for ss in wordnet.synsets(word)], [])
        ]
        lookup[word] = synonyms

    output = ""
    with open(doc_file) as f:
        # traverse the document looking to replace synonyms of words in the query
        for line in f:
            text = line.translate(None, string.punctuation)
            text_words = PunktWordTokenizer().tokenize(text)
            for (i, word) in enumerate(text_words):
                for key in lookup:
                    if str.lower(word) in lookup[key]:
                        text_words[i] = key
                        break
            output += " ".join(text_words) + "\n"

    if os.path.exists(new_file):
        os.remove(new_file)
    with open(new_file, 'w') as f:
        f.write(output)
    print 'done'
Example #26
0
def parser():
    words=None
    searchterm=request.vars.query
    type=request.vars.type

	#null searches
    if not(request.vars):
    	return dict(wordlist=True, words=words)
    elif not(searchterm):
    	redirect (URL(r=request,c="language",f="dictionary"))
    elif (' ' in searchterm):
	pass
    else:
	redirect (URL(r=request,c="language",f="dictionary",vars={'query':searchterm, 'type':type}))

    typequery=request.vars.type
### add reference to example sentences
    #wordlist=searchterm.split()
    #for word in wordlist:
#	pass
    if typequery=="English":
                query= dblanguage.BundjalungExamples.English.like('%%%s%%' % searchterm)
    else:
                query= dblanguage.BundjalungExamples.Language.like('%%%s%%' % searchterm)
    words=dblanguage(query)
    try:
	words=words.select()
    except:
	redirect (URL(r=request,c="language",f="dictionary",vars={'query':searchterm, 'type':type}))
##else load dictionary
    wd = dictionary.AboriginalLanguageDictionary()
    ws = stemmer.AboriginalLanguageStemmer()

    if (words):
	return dict(wordlist=False, words=words, query=searchterm)
    else:
		newwords = PunktWordTokenizer().tokenize(searchterm)
		words = []
	
                for word in newwords:
                	words+= [translate_word(word, typequery, ws, wd)]   
    		lang=[]
    		english=[]
    		pos=[]
    		for word in words:
                        printed_word = print_word(word)
                        lang.append(printed_word[0])
                        english.append(printed_word[1])
                        pos.append(printed_word[2])
    		words=[lang,english,pos]
    return dict(wordlist=True, words=words, query=request.vars.query)
Example #27
0
def overlapcontext(synset, sentence):
    gloss = set(PunktWordTokenizer().tokenize(synset.definition()))
    for i in synset.examples():
        gloss.union(i)
    gloss = gloss.difference(functionwords)
    if isinstance(sentence, str):
        sentence = set(sentence.split(" "))
    elif isinstance(sentence, list):
        sentence = set(sentence)
    elif isinstance(sentence, set):
        pass
    else:
        return
    sentence = sentence.difference(functionwords)
    return len(gloss.intersection(sentence))
Example #28
0
 def test_greek_stopwords(self):
     """Filter Greek stopwords"""
     from cltk.stop.classical_greek.stops_unicode import STOPS_LIST
     sentence = """Ἅρπαγος δὲ καταστρεψάμενος Ἰωνίην ἐποιέετο στρατηίην
     ἐπὶ Κᾶρας καὶ Καυνίους καὶ Λυκίους, ἅμα ἀγόμενος καὶ Ἴωνας καὶ
     Αἰολέας."""
     lowered = sentence.lower()
     tokens = PunktWordTokenizer().tokenize(lowered)
     no_stops = [w for w in tokens if w not in STOPS_LIST]
     target_list = [
         'ἅρπαγος', 'καταστρεψάμενος', 'ἰωνίην', 'ἐποιέετο', 'στρατηίην',
         'κᾶρας', 'καυνίους', 'λυκίους', ',', 'ἅμα', 'ἀγόμενος', 'ἴωνας',
         'αἰολέας.'
     ]
     self.assertEqual(no_stops, target_list)
Example #29
0
 def split(self, sep=None, maxsplit=-1):
     if not sep and maxsplit == -1:
         tokens = PunktWordTokenizer().tokenize(self)
     else:
         tokens = str.split(self, sep, maxsplit)
     result = []            
     for (index, token) in enumerate(tokens):
         if self.iob_out():
             iob_tag = self.OUT
         elif self.iob_begins() and index == 0:
             iob_tag = self.BEGINS
         else:
             iob_tag = self.IN
         result.append(self.__class__(token, iob_tag=iob_tag))            
     return result
Example #30
0
def contentTokenizer(url):
    try:
        response = mechanize.urlopen(url)
        soup = BeautifulSoup(response.read())
        s = soup.findAll(text=True)
        print "tokenizing : " + url
        punct = set(string.punctuation)
        s = ''.join(x for x in s if x not in punct)
        s = s.lower()
        s = unicode(s)
        tokens = PunktWordTokenizer().tokenize(s)
        tokens = [w for w in tokens if not w in stopset]
        return tokens
    except:
        print "url tokenizing error"
Example #31
0
def processDataFile():
    file_noun_list = []

    data_file_address = "data_file_trimmed.txt"
    file_in = open(data_file_address, 'r')

    for line in file_in:
        s1 = PunktWordTokenizer().tokenize(line)
        s2 = nltk.pos_tag(s1)
        noun_list = getNouns(s2)
        for noun in noun_list:
            lmtzr = WordNetLemmatizer()
            word = lmtzr.lemmatize(noun)
            file_noun_list.append(word)
    return file_noun_list
    def create_dictionary(self):
        """ This function creates the dictionary"""

        if self.input_dir is not False:
            assert ("No Input dir")
        if self.save_file is not False:
            assert ("No Save File")
        if self.files_list is not False:
            assert ("No files list file")

        wnl = WordNetLemmatizer()
        pt = PunktWordTokenizer()
        files = os.listdir(self.input_dir)
        token_dicts = {}
        article_files = []
        for f in files:
            with open(os.path.join(self.input_dir, f), "r") as g:

                #tokens = pt.tokenize(g.read())

                #stemmed = [wnl.lemmatize(token) for token in tokens]
                #stemmed_nostop = [w for w in stemmed if w not in stopwords.words('english')]

                # Now let's hold all of the data in lists
                token_dicts[f] = g.read()
                #article_files.append(f)

        # Now let's initialize the tfIDF vectorizer
        print "Starting Vectorizer"
        tfidf = TfidfVectorizer(max_features=1000)
        matrix_val = tfidf.fit_transform(token_dicts.values())
        print type(matrix_val)

        mat_file = self.save_file + ".npy"
        self.matrix_val = matrix_val

        # Now write out this object
        pickle.dump(tfidf, open(self.save_file, "w"))

        # File lists
        with open(self.files_list, "w") as f:
            f.write("\n".join(token_dicts.keys()))
        self.file_names = token_dicts.keys()

        pass
Example #33
0
def read_file(file_object):
    lines = file_object.readlines()
    for line in lines:
        print "#######LINE#######"
        print line

        text = PunktWordTokenizer().tokenize(line)
        #text = nltk.wordpunct_tokenize(line)
        print "#######TEXT#######"
        print text
        """
        STOP WORD
        """
        stopwords = nltk.corpus.stopwords.words('english')
        content = [w for w in text if w[0].lower() not in stopwords]
        print "#######STOP WORD#######"
        print content
        """
        POS TAGGING
        """
        tagged_sent = nltk.pos_tag(content)
        tagged_sent = [(word, simplify_wsj_tag(tag))
                       for word, tag in tagged_sent]
        print "#######POS#######"
        print tagged_sent
        """
        STEMMING
        """
        #tagged_sent = tuple(tagged_sent)
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stem_word = ""
        for wrd in tagged_sent:
            stem_word = stem_word + " " + stemmer.stem(wrd[0])
        print "#######STEMMING#######"
        print stem_word
        """
        LEMMATIZING
        """
        print tagged_sent
        lmtzr = WordNetLemmatizer()
        sent = ""
        for wrd in tagged_sent:
            sent = sent + " " + lmtzr.lemmatize(wrd[0])
        print "#######LEMMA" """"""
        print sent
Example #34
0
def sentenceSentiment(option): #option = 0 or 1 
    from nltk.tokenize.punkt import PunktWordTokenizer
    urlneg = 'http://www.unc.edu/~ncaren/haphazard/negative.txt'
    urlpos = 'http://www.unc.edu/~ncaren/haphazard/positive.txt'
    urllib.urlretrieve(urlneg,dirWordLists + os.sep + 'negative.txt')
    urllib.urlretrieve(urlpos,dirWordLists + os.sep + 'positive.txt')
    neg_list = open(dirWordLists + os.sep + 'negative.txt').read()
    pos_list = open(dirWordLists + os.sep + 'positive.txt').read()
    neg_list = sorted(list(set(neg_list.split('\n'))))
    neg_list = filter(None, neg_list)
    pos_list = sorted(list(set(pos_list.split('\n'))))
    pos_list = filter(None, pos_list)
    sent_Dict_sentiment = sentExtract('trainingset')
    sent_Dict = sentExtract('trainingset')
    setLocation = pathToSets + os.sep + 'trainingset'
    file_list = os.listdir(setLocation)
    file_list.remove('.DS_Store')
    file_list.sort()
    conceptVectors = getConceptVectors()
    for i in file_list:
        for j in conceptVectors:
            for n in list(range(len(sent_Dict[i][j]))):
                pos_count = 0
                neg_count = 0
                words = PunktWordTokenizer().tokenize(sent_Dict[i][j][n])
                for word in words:
                    if word in pos_list:
                        pos_count+=1
                    elif word in neg_list:
                        neg_count+=1
                sent_Dict_sentiment[i][j][n] = (pos_count, neg_count)
    if option == 0: #calculate overall positive and negative scores for documents
        return(sent_Dict_sentiment)
    elif option == 1: #calculate positive and negative separately for each document
        sent_Dict_Sum_sentiment = {}
        for x in file_list:
            total_pos = 0
            total_neg = 0
            for y in conceptVectors:
                for z in list(range(len(sent_Dict_sentiment[x][y]))):
                    total_pos += sent_Dict_sentiment[x][y][z][0]
                    total_neg += sent_Dict_sentiment[x][y][z][1]
            sent_Dict_Sum_sentiment[x] = [total_pos,total_neg]
        return(sent_Dict_Sum_sentiment)
Example #35
0
def tokenize(text):
    text = ' '.join(text)
    tokens = PunktWordTokenizer().tokenize(text)

    # lemmatize words. try both noun and verb lemmatizations
    lmtzr = WordNetLemmatizer()
    for i in range(0, len(tokens)):
        if lang != 'english':
            if tokens[i] in lang_dictionary:
                tokens[i] = lang_dictionary[tokens[i]]
        else:
            res = lmtzr.lemmatize(tokens[i])
            if res == tokens[i]:
                tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
            else:
                tokens[i] = res

    # don't return any single letters
    tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
    return tokens
Example #36
0
def worker(args):

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    word_detector = PunktWordTokenizer()

    def split_sentences(txt):
        sents = sent_detector.tokenize(txt.strip(), realign_boundaries=True)

        tokenizer = RegexpTokenizer(r'\w+')

        for sent in sents:

            tkns = tokenizer.tokenize(sent)
            #in word_detector.tokenize(sent))

            #tkns = filter(None, sstr.split(u' '))
            if len(tkns) > 0:
                yield u' '.join(tkns)

    tdir, txt_files, ofile, dpl = args

    nfiles = len(txt_files)

    with codecs.open(ofile, 'w', 'utf-8') as of:
        for i, fname in enumerate(txt_files, 1):
            txt_file = os.path.join(tdir, fname)

            with codecs.open(txt_file, 'r', 'utf-8') as f:
                text = u' '.join(f.readlines())
                if dpl:
                    doc_str = u' '.join(sent for sent in split_sentences(text))
                    of.write(doc_str)
                    of.write(u'\n')
                    of.flush()
                else:
                    for sent in split_sentences(text):
                        of.write(sent)
                        of.write(u'\n')
                        of.flush()
            print u'{}/{}) Completed {} --> {}'.format(i, nfiles, txt_file,
                                                       ofile).encode('utf-8')
Example #37
0
def get_stock_info():
    sents = get_sentences()
    found_stocks = []
    stock_data = {}

    for sent in sents:
        words = PunktWordTokenizer().tokenize(sent)
        filt_words = [
            w for w in words
            if not (w.lower() in stopwords.words('english') or w in punc)
        ]
        found = stock_in_senctence(filt_words)

        #for every stock found append list of words
        if len(found) > 0:
            for s in found:
                if not s in stock_data:
                    stock_data[s] = []
                stock_data[s].extend(filt_words)

    return stock_data
Example #38
0
 def preprocess_txt(self, text, convertlower=True, nopunk=True, stopwords=True, lemmatize_doc=True, lemmatize_pos=True, stemmed=False):
   #convert to lower
   if convertlower:
     text = text.lower()
   # remove punctuation
   if nopunk:
     text = self.remove_punct(text)
   #tokenize text
   tokens = PunktWordTokenizer().tokenize(text)
   #remove extra whitespaces
   tokens = [token.strip() for token in tokens]
   if stopwords:
     tokens = self.removestopwords(tokens)
   #lemmatize
   if lemmatize_doc:
     tokens = self.lemmatize(tokens,lemmatize_pos)
   #stem
   if stemmed:
     porter = PorterStemmer()
     tokens = [ porter.stem(token) for token in tokens ]
   return tokens
Example #39
0
    def replace_indices(self, indices, text):
        """Replace the indices of the tokenized text"""
        text_tokens = PunktWordTokenizer().tokenize(text)
        pos_tags = nltk.pos_tag(text_tokens)
        print(len(pos_tags))
        print(pos_tags)
        print(len(text_tokens))

        # result = [token.replace(filter(str.isalnum, token), )]

        
        for index in indices:
            try:
                stripped_word = filter(str.isalnum, text_tokens[index])
                text_tokens[index] = text_tokens[index].replace(stripped_word,
                                                                self._old_theme.replace(self._new_theme,stripped_word, pos_tags[index][1]),
                                                                )
            except IndexError as detail:
                print("Index error with {}, {}".format(text_tokens[index], detail))
                continue
            
        return " ".join(text_tokens)
Example #40
0
 def _tokenize(self):
     tok=PunktWordTokenizer()
     #tok=TreebankWordTokenizer()
     split_whitespace=lambda: re.compile(r'(\s+)').split(re.sub(u"\."," .",self.text))
     return list(chain(*[s if s.isspace() else tok.tokenize(s) for s in split_whitespace()]))
Example #41
0
    parser.add_argument('--verbose', '-v', action='store_true')
    parser.add_argument('--speak', '-s', action='store_true')
    args = parser.parse_args()

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=logging_level)

    if args.model:
        logging.debug('loading model...')
        hmm = load_model(args.model)

    if args.corpus:
        logging.debug('loading corpus...')
        corpus = open(args.corpus, 'rb').read()
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        word_detector = PunktWordTokenizer()
        sentences = sent_detector.tokenize(corpus.strip())
        words = [cleanup_words(word_detector.tokenize(s)) for s in sentences]

        logging.debug('training model...')
        trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=range(8), symbols=symbols(words))
        hmm = trainer.train_unsupervised(sequences(words), max_iterations=5)

        logging.debug('saving model...')
        save_model(args.corpus + '.hmm', hmm)

    logging.debug('sampling model...')

    while(True):
        utterance = sample(hmm, random.randint(5, 15)) + '.'
        print utterance
Example #42
0
from __future__ import print_function
import sys
import re
import random
from nltk.tokenize.punkt import PunktWordTokenizer
from config import *

from process_clusters import get_another_word_in_cluster

tkn = PunktWordTokenizer()

if len(sys.argv) <= 1:
    sys.stderr.write('Please specify the file you want to Twitterize\n')
    sys.exit()


with open(sys.argv[1], 'r') as f:

    # hacky way of keeping track of whether we're in a quote or anchor tag
    # I assume it just toggles, no nesting
    in_open_quote = False
    in_open_anchor_tag = False
    
    previous_word_translated = False

    for line in f:
        line = line.replace('&#32;', ' ')\
                   .replace('>', '> ').replace('<', ' <')\
                   .replace(START_QUOTE, 'START_QUOTE ')\
                   .replace(END_QUOTE, ' END_QUOTE')\
                   .replace('.', ' .')
Example #43
0
 def __init__(self):
     self.tokenizer = PunktWordTokenizer()
Example #44
0
 def __init__(self, *dictionaries):
     self.tokenizer = PunktWordTokenizer()
     self.dictionaries = dictionaries
Example #45
0
class NltkTools:
    _abbrevPattern  = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE)
    _datePattern    = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE)
    _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE)

    def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False,
                 pos_model=None, abbrev_set=None):
        """@param abbrev_set: a set of frequent abbreviations."""
        if tok:
            wtok = True
            stok = True
            
        if wtok:
            self.wordTokenizer = PunktWordTokenizer()
            #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE)
            self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE)
            # Bragantino,2006.In fix this shit
        if stok:
            try:
                self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            except LookupError:
                sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!")
                sys.stderr.write("WARNING: using an untrained sen_tokenizer")
                self.senTokenizer = PunktSentenceTokenizer()
        
        self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set())
        
        if pos:
            if pos_model is not None:
                self.posTagger = HunposTagger(pos_model, encoding="utf-8")
            else:
                self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8")
        if stem:
            self.stemmer = WordNetLemmatizer()

    def tokenize(self, raw):
        """Runs sentence and then word tokenization. Does some abbreviation-
        detection to fix false sentence endings."""
        sentences = self.sen_tokenize(raw)
        tokens = [self.word_tokenize(sen) for sen in sentences]
        for i in reversed(xrange(len(tokens) - 1)):
            if ( self.is_abbrev(tokens[i][-1])
                 or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None
                 and not NltkTools.starts_with_upper(tokens[i + 1][0])):
                tokens[i].extend(tokens[i + 1])
                tokens.pop(i + 1)
        return tokens
        

    def sen_tokenize(self, raw):
        """Tokenizes the raw text into sentences."""
        raw = NltkTools.cleanup_puncts(raw)
        return self.senTokenizer.tokenize(raw)

    def filter_long_sentences(self, raw, length=1024):
        """Filters "sentences" (non-whitespace character sequences longer than
        length) from the text."""
        # TODO: This looks nice but it is too generous with memory use
        return ' '.join(filter(lambda x: len(x) <= length, re.split(r"\s+", raw)))
        
    def sen_abbr_tokenize(self, raw):
        """Tokenizes the raw text into sentences, and tries to handle problems
        caused by abbreviations and such."""
        sentences = self.sen_tokenize(raw)
        for i in reversed(xrange(len(sentences) - 1)):
            if (NltkTools._abbrevPattern.search(sentences[i]) is not None
                    and not NltkTools.starts_with_upper(sentences[i + 1])):
                sentences[i] = ' '.join(sentences[i:i+2])
                sentences.pop(i + 1)
        return sentences

    @staticmethod
    def starts_with_upper(text):
        """Checks if the sentence starts with an upper case letter."""
        t = text.lstrip()
        return len(t) > 0 and t[0].isupper()
    
    @staticmethod
    def cleanup_puncts(raw):
        pos = 0
        cleaner = NltkTools._cleanerPattern.search(raw[pos:])
        while cleaner:
            if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper():
                pos = cleaner.end()
            elif cleaner.group(1)[-1].isdigit() and cleaner.group(3)[0].isdigit():
                pos = cleaner.end()
            else:
                changed_part_string = cleaner.expand(r"\1\2 \3\4")
                raw = raw[:cleaner.start()] + changed_part_string + raw[cleaner.end():]
                pos = cleaner.end()
            cleaner = NltkTools._cleanerPattern.search(raw, pos)
        return raw
    
    def is_abbrev(self, tok):
        return tok in self.abbrev_set

    def word_tokenize(self, sen):
        """Tokenizes the sentence to words and splits the sentence ending
        punctuation mark from the last word and adds it as the last token."""
        tokens = self.wordTokenizer.tokenize(sen)
        if len(tokens) == 0:
            return []
        punktMatchObject = self.punktSplitter.match(tokens[-1])
        if punktMatchObject is not None and not self.is_abbrev(tokens[-1]):
            tokens = tokens[:-1] + list(punktMatchObject.groups())
        return tokens

    def pos_tag(self, sentokens):
        return self.posTagger.tag(sentokens)

    def stem(self, tokens):
        return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens)
        
    def tag_raw(self, raw_text):
        """Convenience method for tagging (a line of) raw text. The NltkTools
        instance must have been initialized with C{pos=True, stem=True, tok=True}.
        It is a generator: returns attribute array of one word at a time. The
        attributes are the word, the pos tag and the stem."""
        sens = self.tokenize(raw_text)
        pos_tagged = list(self.pos_tag(sen) for sen in sens)
        stemmed = list(self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged)
        for sen in stemmed:
            for tok in sen:
                yield tok
            yield []
        return
Example #46
0
class Word_Tokenizer():
    def __init__(self):
        self.tokenizer = PunktWordTokenizer()
    
    def tokenize(self, sentence):
        return self.tokenizer.tokenize(sentence)
Example #47
0
def get_tokens(text, remove_stopwords=True):
    tokenizer = PunktWordTokenizer()
    return [term for term in tokenizer.tokenize(text.lower()) \
        if (len(term) > 1 or term.isalpha()) and \
        (term not in stopwords.words('english') or (not remove_stopwords))]