Ejemplo n.º 1
0
def demo():
    # split paragraph into sentences using punct
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(paragraphs)
    
    # split sentence into tokens (wrods + puncts)
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    WordPunctTokenizer().tokenize(s)
    #['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    PunktWordTokenizer().tokenize(s)
    #['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    PunktWordTokenizer().span_tokenize(s)
    #[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),  (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
    
    #split the paragraph into sentence
    nltk.sent_tokenize(s)
    #split sentence into word and punct
    nltk.word_tokenize(s)
    
    # pos tagging
    nltk.pos_tag(nltk.word_tokenize(s))



    
    
    
    
Ejemplo n.º 2
0
def __get_extra_wiki_description(mesh_text, wiki_text, tfidf):
    mesh_sents = sent_tokenize(mesh_text)
    wiki_sents = sent_tokenize(wiki_text)
    mesh_tfidf_list = __sentences_to_tfidf_vecs(mesh_sents, tfidf)
    wiki_tfidf_list = __sentences_to_tfidf_vecs(wiki_sents, tfidf)

    extra_description = ''
    for i, wiki_tfidf_vec in enumerate(wiki_tfidf_list):
        have_similar = False
        for j, mesh_tfidf_vec in enumerate(mesh_tfidf_list):
            sim_val = tfidf.sim(wiki_tfidf_vec, mesh_tfidf_vec)
            if sim_val > 0.95:
                # print sim_val, 'SIMILAR:'
                # print mesh_sents[j]
                # print wiki_sents[i]
                have_similar = True
                break
        if not have_similar:
            extra_description += ' ' + wiki_sents[i]

    if len(extra_description) > 1:
        extra_description = extra_description[1:]
        if extra_description[-1].isalpha():
            extra_description += '.'
        elif extra_description[-1] == ':':
            extra_description = extra_description[:-1] + '.'
        return extra_description
    return ''
Ejemplo n.º 3
0
def postroot():
    if 'text' in request.forms:
        text = request.forms['text']
        sentences = sent_tokenize(text)
        result = " ".join(w+'/'+t for s in sent_tokenize(text)
                          for (w,t) in pos_tag(word_tokenize(s)))
    else:
        text = 'Type your text here'
        result = ''
    return template("""
<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8">
  <title>My Part of Speech Tagger</title>
</head>
<body>
<h1>My Part of Speech Tagger</h1>
<p>Type or paste your text below</p>
<form method="post">
<textarea name="text" rows="10" cols="50">
{{text}}
</textarea>
<input type="submit"/>
</form>
<hr>
<p>The tagged text is</p>
<p>{{tagged}}
</body>
</html>
    """, text=text, tagged=result)
Ejemplo n.º 4
0
def cosineReadable(sentences):
	#FIRST CHECK - we need at least 3 sentences for this method to be worth it
	if (len(nltk.sent_tokenize(sentences)) <= 2):
		return sentences
	else:	#we have enough sentences to do a readability overhaul
		wordDimensions = [] #this gives every word an assigned dimension in the vector
		for sent in nltk.sent_tokenize(sentences):
			for word in nltk.word_tokenize(sent):
				if word not in wordDimensions: #no duplicates
					wordDimensions.append(word)

		sentlist = nltk.sent_tokenize(sentences)
		firstsent = sentlist[0]		
		sentenceVectors = [] #this will be a list of sentVectors for every sent in summary
		for i in range(0,len(sentlist)): #turn every sentence into a vector
			vec = makeSentVector(sentlist[i], wordDimensions)
			sentenceVectors.append(vec)
		sentScores = {} #dic keeps track of cosine distance scores for the sentences (in comparison to the first sentence)		
		firstSentVec = sentenceVectors[0]
		for x in range(1, len(sentlist)):
			sent = sentlist[x]
			val = spatial.distance.cosine(firstSentVec, sentenceVectors[x])
			sentScores[sent] = val
		
		sentScores = sorted(sentScores, reverse=True, key=sentScores.get)
		summary = str(sentlist[0])+"\n"
		for otherSent in sentScores:
			summary+=str(otherSent).strip()+"\n"
		summary = summary.strip()
		return summary
Ejemplo n.º 5
0
def get_summaries_and_articles(coll):
    '''
    INPUT: mongo collection object
    OUTPUT: list of summaries, list of articles

    Runs through the MongoDB and extracts all of the newser.com summaries
    with their corresponding articles.
    '''

    summary_list = []
    article_list = []

    for doc in list(coll.find()):
        if doc['full_text'] != ' ':
            summary_list.append(doc['summary'])
            article_list.append(doc['full_text'])

    for i in xrange(len(article_list)):
        text = ''
        for article in article_list[i]:
            text += article
        article_list[i] = text

    summary_test = np.unique([summary_list[i] for i in xrange(len(summary_list))
                              if article_list[i] != '' and
                              article_list[i] != ' ' and
                              len(sent_tokenize(article_list[i])) > 10])
    article_test = np.unique([article for article in article_list
                              if article != '' and
                              article_list[i] != ' ' and
                              len(sent_tokenize(article)) > 10])

    return summary_test, article_test
Ejemplo n.º 6
0
def print_summary(indexes, doc, extract_n, doc_index):

    if len(indexes) < extract_n:
        extract_n = len(indexes)

    reference = "reference/task" + str(doc_index) + "_englishReference" + str(doc_index) + ".txt"
    reference_output = io.open(reference, "w", encoding='utf8')
    tips = sent_tokenize(doc.tip)

    for tip in tips:
        reference_output.write(tip + "\n")
    reference_output.close()

    sentences = sent_tokenize(doc.review)
    
    #print ""
    ## print "sentences length: " + str(len(sentences))
    #print ""
    #print "indexes: " + str(indexes)
    #print ""
    
    system = "system/task" + str(doc_index) + "_englishSyssum" + str(doc_index) + ".txt"
    system_output = io.open(system, "w", encoding='utf8')    
    for i in range(0, extract_n):
        #print "index: " + str(indexes[i])
        system_output.write(sentences[indexes[i]] + "\n")

    system_output.close()
Ejemplo n.º 7
0
def refineText(infp, outfp):
    stringlist = []
    textline = ""
    size = ""
    for line in infp:
        current = line.strip().replace('  ',' ')
        if current.startswith("<size>"):
            if current != size and size != "":
                for sentence in nltk.sent_tokenize(''.join(stringlist)):
                    for token in MyTokenizer().tokenize(sentence):
                        token = token.replace("“", "")
                        token = token.replace("”", "")
                        outfp.write(token+" ")
                outfp.write('\n')
                stringlist = []
                outfp.write('\n')
            stringlist.append(textline)
            size = current
        elif current == '':
            continue
        elif current[-1] == '-':
            textline = current[0:-1]
        else:
            textline = current+' '
    for sentence in nltk.sent_tokenize(''.join(stringlist)):
        for token in MyTokenizer().tokenize(sentence):
            token = token.replace("“", "")
            token = token.replace("”", "")
            outfp.write(token+" ")
    outfp.write('\n')
Ejemplo n.º 8
0
def readD(txtdoc):
		
	#find basename
	import os, nltk
	base = os.path.basename(txtdoc)		
			#read file
	with open (txtdoc,"r") as myfile:
		text = myfile.readlines()
		
	#extract relevant text from dataset
				
	#write document
	f = open(base + ".ready", "w")         
		
			
	#counts loops
	a = 0
			
	#for every line
	for line in text:
							 
		if line.startswith("<bestanswer>"):
			
			cleansentence = line[12:-13].replace("&#xa;"," ").replace(";",".").replace("&lt;br /&gt;&#xa;","").replace("&#xa;"," ").replace("...",".").replace("<"," ").replace("&lt.br /&gt.","")
		#split line into sentences
			sentences = nltk.sent_tokenize(cleansentence)
				
			s = len(sentences)
			#write into document
			x=0
			while x < (s-1):
				f.write(sentences[x] + "\n")
				a +=1
				x+=1
			f.write(sentences[s-1])
					
			a +=1
			print( (str(a)), end='\r') 

		if line.startswith("<answer_item>"):
			
			cleansentence = line[13:-14].replace("&#xa;"," ").replace(";",".").replace("&lt;br /&gt;&#xa;","").replace("&#xa;"," ").replace("...",".").replace("<"," ").replace("&lt.br /&gt.","")
		#split line into sentences
			sentences = nltk.sent_tokenize(cleansentence)
				
			s = len(sentences)
			#write into document
			x=0
			while x < (s-1):
				f.write(sentences[x] + "\n")
				a +=1
				x+=1
			f.write(sentences[s-1])
					
			a +=1
			print( (str(a)), end='\r') 
				
	f.close
        
Ejemplo n.º 9
0
def print_instance(relations, finlist, is_train):
    arg1 =  reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[0])])
    arg2 = reduce(lambda x,y: x+y, [nltk.word_tokenize(s) for s in nltk.sent_tokenize(finlist[1])])
    if len(relations)>1:
        return
    #if is_train:
    for relation in relations:
        fw.write(json.dumps({'Arg1':arg1,'Arg2':arg2,'Sense':relation})+'\n')
Ejemplo n.º 10
0
def percentage_long_sent(text):
    long_sentence = 0
    sentence_all = len(nltk.sent_tokenize(text))
    sentence_list = nltk.sent_tokenize(text)
    for sentence in sentence_list:
        wordlist = nltk.word_tokenize(sentence)
        word_count = len(wordlist)
        if word_count >15:
            long_sentence += 1
    return long_sentence/sentence_all
def create_summary(text):
    text = re.sub(r'\s\s+', ' ', text)
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < 10:
        num = 3
    else:
        num = 2

    summarizer = SimpleSummarizer()
    return nltk.sent_tokenize(summarizer.summarize(text, num))
Ejemplo n.º 12
0
def featurize():
    n = 100   # number of articles per topic
    employer = request.form['user_input']
    ftopic = df[df['company']==employer].head(n)
    text = list(ftopic['pros'].values)
    text = " ".join(text)
    text = re.sub('[^\w\s]+', ' ', text).replace('\n', ' ')
   # tokenize into words
    tokens = [word.lower() for sent in sent_tokenize(text) \
             for word in word_tokenize(sent)]
   # remove stopwords

   # some extra stop words not present in stopwords
    stop = stopwords.words('english')
    stop += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \
            'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've']
    stop += str(employer)

    tokens = [token for token in tokens if token not in stop]
   # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 2]
    string = " ".join(tokens)
    wordcloud = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string)
    plt.figure(figsize=(50,30))
    plt.imshow(wordcloud)
    plt.axis("off")
    name = 'static/' +str(employer) + '-pros.png'
    pic = plt.savefig(name, bbox_inches='tight',transparent = True)

    text2 = list(ftopic['cons'].values)
    text2 = " ".join(text2)
    text2 = re.sub('[^\w\s]+', ' ', text2).replace('\n', ' ')
   # tokenize into words
    tokens2 = [word.lower() for sent in sent_tokenize(text2) \
             for word in word_tokenize(sent)]
   # remove stopwords

   # some extra stop words not present in stopwords
    stop2 = stopwords.words('english')
    stop2 += ['said', 'would', 's', 'also', 'U', 'mr', 're', 'may', 'one', 'two', 'buy', 'much', \
            'take', 'might', 'say', 'new', 'year', 'many','etc', 'll', 've']
    stop2 += str(employer)

    tokens2 = [token for token in tokens2 if token not in stop2]
   # remove words less than three letters
    tokens2 = [word for word in tokens2 if len(word) >= 2]
    string2 = " ".join(tokens2)
    wordcloud2 = WordCloud(font_path='/Library/Fonts/Arial Rounded Bold.ttf').generate(string2)
    plt.figure(figsize=(50,30))
    plt.imshow(wordcloud2)
    plt.axis("off")
    name2 = 'static/' +str(employer) + '-cons.png'
    pic2 = plt.savefig(name2, bbox_inches='tight',transparent = True)

    return render_template('template_wordcloud.html', pic_pro = name, pic_con=name2, employer=employer)
Ejemplo n.º 13
0
def main():
    tagged = getTagged(corpusdir)
    featureSet = [(getFeatures(feature), tag) for (feature, tag) in tagged]
    trainSet = featureSet[:]
    testSet = featureSet[:100]
    classifier = nltk.NaiveBayesClassifier.train(trainSet)

    fileList = os.listdir(corpusdir)
    sentences = []
    visited = []
    for (stem, tag) in [(f[:-4], f[-3:]) for f in fileList]:
        if stem in visited:
            continue
        else:
            visited.append(stem)
        print stem

        f_pos, f_neg = open(corpusdir + "/" + stem + "_pos"), open(corpusdir + "/" + stem + "_neg")
        f_neg = open(corpusdir + "/" + stem + "_neg")
        raw_pos, raw_neg = f_pos.read(), f_neg.read()
        sent_pos, sent_neg = sent_tokenize(raw_pos), sent_tokenize(raw_neg)
        f_pos.close()
        f_neg.close()

        falseNeg = falsePos = trueNeg = truePos = 0
        for sent in sent_pos:
            guess = classifier.classify(getFeatures(sent))
            if guess == "POS":
                truePos +=1
            else:
                falseNeg += 1

        for sent in sent_neg:
            guess = classifier.classify(getFeatures(sent))
            if guess == "NEG":
                trueNeg +=1
            else:
                falsePos += 1

        posTags = len(sent_pos)
        negTags = len(sent_neg)
        totTags = posTags + negTags

        #print "Total sentences: %i" % (totTag)
        #print "Total negative: %.2f%%" % (float(negTags) / totTag * Tag100)
        #print "Total positive: %.2f%%" % (float(posTags) / totTag * 100)
        #print "True negatives: %.2f%%" % (float(trueNeg) / negTags * 100)
        #print "True positives: %.2f%%" % (float(truePos) / posTags * 100)
        print "False negatives: %.2f%%" % (float(falseNeg) / posTags * 100)
        print "False positives: %.2f%%" % (float(falsePos) / negTags * 100)
        print ""


    print "Accuracy: %f" % nltk.classify.accuracy(classifier, testSet)
Ejemplo n.º 14
0
 def _shuffle_text(self, text, times, label_func):
     from random import shuffle
     origin_sents = sent_tokenize(text)
     assert len(origin_sents) > 1
     sents = sent_tokenize(text)
     res = []
     for i in range(times):
         shuffle(sents)
         label = label_func(sents, origin_sents)
         res.append((' '.join(sents[:-1]), label))
     return res
Ejemplo n.º 15
0
	def content(self, title, text):
		""" Set title and text of the content needs to be parsed. """
		self._title = title
		self._text = text
		self._sepText = text.split('\n')
		self._tokens = nltk.word_tokenize(self._text) # not using regex for tokenization
		self._textSents = nltk.sent_tokenize(self._text)
		self._textSents = list(map(lambda x: x.strip(), self._textSents)) # strip all sentences
		self._sepTextSents = []
		for pp in self._sepText:
			self._sepTextSents.append(nltk.sent_tokenize(pp))
Ejemplo n.º 16
0
def get_crowdd500_data(set_type):
  """
  Returns documents and keywords in either train or test sets of Crowd500 [Marujo2012]
  """
  path = 'data/500N-KPCrowd-v1.1/CorpusAndCrowdsourcingAnnotations/' + set_type + '/'
  files = [f[:-4] for f in os.listdir(path) if re.search('\.key',f)]

  documents = []
  all_keywords = []

  if set_type=='test':
    documents = pickle.load(open(path + 'scraped_testdata.pkl','rb')) # scraped webpages in test set
    skip_these = [3,7,14,19,26,27,32,33,43,45] # these webpages no longer exist, cannot find source text

  for file_idx in xrange(len(files)):
    if set_type=='train':

      # original text
      f = open(path + files[file_idx] + '.txt','r')
      text = f.read()
      f.close()  

      # encoding issues in Crowd500  
      try:
        text = text.encode('utf-8')
        sentences = nltk.sent_tokenize(text.lower())        
      except:
        text = text.decode('utf-8')
        sentences = nltk.sent_tokenize(text.lower())   
      
      documents.append(text)

      # keywords
      keywords = []
      with open(path + files[file_idx] + '.key','r') as f:
        for line in f:
          keywords.append(line.strip('\n'))            
      keywords = [remove_punctuation(k.lower()) for k in keywords]
      all_keywords.append(keywords)

    else:
      if file_idx not in skip_these:
        keywords = []
        with open(path + files[file_idx] + '.key','r') as f:
          for line in f:
            keywords.append(line.strip('\n'))            
        keywords = [remove_punctuation(k.lower()) for k in keywords]
        all_keywords.append(keywords)
  
  return {'documents':documents, 'keywords':all_keywords}
Ejemplo n.º 17
0
def parse_and_tag(corpus):
    boring_tags=['CC','DT',',','IN','PRP','PRP$','VBZ','TO','POS',':','(',')','AT','.',"''"]
    if isinstance(corpus,basestring):
        with lorecorpus.open(corpus) as fin:
            sents=nltk.sent_tokenize(fin.read().strip())
    else:
        sents=nltk.sent_tokenize(corpus.raw().strip())
    tagged_text_unmerged=([nltk.pos_tag(nltk.word_tokenize(sent)) for sent in sents])
    #make a list of tuples, not a list of lists of tuples
    tagged_text=[item for sublist in tagged_text_unmerged for item in sublist]
    all_word_tuples=[(word[0].lower(), word[1]) for word in tagged_text if word[1] not in boring_tags]
    #turn the tuples into lists
    all_word_lists=[list(word) for word in all_word_tuples]
    all_words=[a for (a,b) in all_word_tuples]
    return all_word_lists
Ejemplo n.º 18
0
def extractNames(li):
    finList = []
##  Loop through the list that has the HTML page content
    for a in li:
##  Tokenize the HTML text into smaller blocks of text
        for send in nltk.sent_tokenize(str(a)):
            smLi = []
##  Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag
            for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))):
##  If the POS tag is NNP (noun)
                if 'NNP' in chunk[1]:
##  If the each character in the word is an alphanumeric character and there are more than 2 characters in the word
                    if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2):
##  Append the list with the index of the word, chunk that has the POS tag and the link
                        smLi.append([index, chunk, a[1]])
            finList.append(smLi)
    nameLi = []
    for f in finList:
        if len(f) > 0:
            strName = ''
            for index, i in enumerate(f):
##  If strName is blank, declare it with the current word in the list
                if strName == '':
                    strName = i[1][0]
##  If index+1 is not at the end of the list, continue
                if (index + 1) < len(f):
##  If the index is a consecutive index, add to the strName
                    if i[0] + 1 == f[index + 1][0]:
                        strName = strName + ' ' + f[index + 1][1][0]
##  If the index is not a consecutive index, append strName to the nameLi list with the article link and make the strName blank
                    else:
                        if ' ' in strName:
                            nameLi.append([strName, i[2]])
                        strName = ''
    return nameLi
Ejemplo n.º 19
0
def editText(filename):
    global subject
    subject = get_subj(filename)
    f = open(filename, 'r')
    text = f.read()
    f.close()
    new_file_name = subject + 'B.txt'
    g = open(new_file_name, 'w')
    sents = sent_tokenize(text)
    for i in range(len(sents)):
        sents[i] = sents[i].replace(' he ', ' '+subject+' ')
        sents[i] = sents[i].replace(' he.', ' '+subject+'.')
        sents[i] = sents[i].replace(' he,', ' '+subject+',')
        sents[i] = sents[i].replace('He ', subject+' ')
        sents[i] = sents[i].replace(' him ', ' '+subject+' ')
        sents[i] = sents[i].replace(' him.', ' '+subject+'.')
        sents[i] = sents[i].replace(' him,', ' '+subject+',')
        sents[i] = sents[i].replace(' himself ', ' '+subject+' ')
        sents[i] = sents[i].replace(' himself.', ' '+subject+'.')
        sents[i] = sents[i].replace(' himself,', ' '+subject+',')
        sents[i] = sents[i].replace(' his ', ' '+subject+" ")
        sents[i] = sents[i].replace(' his.', ' '+subject+".")
        sents[i] = sents[i].replace(' his,', ' '+subject+",")
        sents[i] = sents[i].replace('His ', subject)
        sents[i] = simplify_sent(sents[i])
        g.write(sents[i] + ' ')
    g.close()
    return new_file_name
Ejemplo n.º 20
0
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize( ' '.join(re.findall(r'[\w]+', text, re.UNICODE)) ,language='spanish') for word in nltk.word_tokenize(sent,language='spanish')]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.match('[a-zA-Z]', token):
            filtered_tokens.append(token.lower())
    return filtered_tokens

#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
# def strip_proppers_POS(text):
#     tagged = nltk.tag.pos_tag(text.split()) #use NLTK's part of speech tagger
#     non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
#     return non_propernouns

# def wordFrequency(tokens, stopwords):
#     dictFreq = {}
#     for token in tokens:
#         if not token in stopwords:
#             dictFreq[token] = tokens.count(token)

#     ans = sorted(dictFreq, key=dictFreq.__getitem__, reverse=True)
#     ipdb.set_trace()
#     return
Ejemplo n.º 21
0
def beginnerText(txt):
    sentences = sent_tokenize(txt)
    for sentence in sentences:
        sent = word_tokenize(sentence)
        sent = [word for word in sent if word.isalpha() and word not in ['amp']]
        speak(' '.join(sent))
        time.sleep(0.35)
def getThreads(subreddit,num_comments=10,max_threads=5000,max_comments=100,min_comments=10,verbose=False):
    comment_counter = 0
    already_done = [] #keep track of threads you've already seen (you can get them twice)
    subred = r.get_subreddit(subreddit) #get a subreddit
    comments = []
    questionComment = []
    for sub in subred.get_hot(limit=max_threads):
        if sub.id not in already_done and comment_counter < num_comments:
            already_done.append(sub.id)
            sub.replace_more_comments(limit=None, threshold=1)
            flat_comments = praw.helpers.flatten_tree(sub.comments)
            for comment in flat_comments:
                diff_comment = True
                for sentence in sent_tokenize(comment.body.encode('utf-8')):  
                    if '[deleted]' in sentence:
                        break            
                    comments.append(sentence)
                    if '?' in sentence and not diff_comment:
                        s = {}
                        s['Request'] = comments[-2]+' '+sentence
                        s['id'] = comment.id
                        s['score'] = comment.score
                        questionComment.append(s)          
                        comment_counter += 1
                        print 'Added question. Comment counter',comment_counter
                    diff_comment = False
                    if comment_counter>num_comments:
                        return [comments,questionComment]
    return [comments,questionComment]
Ejemplo n.º 23
0
 def genTokens(self):
     text = self.ensureText()
     # set off w/space any entities that are butted up to preceding data
     text = re.sub(r'(?<!\s)(?P<entityref>%s)' % Tokenizer.entityRE, 
                   ' \g<entityref>',
                   text)
     # set off w/space any entities that are butted up to following data
     text = re.sub(r'(?P<entityref>%s)(?!\s)' % Tokenizer.entityRE,
                   '\g<entityref> ',
                   text)
     for (entities, segment) in self.genSegments(text):
         # print "SEGMENT: [%s %r]" % (entities,segment)
         segment = segment.strip()
         if entities:
             for entity in re.split(r'\s+',segment):
                 # print " ENTITY: [%s]" % entity;
                 yield entity
         else:
             sentences = nltk.sent_tokenize(segment)
             # correct for any embedded newlines (irrelevant?)
             sentences = [re.sub(r'[\n\t]+', ' ', sent).strip() for sent in sentences]
             # inexplicably, NLTK thinks big,red should be a single token
             sentences = [re.sub(r'\b,\b', ', ', sent) for sent in sentences]
             for sentence in sentences:
                 # print "  SENTENCE: [%s]" % sentence
                 for tok in nltk.word_tokenize(sentence):
                     # print "    TOK: [%s]" % tok
                     yield tok
Ejemplo n.º 24
0
def num_words(filename):
  lengths = []
  with open(filename) as f:
    lines = f.read()
    for l in nltk.sent_tokenize(lines):
      lengths.append(len(nltk.word_tokenize(l)))
  return (sum(lengths) - 0.0) / len(lengths)
Ejemplo n.º 25
0
def process_query(query_str):
    """
    Tokenize and stem the query words and compute the frequency of each word in the query list

    Arguments:
        query_str       string of query words

    Returns: 
        query_count     a dictionary with the stemmed words and the its frequency in the query
    """
    
    query_list = []
    sentences = nltk.sent_tokenize(query_str)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        for word in words:            
            normalized = text_processing.normalize(word)
            if normalized is not None:
                query_list.append(normalized)
            
    # count the frequency of each term
    query_count = Counter(query_list)

    # set the tf value for each term
    query_weight = {}
    for query_term, term_count in query_count.items():
        query_weight[query_term] = 1 + math.log10(term_count)

    return query_weight
Ejemplo n.º 26
0
def get_xmen_text(soup):
    
    #en_stopwords = set(nltk.corpus.stopwords.words('english'))
    raw = nltk.clean_html(str(soup))
    raw_trunc = raw[:raw.rfind('References')]
    sents = nltk.sent_tokenize(raw_trunc)
    words = [nltk.word_tokenize(sent) for sent in sents]
    poss = [nltk.pos_tag(word) for word in words]
    #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss]
    #for pos in poss: print pos
    poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss]
    print poss_filter
    nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter]
    
    def sub_leaves(tree, node):
        return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)]
    
    people = [sub_leaves(ne, 'NE') for ne in nes]
    people = [item for sublist in people
              for subsublist in sublist
              for subsubsublist in subsublist
              for item in subsubsublist
              if item not in ('NNP', 'NN', 'NNPS', 'JJ')]
    people = merge_people(people)
    fd = nltk.FreqDist(person for person in people if person!='Magneto')
    fd.plot(50)
Ejemplo n.º 27
0
 def capitalize(self, text):
     """
     Capitalizes whole text.
     """
     return " ".join(
         self.capitalize_sentence(sent) for sent in sent_tokenize(text)
     )
Ejemplo n.º 28
0
def poem_tokenizer(poem):
    lines = open(poem).readlines()
    tokens = []
    for l in lines:
        m = re.sub(r'[^0-9a-zA-Z\s\']', '', l) # cleans annoying punctuation, but keep apostrophes
        tokens.append([word for sent in nltk.sent_tokenize(m) for word in nltk.word_tokenize(sent)])
    return tokens
Ejemplo n.º 29
0
Archivo: eval.py Proyecto: nbir/544nlp
def test3():
	import nltk
	from nltk.corpus import conll2000
	from urllib import urlopen

	fname = 'data/dummy/webpages/Abby_Watkins/raw/002/index.html'
	doc = urlopen(fname).read()
	raw = nltk.clean_html(doc)

	decoded = raw.decode('utf-8', errors='ignore')
	raw = decoded.encode('utf-8')
	print raw

	sentences = nltk.sent_tokenize(raw)
	sentences = [s.replace('\n', '').replace('\r', '').strip() for s in sentences]
	sentences = [nltk.word_tokenize(s) for s in sentences]
	sentences = [nltk.pos_tag(s) for s in sentences]
	#porter = nltk.PorterStemmer()
	#sentences = [[(porter.stem(w[0]), w[1]) for w in s] for s in sentences]
	#sentences = [[w[0] for w in s] for s in sentences]
	#sentences = [['%s_%s' % w for w in s] for s in sentences]


	lexicon = []
	#for s in sentences:
		#print len(s)
		#for w in s:
		#	print w[0]
		#print ' '.join(w[0] for w in s)
		#print nltk.ne_chunk(s, binary=True)

		#lexicon.extend(s)
	fdist = nltk.FreqDist(lexicon)
Ejemplo n.º 30
0
def preprocess_email(filename, puncts, stemmer):
    fin = open(filename, 'rb')
    text = fin.read()
    fin.close()
    # lowercase
    text = text.lower()
    # strip all HTML
    text = re.sub("<[^<>]+>", "", text)
    # Handle numbers
    text = re.sub("[0-9]+", "number", text)
    # Handle URLs
    text = re.sub("(http|https)://[^\s]*", "httpaddr", text)
    # Handle email addresses
    text = re.sub("[^\s]+@[^\s]+", "emailaddr", text)
    # Handle $ sign
    text = re.sub("[$]+", "dollar", text)
    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            # get rid of punctuation
            if word in puncts: continue
            # remove non-alpha chars
            word = re.sub("[^a-zA-Z0-9]", "", word)
            # stem the word
            word = stemmer.stem(word)
            # skip word if too short (currently a NOOP)
            if len(word) < 1: continue
            yield word
Ejemplo n.º 31
0
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
#wordnet for using sysnsets

text = input()
sent = sent_tokenize(text)
for i in sent:
    token = word_tokenize(i)
    print(token)

stop = set(stopwords.words("english"))

filteredSentence = []
for w in token:
    if w not in stop:
        filteredSentence.append(w)

print("filtered sentence: ", filteredSentence)

#create a database of all the symptoms and compare it with the tokens and stemmed tokens
#if comparision % is more than 80, consider them else dont

#ask use to verify the symptoms
#if user adds something, filter the sentence and compare it from the database
#if match is >80%, add ot otherwise google it and add symptoms
Ejemplo n.º 32
0
def DNN_lm():
    """
        http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial
        -part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/
    """

    vocabulary_size = 26
    unknown_token = "UNKNOWN_TOKEN"
    sentence_start_token = "SENTENCE_START"
    sentence_end_token = "SENTENCE_END"
    path = os.path.dirname(os.path.realpath(__file__))

    #print("Reading CSV file...")
    with open(path + '/data/reddit-comments-2015-08.csv', 'r') as f:
        reader = csv.reader(f, skipinitialspace=True)
        next(reader)
        # Split full comments into sentences
        sentences = itertools.chain(
            *[nltk.sent_tokenize(x[0].lower()) for x in reader])
        # Append SENTENCE_START and SENTENCE_END
        sentences = [
            "%s %s %s" % (sentence_start_token, x, sentence_end_token)
            for x in sentences
        ]
    #print("Parsed %d sentences." % (len(sentences)))

    # Tokenize the sentences into words
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    #print("Found %d unique words tokens." % len(word_freq.items()))

    # Get the most common words and build index_to_word
    # and word_to_index vectors
    vocab = word_freq.most_common(vocabulary_size - 1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

    #print("Using vocabulary size %d." % vocabulary_size)
    #print("The least frequent word in our vocabulary is '%s' \
    #      and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

    # Replace all words not in our vocabulary with the unknown token
    for i, sent in enumerate(tokenized_sentences):
        tokenized_sentences[i] = [
            w if w in word_to_index else unknown_token for w in sent
        ]

    print("\nExample sentence: '%s'" % sentences[0])
    print("\nExample sentence after Pre-processing: '%s'" %
          tokenized_sentences[0])

    # Create the training data
    X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                          for sent in tokenized_sentences])
    y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                          for sent in tokenized_sentences])

    #print(X_train)
    #print(y_train)

    def gen_input(x):
        sentence = x
        d = np.zeros((len(sentence), vocabulary_size))
        d[np.arange(len(sentence)), sentence] = 1
        return d

    l1 = Layer(100, Tansig())
    l2 = Layer(vocabulary_size, Softmax())
    #l2 = Layer(vocabulary_size, Logsig())

    net = Network((l1, l2))
    net.connect(1, 2, D=0)
    net.connect(1, 1, D=1)

    net.load(vocabulary_size, 1, D=0)

    sentence10 = gen_input(X_train[10])
    o = net.forward(sentence10)
    print(o)

    training_input = [gen_input(x) for x in X_train[:2]]
    training_output = [gen_input(x) for x in y_train[:2]]

    net.train(training_input, training_output)
    o = net.forward(sentence10)
    print(o)
Ejemplo n.º 33
0
def convert(files_raw_data):

    files_raw_data = files_raw_data.replace("\n", " ")
    files_raw_data = files_raw_data.lower()
    sentences = sent_tokenize(files_raw_data)
    return sentences
Ejemplo n.º 34
0
def process_corpus(corpus_name):
    print("Corpus to examine: " + corpus_name)
    input_file = corpus_name + ".zip"
    corpus_contents = unzip_corpus(input_file)
    sentences = []
    words = []
    pos_results = open(corpus_name + "-pos.txt", 'w+')
    cur_sentence = []
    all_pos = []
    for entry in corpus_contents:
        sentences.append(nltk.sent_tokenize(entry))
    for story in sentences:
        for sent in story:
            word_sent = nltk.word_tokenize(sent)
            words.extend(word_sent)
            cur_sentence = nltk.pos_tag(word_sent)
            all_pos.extend(cur_sentence)
            for pair in cur_sentence:
                pos_results.write(pair[0] + "/" + pair[1])
                pos_results.write('\n')
        pos_results.write('\n')
    print("Number of words: " + str(len(words)))
    i = 0
    for word in words:
        words[i] = word.casefold()
        i += 1
    print("The vocabulary size is: " + str(len(set(words))))
    most_common = nltk.FreqDist(pos for (word, pos) in all_pos)
    freq_list = most_common.most_common()
    print("The most common part of speech is " + str(freq_list[0][0]) +
          " which occurs " + str(freq_list[0][1]) + " times.")
    print("")
    word_dist = nltk.FreqDist(word for word in words)
    word_freq = word_dist.most_common()
    freq_results = open(corpus_name + "-word-freq.txt", 'w+')
    for pair in word_freq:
        freq_results.write(str(pair))
        freq_results.write('\n')
    chart_freq = nltk.ConditionalFreqDist(
        (word.casefold(), tag) for (word, tag) in all_pos)
    con_freq = nltk.ConditionalFreqDist(
        (tag, word.casefold()) for (word, tag) in all_pos)
    copy = sys.stdout
    sys.stdout = open(corpus_name + "-pos-word-freq.txt", 'w+')
    chart_freq.tabulate()
    sys.stdout = copy
    common_words_by_pos = [
        con_freq['NN'].most_common()[0], con_freq['VBD'].most_common()[0],
        con_freq['JJ'].most_common()[0], con_freq['RB'].most_common()[0]
    ]
    text_words = nltk.Text(words)
    print("The most common Noun is " + common_words_by_pos[0][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[0][0])
    print("The most common Past Tense Verb is " + common_words_by_pos[1][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[1][0])
    print("The most common Adjective is " + common_words_by_pos[2][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[2][0])
    print("The most common Adverb is " + common_words_by_pos[3][0] +
          ". Similar words include:")
    text_words.similar(common_words_by_pos[3][0])
    print("")
    print("The found collocations are:")
    text_words.collocations()

    pass
    def train_splitter(data_generator):

        while True:
        
            # get the batch triplet
            query, pos_docs, neg_docs = next(data_generator)
            
            # tokenization
            query = tokenizer.texts_to_sequences(query)
            
            if queries_sw is not None:
                for tokenized_query in query:
                    tokenized_query = [token for token in tokenized_query if token not in queries_sw] 
            
            new_pos_docs = []
            new_neg_docs = []
            
            new_pos_extra_features = []
            new_neg_extra_features = []
            
            # sentence splitting
            if mode==4:
                
                for b in range(len(pos_docs)):
                    new_pos_docs.append([])
                    new_neg_docs.append([])
                    
                    _temp_pos_docs = nltk.sent_tokenize(pos_docs[b]["text"])
                    _temp_pos_docs = tokenizer.texts_to_sequences(_temp_pos_docs)
                    
                    if docs_sw is not None:
                        for tokenized_docs in _temp_pos_docs:
                            tokenized_docs = [token for token in tokenized_docs if token not in docs_sw] 
                    
                    # skip batch with empty pos_docs
                    if all([ len(sentence)==0  for sentence in _temp_pos_docs]):
                        break # try a new resampling, NOTE THIS IS A EASY FIX PLS REDO THIS!!!!!!!
                                 # for obvious reasons
                    
                    _temp_neg_docs = nltk.sent_tokenize(neg_docs[b]["text"])
                    _temp_neg_docs = tokenizer.texts_to_sequences(_temp_neg_docs)
                    
                    if docs_sw is not None:
                        for tokenized_docs in _temp_neg_docs:
                            tokenized_docs = [token for token in tokenized_docs if token not in docs_sw] 
  
                    # compute extra features
                    #extra_features_pos_doc = compute_extra_features(query[b], _temp_pos_docs, idf_from_id_token)
                    #extra_features_neg_doc = compute_extra_features(query[b], _temp_neg_docs, idf_from_id_token)
                    
                    # add the bm25 score
                    #extra_features_pos_doc.append(pos_docs[b]["score"])
                    #extra_features_neg_doc.append(neg_docs[b]["score"])
                    
                    # add all the extra features
                    #new_pos_extra_features.append(extra_features_pos_doc)
                    #new_neg_extra_features.append(extra_features_neg_doc)
                    
                    # split by exact matching
                    for t_q in query[b]:
                        # entry for the query-term
                        new_pos_docs[-1].append([])
                        new_neg_docs[-1].append([])
                        
                        for pos_sent in _temp_pos_docs:
                            # exact math for the pos_document
                            for i,t_pd in enumerate(pos_sent):
                                if t_pd==t_q:
                                    new_pos_docs[-1][-1].append(pos_sent)
                                    break

                        for neg_sent in _temp_neg_docs:
                            for i,t_nd in enumerate(neg_sent):
                                if t_nd==t_q:
                                    new_neg_docs[-1][-1].append(neg_sent)
                                    break
            else:
                raise NotImplementedError("Missing implmentation for mode "+str(mode))
            
            if len(new_pos_docs) == len(pos_docs): # if batch is correct
                yield query, new_pos_docs, new_pos_extra_features, new_neg_docs, new_neg_extra_features
Ejemplo n.º 36
0

raw_document_text = 'Federer is married to former Women\'s Tennis Association '\
'player Mirka Vavrinec. He met her while both were competing for Switzerland in'\
'the 2000 Sydney Olympics. Couple of years later Vavrinec retired from the tour because of a'\
'foot injury.[35] They were married at Wenkenhof Villa in Riehen near Basel on'\
 '11 April 2009, surrounded by a small group of close friends and family.[36]'\
 'In July 2009, Mirka gave birth to identical twin girls, Myla Rose and Charlene'\
 ' Riva.[37] The Federers had another set of twins in 2014, this time boys whom'\
 '  they named Leo and Lennart,[38] called Lenny.[39]'

doc = Document.Document('RFWiki', '2016\/11\/22', raw_document_text)

json_str = json.dumps(doc, default=lambda o: o.__dict__)

tokens = nltk.sent_tokenize(doc.raw_text)

sentences = []

classes = ['money', 'percent', 'date', 'time']

index = 0
for sent in tokens:
    sent_object = Document.Sentence(sent, index, doc.document_id)
    index = index + 1
    tags1 = NERTagger1.tag(sent_object.raw_sentence.split())
    for i, t in enumerate(tags1):
        sent_object.words[GetASCIIString(t[0]) + '_' +
                          str(i)].ne_tag = GetASCIIString(t[1])
    tags2 = NERTagger2.tag(sent_object.raw_sentence.split())
    for i, t in enumerate(tags2):
Ejemplo n.º 37
0
def allowed(word):
    return len(word) > 1 and set(word) <= accepted_chars


# load corpus into RAM
print('Loading corpus')
#txt = open(CorpusFName, 'rU').read()
txt = codecs.open(CorpusFName, encoding='utf-8').read()

# select word tokenizer
#tokenizer = TreebankWordTokenizerNoContract()

# split text into sentences
print('Splitting to sentences')
sents = nltk.sent_tokenize(txt, language=lang)

# init count
Counts = defaultdict(int)
lines = 0
nlines = len(sents)
for s in sents:
    tokens = nltk.word_tokenize(s)

    # drop tokens that are punctuation symbols only
    keep = []
    for t in tokens:
        if not set(t) <= punctiation:
            keep.append(t.lower())

    if verbose:
Ejemplo n.º 38
0
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('popular', quiet=True)  # for downloading packages

# uncomment the following only the first time
#nltk.download('punkt') # first-time use only
#nltk.download('wordnet') # first-time use only

#Reading in the corpus
with open('chatbot.txt', 'r', encoding='utf8', errors='ignore') as fin:
    raw = fin.read().lower()

#TOkenisation
sent_tokens = nltk.sent_tokenize(raw)  # converts to list of sentences
word_tokens = nltk.word_tokenize(raw)  # converts to list of words

# Preprocessing
lemmer = WordNetLemmatizer()


def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]


remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)


def LemNormalize(text):
    return LemTokens(
def make_dictionaries(file_dir, m=2):
    from re import sub, findall
    from nltk import sent_tokenize
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer

    # read in input file
    try:
        with open(file_dir, 'r') as f:
            text = f.read()
    except:
        print(
            "Unable to open input .txt file. Rerun with correct path to input .txt."
        )
        return

    # ---------------- data cleaning -----------------
    # split into sentences (one per item in series)
    df = pd.Series(sent_tokenize(text))
    del text

    # lowercase everything
    df = df.str.lower()

    # remove stray apostrophes and parenthesis
    df = df.apply(lambda x: sub(r"((?<=\s)'|'(?!\w))", '', x))
    df = df.apply(lambda x: sub('\"', '', x))
    df = df.apply(lambda x: sub(r"[\(\)\[\]]", '', x))
    df = df.apply(lambda x: sub(r"(?<=[a-zA-Z]),", ' commaplaceholder ', x))

    def sub_endline(x):
        endline = findall(r"\W*$", x)[0]

        if '\!' in endline:
            return sub(r"\W*$", ' eendline', x)
        elif '\?' in endline:
            return sub(r"\W*$", ' qendline', x)
        else:
            return sub(r"\W*$", ' pendline', x)

    df = df.apply(sub_endline)

    # ---------- create data structures ------------------
    data_structs = [None] * (m + 1)

    # probabilities for the starting word
    data_structs[0] = df.str.split(n=1).str[0].value_counts()
    data_structs[0] = data_structs[0] / data_structs[0].sum()

    # distributions for subsequent words
    for i in range(1, m + 1):
        vect = CountVectorizer(token_pattern=r"(?u)\b[^\s]+\b",
                               analyzer='word',
                               ngram_range=(i + 1, i + 1))
        vect.fit(df)

        # get occurrences out of vect
        pairs = pd.Series(np.asarray(
            vect.transform(df).sum(axis=0)).reshape(-1),
                          index=vect.get_feature_names(),
                          name='freq')

        pairs.index.name = 'tokens'
        pairs = pairs.reset_index()

        # expand to 2 columns (prompt, response)
        pairs = pd.concat([
            pairs['tokens'].str.rsplit(n=1, expand=True).rename(columns={
                0: 'prompt',
                1: 'response'
            }), pairs['freq']
        ],
                          axis=1)

        # undo endline/comma substitutions
        pairs['prompt'] = pairs['prompt'].apply(
            lambda x: sub(r"\s*commaplaceholder", ',', x))
        pairs['response'] = pairs['response'].apply(
            lambda x: sub(r"\s*commaplaceholder", ',', x)).replace(
                'pendline', '.').replace('qendline',
                                         '?').replace('eendline', '!')

        # store results in a dictionary
        doubles = {}
        for token, group in pairs.groupby('prompt'):
            doubles[token] = {
                'prob': (group['freq'] / group['freq'].sum()).values,
                'token': group['response'].values
            }

        data_structs[i] = doubles

    # export data
    import pickle

    with open('data.pkl', 'wb') as f:
        pickle.dump(data_structs, f)
    f.close()
Ejemplo n.º 40
0
# Corpus einlesen
# with open('chatbot_de.txt','r', encoding='utf8', errors ='ignore') as bockwurst:
#     raw = bockwurst.read().lower()

with open("new.txt", 'r', encoding='utf8', errors='ignore') as tweet1file:
    raw = tweet1file.read().lower()

with open(os.path.join("json", "trump_data_file.txt"),
          'r',
          encoding='utf8',
          errors='ignore') as tweet2file:
    raw = tweet2file.read().lower()

# Tokenisierung
# sent_tokens konvertiert in Liste von Sätzen
sent_tokens = nltk.sent_tokenize(raw)
# word_tokens konvertiert in Liste von Worten (Wird nicht verwendet.)
word_tokens = nltk.word_tokenize(raw)

# Vorverarbeitung (Preprocessing)
lemmer = WordNetLemmatizer()


def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]


remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)


def LemNormalize(text):
Ejemplo n.º 41
0
    "zipcode",
    "city",
    "neighbourhood_cleansed",
    "market",
    "smart_location",
    "room_type",  # =Private room
    "bedrooms",  # =1
    "beds",  # =1
    "price"
]

df = pd.read_csv(file_object, usecols=lambda x: x in VIBE_FIELDS)
sentences = []
for document in df['neighborhood_overview'][1:100]:
    try:
        sentences.extend(nltk.sent_tokenize(document))
    except:
        pass
file = open('sentence2label.csv', "w")
sentence2label = []
for sentence in sentences:
    print sentence
    try:
        labels = raw_input("Enter labels. ex. green,diverse  :")
    except ValueError:
        labels = None
    file.write(sentence + "," + labels + "\n")
#sentence2label = [(sentence,labels)]
#print sentence2labels

#store['airbnb_vibes_raw'] = df
Ejemplo n.º 42
0
import nltk
from nltk.corpus import reuters
sentences = nltk.sent_tokenize(reuters.raw('test/21131')[:1000])
print("#sentences={0}\n\n".format(len(sentences)))
for sent in sentences:
    print(sent, '\n')
def ie_preprocess(document):
    document = ' '.join([i for i in document.split() if i not in stop])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences
Ejemplo n.º 44
0
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
Ejemplo n.º 45
0
nif=rdflib.Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
data=sys.argv[1]
for arg in sys.argv:
	lang = arg
count=0
for filename in os.listdir('Files/Input'+lang+'/'):
	if (count < int(data)):
		#print(filename)
		graph2=rdflib.Graph()
		graph2.parse('Files/Input'+lang+'/'+filename,format='nt')
		g=Graph()
		name=filename.split(".")[0]
		s=graph2.serialize(format="nt")
		for s,p,o in graph2:
			if type(o)==rdflib.term.Literal and nif.isString in p:
				sentences = nltk.sent_tokenize(o)
				for i in sentences:
					try:
						BI=o.index(i)
						EI=o.index(i)+len(i)
						g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),RDF.type,nif.Sentence])
						g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.beginIndex,rdflib.term.Literal(str(BI))])
						g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.endIndex,rdflib.term.Literal(str(EI))])
						g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.anchorOf,rdflib.term.Literal(i)])
						g.add([rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=sentence_"+str(BI)+"_"+str(EI)),nif.referenceContext,rdflib.term.URIRef("http://dbpedia.org/resource/"+name+"?dbpv=2016-10&nif=context")])     
					except:
						pass
		g.bind("nif",nif)        
		#print(g.serialize(format="turtle"))
		g.serialize(destination='Files/Sentence/'+filename,format="turtle")
		count=count+1
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the 
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and 
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and 
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this 
               amazing award tonight. Let us not take this planet for 
               granted. I do not take tonight for granted. Thank you so very much."""

dataSet = nltk.sent_tokenize(paragraph)

for i in range(len(dataSet)):
    dataSet[i] = dataSet[i].lower()
    dataSet[i] = re.sub(r'\W', ' ', dataSet[i])
    dataSet[i] = re.sub(r'\s+', ' ', dataSet[i])

word2count = {}
for data in dataSet:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1
Ejemplo n.º 47
0
# CHUNKGRAMS

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

text = """
Apple's iPhone revenue for the holiday quarter fell 15% from the same period a year ago, the company said after the markets closed Tuesday.

CEO Tim Cook blamed sales decline on a mix of factors, including a slowdown in China, foreign exchange rates, a popular battery replacement program and reduced smartphone subsidies from carriers.
"""
keywords = set()
sentences = nltk.sent_tokenize(text)
try:
    for i in sentences:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        #             print (tagged)
        # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        # use pipeline "|" to use more pattern
        # use plus "+" to add more tags to the pattern
        chunkGram = r"""NE:  {<NNP>+<NNP>?|<NNP|NN>+<CC.*|NN.*>+<NNP>}
                    {<NNP>}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        NE = [
            " ".join(w for w, t in ele) for ele in chunked
            if isinstance(ele, nltk.Tree)
        ]
        for i in NE:
            keywords.add(i)
except Exception as e:
Ejemplo n.º 48
0
def get_data():
    url = "http://letsrant.azurewebsites.net/api/values"
    reader = codecs.getreader("utf-8")
    obj = json.load(reader(urlopen(url)))

    # Data assigning
    a = []
    b = []
    place_nnp = 'None'
    issue = 'none'
    for i in range(0,len(obj)):
        data = obj[i]['Tweet']

    # Tokenizer 

        tokenizer = RegexpTokenizer(r'\w+')

        stopWords = set(stopwords.words('english'))
        words = tokenizer.tokenize(data)
        wordsFiltered = []
        
        for w in words:
            if w not in stopWords:
                wordsFiltered.append(w)

        tagged = nltk.pos_tag(wordsFiltered)

        # Sentiment analysis

        # def sent():
        def word_feats(words_):
            return dict([(wor, True) for wor in words_])

        positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
        negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
        neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]

        positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
        negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
        neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

        train_set = negative_features + positive_features + neutral_features

        classifier = NaiveBayesClassifier.train(train_set) 

        neg = 0
        pos = 0
        sentence = obj[i]['Tweet']
        sentence = sentence.lower()
        words_ = sentence.split(' ')
        for wor in words_:
            classResult = classifier.classify( word_feats(wor))
            if classResult == 'neg':
                neg = neg + 1
            if classResult == 'pos':
                pos = pos + 1

        pos = str(float(pos)/len(words_))
        neg = str(float(neg)/len(words_))

        # coordinates assigning

        coordinates = obj[i]['coordinates']

        # Noun extraction

        sentences = nltk.sent_tokenize(data)   
        
        d = []
        for sent in sentences:
            d = d + nltk.pos_tag(nltk.word_tokenize(sent))
        
        for word in d: 
            if 'NNS' in word[1]: 
                issue = word
            # if 'NNS' not in word[1]:
            #     issue = 'null'
            if 'NNP' in word[1]: 
                place_nnp = word
            # if 'NNP' not in word[1]:
            #     issue = 'null'

        tweetid = obj[i]['TweetID']
        place = obj[i]['PlaceName']

        if place not in obj[i]:
            
            placename = place_nnp
        else:
            placename = place

        # JSON return

        a = {'tweetid':tweetid, 'place':placename, 'issue':issue, 'sentpos':pos, 'sentneg':neg, 'coordinates':coordinates}

        # print (a)

        b.append(a)
    return b
Ejemplo n.º 49
0
def sentence_tokenize(text):
    """tokenize text into sentences after simple normalization"""
    return sent_tokenize(prenormalize(text))
Ejemplo n.º 50
0
def prepare_text(input):
    sentences = nltk.sent_tokenize(input)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    sentences = [NPChunker.parse(sent) for sent in sentences]
    return sentences
Ejemplo n.º 51
0
from timex import tag
from timex import ground
import datetime
from datetime import date
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

# read and store diary content as nltk text
summary = open('d47sum.txt', 'r')
rawSum = summary.read()
rawSum = rawSum.encode('ascii', errors='ignore')
sentences = nltk.sent_tokenize(rawSum)

# tokenize and tag content
tokens = [nltk.word_tokenize(sent) for sent in sentences]
tagged = [nltk.pos_tag(sent) for sent in tokens]
chunked = nltk.ne_chunk_sents(tagged, binary=True)


# extract named entities from content
# returns an nltk.tree.Tree object which needs to be traversed
# the Tree is a list, chunks are subtrees, and non chunked words are regular strings
def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE' or t.label() == 'NP':
Ejemplo n.º 52
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk import WordNetLemmatizer

host = '127.0.0.1'
port = 1232

warnings.filterwarnings('ignore')
nltk.download('popular', quiet=True)

# buka data librari atau biasa disebut raw data
with open('data/bot.txt', 'r', encoding='utf8', errors='ignore') as fin:
    raws = fin.read().lower()  # cari data raw yang ada pada file raw data

tokenSent = nltk.sent_tokenize(raws)
tokenWord = nltk.word_tokenize(raws)

# preprocessing text atau raw data
lmr = WordNetLemmatizer()


def lmTokens(tokens):
    return [lmr.lemmatize(token) for token in tokens]


removePunctDictionary = dict(
    (ord(punct), None) for punct in string.punctuation)


def lmNormalize(text):
Ejemplo n.º 53
0
        line = line.replace("Mr. Weasley", "Arthur")
        line = line.replace("Mrs.", "Mistress")
        line = line.replace("Mr.", "Mister")
        text += line
"""neuralcoref doesn't do posession correctly. insert ['s] to concatenate later to character's name."""
i = 0
word_list = text.split()
for word in word_list:
    if word in posessive_pronouns:
        word_list.insert(i + 1, "['s]")
    i += 1

text = " ".join(word_list)

text = removePunctFromQuotes(text)
tokens = nltk.sent_tokenize(text)

sentences = []
for t in tokens:
    sentences.append(t)
"""anaphora resolution at single sentence level"""
for sentence in sentences:
    doc = nlp(sentence)
    doc._.has_coref
    line = returnPunctFromQuotes(doc._.coref_resolved)
    f1.write(f"{line} \n")

f1.close()
print("single sentence completed")

final_file = f"harrypotter{hp_num}_final.txt"
                  ])  # use modified lexicon
nrc.drop_duplicates('word', inplace=True)
nrc['value'] = nrc['positive'] - nrc['negative']

# Load data from Mongo
mongo = Mongo('facebook', 'comments')
docs = [doc for doc in mongo.collection.find()]
mongo.close()
mongo_ids = [doc.pop('_id', None)
             for doc in docs]  # exclude mongo generated ids
docs = d_to_df(docs)
docs['created_time'] = pd.to_datetime(docs['created_time'],
                                      format="%Y-%m-%dT%H:%M:%S+0000")
docs.set_index('created_time', inplace=True)
docs.drop_duplicates(['message', 'user.name', 'post_id'], inplace=True)
docs['n_sents'] = docs.message.apply(lambda x: len(sent_tokenize(x)))
docs['n_words'] = docs.message.apply(lambda x: len(tokenize.word_tokenize(x)))
docs = docs[docs['n_sents'] != 0].copy()

mongo = Mongo('facebook', 'posts')
posts = [doc for doc in mongo.collection.find()]
mongo.close()
mongo_ids = [post.pop('_id', None)
             for post in posts]  # exclude mongo generated ids
posts = d_to_df(posts)
posts['created_time'] = pd.to_datetime(posts['created_time'],
                                       format="%Y-%m-%dT%H:%M:%S+0000")
posts.set_index('created_time', inplace=True)

# Calculating post title and message sentiment
posts['article_title'].fillna('', inplace=True)
Ejemplo n.º 55
0
def sentence_tokenizer(text):
    token_list = nltk.sent_tokenize(text, "english")
    return token_list
np.random.seed(42)

BATCH_SIZE = 128
NUM_EPOCHS = 20

lines = []
fin = open("../data/alice_in_wonderland.txt", "rb")
for line in fin:
    line = line.strip().decode("ascii", "ignore").encode("utf-8")
    if len(line) == 0:
        continue
    lines.append(line)
fin.close()

sents = nltk.sent_tokenize(" ".join(lines))

tokenizer = Tokenizer(5000)  # use top 5000 words only
tokens = tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_index) + 1

w_lefts, w_centers, w_rights = [], [], []
for sent in sents:
    embedding = one_hot(sent, vocab_size)
    triples = list(nltk.trigrams(embedding))
    w_lefts.extend([x[0] for x in triples])
    w_centers.extend([x[1] for x in triples])
    w_rights.extend([x[2] for x in triples])

ohe = OneHotEncoder(n_values=vocab_size)
Xleft = ohe.fit_transform(np.array(w_lefts).reshape(-1, 1)).todense()
Ejemplo n.º 57
0
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""
               
               
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

Ejemplo n.º 58
0
import nltk

# 本文,大意是歐巴馬卸任
news_content='''At noon on Friday, 55-year old Barack Obama became a federal retiree.
His pension payment will be $207,800 for the upcoming year, about half of his presidential salary.
Obama and every other former president also get seven months of "transition" services to help adjust to post-presidential life. The ex-Commander in Chief also gets lifetime Secret Service protection as well as allowances for things such as travel, office expenses, communications and health care coverage.
All those extra expenses can really add up. In 2015 they ranged from a bit over $200,000 for Jimmy Carter to $800,000 for George W. Bush, according to a government report. Carter doesn't get health insurance because you have to work for the federal government for five years to qualify.
'''

# 分詞、標註、NER、打分數,依分數高低排列句子
results=[]
for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)):
    no_of_tokens=len(nltk.word_tokenize(sentence))
    # Let's do POS tagging
    tagged=nltk.pos_tag(nltk.word_tokenize(sentence))
    # Count the no of Nouns in the sentence
    no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"] ])
    #Use NER to tag the named entities.
    ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False)
    no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'label')])
    score=(no_of_ners+no_of_nouns)/float(no_of_tokens)
    results.append((sent_no,no_of_tokens,no_of_ners, no_of_nouns,score,sentence))

# 依重要性順序列出句子
for sent in sorted(results,key=lambda x: x[4],reverse=True):
    print(sent[5])
from nltk import sent_tokenize
from urllib import request
import random

url = "https://www.gutenberg.org/files/61236/61236-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
sentence = sent_tokenize(raw)

# result = random.choice(sentence)
print(random.choice(sentence))
Ejemplo n.º 60
0
data = [x.strip() for x in data]

jar = 'stanford-ner.jar'
model = 'english.all.3class.distsim.crf.ser.gz'

st = StanfordNERTagger(model, jar, encoding='utf8')

#tokenized_sents = [[nltk.word_tokenize(str(sent)) for sent in nltk.sent_tokenize(str(line))] for line in lines]
#classified_text = st.tag_sents(tokenized_sents)

tokenized_sents = []
persons = []
entities = []

for line in data:
    sentences = nltk.sent_tokenize(line)  #tokenize sentences
    tokenized_sents.append(nltk.word_tokenize(str(sentences)))
    for sentence in sentences:
        for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
            if (pos == 'NNS'):
                if (word not in entities):
                    entities.append(word)

classified_text = st.tag_sents(tokenized_sents)

for item in classified_text:
    for x, y in item:
        if (y == 'PERSON'):
            if (x not in persons):
                persons.append(str(x))