Example #1
0
    def __init__(self):
        self.segmentor = ss.segmentor(1)
        grammar = '''NE : {<NNP|NNPS>*<DT>?<NNP|NNPS>+}
						 '''
        self.chunker = nltk.RegexpParser(grammar)

        self.phrases = None
        months = ['January','Jan.', 'February','Feb.','March','Mar.', 'April','Apr.', 'May', 'June', 'July','August', 'Aug.',\
         'Spetember','Sept.','October','Oct.', 'November','Nov.','December','Dec.' ]
        days = ['Sunday', 'Sun.', 'Tuesday', 'Tu.', 'Tue.', 'Tues.', 'Thursday','Th.', 'Thurs.', 'Thur.', 'Thu.', 'Monday',\
        'Mon.', 'Wednesday','Wed.', 'Friday','Fri.', 'Saturday','Sat.']
        self.exclude = set(stopwords.words('english') + months + days)
        self.stopwords = stopwords.words('english') + ['A', 'An']

        self.HTMLTextBlobber = usefulText.HTMLTextBlob()
Example #2
0
	def __init__(self):
		self.segmentor = ss.segmentor(1)
		grammar = '''NE : {<NNP|NNPS>*<DT>?<NNP|NNPS>+}
						 '''
		self.chunker = nltk.RegexpParser(grammar)

		self.phrases = None
		months = ['January','Jan.', 'February','Feb.','March','Mar.', 'April','Apr.', 'May', 'June', 'July','August', 'Aug.',\
		 'Spetember','Sept.','October','Oct.', 'November','Nov.','December','Dec.' ]
		days = ['Sunday', 'Sun.', 'Tuesday', 'Tu.', 'Tue.', 'Tues.', 'Thursday','Th.', 'Thurs.', 'Thur.', 'Thu.', 'Monday',\
		'Mon.', 'Wednesday','Wed.', 'Friday','Fri.', 'Saturday','Sat.']
		self.exclude = set(stopwords.words('english') + months+ days)
		self.stopwords = stopwords.words('english') + ['A' ,'An']

		self.HTMLTextBlobber = usefulText.HTMLTextBlob()
Example #3
0
import pymongo, nltk
from collections import defaultdict

p = pymongo.Connection().articles.collection_1
corpus_pointer = p.find()  # cursor object

import sys
sys.path.append('../version0.0/')
import segment_sentence as ss

text = l[0]['metadata']['body']
segmentor = ss.segmentor(1)  # train by decision tree classifier
sentences = eval(segmentor.segment(text, rtype=0))['sentences']

corpus = defaultdict(list)

## Query
l = []
for i in p.find():
    try:
        if set([
                'ACQUIRED IMMUNE\nDEFICIENCY SYNDROME (AIDS)',
                'ACQUIRED IMMUNE', 'ACORN STAKES (HORSE RACE)', 'ADDENDA'
        ]).intersection(set(i['metadata']['indexing_service']['classifier'])):
            l.append({'id': i['_id'],'year': i['metadata']['date']['year'], 'day_of_month': i['metadata']['date']['day_of_month'],\
             'month' : i['metadata']['date']['month']} )

    except:
        pass

## Getting tags
Example #4
0
## segment into sentences
import sys
sys.path.append('../version0.0/')
import segment_sentence as ss


def isProper(sentence):
	if len(sentence) <=5:
		return False

	if '|' in sentence:
		return False
	return True
	## add some more

segmentor = ss.segmentor(1)
sentences= eval(segmentor.segment(text, 0))['sentences'][:-5]
sentences =[sentence for sentence in sentences if isProper(sentence)] 

#### textblob
from textblob import TextBlob
tb = TextBlob(text)
np = tb.noun_phrases
n_counts = {}
for n in np:
	if all(len([ch for ch in word if ch.isalpha()]) > 2 for word in n.split()):
		n_counts[n] = n_counts.get(n, 0) +1
sorted_n = sorted(n_counts.iteritems(), key=lambda n: -n_counts[n])[:10]


Example #5
0
	def __init__(self, url):
		r= requests.get()
		html = r.content
		self.text = usefulText.extract_text(html).decode('utf-8', 'ignore')  ## Scope of improvement
		self.segmentor = ss.segmentor(1)
Example #6
0
	if request.method == "POST":
		para = get().decode('utf-8', 'ignore')
		return segmentor.segment(para, rtype =0)
	else:
		return "Only POST requests are accepted. No text found. Try Again...\n"

@app.route('/segment_text_return_sentences', methods=['GET', 'POST'])
def process_request2():
	if request.method == "POST":
		para = get().decode('utf-8', 'ignore')
    	return segmentor.segment(para, rtype = 1)
#	else:
#		return "Only POST requests are accepted. No text found. Try Again...\n"


@app.route('/extract_entity', methods=['GET', 'POST'])
def process_request4():
  if request.method == "POST":
    sentence= get().decode('utf-8', 'ignore')
    return entity_extractor.extract(sentence)
  else:
    return "Only POST requests are accepted. No text found. Try Again...\n"


if __name__ == '__main__':
	key_extractor = key_score.keyword_extractor()
	segmentor =  segment_sentence.segmentor(1)
	entity_extractor = extractor.entity_Extractor()
	app.debug=True
 	app.run(host='0.0.0.0', port =8888) #, use_reloader= False) # Without app.reloader it will run twice. and it will not debug
 
Example #7
0
import pymongo, nltk
from collections import defaultdict

p= pymongo.Connection().articles.collection_1
corpus_pointer =p.find() # cursor object

import sys
sys.path.append('../version0.0/')
import segment_sentence as ss

text= l[0]['metadata']['body']
segmentor= ss.segmentor(1) # train by decision tree classifier
sentences = eval(segmentor.segment(text, rtype =0))['sentences']

corpus = defaultdict(list)

## Query
l=[]
for i in p.find():
	try: 
		if set(['ACQUIRED IMMUNE\nDEFICIENCY SYNDROME (AIDS)', 'ACQUIRED IMMUNE', 'ACORN STAKES (HORSE RACE)', 'ADDENDA']).intersection(set(i['metadata']['indexing_service']['classifier'])):
			l.append({'id': i['_id'],'year': i['metadata']['date']['year'], 'day_of_month': i['metadata']['date']['day_of_month'],\
				'month' : i['metadata']['date']['month']} )
			
	except:
		pass

## Getting tags
f= open('tags.txt', 'r')
f.readline()
f.readline()