def __init__(self): self.segmentor = ss.segmentor(1) grammar = '''NE : {<NNP|NNPS>*<DT>?<NNP|NNPS>+} ''' self.chunker = nltk.RegexpParser(grammar) self.phrases = None months = ['January','Jan.', 'February','Feb.','March','Mar.', 'April','Apr.', 'May', 'June', 'July','August', 'Aug.',\ 'Spetember','Sept.','October','Oct.', 'November','Nov.','December','Dec.' ] days = ['Sunday', 'Sun.', 'Tuesday', 'Tu.', 'Tue.', 'Tues.', 'Thursday','Th.', 'Thurs.', 'Thur.', 'Thu.', 'Monday',\ 'Mon.', 'Wednesday','Wed.', 'Friday','Fri.', 'Saturday','Sat.'] self.exclude = set(stopwords.words('english') + months + days) self.stopwords = stopwords.words('english') + ['A', 'An'] self.HTMLTextBlobber = usefulText.HTMLTextBlob()
def __init__(self): self.segmentor = ss.segmentor(1) grammar = '''NE : {<NNP|NNPS>*<DT>?<NNP|NNPS>+} ''' self.chunker = nltk.RegexpParser(grammar) self.phrases = None months = ['January','Jan.', 'February','Feb.','March','Mar.', 'April','Apr.', 'May', 'June', 'July','August', 'Aug.',\ 'Spetember','Sept.','October','Oct.', 'November','Nov.','December','Dec.' ] days = ['Sunday', 'Sun.', 'Tuesday', 'Tu.', 'Tue.', 'Tues.', 'Thursday','Th.', 'Thurs.', 'Thur.', 'Thu.', 'Monday',\ 'Mon.', 'Wednesday','Wed.', 'Friday','Fri.', 'Saturday','Sat.'] self.exclude = set(stopwords.words('english') + months+ days) self.stopwords = stopwords.words('english') + ['A' ,'An'] self.HTMLTextBlobber = usefulText.HTMLTextBlob()
import pymongo, nltk from collections import defaultdict p = pymongo.Connection().articles.collection_1 corpus_pointer = p.find() # cursor object import sys sys.path.append('../version0.0/') import segment_sentence as ss text = l[0]['metadata']['body'] segmentor = ss.segmentor(1) # train by decision tree classifier sentences = eval(segmentor.segment(text, rtype=0))['sentences'] corpus = defaultdict(list) ## Query l = [] for i in p.find(): try: if set([ 'ACQUIRED IMMUNE\nDEFICIENCY SYNDROME (AIDS)', 'ACQUIRED IMMUNE', 'ACORN STAKES (HORSE RACE)', 'ADDENDA' ]).intersection(set(i['metadata']['indexing_service']['classifier'])): l.append({'id': i['_id'],'year': i['metadata']['date']['year'], 'day_of_month': i['metadata']['date']['day_of_month'],\ 'month' : i['metadata']['date']['month']} ) except: pass ## Getting tags
## segment into sentences import sys sys.path.append('../version0.0/') import segment_sentence as ss def isProper(sentence): if len(sentence) <=5: return False if '|' in sentence: return False return True ## add some more segmentor = ss.segmentor(1) sentences= eval(segmentor.segment(text, 0))['sentences'][:-5] sentences =[sentence for sentence in sentences if isProper(sentence)] #### textblob from textblob import TextBlob tb = TextBlob(text) np = tb.noun_phrases n_counts = {} for n in np: if all(len([ch for ch in word if ch.isalpha()]) > 2 for word in n.split()): n_counts[n] = n_counts.get(n, 0) +1 sorted_n = sorted(n_counts.iteritems(), key=lambda n: -n_counts[n])[:10]
def __init__(self, url): r= requests.get() html = r.content self.text = usefulText.extract_text(html).decode('utf-8', 'ignore') ## Scope of improvement self.segmentor = ss.segmentor(1)
if request.method == "POST": para = get().decode('utf-8', 'ignore') return segmentor.segment(para, rtype =0) else: return "Only POST requests are accepted. No text found. Try Again...\n" @app.route('/segment_text_return_sentences', methods=['GET', 'POST']) def process_request2(): if request.method == "POST": para = get().decode('utf-8', 'ignore') return segmentor.segment(para, rtype = 1) # else: # return "Only POST requests are accepted. No text found. Try Again...\n" @app.route('/extract_entity', methods=['GET', 'POST']) def process_request4(): if request.method == "POST": sentence= get().decode('utf-8', 'ignore') return entity_extractor.extract(sentence) else: return "Only POST requests are accepted. No text found. Try Again...\n" if __name__ == '__main__': key_extractor = key_score.keyword_extractor() segmentor = segment_sentence.segmentor(1) entity_extractor = extractor.entity_Extractor() app.debug=True app.run(host='0.0.0.0', port =8888) #, use_reloader= False) # Without app.reloader it will run twice. and it will not debug
import pymongo, nltk from collections import defaultdict p= pymongo.Connection().articles.collection_1 corpus_pointer =p.find() # cursor object import sys sys.path.append('../version0.0/') import segment_sentence as ss text= l[0]['metadata']['body'] segmentor= ss.segmentor(1) # train by decision tree classifier sentences = eval(segmentor.segment(text, rtype =0))['sentences'] corpus = defaultdict(list) ## Query l=[] for i in p.find(): try: if set(['ACQUIRED IMMUNE\nDEFICIENCY SYNDROME (AIDS)', 'ACQUIRED IMMUNE', 'ACORN STAKES (HORSE RACE)', 'ADDENDA']).intersection(set(i['metadata']['indexing_service']['classifier'])): l.append({'id': i['_id'],'year': i['metadata']['date']['year'], 'day_of_month': i['metadata']['date']['day_of_month'],\ 'month' : i['metadata']['date']['month']} ) except: pass ## Getting tags f= open('tags.txt', 'r') f.readline() f.readline()