def getArticles(articleList): singleSets = [] for article in articleList: try: chunks = gc.getChunks(article[1]) tags = tag.getTags(article[1],chunks) if tags == []: continue # check this is right. go to next itteration """The Stanford Open IE tags""" subject = tags['subject'] relation = tags['relation'] objects = tags['object'] objects = objects.split() content = wp.getArticle(subject) rawSentences = sent.getSentences(content) sentences = [] for sentence in rawSentences: if(hd.hasDate(sentence) !== []): sentences.append(sentence) listOfYears.append(article[0]) SS = {'title':article[1], 'sentences':sentences, 'year':article[0]} singleSets.append(SS) except: pass return singleSets
def getArticle(article): try: #chunks = gc.getChunks(article) tags = tag.getTags(article[1]) #if tags == []: try: # continue # check this is right. go to next itteration """The Stanford Open IE tags""" subject = tags[-1]['subject'] relation = tags[-1]['relation'] objects = tags[-1]['object'] objects = objects.split(' ') relations = [] relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation)) relations = en.lexeme(relations[0]) content = wp.getArticle(subject) except: # continue # check this is right. go to next itteration """The Stanford Open IE tags""" subject = tags[0]['subject'] relation = tags[0]['relation'] objects = tags[0]['object'] objects = objects.split(' ') relations = [] relations.append(nltk.stem.wordnet.WordNetLemmatizer().lemmatize(relation)) relations = en.lexeme(relations[0]) content = wp.getArticle(subject) #objects = objects.split() rawSentences = nltk.tokenize.sent_tokenize(content)#sent.getSentences(content) sentences = [] for sentence in rawSentences: for word in objects: if word in sentence: sentences.append(sentence) for word in relations: if word in sentence: sentences.append(sentence) sentences = list(set(sentences)) return {'title':article[1], 'sentences':sentences, 'year':article[0]} except: return
import app.analytics.tag as tag import app.parser.articleRetrieval.wikipediaParse as wp import app.parser.sentences as sent import app.analytics.sentenceFiltering.actionSentences as action import app.analytics.functions.hasDate as hd import app.analytics.functions.synonym as sn import app.analytics.getFeatures as ft articles = importArticles.getData() sentences= [] count = 0 for article in articles[0:10]: print article chunks = gc.getChunks(article[1]) tags = tag.getTags(article[1],chunks) if tags == []: continue # check this is right. go to next itteration """The Stanford Open IE tags""" subject = tags['subject'] relation = tags['relation'] objects = tags['object'] objects = objects.split() print objects print relation print subject article = wp.getArticle(subject) sentences = sent.getSentences(article) features= ft.getFeatures(subject, objects, relation, sentences)