def getTimelineTweets(self, date, max_tweets, topic): ''' :param date: python datetime object :param max_tweets: Maximum number of tweets to return :param topic: Topic to search from :return: List of tuples, tuple format (date,tweet) ''' from_date = date - datetime.timedelta(days=1) from_date = from_date.strftime("%Y-%m-%d") to_date = date + datetime.timedelta(days=1) to_date = to_date.strftime("%Y-%m-%d") curr_tweets = 0 tweets = [] for tweet in tweepy.Cursor(self.api.search, q=topic + ' -filter:retweets', since=from_date, until=to_date).items(): tweet_date = tweet._json['created_at'] tweet_text = tweet._json['text'] if len(tag(tweet_text)) > len( tweet_text): # Filter out non timeline sentences tweets.append((tweet_date, tweet_text)) curr_tweets += 1 if curr_tweets >= max_tweets: break return tweets
def process_doc(single_doc, q_type, doc_num): # in order of type constants (0 : who, 1: where, 2: when) # BASELINE ONLY! # when NER doesn't work in nltk NER. we use timex.py to tag # https://github.com/nltk/nltk_contrib/blob/master/nltk_contrib/timex.py if q_type == WHEN_TYPE: sentences = nltk.tokenize.sent_tokenize(single_doc) surviving_sentences = [] for sentence in sentences: sentence_after_tagging = timex.tag(sentence) if sentence_after_tagging.find('<TIMEX2>') != -1: surviving_sentences.append((doc_num, sentence)) return surviving_sentences # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb else: sentences = nltk.tokenize.sent_tokenize(single_doc) surviving_sentences = [] for sentence in sentences: words = nltk.word_tokenize(sentence) pos_tag = nltk.pos_tag(words) # this is in nltk tree # reference : http://www.nltk.org/howto/tree.html # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb ner_tree = nltk.ne_chunk(pos_tag) # bool for whether this sentence has contains_tag = False for subtree in ner_tree.subtrees(): if subtree.label() in NER_TAG[q_type]: contains_tag = True if (contains_tag): surviving_sentences.append((doc_num, sentence)) #print surviving_sentences return surviving_sentences
def answer_processing(s_tuple, q_type, q_keywords): #print "DOING ANSWER_PROCESSING" sentences = s_tuple # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb # in string answers = [] # NEED TO ACCOUNT FOR CASES IN WHICH THERE ARE LESS THAN 5 ANSWERS num_answers_needed = 5 - len(sentences) if(num_answers_needed > 0): for i in range(0,num_answers_needed): sentences.append(('100','nil')) for i in range(0, len(sentences)): doc_num = sentences[i][0] sentence = sentences[i][1] if q_type == WHEN_TYPE: sentence_after_tagging = timex.tag(sentence) when_answers = re.findall('<TIMEX2>(.*?)</TIMEX2>', sentence_after_tagging) # in case answer comes out as empty, output an empty string when_answer = when_answers[0] if len(when_answers) != 0 else 'nil' answers.append((doc_num, when_answer)) else: words = nltk.word_tokenize(sentence) pos_tag = nltk.pos_tag(words) ner_tree = nltk.ne_chunk(pos_tag) #print ner_tree # the list of tuples((word, pos),ner) to be considered for this sentence matching_tuples = [] # print q_keywords global subtree tmp = [] for subtree in ner_tree.subtrees(): if subtree.label() in NER_TAG[q_type] and subtree.pos()[0][0][1]=='NNP': word = ' '.join(map(lambda x : x[0][0], subtree.pos())) print word print q_keywords iskwin = map(lambda x : x in word, q_keywords) if not any(iskwin): # print "SUBTREE!", subtree # matching_tuples = subtree.pos() answer = ' '.join(map(lambda x : x[0][0], subtree.pos())) if answer not in map(lambda x : x[1],answers): tmp.append((doc_num,answer)) k_sorted = sort_keywords(sentence, tmp, q_keywords) answers+=k_sorted; print answers print "SENTENCE : ", sentence, "ANSWER : ", tmp # t : ((word, pos), ner) # answer = '' # for t in matching_tuples: # #print t # if t[0][0] not in q_keywords: # answer += t[0][0] + ' ' # # remove any possible trailing whitespaces # answer = answer.rstrip() # answers.append((doc_num,answer)) print answers return answers
def timex_parse(content, base_time=gmt()): """Timex tagger using the timex module. Ripped from nltk_contrib using regex. """ tagged_text = tag(content) injected_base_text = ground(tagged_text, base_time) return injected_base_text
def get_date(result_entry, process_date): """ Function to extract date from a story. First checks for a date from the RSS feed itself. Then tries to pull a date from the first two sentences of a story. Finally turns to the date that the story was added to the database. For the dates pulled from the story, the function checks whether the difference is greater than one day from the date that the pipeline is parsing. Parameters ---------- result_entry: Dictionary. Record of a single result from the web scraper. process_date: datetime object. Datetime object indicating which date the pipeline is processing. Standard is date_running - 1 day. Returns ------- date : String. Date string in the form YYMMDD. """ date_obj = '' if result_entry['date']: try: date_obj = parser.parse(result_entry['date']) except TypeError: date_obj = '' else: date_obj = '' if not date_obj: tagged = timex.tag(result_entry['content'][:2]) dates = re.findall(r'<TIMEX2>(.*?)</TIMEX2>', tagged) if dates: try: date_obj = parser.parse(dates[0]) diff_check = _check_date(date_obj, process_date) if diff_check: date_obj = '' except TypeError: date_obj = '' else: date_obj = '' if not date_obj: date_obj = result_entry['date_added'] date = '{}{:02d}{:02d}'.format( str(date_obj.year)[2:], date_obj.month, date_obj.day) return date
def get_date(result_entry, process_date): """ Function to extract date from a story. First checks for a date from the RSS feed itself. Then tries to pull a date from the first two sentences of a story. Finally turns to the date that the story was added to the database. For the dates pulled from the story, the function checks whether the difference is greater than one day from the date that the pipeline is parsing. Parameters ---------- result_entry: Dictionary. Record of a single result from the web scraper. process_date: datetime object. Datetime object indicating which date the pipeline is processing. Standard is date_running - 1 day. Returns ------- date : String. Date string in the form YYMMDD. """ date_obj = '' if result_entry['date']: try: date_obj = parser.parse(result_entry['date']) except TypeError: date_obj = '' else: date_obj = '' if not date_obj: tagged = timex.tag(result_entry['content'][:2]) dates = re.findall(r'<TIMEX2>(.*?)</TIMEX2>', tagged) if dates: try: date_obj = parser.parse(dates[0]) diff_check = _check_date(date_obj, process_date) if diff_check: date_obj = '' except TypeError: date_obj = '' else: date_obj = '' if not date_obj: date_obj = result_entry['date_added'] date = '{}{:02d}{:02d}'.format(str(date_obj.year)[2:], date_obj.month, date_obj.day) return date
def answer_processing(s_tuple, q_type, q_keywords): sentences = s_tuple # http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb # in string answers = [] # NEED TO ACCOUNT FOR CASES IN WHICH THERE ARE LESS THAN 5 ANSWERS num_answers_needed = 5 - len(sentences) if (num_answers_needed > 0): for i in range(0, num_answers_needed): sentences.append(('100', 'nil')) for i in range(0, 5): doc_num = sentences[i][0] sentence = sentences[i][1] # nltk NER doesn't work with when_type, let's use timex.py if q_type == WHEN_TYPE: sentence_after_tagging = timex.tag(sentence) when_answers = re.findall('<TIMEX2>(.*?)</TIMEX2>', sentence_after_tagging) # in case answer comes out as empty, output an empty string when_answer = when_answers[0] if len(when_answers) != 0 else '' answers.append((doc_num, when_answer)) else: words = nltk.word_tokenize(sentence) pos_tag = nltk.pos_tag(words) ner_tree = nltk.ne_chunk(pos_tag) #print ner_tree # the list of tuples((word, pos),ner) to be considered for this sentence matching_tuples = [] for subtree in ner_tree.subtrees(): if subtree.label() in NER_TAG[q_type] and subtree.pos( )[0][0][1] == 'NNP': print subtree iskwin = map(lambda x: x in subtree.pos()[0][0][0], q_keywords) if not any(iskwin): matching_tuples = subtree.pos() # t : ((word, pos), ner) answer = '' for t in matching_tuples: #print t if t[0][0] not in q_keywords: answer += t[0][0] + ' ' # remove any possible trailing whitespaces answer = answer.rstrip() answers.append((doc_num, answer)) print answers return answers
def performTagging(featureObjects): taggedLines = [] for obj in featureObjects: taggedLine = "" try: taggedLine = timex.tag( obj.getLexicalFeatures().getSpellCorrection().lower()) taggedLine = timex.ground(taggedLine, timex.gmt()) except: taggedLine = "" if not Utilities.isEmpty(taggedLine): obj.getSyntacticFeatures().setTemporalTag( Utilities.firstMatching(TIMEX_TAG_REGEX, taggedLine)) taggedLines.append(obj) return taggedLines
def getTime(newsid): """ Do something to get the lastest date in the article, in Unix Time form (a.k.a. seconds from 1970/1/1) """ flag = 0 fp = open('database/'+newsid) soup = BeautifulSoup(fp) tagtimes = list() #content +=soup.find('title').get_text() for i in soup.findAll('p'): text = i.get_text() # content += text.strip() + "\n" try: tagged = timex.ground(timex.tag(text), getBasetime(newsid)) except ValueError: continue soup2 = BeautifulSoup(tagged) if soup2.timex2 != None: flag = 1 for i in soup2.findAll('timex2'): try: # print "tagged time: " + str(i) timestr = i['val'] except KeyError: print "Error tagged time: " + str(i) continue if timestr != 'UNKNOWN': try: tagtimes.append(int(dateutil.parser.parse(timestr).strftime('%s'))) except ValueError: continue else: print i if flag ==0: # print("OMG no tags!") randtime = random.randint(1370016000,1416758400) print("Fail to get timetag from news " + str(newsid) + " , assign " + str(randtime)) return int(randtime) else: if tagtimes != list(): print("Time prediction for news " + str(newsid) + ": " + str(np.array(tagtimes).min())) return np.array(tagtimes).min() else: randtime = random.randint(1370016000,1416758400) print("Fail to get timetag from news " + str(newsid) + " , assign " + str(randtime)) return int(randtime)
def timexWrapper(text): """ wrap timex @type text: list [word] (ordered by index) @param text: the text to be tagged with time expressions @rtype tuple (list [TimeExpression], list[word]) @return list of time expressions extracted from text and list of unmatched words """ text_str = " ".join([x.word for x in text]) timeExpressions = [] uncovered_tokens = [[x,False] for x in text] try: #timex's ground function isn't reliable ground_res = ground(tag(text_str),gmt()) except: return ([],text) for s,val in ground_res[1]: textInd = text_str.find(s) curText = [] numOfItems = len(s.split()) startInd = len(text_str[:textInd].split()) if textInd>0: if text_str[textInd-1] != " ": # deal with time expressions starting in the middle of words startInd-=1 for i in range(startInd,startInd+numOfItems): uncovered_tokens[i][1] = True curText.append(text[i]) timeExpressions.append(TimeExpression(curText,val)) iterList = list(enumerate([x for x in uncovered_tokens])) iterList.reverse() for i,(x,flag) in iterList: if flag: del(uncovered_tokens[i]) else: uncovered_tokens[i]=x return timeExpressions,uncovered_tokens
def timexWrapper(text): """ wrap timex @type text: list [word] (ordered by index) @param text: the text to be tagged with time expressions @rtype tuple (list [TimeExpression], list[word]) @return list of time expressions extracted from text and list of unmatched words """ text_str = " ".join([x.word for x in text]) timeExpressions = [] uncovered_tokens = [[x, False] for x in text] try: #timex's ground function isn't reliable ground_res = ground(tag(text_str), gmt()) except: return ([], text) for s, val in ground_res[1]: textInd = text_str.find(s) curText = [] numOfItems = len(s.split()) startInd = len(text_str[:textInd].split()) if textInd > 0: if text_str[textInd - 1] != " ": # deal with time expressions starting in the middle of words startInd -= 1 for i in range(startInd, startInd + numOfItems): uncovered_tokens[i][1] = True curText.append(text[i]) timeExpressions.append(TimeExpression(curText, val)) iterList = list(enumerate([x for x in uncovered_tokens])) iterList.reverse() for i, (x, flag) in iterList: if flag: del (uncovered_tokens[i]) else: uncovered_tokens[i] = x return timeExpressions, uncovered_tokens
def main(): data_path = "./origin_data/riedel/nyt-2005-2006.backup/" output_path = "./data/" with open(output_path + "processed.txt", "wb") as fout: outputs = [] for item in os.listdir(data_path): with open(data_path + item, "rb") as fin: if item[-3:] != ".pb": continue # pdb.set_trace() doc = Document_pb2.Document() doc.ParseFromString(fin.read()) # whole_doc is for time extraction whole_doc = [] valid_set = [] for sentence in doc.sentences: s = [] m = [] t = [] # extract the token words into one sentence. for token in sentence.tokens: s.append(token.word) # extract mentions for mention in sentence.mentions: # mention got entity_name, mform, to m.append(["_".join(s[int(mention.mfrom):int(mention.to+1)]), mention.mfrom, mention.to]) # if mentions is smaller than 2, means this may not in my train-test set. # in this case, the mention could be bigger than 2, so we may need iterations for further processing. if len(m) < 2: valid_set.append([m, s]) whole_doc.append(s) # tagging op should appear in each doc iter # since we need the whole doc to set base-time timex_found, whole_doc = timex.tag(" ".join(whole_doc)) if len(timex_found) > 0: # set base-time tobe the last time found. base_t = timex.retrieve_Date_time(timex_found) pdb.set_trace()
def get_sentence_dates(url): # Extract sentences from the html response = urlopen(url) content = response.read() # Get only relevant text soup = BeautifulSoup(content) split = [elm.text.encode("utf-8") for elm in soup.findAll('p')] content = ' '.join(split) raw = nltk.clean_html(content) # Strip raw raw = re.sub(r'(\n)|\[.*?\] ?', "", raw) raw = re.sub(r'(\n)|\(.*?\) ?', "", raw) raw = re.sub(r"\r\n", ".", raw) #raw = re.sub(r";", ".", raw) #raw = re.sub(r"\"", ".", raw) sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sent_detector.tokenize(raw.strip()) more_sentences = [] for sentence in sentences: sentence = sentence.strip().split(".") more_sentences += sentence sentences = more_sentences #Clean sentences for index, sentence in enumerate(sentences): split_up = sentence.split() sentences[index] = ' '.join(split_up) sentence_date_maps = {} # Assign a date to each sentence for index, sentence in enumerate(sentences): timed_sentence = timex.tag(str(sentence)) for result in re.finditer(".*<TIMEX2>(.*)</TIMEX2>.*", timed_sentence): for r in result.groups(): if r not in sentence_date_maps: sentence_date_maps[r] = [] sentence_date_maps[r].append(sentence) sentence_dates = [] for date, sentences in sentence_date_maps.items(): try: year = int(date) if year > START and year < END: #for sentence in sentences: # res = [] # print(sentence) # if len(sentence) > 10 and len(sentence) < 500: # res.append(sentence) #if res != []: # sentence_dates.append((year, res)) sentence_dates.append((year, sentences)) except: continue sentence_dates = sorted(sentence_dates, key=lambda x:x[0]) return json.dumps(sentence_dates)
for tree in chunked: # print results per sentence # print extract_entity_names(tree) entity_names.extend(extract_entity_names(tree)) # print unique entity names base_date = datetime.date.today() #now = datetime.date.today() #basedate = datetime.Date(now.year, now.month, now.day) tagged_text = [] for sentence in sentences: #newsent = dateparser.parse(sentence) #newsent = search_dates(sentence) newsent = tag(sentence) #dt.append(newsent) tagged_text.append(newsent) #dates = tag(sentences); dt = [] #for string in tagged_text: #dt = ground(tagged_text, base_date) dt = ground(tagged_text, base_date) print(dt) #unique = set(entity_names) #unique.append(dates) #print(unique)