def get_names_from_text(data): """ Given a text of data, attempts to find all the potential names. We define as potential names continuous groups of capitalized words, not separated by punctuation. """ data = strip_tags_and_new_lines(data) # Transforms all the punctuation into dots so I can catch them as being # between capitalized names. data = re.sub("[!*#='|.,;\"\\(\\):\\?]", " . ", data) # Delete the 'read also' links. if data.find('Citeşte şi') > 0: data = data[0 : data.find('Citeşte şi')] words = data.split(" ") names = [] name = [] for word in words: # when you meet a separator, delete the name if re.search("[.,;\]\[]", word) or \ re.search("^[0-9a-zşșî/(\\-]", word) or \ re.search("[A-ZŢ]{2,}", word) or \ word == "" or \ word in common_capitalized_words: if len(name) > 1: names.append(name) name = [] else: name.append(word) return names
def get_names_from_text(data): """ Given a text of data, attempts to find all the potential names. We define as potential names continuous groups of capitalized words, not separated by punctuation. """ data = strip_tags_and_new_lines(data) # Transforms all the punctuation into dots so I can catch them as being # between capitalized names. data = re.sub("[!*#='|.,;\"\\(\\):\\?]", " . ", data) # Delete the 'read also' links. if data.find('Citeşte şi') > 0: data = data[0:data.find('Citeşte şi')] words = data.split(" ") names = [] name = [] for word in words: # when you meet a separator, delete the name if re.search("[.,;\]\[]", word) or \ re.search("^[0-9a-zşșî/(\\-]", word) or \ re.search("[A-ZŢ]{2,}", word) or \ word == "" or \ word in common_capitalized_words: if len(name) > 1: names.append(name) name = [] else: name.append(word) return names
def get_qualifiers(name, data): """ Given a name and a blob of text, find the qualifiers of that name in the text. Let's start simple and find this type: + Sentence: "Monica Macovei, fost ministru al Justitiei,..." - Extract: "first premier" """ data = strip_tags_and_new_lines(data) post_qualifiers = re.findall(name + ', ([^,.]+)[.|,]', data) # TODO(vivi): Add the # - 'bula demnitarului' paranthesis here too. # - Sentences like "Ion Iliescu este bla bla". post_qualifiers = filter(could_be_qualifier, post_qualifiers) return post_qualifiers
tree = parse(SOURCE + '/' + fname) for item in tree.findall('item'): link = item.findtext('news_link').encode('UTF-8') # news_title is at this point an UTF-8 encoded string, quoted. title = urllib.unquote(item.findtext('news_title').encode('UTF-8')) tstr = item.findtext('news_time').encode('UTF-8').split(' ') d = datetime(year=int(tstr[4]), month=int(tstr[3]), day=int(tstr[2]), hour=int(tstr[0]), minute=int(tstr[1])) news_content = urllib.unquote(item.findtext('news_content')) news_content = strip_tags_and_new_lines(news_content) news_content = strip_punctuation(news_content) news_content = lower(news_content) news_content = strip_diacritics(news_content) words = news_content.split(" ") for word in words: if not word: continue if not word in map: map[word] = 1 else: map[word] = map[word] + 1
for fname in files[-NUMBER_OF_DAYS_TO_PARSE : ]: print "--" print "-- ++ working on " + SOURCE + "/" + fname tree = parse(SOURCE + '/' + fname) for item in tree.findall('item'): link = item.findtext('news_link').encode('UTF-8') # news_title is at this point an UTF-8 encoded string, quoted. title = urllib.unquote(item.findtext('news_title').encode('UTF-8')) tstr = item.findtext('news_time').encode('UTF-8').split(' ') d = datetime(year=int(tstr[4]), month=int(tstr[3]), day=int(tstr[2]), hour=int(tstr[0]), minute=int(tstr[1])) news_content = urllib.unquote(item.findtext('news_content')); news_content = strip_tags_and_new_lines(news_content) news_content = strip_punctuation(news_content) news_content = lower(news_content) news_content = strip_diacritics(news_content) words = news_content.split(" ") for word in words: if not word: continue if not word in map: map[word] = 1 else: map[word] = map[word] + 1