def get_names_from_text(data):
  """ Given a text of data, attempts to find all the potential names. We define
  as potential names continuous groups of capitalized words, not separated by
  punctuation.
  """
  data = strip_tags_and_new_lines(data)
  # Transforms all the punctuation into dots so I can catch them as being
  # between capitalized names.
  data = re.sub("[!*#='|.,;\"\\(\\):\\?]", " . ", data)

  # Delete the 'read also' links.
  if data.find('Citeşte şi') > 0:
    data = data[0 : data.find('Citeşte şi')]

  words = data.split(" ")

  names = []
  name = []
  for word in words:
    # when you meet a separator, delete the name
    if re.search("[.,;\]\[]", word) or \
       re.search("^[0-9a-zşșî/(\\-]", word) or \
       re.search("[A-ZŢ]{2,}", word) or \
       word == "" or \
       word in common_capitalized_words:
      if len(name) > 1:
        names.append(name)
      name = []
    else:
      name.append(word)

  return names
Exemple #2
0
def get_names_from_text(data):
    """ Given a text of data, attempts to find all the potential names. We define
  as potential names continuous groups of capitalized words, not separated by
  punctuation.
  """
    data = strip_tags_and_new_lines(data)
    # Transforms all the punctuation into dots so I can catch them as being
    # between capitalized names.
    data = re.sub("[!*#='|.,;\"\\(\\):\\?]", " . ", data)

    # Delete the 'read also' links.
    if data.find('Citeşte şi') > 0:
        data = data[0:data.find('Citeşte şi')]

    words = data.split(" ")

    names = []
    name = []
    for word in words:
        # when you meet a separator, delete the name
        if re.search("[.,;\]\[]", word) or \
           re.search("^[0-9a-zşșî/(\\-]", word) or \
           re.search("[A-ZŢ]{2,}", word) or \
           word == "" or \
           word in common_capitalized_words:
            if len(name) > 1:
                names.append(name)
            name = []
        else:
            name.append(word)

    return names
def get_qualifiers(name, data):
    """ Given a name and a blob of text, find the qualifiers of that name in the
  text. Let's start simple and find this type:

    + Sentence: "Monica Macovei, fost ministru al Justitiei,..."
    - Extract: "first premier"
  """
    data = strip_tags_and_new_lines(data)

    post_qualifiers = re.findall(name + ', ([^,.]+)[.|,]', data)
    # TODO(vivi): Add the
    # - 'bula demnitarului' paranthesis here too.
    # - Sentences like "Ion Iliescu este bla bla".
    post_qualifiers = filter(could_be_qualifier, post_qualifiers)

    return post_qualifiers
def get_qualifiers(name, data):
  """ Given a name and a blob of text, find the qualifiers of that name in the
  text. Let's start simple and find this type:

    + Sentence: "Monica Macovei, fost ministru al Justitiei,..."
    - Extract: "first premier"
  """
  data = strip_tags_and_new_lines(data)

  post_qualifiers = re.findall(name + ', ([^,.]+)[.|,]', data)
  # TODO(vivi): Add the
  # - 'bula demnitarului' paranthesis here too.
  # - Sentences like "Ion Iliescu este bla bla".
  post_qualifiers = filter(could_be_qualifier, post_qualifiers)

  return post_qualifiers
Exemple #5
0
    tree = parse(SOURCE + '/' + fname)
    for item in tree.findall('item'):
        link = item.findtext('news_link').encode('UTF-8')
        # news_title is at this point an UTF-8 encoded string, quoted.
        title = urllib.unquote(item.findtext('news_title').encode('UTF-8'))

        tstr = item.findtext('news_time').encode('UTF-8').split(' ')
        d = datetime(year=int(tstr[4]),
                     month=int(tstr[3]),
                     day=int(tstr[2]),
                     hour=int(tstr[0]),
                     minute=int(tstr[1]))

        news_content = urllib.unquote(item.findtext('news_content'))
        news_content = strip_tags_and_new_lines(news_content)
        news_content = strip_punctuation(news_content)
        news_content = lower(news_content)
        news_content = strip_diacritics(news_content)

        words = news_content.split(" ")

        for word in words:
            if not word:
                continue

            if not word in map:
                map[word] = 1
            else:
                map[word] = map[word] + 1
for fname in files[-NUMBER_OF_DAYS_TO_PARSE : ]:
  print "--"
  print "-- ++ working on " + SOURCE + "/" + fname

  tree = parse(SOURCE + '/' + fname)
  for item in tree.findall('item'):
    link = item.findtext('news_link').encode('UTF-8')
    # news_title is at this point an UTF-8 encoded string, quoted.
    title = urllib.unquote(item.findtext('news_title').encode('UTF-8'))

    tstr = item.findtext('news_time').encode('UTF-8').split(' ')
    d = datetime(year=int(tstr[4]), month=int(tstr[3]), day=int(tstr[2]),
                 hour=int(tstr[0]), minute=int(tstr[1]))

    news_content = urllib.unquote(item.findtext('news_content'));
    news_content = strip_tags_and_new_lines(news_content)
    news_content = strip_punctuation(news_content)
    news_content = lower(news_content)
    news_content = strip_diacritics(news_content)
    
    words = news_content.split(" ")

    for word in words:
      if not word:
        continue
      
      if not word in map:
        map[word] = 1
      else:
        map[word] = map[word] + 1