Beispiel #1
0
def init_footprint_xml():
  outstr = '<?xml version="1.0" ?>'
  outstr += '<FootprintFeed schemaVersion="0.1">'
  outstr += "<FeedInfo>"
  outstr += xmlh.output_val("providerID", "105")
  outstr += xmlh.output_val("providerName", "craigslist")
  outstr += xmlh.output_val("feedID", "craigslist")
  outstr += xmlh.output_val("createdDateTime", xmlh.current_ts())
  outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/")
  outstr += "</FeedInfo>"
  # no "organization" in craigslist postings
  outstr += "<Organizations>"
  outstr += "<Organization>"
  outstr += "<organizationID>0</organizationID>"
  outstr += "<nationalEIN></nationalEIN>"
  outstr += "<name></name>"
  outstr += "<missionStatement></missionStatement>"
  outstr += "<description></description>"
  outstr += "<location>"
  outstr += xmlh.output_val("city", "")
  outstr += xmlh.output_val("region", "")
  outstr += xmlh.output_val("postalCode", "")
  outstr += "</location>"
  outstr += "<organizationURL></organizationURL>"
  outstr += "<donateURL></donateURL>"
  outstr += "<logoURL></logoURL>"
  outstr += "<detailURL></detailURL>"
  outstr += "</Organization>"
  outstr += "</Organizations>"
  outstr += "<VolunteerOpportunities>"
  return outstr
Beispiel #2
0
def init_footprint_xml():
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += "<FeedInfo>"
    outstr += xmlh.output_val("providerID", "105")
    outstr += xmlh.output_val("providerName", "craigslist")
    outstr += xmlh.output_val("feedID", "craigslist")
    outstr += xmlh.output_val("createdDateTime", xmlh.current_ts())
    outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/")
    outstr += "</FeedInfo>"
    # no "organization" in craigslist postings
    outstr += "<Organizations>"
    outstr += "<Organization>"
    outstr += "<organizationID>0</organizationID>"
    outstr += "<nationalEIN></nationalEIN>"
    outstr += "<name></name>"
    outstr += "<missionStatement></missionStatement>"
    outstr += "<description></description>"
    outstr += "<location>"
    outstr += xmlh.output_val("city", "")
    outstr += xmlh.output_val("region", "")
    outstr += xmlh.output_val("postalCode", "")
    outstr += "</location>"
    outstr += "<organizationURL></organizationURL>"
    outstr += "<donateURL></donateURL>"
    outstr += "<logoURL></logoURL>"
    outstr += "<detailURL></detailURL>"
    outstr += "</Organization>"
    outstr += "</Organizations>"
    outstr += "<VolunteerOpportunities>"
    return outstr
def parser(providerID, providerName, feedID, providerURL, feedDescription):
  """create an FPXML-compatible parser"""
  feedinfo = "<FeedInfo>"
  feedinfo += xmlh.output_val('providerID', providerID)
  feedinfo += xmlh.output_val('providerName', providerName)
  feedinfo += xmlh.output_val('feedID', feedID)
  feedinfo += xmlh.output_val('createdDateTime', xmlh.current_ts())
  feedinfo += xmlh.output_val('providerURL', providerURL)
  feedinfo += xmlh.output_val('description', feedDescription)
  feedinfo += "</FeedInfo>"
  def parse_func(instr, maxrecs, progress):
    """closure-- generated parse func"""
    outstr, numorgs, numopps = parse_fast(instr, maxrecs, progress)
    return re.sub(re.compile(r'<FeedInfo>.+?</FeedInfo>', re.DOTALL),
                  feedinfo, outstr), numorgs, numopps
  return parse_func
def parser(providerID, providerName, feedID, providerURL, feedDescription):
    """create an FPXML-compatible parser"""
    feedinfo = "<FeedInfo>"
    feedinfo += xmlh.output_val('providerID', providerID)
    feedinfo += xmlh.output_val('providerName', providerName)
    feedinfo += xmlh.output_val('feedID', feedID)
    feedinfo += xmlh.output_val('createdDateTime', xmlh.current_ts())
    feedinfo += xmlh.output_val('providerURL', providerURL)
    feedinfo += xmlh.output_val('description', feedDescription)
    feedinfo += "</FeedInfo>"

    def parse_func(instr, maxrecs, progress):
        """closure-- generated parse func"""
        outstr, numorgs, numopps = parse_fast(instr, maxrecs, progress)
        return re.sub(re.compile(r'<FeedInfo>.+?</FeedInfo>', re.DOTALL),
                      feedinfo, outstr), numorgs, numopps

    return parse_func
def parse_fast(instr, maxrecs, progress):
  """fast parser but doesn't check correctness,
  i.e. must be pre-checked by caller."""
  numorgs = numopps = 0
  outstr_list = ['<?xml version="1.0" ?>']
  outstr_list.append('<FootprintFeed schemaVersion="0.1">')

  # note: processes Organizations first, so ID lookups work
  for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>',
                                      re.DOTALL), instr):
    node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
    xmlh.set_default_value(node, node.firstChild, "feedID", "0")
    set_default_time_elem(node, node.firstChild, "createdDateTime")
    outstr_list.append(xmlh.prettyxml(node, True))

  outstr_list.append('<Organizations>')
  for match in re.finditer(re.compile('<Organization>.+?</Organization>',
                                      re.DOTALL), instr):
    node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
    numorgs += 1
    outstr_list.append(xmlh.prettyxml(node, True))
  outstr_list.append('</Organizations>')
               
  outstr_list.append('<VolunteerOpportunities>')
  for match in re.finditer(re.compile(
      '<VolunteerOpportunity>.+?</VolunteerOpportunity>', re.DOTALL), instr):
    opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)

    numopps += 1
    if (maxrecs > 0 and numopps > maxrecs):
      break
    #if progress and numopps % 250 == 0:
    #  print datetime.now(), ": ", numopps, " records generated."

    # these set_default_* functions dont do anything if the field
    # doesnt already exists
    xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888)
    xmlh.set_default_value(opp, opp, "paid", "No")
    xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither")
    xmlh.set_default_value(opp, opp, "language", "English")
    set_default_time_elem(opp, opp, "lastUpdated")
    set_default_time_elem(opp, opp, "expires", 
        xmlh.current_ts(DEFAULT_EXPIRATION))
   
    try:
      opplocs = opp.getElementsByTagName("location")
    except:
      opplocs = []

    for loc in opplocs:
      xmlh.set_default_value(opp, loc, "virtual", "No")
      xmlh.set_default_value(opp, loc, "country", "US")

    try:
      dttms = opp.getElementsByTagName("dateTimeDurations")
    except:
      dttms = []

    for dttm in dttms:
      # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No")
      xmlh.set_default_value(opp, dttm, "iCalRecurrence", "")
      if (dttm.getElementsByTagName("startTime") == None and
          dttm.getElementsByTagName("endTime") == None):
        set_default_time_elem(opp, dttm, "timeFlexible", "Yes")
      else:
        set_default_time_elem(opp, dttm, "timeFlexible", "No")
      xmlh.set_default_value(opp, dttm, "openEnded", "No")

    try:
      time_elems = opp.getElementsByTagName("startTime")
      time_elems += opp.getElementsByTagName("endTime")
    except:
      time_elems = []

    for el in time_elems:
      xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles")

    str_opp = xmlh.prettyxml(opp, True)

    outstr_list.append(str_opp)

  outstr_list.append('</VolunteerOpportunities>')

  outstr_list.append('</FootprintFeed>')
  return "".join(outstr_list), numorgs, numopps
def set_default_time_elem(parent, entity, tagname, timest=xmlh.current_ts()):
  """footprint macro."""
  cdt = xmlh.set_default_value(parent, entity, tagname, timest)
  xmlh.set_default_attr(parent, cdt, "olsonTZ", "America/Los_Angeles")
def parse(instr, maxrecs, progress):
    """return FPXML given craigslist data"""
    if CL_LATLONGS == None:
        load_craigslist_latlongs()
    xmlh.print_progress("loading craigslist crawler output...")
    crawl_craigslist.parse_cache_file(instr, listings_only=True)
    xmlh.print_progress("loaded " + str(len(crawl_craigslist.pages)) + " craigslist pages.")

    # convert to footprint format
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += "<FeedInfo>"
    outstr += xmlh.output_val("providerID", "105")
    outstr += xmlh.output_val("providerName", "craigslist")
    outstr += xmlh.output_val("feedID", "craigslist")
    outstr += xmlh.output_val("createdDateTime", xmlh.current_ts())
    outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/")
    outstr += "</FeedInfo>"

    numorgs = numopps = 0

    # no "organization" in craigslist postings
    outstr += "<Organizations>"
    outstr += "<Organization>"
    outstr += "<organizationID>0</organizationID>"
    outstr += "<nationalEIN></nationalEIN>"
    outstr += "<name></name>"
    outstr += "<missionStatement></missionStatement>"
    outstr += "<description></description>"
    outstr += "<location>"
    outstr += xmlh.output_val("city", "")
    outstr += xmlh.output_val("region", "")
    outstr += xmlh.output_val("postalCode", "")
    outstr += "</location>"
    outstr += "<organizationURL></organizationURL>"
    outstr += "<donateURL></donateURL>"
    outstr += "<logoURL></logoURL>"
    outstr += "<detailURL></detailURL>"
    outstr += "</Organization>"
    numorgs += 1
    outstr += "</Organizations>"

    skipped_listings = {}
    skipped_listings["body"] = skipped_listings["title"] = skipped_listings["not-ok"] = 0
    outstr += "<VolunteerOpportunities>"
    for i, url in enumerate(crawl_craigslist.pages):
        page = crawl_craigslist.pages[url]

        ok = extract(page, "it's OK to distribute this " + "charitable volunteerism opportunity")
        if ok == "":
            skipped_listings["not-ok"] += 1
            continue

        title = extract(page, "<title>(.+?)</title>")
        if title == "":
            skipped_listings["title"] += 1
            continue

        body = extract(page, '<div id="userbody">(.+?)<')
        if len(body) < 25:
            skipped_listings["body"] += 1
            continue

        item_id = extract(url, "/vol/(.+?)[.]html$")
        locstr = extract(page, "Location: (.+?)<")
        datestr = extract(page, "Date: (.+?)<")
        ts = dateutil.parser.parse(datestr)
        datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S")
        datestr = ts.strftime("%Y-%m-%d")

        if maxrecs > 0 and i > maxrecs:
            break
        xmlh.print_rps_progress("opps", progress, i, maxrecs)
        if progress and i > 0 and i % 250 == 0:
            msg = "skipped " + str(skipped_listings["title"] + skipped_listings["body"])
            msg += " listings (" + str(skipped_listings["title"]) + " for no-title and "
            msg += str(skipped_listings["body"]) + " for short body and "
            msg += str(skipped_listings["not-ok"]) + " for no-redistrib)"
            xmlh.print_progress(msg)
            # print "---"
            # print "title:",title
            # print "loc:",locstr
            # print "date:",datestr
            # print "body:",body[0:100]

        # craigslist is full of weird escapes-- strip them
        body = re.sub(r"&[a-z]+;", "", body)
        title = re.sub(r"&[a-z]+;", "", title)
        locstr = re.sub(r"&[a-z]+;", "", locstr)
        outstr += "<VolunteerOpportunity>"
        outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (item_id)
        outstr += "<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>"
        outstr += "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>"
        outstr += "<title>%s</title>" % (title)
        outstr += "<detailURL>%s</detailURL>" % (url)
        # avoid CDATA in body...
        esc_body = xml.sax.saxutils.escape(body)
        esc_body100 = xml.sax.saxutils.escape(body[0:100])
        outstr += "<description>%s</description>" % (esc_body)
        outstr += "<abstract>%s</abstract>" % (esc_body100 + "...")
        outstr += "<lastUpdated>%s</lastUpdated>" % (datetimestr)
        # TODO: expires
        # TODO: synthesize location from metro...
        outstr += "<locations><location>"
        outstr += "<name>%s</name>" % (xml.sax.saxutils.escape(locstr))
        # what about the few that do geocode?
        lat, lng = "", ""
        try:
            domain, unused = url.split("vol/")
            lat, lng = CL_LATLONGS[domain].split(",")
        except:
            # ignore for now
            # print url
            # continue
            pass
        outstr += "<latitude>%s</latitude>" % (lat)
        outstr += "<longitude>%s</longitude>" % (lng)
        outstr += "</location></locations>"
        # outstr += '<locations><location>'
        # outstr += '<city>%s</city>' % (
        # outstr += '<region>%s</region>' % (
        # outstr += '</location></locations>'
        outstr += "<dateTimeDurations><dateTimeDuration>"
        outstr += "<openEnded>No</openEnded>"
        outstr += "<startDate>%s</startDate>" % (datestr)
        # TODO: endDate = startDate + N=14 days?
        # TODO: timezone???
        # outstr += '<endDate>%s</endDate>' % (
        outstr += "</dateTimeDuration></dateTimeDurations>"
        # TODO: categories???
        # outstr += '<categoryTags>'
        outstr += "</VolunteerOpportunity>"
        numopps += 1
    outstr += "</VolunteerOpportunities>"
    outstr += "</FootprintFeed>"

    # outstr = re.sub(r'><([^/])', r'>\n<\1', outstr)
    return outstr, numorgs, numopps
Beispiel #8
0
def parse(instr, maxrec, progress):
    """return FPXML given 350.org data"""
    feed = xmlh.parse_or_die(instr.encode('utf-8'))

    org_id = str(139)
    mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand."
    org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths."

    start_date = '2010-10-01'
    today = datetime.now()
    last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")

    numorgs = 1
    numopps = 0
    xmlh.print_progress("loading 350.org custom XML...")

    # convert to footprint format
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += '<FeedInfo>'
    outstr += xmlh.output_val('providerID', org_id)
    outstr += xmlh.output_val('providerName', "350org")
    outstr += xmlh.output_val('feedID', "350org")
    outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
    outstr += xmlh.output_val('providerURL', "http://www.350.org/")
    outstr += '</FeedInfo>'
    # 1 "organization" in 350.org postings
    outstr += '<Organizations><Organization>'
    outstr += xmlh.output_val('organizationID', org_id)
    outstr += '<nationalEIN></nationalEIN>'
    outstr += '<name>350.org</name>'
    outstr += xmlh.output_val('missionStatement', mission_statement)
    outstr += xmlh.output_val('description', org_desc)
    outstr += '<location>'
    outstr += xmlh.output_val("city", "")
    outstr += xmlh.output_val("region", "")
    outstr += xmlh.output_val("postalCode", "")
    outstr += '</location>'
    # TODO: make these variables
    outstr += '<organizationURL>http://www.350.org/</organizationURL>'
    outstr += '<donateURL>http://www.350.org/donate</donateURL>'
    outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>'
    outstr += '<detailURL>http://www.350.org/about</detailURL>'
    outstr += '</Organization></Organizations>'

    outstr += '\n<VolunteerOpportunities>\n'
    nodes = feed.getElementsByTagName('node')
    for i, node in enumerate(nodes):
        if maxrec > 0 and i > maxrec:
            break
        title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>'
        desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>'
        url = xmlh.get_tag_val(node, "Link")
        lat = xmlh.get_tag_val(node, "Latitude")
        lng = xmlh.get_tag_val(node, "Longitude")

        start_datetime = xmlh.get_tag_val(node, "Start_Date")
        start_time = None
        if not start_datetime:
            start_date = "2010-10-10"
        else:
            start_datetime = start_datetime.replace(" (All day)", "T00:00:00")
            dt = start_datetime.split("T")
            start_date = dt[0][0:10]
            if len(dt) > 1:
                start_time = dt[1]

        end_datetime = xmlh.get_tag_val(node, "End_Date")
        end_time = None
        if not end_datetime:
            open_ended = True
        else:
            open_ended = False
            end_datetime = end_datetime.replace(" (All day)", "T23:00:00")
            dt = end_datetime.split("T")
            end_date = dt[0][0:10]
            if len(dt) > 1:
                end_time = dt[1]

        end_datetime = xmlh.get_tag_val(node, "End_Date")
        locstr = "%s, %s %s" % (xmlh.get_tag_val(
            node, "City"), xmlh.get_tag_val(
                node, "Province"), xmlh.get_tag_val(node, "Country"))

        outstr += '<VolunteerOpportunity>'
        outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (
            str(i))
        outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (
            org_id)
        outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (
            org_id)
        outstr += '<title>%s</title>' % (title)
        outstr += '<detailURL>%s</detailURL>' % (url)
        outstr += '<description>%s</description>' % (desc)
        outstr += '<abstract>%s</abstract>' % (desc)
        outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated)
        outstr += '<locations><location>'
        outstr += '<location_string>%s</location_string>' % (locstr)
        outstr += '<latitude>%s</latitude>' % (lat)
        outstr += '<longitude>%s</longitude>' % (lng)
        outstr += '</location></locations>'
        outstr += '<dateTimeDurations><dateTimeDuration>'
        outstr += '<startDate>%s</startDate>' % (start_date)
        if start_time:
            outstr += '<startTime>%s</startTime>' % (start_time)
        if open_ended:
            outstr += '<openEnded>Yes</openEnded>'
        else:
            outstr += '<openEnded>No</openEnded>'
            outstr += '<endDate>%s</endDate>' % (end_date)
            if end_time:
                outstr += '<endTime>%s</endTime>' % (end_time)
        outstr += '</dateTimeDuration></dateTimeDurations>'
        outstr += '</VolunteerOpportunity>\n'
        numopps += 1
    outstr += '</VolunteerOpportunities>'
    outstr += '</FootprintFeed>'

    return outstr, numorgs, numopps
 def parse(instr, maxrecs, progress):
   numorgs = numopps = 0
   instr = re.sub(r'<(/?db):', r'<\1_', instr)
   opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>',
                     instr, re.DOTALL)
   volopps = ""
   for i, oppstr in enumerate(opps):
     #if progress and i > 0 and i % 250 == 0:
     #  print str(datetime.now())+": ", i, " opportunities processed."
     if (maxrecs > 0 and i > maxrecs):
       break
     xmlh.print_rps_progress("opps", progress, i, maxrecs)
 
     item = xmlh.simple_parser(oppstr, known_elnames, progress=False)
 
     orgid = register_org(item)
 
     # logoURL -- sigh, this is for the opportunity not the org
     volopps += '<VolunteerOpportunity>'
     volopps += xmlh.output_val('volunteerOpportunityID', str(i))
     volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid))
     volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID")
     volopps += xmlh.output_node('title', item, "Title")
     volopps += xmlh.output_node('abstract', item, "Description")
     volopps += xmlh.output_node('description', item, "Description")
     volopps += xmlh.output_node('detailURL', item, "DetailURL")
     volopps += xmlh.output_val('volunteersNeeded', "-8888")
 
     try:
       oppdates = item.getElementsByTagName("OpportunityDate")
     except:
       oppdates = []
     
     if len(oppdates) > 1:
       print datetime.now(), \
           "parse_servenet.py: only 1 OpportunityDate supported."
       #return None
       oppdate = oppdates[0]
     elif len(oppdates) == 0:
       oppdate = None
     else:
       oppdate = oppdates[0]
     volopps += '<dateTimeDurations><dateTimeDuration>'
 
     if oppdate:
       volopps += xmlh.output_val('openEnded', 'No')
       volopps += xmlh.output_val('duration', 'P%s%s' % 
                                 (xmlh.get_tag_val(oppdate, "DurationQuantity"),
                                  xmlh.get_tag_val(oppdate, "DurationUnit")))
       volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
       volopps += xmlh.output_node('startDate', oppdate, "StartDate")
       volopps += xmlh.output_node('endDate', oppdate, "EndDate")
     else:
       volopps += xmlh.output_val('openEnded', 'Yes')
       volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
     volopps += '</dateTimeDuration></dateTimeDurations>'
 
     volopps += '<locations>'
     try:
       opplocs = item.getElementsByTagName("Location")
     except:
       opplocs = []
     for opploc in opplocs:
       volopps += '<location>'
       virtual_tag = opploc.getElementsByTagName("Virtual")
       if virtual_tag and xmlh.get_tag_val(opploc, "Virtual").lower() == "yes":
         volopps += xmlh.output_val('virtual', 'Yes')
       else:
         volopps += xmlh.output_node('region', opploc, "StateOrProvince")
         volopps += xmlh.output_node('country', opploc, "Country")
         volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode")
       volopps += '</location>'
     volopps += '</locations>'
     volopps += '<categoryTags/>'
     volopps += '</VolunteerOpportunity>'
     numopps += 1
     
   # convert to footprint format
   outstr = '<?xml version="1.0" ?>'
   outstr += '<FootprintFeed schemaVersion="0.1">'
   outstr += '<FeedInfo>'
   outstr += xmlh.output_val('providerID', providerID)
   outstr += xmlh.output_val('providerName', providerName)
   outstr += xmlh.output_val('feedID', feedID)
   outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
   outstr += xmlh.output_val('providerURL', providerURL)
   outstr += xmlh.output_val('description', feedDescription)
   # TODO: capture ts -- use now?!
   outstr += '</FeedInfo>'
 
   # hardcoded: Organization
   outstr += '<Organizations>'
   for key in ORGS:
     outstr += ORGS[key]
     numorgs += 1
   outstr += '</Organizations>'
   outstr += '<VolunteerOpportunities>'
   outstr += volopps
   outstr += '</VolunteerOpportunities>'
   outstr += '</FootprintFeed>'
 
   #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr)
   return outstr, numorgs, numopps
def parse(instr, maxrec, progress):
  """return FPXML given 350.org data"""
  feed = xmlh.parse_or_die(instr.encode('utf-8'))

  org_id = str(139)
  mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand."
  org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths."

  start_date = '2010-10-01'
  today = datetime.now()
  last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")

  numorgs = 1
  numopps = 0
  xmlh.print_progress("loading 350.org custom XML...")

  # convert to footprint format
  outstr = '<?xml version="1.0" ?>'
  outstr += '<FootprintFeed schemaVersion="0.1">'
  outstr += '<FeedInfo>'
  outstr += xmlh.output_val('providerID', org_id)
  outstr += xmlh.output_val('providerName', "350org")
  outstr += xmlh.output_val('feedID', "350org")
  outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
  outstr += xmlh.output_val('providerURL', "http://www.350.org/")
  outstr += '</FeedInfo>'
  # 1 "organization" in 350.org postings
  outstr += '<Organizations><Organization>'
  outstr += xmlh.output_val('organizationID', org_id)
  outstr += '<nationalEIN></nationalEIN>'
  outstr += '<name>350.org</name>'
  outstr += xmlh.output_val('missionStatement', mission_statement)
  outstr += xmlh.output_val('description', org_desc)
  outstr += '<location>'
  outstr += xmlh.output_val("city", "")
  outstr += xmlh.output_val("region", "")
  outstr += xmlh.output_val("postalCode", "")
  outstr += '</location>'
  # TODO: make these variables
  outstr += '<organizationURL>http://www.350.org/</organizationURL>'
  outstr += '<donateURL>http://www.350.org/donate</donateURL>'
  outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>'
  outstr += '<detailURL>http://www.350.org/about</detailURL>'
  outstr += '</Organization></Organizations>'

  outstr += '\n<VolunteerOpportunities>\n'
  nodes = feed.getElementsByTagName('node')
  for i, node in enumerate(nodes):
    if maxrec > 0 and i > maxrec:
       break
    title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>'
    desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>'
    url = xmlh.get_tag_val(node, "Link")
    lat = xmlh.get_tag_val(node, "Latitude")
    lng = xmlh.get_tag_val(node, "Longitude")

    start_datetime = xmlh.get_tag_val(node, "Start_Date")
    start_time = None
    if not start_datetime:
      start_date = "2010-10-10"
    else:
      start_datetime = start_datetime.replace(" (All day)",  "T00:00:00")
      dt = start_datetime.split("T")
      start_date = dt[0][0:10]
      if len(dt) > 1:
        start_time = dt[1]

    end_datetime = xmlh.get_tag_val(node, "End_Date")
    end_time = None
    if not end_datetime:
      open_ended = True
    else:
      open_ended = False
      end_datetime = end_datetime.replace(" (All day)",  "T23:00:00")
      dt = end_datetime.split("T")
      end_date = dt[0][0:10]
      if len(dt) > 1:
        end_time = dt[1]
      
    end_datetime = xmlh.get_tag_val(node, "End_Date")
    locstr = "%s, %s %s" % (xmlh.get_tag_val(node, "City"), 
                            xmlh.get_tag_val(node, "Province"), 
                            xmlh.get_tag_val(node, "Country"))

    outstr += '<VolunteerOpportunity>'
    outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i))
    outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id)
    outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id)
    outstr += '<title>%s</title>' % (title)
    outstr += '<detailURL>%s</detailURL>' % (url)
    outstr += '<description>%s</description>' % (desc)
    outstr += '<abstract>%s</abstract>' % (desc)
    outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated)
    outstr += '<locations><location>'
    outstr += '<location_string>%s</location_string>' % (locstr)
    outstr += '<latitude>%s</latitude>' % (lat)
    outstr += '<longitude>%s</longitude>' % (lng)
    outstr += '</location></locations>'
    outstr += '<dateTimeDurations><dateTimeDuration>'
    outstr += '<startDate>%s</startDate>' % (start_date)
    if start_time:
      outstr += '<startTime>%s</startTime>' % (start_time)
    if open_ended:
      outstr += '<openEnded>Yes</openEnded>'
    else:
      outstr += '<openEnded>No</openEnded>'
      outstr += '<endDate>%s</endDate>' % (end_date)
      if end_time:
        outstr += '<endTime>%s</endTime>' % (end_time)
    outstr += '</dateTimeDuration></dateTimeDurations>'
    outstr += '</VolunteerOpportunity>\n'
    numopps += 1
  outstr += '</VolunteerOpportunities>'
  outstr += '</FootprintFeed>'

  return outstr, numorgs, numopps
def parse(instr, maxrec, progress):
  """return FPXML given sparked feed data"""
  feed = xmlh.parse_or_die(instr.encode('utf-8'))

  org_id = str(139)
  mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about."
  org_desc = "Sparked is the world's first Microvolunteering network"

  today = datetime.now()
  last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")
  start_date = last_updated

  numorgs = 1
  numopps = 0
  xmlh.print_progress("loading sparked.com custom XML...")

  # convert to footprint format
  outstr = '<?xml version="1.0" ?>'
  outstr += '<FootprintFeed schemaVersion="0.1">'
  outstr += '<FeedInfo>'
  outstr += xmlh.output_val('providerID', org_id)
  outstr += xmlh.output_val('providerName', "sparked")
  outstr += xmlh.output_val('feedID', "sparked")
  outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
  outstr += xmlh.output_val('providerURL', "http://www.sparked.com/")
  outstr += '</FeedInfo>'
  # 1 "organization" in sparked.com postings
  outstr += '<Organizations><Organization>'
  outstr += xmlh.output_val('organizationID', org_id)
  outstr += '<nationalEIN></nationalEIN>'
  outstr += '<name>sparked.com</name>'
  outstr += xmlh.output_val('missionStatement', mission_statement)
  outstr += xmlh.output_val('description', org_desc)
  outstr += '<location>'
  outstr += xmlh.output_val("city", "San Francisco")
  outstr += xmlh.output_val("region", "CA")
  outstr += xmlh.output_val("postalCode", "94105")
  outstr += '</location>'
  outstr += '<organizationURL>http://www.sparked.com/</organizationURL>'
  outstr += '<donateURL>http://www.sparked.com/</donateURL>'
  outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>'
  outstr += '<detailURL>http://www.sparked.com/</detailURL>'
  outstr += '</Organization></Organizations>'

  outstr += '\n<VolunteerOpportunities>\n'
  nodes = feed.getElementsByTagName('challenge')
  for i, node in enumerate(nodes):
    if maxrec > 0 and i > maxrec:
       break
    title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>'
    desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>'
    url = xmlh.get_tag_val(node, "url")

    start_date = last_updated
    open_ended = True
    #01234567
    #02/15/11
    mdy = xmlh.get_tag_val(node, "deadline")
    if mdy:
      try:
        end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5]
        open_ended = False
      except:
        pass
    outstr += '<VolunteerOpportunity>'
    outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i))
    outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id)
    outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id)
    outstr += '<micro>Yes</micro>'
    outstr += '<title>%s</title>' % (title)
    outstr += '<detailURL>%s</detailURL>' % (url)
    outstr += '<description>%s</description>' % (desc)
    outstr += '<abstract>%s</abstract>' % (desc)
    outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated)
    outstr += '<dateTimeDurations><dateTimeDuration>'
    outstr += '<startDate>%s</startDate>' % (start_date)
    if open_ended:
      outstr += '<openEnded>Yes</openEnded>'
    else:
      outstr += '<openEnded>No</openEnded>'
      outstr += '<endDate>%s</endDate>' % (end_date)
    outstr += '</dateTimeDuration></dateTimeDurations>'
    outstr += '<locations><location><virtual>Yes</virtual></location></locations>'
    outstr += '</VolunteerOpportunity>\n'
    numopps += 1
  outstr += '</VolunteerOpportunities>'
  outstr += '</FootprintFeed>'

  return outstr, numorgs, numopps
    today = datetime.now()
    last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")
    start_date = last_updated

    numorgs = 1
    numopps = 0
    xmlh.print_progress("loading idealist.xml custom XML...")

    # convert to footprint format
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += '<FeedInfo>'
    outstr += xmlh.output_val('providerID', org_id)
    outstr += xmlh.output_val('providerName', "idealist")
    outstr += xmlh.output_val('feedID', "idealist")
    outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
    outstr += xmlh.output_val('providerURL', "http://www.idealist.org/")
    outstr += '</FeedInfo>'
    # 1 "organization" in idealist.org postings
    outstr += '<Organizations><Organization>'
    outstr += xmlh.output_val('organizationID', org_id)
    outstr += '<nationalEIN></nationalEIN>'
    outstr += '<name>idealist.org</name>'
    outstr += xmlh.output_val('missionStatement', mission_statement)
    outstr += xmlh.output_val('description', org_desc)
    outstr += '<location>'
    outstr += xmlh.output_val("city", "New York")
    outstr += xmlh.output_val("region", "NY")
    outstr += xmlh.output_val("postalCode", "10001")
    outstr += '</location>'
    outstr += '<organizationURL>http://www.idealist.org/</organizationURL>'
Beispiel #13
0
def parse(instr, maxrecs, progress):
  """return FPXML given craigslist data"""
  if CL_LATLONGS == None:
    load_craigslist_latlongs()
  xmlh.print_progress("loading craigslist crawler output...")
  crawl_craigslist.parse_cache_file(instr, listings_only=True)
  xmlh.print_progress("loaded "+str(len(crawl_craigslist.pages))+" craigslist pages.")

  # convert to footprint format
  outstr = '<?xml version="1.0" ?>'
  outstr += '<FootprintFeed schemaVersion="0.1">'
  outstr += '<FeedInfo>'
  outstr += xmlh.output_val('providerID', "105")
  outstr += xmlh.output_val('providerName', "craigslist")
  outstr += xmlh.output_val('feedID', "craigslist")
  outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
  outstr += xmlh.output_val('providerURL', "http://www.craigslist.org/")
  outstr += '</FeedInfo>'

  numorgs = numopps = 0

  # no "organization" in craigslist postings
  outstr += '<Organizations>'
  outstr += '<Organization>'
  outstr += '<organizationID>0</organizationID>'
  outstr += '<nationalEIN></nationalEIN>'
  outstr += '<name></name>'
  outstr += '<missionStatement></missionStatement>'
  outstr += '<description></description>'
  outstr += '<location>'
  outstr += xmlh.output_val("city", "")
  outstr += xmlh.output_val("region", "")
  outstr += xmlh.output_val("postalCode", "")
  outstr += '</location>'
  outstr += '<organizationURL></organizationURL>'
  outstr += '<donateURL></donateURL>'
  outstr += '<logoURL></logoURL>'
  outstr += '<detailURL></detailURL>'
  outstr += '</Organization>'
  numorgs += 1
  outstr += '</Organizations>'

  skipped_listings = {}
  skipped_listings["body"] = skipped_listings["title"] = \
      skipped_listings["not-ok"] = 0
  outstr += '<VolunteerOpportunities>'
  for i, url in enumerate(crawl_craigslist.pages):
    page = crawl_craigslist.pages[url]

    ok = extract(page, "it's OK to distribute this "+
                 "charitable volunteerism opportunity")
    if ok == "":
      skipped_listings["not-ok"] += 1
      continue

    title = extract(page, "<title>(.+?)</title>")
    if title == "":
      skipped_listings["title"] += 1
      continue

    body = extract(page, '<div id="userbody">(.+?)<')
    if len(body) < 25:
      skipped_listings["body"] += 1
      continue

    item_id = extract(url, "/vol/(.+?)[.]html$")
    locstr = extract(page, "Location: (.+?)<")
    datestr = extract(page, "Date: (.+?)<")
    ts = dateutil.parser.parse(datestr)
    datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S")
    datestr = ts.strftime("%Y-%m-%d")


    if (maxrecs>0 and i>maxrecs):
      break
    xmlh.print_rps_progress("opps", progress, i, maxrecs)
    if progress and i > 0 and i % 250 == 0:
      msg = "skipped " + str(skipped_listings["title"]+skipped_listings["body"])
      msg += " listings ("+str(skipped_listings["title"]) + " for no-title and "
      msg += str(skipped_listings["body"]) + " for short body and "
      msg += str(skipped_listings["not-ok"]) + " for no-redistrib)"
      xmlh.print_progress(msg)
      #print "---"
      #print "title:",title
      #print "loc:",locstr
      #print "date:",datestr
      #print "body:",body[0:100]

    # craigslist is full of weird escapes-- strip them
    body = re.sub(r'&[a-z]+;', '', body)
    title = re.sub(r'&[a-z]+;', '', title)
    locstr = re.sub(r'&[a-z]+;', '', locstr)
    outstr += '<VolunteerOpportunity>'
    outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (item_id)
    outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>'
    outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>'
    outstr += '<title>%s</title>' % (title)
    outstr += '<detailURL>%s</detailURL>' % (url)
    # avoid CDATA in body...
    esc_body = xml.sax.saxutils.escape(body)
    esc_body100 = xml.sax.saxutils.escape(body[0:100])
    outstr += '<description>%s</description>' % (esc_body)
    outstr += '<abstract>%s</abstract>' % (esc_body100 + "...")
    outstr += '<lastUpdated>%s</lastUpdated>' % (datetimestr)
    # TODO: expires
    # TODO: synthesize location from metro...
    outstr += '<locations><location>'
    outstr += '<name>%s</name>' % (xml.sax.saxutils.escape(locstr))
    # what about the few that do geocode?
    lat, lng = "", ""
    try:
      domain, unused = url.split("vol/")
      lat, lng = CL_LATLONGS[domain].split(",")
    except:
      # ignore for now
      #print url
      #continue
      pass
    outstr += '<latitude>%s</latitude>' % (lat)
    outstr += '<longitude>%s</longitude>' % (lng)
    outstr += '</location></locations>'
    #outstr += '<locations><location>'
    #outstr += '<city>%s</city>' % (
    #outstr += '<region>%s</region>' % (
    #outstr += '</location></locations>'
    outstr += '<dateTimeDurations><dateTimeDuration>'
    outstr += '<openEnded>No</openEnded>'
    outstr += '<startDate>%s</startDate>' % (datestr)
    # TODO: endDate = startDate + N=14 days?
    # TODO: timezone???
    #outstr += '<endDate>%s</endDate>' % (
    outstr += '</dateTimeDuration></dateTimeDurations>'
    # TODO: categories???
    #outstr += '<categoryTags>'
    outstr += '</VolunteerOpportunity>'
    numopps += 1
  outstr += '</VolunteerOpportunities>'
  outstr += '</FootprintFeed>'

  #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr)
  return outstr, numorgs, numopps
def parse(instr, maxrec, progress):
    """return FPXML given sparked feed data"""
    from xml.dom import minidom

    org_id = "140"
    mission_statement = "Do it yourself volunteer opportunities."
    org_desc = "Do it yourself volunteer opportunities"

    today = datetime.now()
    last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")

    numorgs = 1
    numopps = 0
    xmlh.print_progress("loading diy custom TSV...")

    # convert to footprint format
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += '<FeedInfo>'
    outstr += xmlh.output_val('providerID', org_id)
    outstr += xmlh.output_val('providerName', "diy")
    outstr += xmlh.output_val('feedID', "diy")
    outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
    outstr += xmlh.output_val('providerURL', "http://www.allforgood.org/")
    outstr += '</FeedInfo>'
    outstr += '<Organizations><Organization>'
    outstr += xmlh.output_val('organizationID', org_id)
    outstr += '<nationalEIN></nationalEIN>'
    outstr += '<name>allforgood.org</name>'
    outstr += xmlh.output_val('missionStatement', mission_statement)
    outstr += xmlh.output_val('description', org_desc)
    outstr += '<location>'
    outstr += xmlh.output_val("city", "San Francisco")
    outstr += xmlh.output_val("region", "CA")
    outstr += xmlh.output_val("postalCode", "94105")
    outstr += '</location>'
    outstr += '<organizationURL>http://www.allforgood.org/</organizationURL>'
    outstr += '<donateURL>http://www.allforgood.org/</donateURL>'
    outstr += '<logoURL>http://www.allforgood.org/</logoURL>'
    outstr += '<detailURL>http://www.allforgood.org/</detailURL>'
    outstr += '</Organization></Organizations>'
    outstr += '<VolunteerOpportunities>'

    lines = instr.split("\n")
    header = lines.pop(0).strip().split("\t")

    for i, line in enumerate(lines):
        row = line.strip().split("\t")
        if maxrec > 0 and i > maxrec:
            break

        title = '<![CDATA[' + get_field("title", row, header) + ']]>'
        url = get_field("url", row, header)
        if not title or not url:
            continue

        sponsor = get_field("sponsoringOrganization", row, header)
        desc = ('<![CDATA[' + sponsor + ': ' +
                get_field("description", row, header) +
                ' Areas of interest: ' +
                get_field("subjectArea", row, header) + ' Tags: ' +
                get_field("keywords", row, header) + ']]>')

        start_date = last_updated
        outstr += '<VolunteerOpportunity>'
        outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (
            str(i))
        outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (
            org_id)
        outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (
            org_id)
        outstr += '<self_directed>Yes</self_directed>'
        outstr += '<title>%s</title>' % (title)
        outstr += '<detailURL><![CDATA[%s]]></detailURL>' % (url)
        outstr += '<description>%s</description>' % (desc)
        outstr += '<abstract>%s</abstract>' % (desc)
        outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated)
        outstr += '<dateTimeDurations><dateTimeDuration>'
        outstr += '<startDate>%s</startDate>' % (start_date)
        outstr += '<openEnded>Yes</openEnded>'
        outstr += '</dateTimeDuration></dateTimeDurations>'
        outstr += '<locations><location><virtual>Yes</virtual></location></locations>'
        outstr += '</VolunteerOpportunity>'
        numopps += 1

    outstr += '</VolunteerOpportunities>'
    outstr += '</FootprintFeed>'

    return outstr, numorgs, numopps
Beispiel #15
0
def parse(instr, maxrec, progress):
    """return FPXML given sparked feed data"""
    from xml.dom import minidom

    org_id = "140"
    mission_statement = "Do it yourself volunteer opportunities."
    org_desc = "Do it yourself volunteer opportunities"

    today = datetime.now()
    last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")

    numorgs = 1
    numopps = 0
    xmlh.print_progress("loading diy custom TSV...")

    # convert to footprint format
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += "<FeedInfo>"
    outstr += xmlh.output_val("providerID", org_id)
    outstr += xmlh.output_val("providerName", "diy")
    outstr += xmlh.output_val("feedID", "diy")
    outstr += xmlh.output_val("createdDateTime", xmlh.current_ts())
    outstr += xmlh.output_val("providerURL", "http://www.allforgood.org/")
    outstr += "</FeedInfo>"
    outstr += "<Organizations><Organization>"
    outstr += xmlh.output_val("organizationID", org_id)
    outstr += "<nationalEIN></nationalEIN>"
    outstr += "<name>allforgood.org</name>"
    outstr += xmlh.output_val("missionStatement", mission_statement)
    outstr += xmlh.output_val("description", org_desc)
    outstr += "<location>"
    outstr += xmlh.output_val("city", "San Francisco")
    outstr += xmlh.output_val("region", "CA")
    outstr += xmlh.output_val("postalCode", "94105")
    outstr += "</location>"
    outstr += "<organizationURL>http://www.allforgood.org/</organizationURL>"
    outstr += "<donateURL>http://www.allforgood.org/</donateURL>"
    outstr += "<logoURL>http://www.allforgood.org/</logoURL>"
    outstr += "<detailURL>http://www.allforgood.org/</detailURL>"
    outstr += "</Organization></Organizations>"
    outstr += "<VolunteerOpportunities>"

    lines = instr.split("\n")
    header = lines.pop(0).strip().split("\t")

    for i, line in enumerate(lines):
        row = line.strip().split("\t")
        if maxrec > 0 and i > maxrec:
            break

        title = "<![CDATA[" + get_field("title", row, header) + "]]>"
        url = get_field("url", row, header)
        if not title or not url:
            continue

        sponsor = get_field("sponsoringOrganization", row, header)
        desc = (
            "<![CDATA["
            + sponsor
            + ": "
            + get_field("description", row, header)
            + " Areas of interest: "
            + get_field("subjectArea", row, header)
            + " Tags: "
            + get_field("keywords", row, header)
            + "]]>"
        )

        start_date = last_updated
        outstr += "<VolunteerOpportunity>"
        outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (str(i))
        outstr += (
            "<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>"
            % (org_id)
        )
        outstr += (
            "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>"
            % (org_id)
        )
        outstr += "<self_directed>Yes</self_directed>"
        outstr += "<title>%s</title>" % (title)
        outstr += "<detailURL><![CDATA[%s]]></detailURL>" % (url)
        outstr += "<description>%s</description>" % (desc)
        outstr += "<abstract>%s</abstract>" % (desc)
        outstr += "<lastUpdated>%s</lastUpdated>" % (last_updated)
        outstr += "<dateTimeDurations><dateTimeDuration>"
        outstr += "<startDate>%s</startDate>" % (start_date)
        outstr += "<openEnded>Yes</openEnded>"
        outstr += "</dateTimeDuration></dateTimeDurations>"
        outstr += "<locations><location><virtual>Yes</virtual></location></locations>"
        outstr += "</VolunteerOpportunity>"
        numopps += 1

    outstr += "</VolunteerOpportunities>"
    outstr += "</FootprintFeed>"

    return outstr, numorgs, numopps
def parse(instr, maxrec, progress):
    """return FPXML given sparked feed data"""
    feed = xmlh.parse_or_die(instr.encode('utf-8'))

    org_id = str(139)
    mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about."
    org_desc = "Sparked is the world's first Microvolunteering network"

    today = datetime.now()
    last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")
    start_date = last_updated

    numorgs = 1
    numopps = 0
    xmlh.print_progress("loading sparked.com custom XML...")

    # convert to footprint format
    outstr = '<?xml version="1.0" ?>'
    outstr += '<FootprintFeed schemaVersion="0.1">'
    outstr += '<FeedInfo>'
    outstr += xmlh.output_val('providerID', org_id)
    outstr += xmlh.output_val('providerName', "sparked")
    outstr += xmlh.output_val('feedID', "sparked")
    outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
    outstr += xmlh.output_val('providerURL', "http://www.sparked.com/")
    outstr += '</FeedInfo>'
    # 1 "organization" in sparked.com postings
    outstr += '<Organizations><Organization>'
    outstr += xmlh.output_val('organizationID', org_id)
    outstr += '<nationalEIN></nationalEIN>'
    outstr += '<name>sparked.com</name>'
    outstr += xmlh.output_val('missionStatement', mission_statement)
    outstr += xmlh.output_val('description', org_desc)
    outstr += '<location>'
    outstr += xmlh.output_val("city", "San Francisco")
    outstr += xmlh.output_val("region", "CA")
    outstr += xmlh.output_val("postalCode", "94105")
    outstr += '</location>'
    outstr += '<organizationURL>http://www.sparked.com/</organizationURL>'
    outstr += '<donateURL>http://www.sparked.com/</donateURL>'
    outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>'
    outstr += '<detailURL>http://www.sparked.com/</detailURL>'
    outstr += '</Organization></Organizations>'

    outstr += '\n<VolunteerOpportunities>\n'
    nodes = feed.getElementsByTagName('challenge')
    for i, node in enumerate(nodes):
        if maxrec > 0 and i > maxrec:
            break
        title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>'
        desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>'
        url = xmlh.get_tag_val(node, "url")

        start_date = last_updated
        open_ended = True
        #01234567
        #02/15/11
        mdy = xmlh.get_tag_val(node, "deadline")
        if mdy:
            try:
                end_date = str(2000 +
                               int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5]
                open_ended = False
            except:
                pass
        outstr += '<VolunteerOpportunity>'
        outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (
            str(i))
        outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (
            org_id)
        outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (
            org_id)
        outstr += '<micro>Yes</micro>'
        outstr += '<title>%s</title>' % (title)
        outstr += '<detailURL>%s</detailURL>' % (url)
        outstr += '<description>%s</description>' % (desc)
        outstr += '<abstract>%s</abstract>' % (desc)
        outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated)
        outstr += '<dateTimeDurations><dateTimeDuration>'
        outstr += '<startDate>%s</startDate>' % (start_date)
        if open_ended:
            outstr += '<openEnded>Yes</openEnded>'
        else:
            outstr += '<openEnded>No</openEnded>'
            outstr += '<endDate>%s</endDate>' % (end_date)
        outstr += '</dateTimeDuration></dateTimeDurations>'
        outstr += '<locations><location><virtual>Yes</virtual></location></locations>'
        outstr += '</VolunteerOpportunity>\n'
        numopps += 1
    outstr += '</VolunteerOpportunities>'
    outstr += '</FootprintFeed>'

    return outstr, numorgs, numopps
Beispiel #17
0
    def parse(instr, maxrecs, progress):
        numorgs = numopps = 0
        instr = re.sub(r'<(/?db):', r'<\1_', instr)
        opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>',
                          instr, re.DOTALL)
        volopps = ""
        for i, oppstr in enumerate(opps):
            #if progress and i > 0 and i % 250 == 0:
            #  print str(datetime.now())+": ", i, " opportunities processed."
            if (maxrecs > 0 and i > maxrecs):
                break
            xmlh.print_rps_progress("opps", progress, i, maxrecs)

            item = xmlh.simple_parser(oppstr, known_elnames, progress=False)

            orgid = register_org(item)

            # logoURL -- sigh, this is for the opportunity not the org
            volopps += '<VolunteerOpportunity>'
            volopps += xmlh.output_val('volunteerOpportunityID', str(i))
            volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid))
            volopps += xmlh.output_node('volunteerHubOrganizationID', item,
                                        "LocalID")
            volopps += xmlh.output_node('title', item, "Title")
            volopps += xmlh.output_node('abstract', item, "Description")
            volopps += xmlh.output_node('description', item, "Description")
            volopps += xmlh.output_node('detailURL', item, "DetailURL")
            volopps += xmlh.output_val('volunteersNeeded', "-8888")

            try:
                oppdates = item.getElementsByTagName("OpportunityDate")
            except:
                oppdates = []

            if len(oppdates) > 1:
                print datetime.now(), \
                    "parse_servenet.py: only 1 OpportunityDate supported."
                #return None
                oppdate = oppdates[0]
            elif len(oppdates) == 0:
                oppdate = None
            else:
                oppdate = oppdates[0]
            volopps += '<dateTimeDurations><dateTimeDuration>'

            if oppdate:
                volopps += xmlh.output_val('openEnded', 'No')
                volopps += xmlh.output_val(
                    'duration',
                    'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"),
                               xmlh.get_tag_val(oppdate, "DurationUnit")))
                volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
                volopps += xmlh.output_node('startDate', oppdate, "StartDate")
                volopps += xmlh.output_node('endDate', oppdate, "EndDate")
            else:
                volopps += xmlh.output_val('openEnded', 'Yes')
                volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
            volopps += '</dateTimeDuration></dateTimeDurations>'

            volopps += '<locations>'
            try:
                opplocs = item.getElementsByTagName("Location")
            except:
                opplocs = []
            for opploc in opplocs:
                volopps += '<location>'
                virtual_tag = opploc.getElementsByTagName("Virtual")
                if virtual_tag and xmlh.get_tag_val(
                        opploc, "Virtual").lower() == "yes":
                    volopps += xmlh.output_val('virtual', 'Yes')
                else:
                    volopps += xmlh.output_node('region', opploc,
                                                "StateOrProvince")
                    volopps += xmlh.output_node('country', opploc, "Country")
                    volopps += xmlh.output_node('postalCode', opploc,
                                                "ZipOrPostalCode")
                volopps += '</location>'
            volopps += '</locations>'
            volopps += '<categoryTags/>'
            volopps += '</VolunteerOpportunity>'
            numopps += 1

        # convert to footprint format
        outstr = '<?xml version="1.0" ?>'
        outstr += '<FootprintFeed schemaVersion="0.1">'
        outstr += '<FeedInfo>'
        outstr += xmlh.output_val('providerID', providerID)
        outstr += xmlh.output_val('providerName', providerName)
        outstr += xmlh.output_val('feedID', feedID)
        outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
        outstr += xmlh.output_val('providerURL', providerURL)
        outstr += xmlh.output_val('description', feedDescription)
        # TODO: capture ts -- use now?!
        outstr += '</FeedInfo>'

        # hardcoded: Organization
        outstr += '<Organizations>'
        for key in ORGS:
            outstr += ORGS[key]
            numorgs += 1
        outstr += '</Organizations>'
        outstr += '<VolunteerOpportunities>'
        outstr += volopps
        outstr += '</VolunteerOpportunities>'
        outstr += '</FootprintFeed>'

        #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr)
        return outstr, numorgs, numopps
  today = datetime.now()
  last_updated = today.strftime("%Y-%m-%dT%H:%M:%S")
  start_date = last_updated

  numorgs = 1
  numopps = 0
  xmlh.print_progress("loading idealist.xml custom XML...")

  # convert to footprint format
  outstr = '<?xml version="1.0" ?>'
  outstr += '<FootprintFeed schemaVersion="0.1">'
  outstr += '<FeedInfo>'
  outstr += xmlh.output_val('providerID', org_id)
  outstr += xmlh.output_val('providerName', "idealist")
  outstr += xmlh.output_val('feedID', "idealist")
  outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
  outstr += xmlh.output_val('providerURL', "http://www.idealist.org/")
  outstr += '</FeedInfo>'
  # 1 "organization" in idealist.org postings
  outstr += '<Organizations><Organization>'
  outstr += xmlh.output_val('organizationID', org_id)
  outstr += '<nationalEIN></nationalEIN>'
  outstr += '<name>idealist.org</name>'
  outstr += xmlh.output_val('missionStatement', mission_statement)
  outstr += xmlh.output_val('description', org_desc)
  outstr += '<location>'
  outstr += xmlh.output_val("city", "New York")
  outstr += xmlh.output_val("region", "NY")
  outstr += xmlh.output_val("postalCode", "10001")
  outstr += '</location>'
  outstr += '<organizationURL>http://www.idealist.org/</organizationURL>'
def set_default_time_elem(parent, entity, tagname, timest=xmlh.current_ts()):
    """footprint macro."""
    cdt = xmlh.set_default_value(parent, entity, tagname, timest)
    xmlh.set_default_attr(parent, cdt, "olsonTZ", "America/Los_Angeles")
Beispiel #20
0
def parse(instr, maxrecs, progress):
    """return FPXML given usaservice data"""
    # TODO: progress
    known_elnames = [
        'channel',
        'db:abstract',
        'db:address',
        'db:attendee_count',
        'db:categories',
        'db:city',
        'db:country',
        'db:county',
        'db:dateTime',
        'db:event',
        'db:eventType',
        'db:guest_total',
        'db:host',
        'db:latitude',
        'db:length',
        'db:longitude',
        'db:rsvp',
        'db:scheduledTime',
        'db:state',
        'db:street',
        'db:title',
        'db:venue_name',
        'db:zipcode',
        'description',
        'docs',
        'guid',
        'item',
        'language',
        'link',
        'pubDate',
        'rss',
        'title',
    ]

    # convert to footprint format
    s = '<?xml version="1.0" ?>'
    s += '<FootprintFeed schemaVersion="0.1">'
    s += '<FeedInfo>'
    # TODO: assign provider IDs?
    s += '<providerID>101</providerID>'
    s += '<providerName>usaservice.org</providerName>'
    s += '<feedID>1</feedID>'
    s += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts()
    s += '<providerURL>http://www.usaservice.org/</providerURL>'
    s += '<description>Syndicated events</description>'
    # TODO: capture ts -- use now?!
    s += '</FeedInfo>'

    numorgs = numopps = 0
    # hardcoded: Organization
    s += '<Organizations>'
    s += '<Organization>'
    s += '<organizationID>0</organizationID>'
    s += '<nationalEIN></nationalEIN>'
    s += '<name></name>'
    s += '<missionStatement></missionStatement>'
    s += '<description></description>'
    s += '<location><city></city><region></region><postalCode></postalCode></location>'
    s += '<organizationURL></organizationURL>'
    s += '<donateURL></donateURL>'
    s += '<logoURL></logoURL>'
    s += '<detailURL></detailURL>'
    s += '</Organization>'
    numorgs += 1
    s += '</Organizations>'

    s += '<VolunteerOpportunities>'

    instr = re.sub(r'<(/?db):', r'<\1_', instr)
    for i, line in enumerate(instr.splitlines()):
        if (maxrecs > 0 and i > maxrecs):
            break
        xmlh.print_rps_progress("opps", progress, i, maxrecs)
        item = xmlh.simple_parser(line, known_elnames, progress=False)

        # unmapped: db_rsvp  (seems to be same as link, but with #rsvp at end of url?)
        # unmapped: db_host  (no equivalent?)
        # unmapped: db_county  (seems to be empty)
        # unmapped: attendee_count
        # unmapped: guest_total
        # unmapped: db_title   (dup of title, above)
        s += '<VolunteerOpportunity>'
        s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (
            xmlh.get_tag_val(item, "guid"))
        # hardcoded: sponsoringOrganizationID
        s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>'
        # hardcoded: volunteerHubOrganizationID
        s += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>'
        s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title"))
        s += '<abstract>%s</abstract>' % (xmlh.get_tag_val(item, "abstract"))
        s += '<volunteersNeeded>-8888</volunteersNeeded>'

        dbscheduledTimes = item.getElementsByTagName("db_scheduledTime")
        if (dbscheduledTimes.length != 1):
            print datetime.now(
            ), "parse_usaservice: only 1 db_scheduledTime supported."
            return None
        dbscheduledTime = dbscheduledTimes[0]
        s += '<dateTimeDurations><dateTimeDuration>'
        length = xmlh.get_tag_val(dbscheduledTime, "db_length")
        if length == "" or length == "-1":
            s += '<openEnded>Yes</openEnded>'
        else:
            s += '<openEnded>No</openEnded>'
        date, time = xmlh.get_tag_val(dbscheduledTime,
                                      "db_dateTime").split(" ")
        s += '<startDate>%s</startDate>' % (date)
        # TODO: timezone???
        s += '<startTime>%s</startTime>' % (time)
        s += '</dateTimeDuration></dateTimeDurations>'

        dbaddresses = item.getElementsByTagName("db_address")
        if (dbaddresses.length != 1):
            print datetime.now(
            ), "parse_usaservice: only 1 db_address supported."
            return None
        dbaddress = dbaddresses[0]
        s += '<locations><location>'
        s += '<name>%s</name>' % (xmlh.get_tag_val(item, "db_venue_name"))
        s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val(
            dbaddress, "db_street"))
        s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "db_city"))
        s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "db_state"))
        s += '<country>%s</country>' % (xmlh.get_tag_val(
            dbaddress, "db_country"))
        s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val(
            dbaddress, "db_zipcode"))
        s += '<latitude>%s</latitude>' % (xmlh.get_tag_val(
            item, "db_latitude"))
        s += '<longitude>%s</longitude>' % (xmlh.get_tag_val(
            item, "db_longitude"))
        s += '</location></locations>'

        type = xmlh.get_tag_val(item, "db_eventType")
        s += '<categoryTags><categoryTag>%s</categoryTag></categoryTags>' % (
            type)

        s += '<contactName>%s</contactName>' % xmlh.get_tag_val(
            item, "db_host")
        s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(item, "link"))
        s += '<description>%s</description>' % (xmlh.get_tag_val(
            item, "description"))
        pubdate = xmlh.get_tag_val(item, "pubDate")
        if re.search("[0-9][0-9] [A-Z][a-z][a-z] [0-9][0-9][0-9][0-9]",
                     pubdate):
            # TODO: parse() is ignoring timzone...
            ts = dateutil.parser.parse(pubdate)
            pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S")
        s += '<lastUpdated>%s</lastUpdated>' % (pubdate)
        s += '</VolunteerOpportunity>'
        numopps += 1

    s += '</VolunteerOpportunities>'
    s += '</FootprintFeed>'
    #s = re.sub(r'><([^/])', r'>\n<\1', s)
    return s, numorgs, numopps
def parse_fast(instr, maxrecs, progress):
    """fast parser but doesn't check correctness,
  i.e. must be pre-checked by caller."""
    numorgs = numopps = 0
    outstr_list = ['<?xml version="1.0" ?>']
    outstr_list.append('<FootprintFeed schemaVersion="0.1">')

    # note: processes Organizations first, so ID lookups work
    for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>', re.DOTALL),
                             instr):
        node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
        xmlh.set_default_value(node, node.firstChild, "feedID", "0")
        set_default_time_elem(node, node.firstChild, "createdDateTime")
        outstr_list.append(xmlh.prettyxml(node, True))

    outstr_list.append('<Organizations>')
    for match in re.finditer(
            re.compile('<Organization>.+?</Organization>', re.DOTALL), instr):
        node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
        numorgs += 1
        outstr_list.append(xmlh.prettyxml(node, True))
    outstr_list.append('</Organizations>')

    outstr_list.append('<VolunteerOpportunities>')
    for match in re.finditer(
            re.compile('<VolunteerOpportunity>.+?</VolunteerOpportunity>',
                       re.DOTALL), instr):
        opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)

        numopps += 1
        if (maxrecs > 0 and numopps > maxrecs):
            break
        #if progress and numopps % 250 == 0:
        #  print datetime.now(), ": ", numopps, " records generated."

        # these set_default_* functions dont do anything if the field
        # doesnt already exists
        xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888)
        xmlh.set_default_value(opp, opp, "paid", "No")
        xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither")
        xmlh.set_default_value(opp, opp, "language", "English")
        set_default_time_elem(opp, opp, "lastUpdated")
        set_default_time_elem(opp, opp, "expires",
                              xmlh.current_ts(DEFAULT_EXPIRATION))

        try:
            opplocs = opp.getElementsByTagName("location")
        except:
            opplocs = []

        for loc in opplocs:
            xmlh.set_default_value(opp, loc, "virtual", "No")
            xmlh.set_default_value(opp, loc, "country", "US")

        try:
            dttms = opp.getElementsByTagName("dateTimeDurations")
        except:
            dttms = []

        for dttm in dttms:
            # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No")
            xmlh.set_default_value(opp, dttm, "iCalRecurrence", "")
            if (dttm.getElementsByTagName("startTime") == None
                    and dttm.getElementsByTagName("endTime") == None):
                set_default_time_elem(opp, dttm, "timeFlexible", "Yes")
            else:
                set_default_time_elem(opp, dttm, "timeFlexible", "No")
            xmlh.set_default_value(opp, dttm, "openEnded", "No")

        try:
            time_elems = opp.getElementsByTagName("startTime")
            time_elems += opp.getElementsByTagName("endTime")
        except:
            time_elems = []

        for el in time_elems:
            xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles")

        str_opp = xmlh.prettyxml(opp, True)

        outstr_list.append(str_opp)

    outstr_list.append('</VolunteerOpportunities>')

    outstr_list.append('</FootprintFeed>')
    return "".join(outstr_list), numorgs, numopps
def parse(instr, maxrecs = 0, progress = False):
  """parser main."""
  data = {}
  updated = {}
  maxrow, maxcol = parse_gspreadsheet(instr, data, updated, progress)
  if DEBUG and progress:
    print str(datetime.now())+": maxrow="+str(maxrow)+" maxcol="+str(maxcol)

  # find header row: look for "opportunity title" (case insensitive)
  header_row, header_startcol = find_header_row(data, 'opportunity\s*title')

  header_colidx = {}
  header_names = {}
  header_col = header_startcol
  while True:
    header_str = cellval(data, header_row, header_col)
    if not header_str:
      break
    field_name = None
    header_str = header_str.lower()
    if header_str.find("title") >= 0:
      field_name = "OpportunityTitle"
    elif (header_str.find("organization") >= 0 and
          header_str.find("sponsor") >= 0):
      field_name = "SponsoringOrganization"
    elif header_str.find("description") >= 0:
      field_name = "Description"
    elif header_str.find("skills") >= 0:
      field_name = "Skills"
    elif header_str.find("location") >= 0 and header_str.find("name") >= 0:
      field_name = "LocationName"
    elif header_str.find("street") >= 0:
      field_name = "LocationStreet"
    elif header_str.find("city") >= 0:
      field_name = "LocationCity"
    elif header_str.find("state") >= 0 or header_str.find("province") >= 0:
      field_name = "LocationProvince"
    elif header_str.find("zip") >= 0 or header_str.find("postal") >= 0:
      field_name = "LocationPostalCode"
    elif header_str.find("country") >= 0:
      field_name = "LocationCountry"
    elif header_str.find("start") >= 0 and header_str.find("date") >= 0:
      field_name = "StartDate"
    elif header_str.find("start") >= 0 and header_str.find("time") >= 0:
      field_name = "StartTime"
    elif header_str.find("end") >= 0 and header_str.find("date") >= 0:
      field_name = "EndDate"
    elif header_str.find("end") >= 0 and header_str.find("time") >= 0:
      field_name = "EndTime"
    elif header_str.find("contact") >= 0 and header_str.find("name") >= 0:
      field_name = "ContactName"
    elif header_str.find("email") >= 0 or header_str.find("e-mail") >= 0:
      field_name = "ContactEmail"
    elif header_str.find("phone") >= 0:
      field_name = "ContactPhone"
    elif header_str.find("website") >= 0 or header_str.find("url") >= 0:
      field_name = "URL"
    elif header_str.find("often") >= 0:
      field_name = "Frequency"
    elif header_str.find("days") >= 0 and header_str.find("week") >= 0:
      field_name = "DaysOfWeek"
    elif header_str.find("paid") >= 0:
      field_name = "Paid"
    elif header_str.find("self_directed") >= 0:
      field_name = "SelfDirected"
    elif header_str.find("commitment") >= 0 or header_str.find("hours") >= 0:
      field_name = "CommitmentHours"
    elif header_str.find("age") >= 0 and header_str.find("min") >= 0:
      field_name = "MinimumAge"
    elif header_str.find("kid") >= 0:
      field_name = "KidFriendly"
    elif header_str.find("senior") >= 0 and header_str.find("only") >= 0:
      field_name = "SeniorsOnly"
    elif header_str.find("sex") >= 0 or header_str.find("gender") >= 0:
      field_name = "SexRestrictedTo"
    elif header_str.find("volunteer appeal") >= 0:
      field_name = None
    elif header_str.find("volunteerOptIn") >= 0:
      field_name = None
    elif header_str.find("booksOptIn") >= 0:
      field_name = None
    else:
      parser_error("couldn't map header '"+header_str+"' to a field name.")
    if field_name != None:
      header_colidx[field_name] = header_col
      header_names[header_col] = field_name
      #print header_str, "=>", field_name
    header_col += 1

  if len(header_names) < 10:
    parser_error("too few fields found: "+str(len(header_names)))

  # check to see if there's a header-description row
  header_desc = cellval(data, header_row+1, header_startcol)
  if not header_desc:
    parser_error("empty spreadsheet? blank row not allowed below header row")
    return '', 0, 0
    #data_startrow = 3
  else:
    header_desc = header_desc.lower()
    data_startrow = header_row + 1
    if header_desc.find("up to") >= 0:
      data_startrow += 1


  # find the data
  global CURRENT_ROW
  CURRENT_ROW = row = data_startrow
  blankrows = 0
  volopps = '<VolunteerOpportunities>'
  numorgs = numopps = 0
  while True:
    blankrow = True
    #rowstr = "row="+str(row)+"\n"
    record = {}
    record['LastUpdated'] = '2000-01-01'
    for field_name in header_colidx:
      col = header_colidx[field_name]
      val = cellval(data, row, col)
      if val:
        blankrow = False
      else:
        val = ""
      #rowstr += "  "+field_name+"="+val+"\n"
      record[field_name] = val
      key = 'R'+str(row)+'C'+str(col)
      if (key in updated and
          updated[key] > record['LastUpdated']):
        record['LastUpdated'] = updated[key]

    if blankrow:
      blankrows += 1
      if blankrows > MAX_BLANKROWS:
        break
    else:
      numopps += 1
      blankrows = 0
      record['oppid'] = str(numopps)
      volopps += record_to_fpxml(record)
    row += 1
    CURRENT_ROW = row

  CURRENT_ROW = None
  if DEBUG and progress:
    print str(datetime.now())+": ", numopps, "opportunities found."
  volopps += '</VolunteerOpportunities>'

  outstr = '<?xml version="1.0" ?>'
  outstr += '<FootprintFeed schemaVersion="0.1">'
  outstr += '<FeedInfo>'
  # providerID replaced by caller
  outstr += '<providerID></providerID>'
  # providerName replaced by caller
  outstr += '<providerName></providerName>'
  outstr += '<feedID>1</feedID>'
  outstr += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts()
  # providerURL replaced by caller
  outstr += '<providerURL></providerURL>'
  outstr += '<description></description>'
  outstr += '</FeedInfo>'
  outstr += "<Organizations>"
  for orgname in KNOWN_ORGS:
    outstr += "<Organization>"
    outstr += xmlh.output_val("organizationID", KNOWN_ORGS[orgname])
    outstr += xmlh.output_val("name", orgname, cdata=True)
    outstr += "</Organization>"
  outstr += "</Organizations>"
  outstr += volopps
  outstr += '</FootprintFeed>'

  #outstr = re.sub(r'><', '>\n<', outstr)
  #print outstr

  return outstr, numorgs, numopps