def init_footprint_xml(): outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", "105") outstr += xmlh.output_val("providerName", "craigslist") outstr += xmlh.output_val("feedID", "craigslist") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/") outstr += "</FeedInfo>" # no "organization" in craigslist postings outstr += "<Organizations>" outstr += "<Organization>" outstr += "<organizationID>0</organizationID>" outstr += "<nationalEIN></nationalEIN>" outstr += "<name></name>" outstr += "<missionStatement></missionStatement>" outstr += "<description></description>" outstr += "<location>" outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += "</location>" outstr += "<organizationURL></organizationURL>" outstr += "<donateURL></donateURL>" outstr += "<logoURL></logoURL>" outstr += "<detailURL></detailURL>" outstr += "</Organization>" outstr += "</Organizations>" outstr += "<VolunteerOpportunities>" return outstr
def parser(providerID, providerName, feedID, providerURL, feedDescription): """create an FPXML-compatible parser""" feedinfo = "<FeedInfo>" feedinfo += xmlh.output_val('providerID', providerID) feedinfo += xmlh.output_val('providerName', providerName) feedinfo += xmlh.output_val('feedID', feedID) feedinfo += xmlh.output_val('createdDateTime', xmlh.current_ts()) feedinfo += xmlh.output_val('providerURL', providerURL) feedinfo += xmlh.output_val('description', feedDescription) feedinfo += "</FeedInfo>" def parse_func(instr, maxrecs, progress): """closure-- generated parse func""" outstr, numorgs, numopps = parse_fast(instr, maxrecs, progress) return re.sub(re.compile(r'<FeedInfo>.+?</FeedInfo>', re.DOTALL), feedinfo, outstr), numorgs, numopps return parse_func
def parse_fast(instr, maxrecs, progress): """fast parser but doesn't check correctness, i.e. must be pre-checked by caller.""" numorgs = numopps = 0 outstr_list = ['<?xml version="1.0" ?>'] outstr_list.append('<FootprintFeed schemaVersion="0.1">') # note: processes Organizations first, so ID lookups work for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) xmlh.set_default_value(node, node.firstChild, "feedID", "0") set_default_time_elem(node, node.firstChild, "createdDateTime") outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('<Organizations>') for match in re.finditer(re.compile('<Organization>.+?</Organization>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numorgs += 1 outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('</Organizations>') outstr_list.append('<VolunteerOpportunities>') for match in re.finditer(re.compile( '<VolunteerOpportunity>.+?</VolunteerOpportunity>', re.DOTALL), instr): opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numopps += 1 if (maxrecs > 0 and numopps > maxrecs): break #if progress and numopps % 250 == 0: # print datetime.now(), ": ", numopps, " records generated." # these set_default_* functions dont do anything if the field # doesnt already exists xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888) xmlh.set_default_value(opp, opp, "paid", "No") xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither") xmlh.set_default_value(opp, opp, "language", "English") set_default_time_elem(opp, opp, "lastUpdated") set_default_time_elem(opp, opp, "expires", xmlh.current_ts(DEFAULT_EXPIRATION)) try: opplocs = opp.getElementsByTagName("location") except: opplocs = [] for loc in opplocs: xmlh.set_default_value(opp, loc, "virtual", "No") xmlh.set_default_value(opp, loc, "country", "US") try: dttms = opp.getElementsByTagName("dateTimeDurations") except: dttms = [] for dttm in dttms: # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No") xmlh.set_default_value(opp, dttm, "iCalRecurrence", "") if (dttm.getElementsByTagName("startTime") == None and dttm.getElementsByTagName("endTime") == None): set_default_time_elem(opp, dttm, "timeFlexible", "Yes") else: set_default_time_elem(opp, dttm, "timeFlexible", "No") xmlh.set_default_value(opp, dttm, "openEnded", "No") try: time_elems = opp.getElementsByTagName("startTime") time_elems += opp.getElementsByTagName("endTime") except: time_elems = [] for el in time_elems: xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles") str_opp = xmlh.prettyxml(opp, True) outstr_list.append(str_opp) outstr_list.append('</VolunteerOpportunities>') outstr_list.append('</FootprintFeed>') return "".join(outstr_list), numorgs, numopps
def set_default_time_elem(parent, entity, tagname, timest=xmlh.current_ts()): """footprint macro.""" cdt = xmlh.set_default_value(parent, entity, tagname, timest) xmlh.set_default_attr(parent, cdt, "olsonTZ", "America/Los_Angeles")
def parse(instr, maxrecs, progress): """return FPXML given craigslist data""" if CL_LATLONGS == None: load_craigslist_latlongs() xmlh.print_progress("loading craigslist crawler output...") crawl_craigslist.parse_cache_file(instr, listings_only=True) xmlh.print_progress("loaded " + str(len(crawl_craigslist.pages)) + " craigslist pages.") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", "105") outstr += xmlh.output_val("providerName", "craigslist") outstr += xmlh.output_val("feedID", "craigslist") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.craigslist.org/") outstr += "</FeedInfo>" numorgs = numopps = 0 # no "organization" in craigslist postings outstr += "<Organizations>" outstr += "<Organization>" outstr += "<organizationID>0</organizationID>" outstr += "<nationalEIN></nationalEIN>" outstr += "<name></name>" outstr += "<missionStatement></missionStatement>" outstr += "<description></description>" outstr += "<location>" outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += "</location>" outstr += "<organizationURL></organizationURL>" outstr += "<donateURL></donateURL>" outstr += "<logoURL></logoURL>" outstr += "<detailURL></detailURL>" outstr += "</Organization>" numorgs += 1 outstr += "</Organizations>" skipped_listings = {} skipped_listings["body"] = skipped_listings["title"] = skipped_listings["not-ok"] = 0 outstr += "<VolunteerOpportunities>" for i, url in enumerate(crawl_craigslist.pages): page = crawl_craigslist.pages[url] ok = extract(page, "it's OK to distribute this " + "charitable volunteerism opportunity") if ok == "": skipped_listings["not-ok"] += 1 continue title = extract(page, "<title>(.+?)</title>") if title == "": skipped_listings["title"] += 1 continue body = extract(page, '<div id="userbody">(.+?)<') if len(body) < 25: skipped_listings["body"] += 1 continue item_id = extract(url, "/vol/(.+?)[.]html$") locstr = extract(page, "Location: (.+?)<") datestr = extract(page, "Date: (.+?)<") ts = dateutil.parser.parse(datestr) datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S") datestr = ts.strftime("%Y-%m-%d") if maxrecs > 0 and i > maxrecs: break xmlh.print_rps_progress("opps", progress, i, maxrecs) if progress and i > 0 and i % 250 == 0: msg = "skipped " + str(skipped_listings["title"] + skipped_listings["body"]) msg += " listings (" + str(skipped_listings["title"]) + " for no-title and " msg += str(skipped_listings["body"]) + " for short body and " msg += str(skipped_listings["not-ok"]) + " for no-redistrib)" xmlh.print_progress(msg) # print "---" # print "title:",title # print "loc:",locstr # print "date:",datestr # print "body:",body[0:100] # craigslist is full of weird escapes-- strip them body = re.sub(r"&[a-z]+;", "", body) title = re.sub(r"&[a-z]+;", "", title) locstr = re.sub(r"&[a-z]+;", "", locstr) outstr += "<VolunteerOpportunity>" outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (item_id) outstr += "<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>" outstr += "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>" outstr += "<title>%s</title>" % (title) outstr += "<detailURL>%s</detailURL>" % (url) # avoid CDATA in body... esc_body = xml.sax.saxutils.escape(body) esc_body100 = xml.sax.saxutils.escape(body[0:100]) outstr += "<description>%s</description>" % (esc_body) outstr += "<abstract>%s</abstract>" % (esc_body100 + "...") outstr += "<lastUpdated>%s</lastUpdated>" % (datetimestr) # TODO: expires # TODO: synthesize location from metro... outstr += "<locations><location>" outstr += "<name>%s</name>" % (xml.sax.saxutils.escape(locstr)) # what about the few that do geocode? lat, lng = "", "" try: domain, unused = url.split("vol/") lat, lng = CL_LATLONGS[domain].split(",") except: # ignore for now # print url # continue pass outstr += "<latitude>%s</latitude>" % (lat) outstr += "<longitude>%s</longitude>" % (lng) outstr += "</location></locations>" # outstr += '<locations><location>' # outstr += '<city>%s</city>' % ( # outstr += '<region>%s</region>' % ( # outstr += '</location></locations>' outstr += "<dateTimeDurations><dateTimeDuration>" outstr += "<openEnded>No</openEnded>" outstr += "<startDate>%s</startDate>" % (datestr) # TODO: endDate = startDate + N=14 days? # TODO: timezone??? # outstr += '<endDate>%s</endDate>' % ( outstr += "</dateTimeDuration></dateTimeDurations>" # TODO: categories??? # outstr += '<categoryTags>' outstr += "</VolunteerOpportunity>" numopps += 1 outstr += "</VolunteerOpportunities>" outstr += "</FootprintFeed>" # outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given 350.org data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand." org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths." start_date = '2010-10-01' today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading 350.org custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "350org") outstr += xmlh.output_val('feedID', "350org") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.350.org/") outstr += '</FeedInfo>' # 1 "organization" in 350.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>350.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' # TODO: make these variables outstr += '<organizationURL>http://www.350.org/</organizationURL>' outstr += '<donateURL>http://www.350.org/donate</donateURL>' outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>' outstr += '<detailURL>http://www.350.org/about</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('node') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>' url = xmlh.get_tag_val(node, "Link") lat = xmlh.get_tag_val(node, "Latitude") lng = xmlh.get_tag_val(node, "Longitude") start_datetime = xmlh.get_tag_val(node, "Start_Date") start_time = None if not start_datetime: start_date = "2010-10-10" else: start_datetime = start_datetime.replace(" (All day)", "T00:00:00") dt = start_datetime.split("T") start_date = dt[0][0:10] if len(dt) > 1: start_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") end_time = None if not end_datetime: open_ended = True else: open_ended = False end_datetime = end_datetime.replace(" (All day)", "T23:00:00") dt = end_datetime.split("T") end_date = dt[0][0:10] if len(dt) > 1: end_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") locstr = "%s, %s %s" % (xmlh.get_tag_val( node, "City"), xmlh.get_tag_val( node, "Province"), xmlh.get_tag_val(node, "Country")) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<locations><location>' outstr += '<location_string>%s</location_string>' % (locstr) outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if start_time: outstr += '<startTime>%s</startTime>' % (start_time) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) if end_time: outstr += '<endTime>%s</endTime>' % (end_time) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): numorgs = numopps = 0 instr = re.sub(r'<(/?db):', r'<\1_', instr) opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>', instr, re.DOTALL) volopps = "" for i, oppstr in enumerate(opps): #if progress and i > 0 and i % 250 == 0: # print str(datetime.now())+": ", i, " opportunities processed." if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(oppstr, known_elnames, progress=False) orgid = register_org(item) # logoURL -- sigh, this is for the opportunity not the org volopps += '<VolunteerOpportunity>' volopps += xmlh.output_val('volunteerOpportunityID', str(i)) volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid)) volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID") volopps += xmlh.output_node('title', item, "Title") volopps += xmlh.output_node('abstract', item, "Description") volopps += xmlh.output_node('description', item, "Description") volopps += xmlh.output_node('detailURL', item, "DetailURL") volopps += xmlh.output_val('volunteersNeeded', "-8888") try: oppdates = item.getElementsByTagName("OpportunityDate") except: oppdates = [] if len(oppdates) > 1: print datetime.now(), \ "parse_servenet.py: only 1 OpportunityDate supported." #return None oppdate = oppdates[0] elif len(oppdates) == 0: oppdate = None else: oppdate = oppdates[0] volopps += '<dateTimeDurations><dateTimeDuration>' if oppdate: volopps += xmlh.output_val('openEnded', 'No') volopps += xmlh.output_val('duration', 'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"), xmlh.get_tag_val(oppdate, "DurationUnit"))) volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += xmlh.output_node('startDate', oppdate, "StartDate") volopps += xmlh.output_node('endDate', oppdate, "EndDate") else: volopps += xmlh.output_val('openEnded', 'Yes') volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += '</dateTimeDuration></dateTimeDurations>' volopps += '<locations>' try: opplocs = item.getElementsByTagName("Location") except: opplocs = [] for opploc in opplocs: volopps += '<location>' virtual_tag = opploc.getElementsByTagName("Virtual") if virtual_tag and xmlh.get_tag_val(opploc, "Virtual").lower() == "yes": volopps += xmlh.output_val('virtual', 'Yes') else: volopps += xmlh.output_node('region', opploc, "StateOrProvince") volopps += xmlh.output_node('country', opploc, "Country") volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode") volopps += '</location>' volopps += '</locations>' volopps += '<categoryTags/>' volopps += '</VolunteerOpportunity>' numopps += 1 # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', providerID) outstr += xmlh.output_val('providerName', providerName) outstr += xmlh.output_val('feedID', feedID) outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', providerURL) outstr += xmlh.output_val('description', feedDescription) # TODO: capture ts -- use now?! outstr += '</FeedInfo>' # hardcoded: Organization outstr += '<Organizations>' for key in ORGS: outstr += ORGS[key] numorgs += 1 outstr += '</Organizations>' outstr += '<VolunteerOpportunities>' outstr += volopps outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given 350.org data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "350.org is an international campaign that's building a movement to unite the world around solutions to the climate crisis--the solutions that science and justice demand." org_desc = "On October 10 we'll be helping host a Global Work Party, with thousands of communities setting up solar panels or digging community gardens or laying out bike paths." start_date = '2010-10-01' today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading 350.org custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "350org") outstr += xmlh.output_val('feedID', "350org") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.350.org/") outstr += '</FeedInfo>' # 1 "organization" in 350.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>350.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' # TODO: make these variables outstr += '<organizationURL>http://www.350.org/</organizationURL>' outstr += '<donateURL>http://www.350.org/donate</donateURL>' outstr += '<logoURL>http://www.350.org/sites/all/themes/threefifty/logo.gif</logoURL>' outstr += '<detailURL>http://www.350.org/about</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('node') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "Body") + ']]>' url = xmlh.get_tag_val(node, "Link") lat = xmlh.get_tag_val(node, "Latitude") lng = xmlh.get_tag_val(node, "Longitude") start_datetime = xmlh.get_tag_val(node, "Start_Date") start_time = None if not start_datetime: start_date = "2010-10-10" else: start_datetime = start_datetime.replace(" (All day)", "T00:00:00") dt = start_datetime.split("T") start_date = dt[0][0:10] if len(dt) > 1: start_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") end_time = None if not end_datetime: open_ended = True else: open_ended = False end_datetime = end_datetime.replace(" (All day)", "T23:00:00") dt = end_datetime.split("T") end_date = dt[0][0:10] if len(dt) > 1: end_time = dt[1] end_datetime = xmlh.get_tag_val(node, "End_Date") locstr = "%s, %s %s" % (xmlh.get_tag_val(node, "City"), xmlh.get_tag_val(node, "Province"), xmlh.get_tag_val(node, "Country")) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id) outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated) outstr += '<locations><location>' outstr += '<location_string>%s</location_string>' % (locstr) outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if start_time: outstr += '<startTime>%s</startTime>' % (start_time) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) if end_time: outstr += '<endTime>%s</endTime>' % (end_time) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about." org_desc = "Sparked is the world's first Microvolunteering network" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading sparked.com custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "sparked") outstr += xmlh.output_val('feedID', "sparked") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.sparked.com/") outstr += '</FeedInfo>' # 1 "organization" in sparked.com postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>sparked.com</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.sparked.com/</organizationURL>' outstr += '<donateURL>http://www.sparked.com/</donateURL>' outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>' outstr += '<detailURL>http://www.sparked.com/</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('challenge') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>' url = xmlh.get_tag_val(node, "url") start_date = last_updated open_ended = True #01234567 #02/15/11 mdy = xmlh.get_tag_val(node, "deadline") if mdy: try: end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5] open_ended = False except: pass outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % (org_id) outstr += '<micro>Yes</micro>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' %(last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading idealist.xml custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "idealist") outstr += xmlh.output_val('feedID', "idealist") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.idealist.org/") outstr += '</FeedInfo>' # 1 "organization" in idealist.org postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>idealist.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "New York") outstr += xmlh.output_val("region", "NY") outstr += xmlh.output_val("postalCode", "10001") outstr += '</location>' outstr += '<organizationURL>http://www.idealist.org/</organizationURL>'
def parse(instr, maxrecs, progress): """return FPXML given craigslist data""" if CL_LATLONGS == None: load_craigslist_latlongs() xmlh.print_progress("loading craigslist crawler output...") crawl_craigslist.parse_cache_file(instr, listings_only=True) xmlh.print_progress("loaded "+str(len(crawl_craigslist.pages))+" craigslist pages.") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', "105") outstr += xmlh.output_val('providerName', "craigslist") outstr += xmlh.output_val('feedID', "craigslist") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.craigslist.org/") outstr += '</FeedInfo>' numorgs = numopps = 0 # no "organization" in craigslist postings outstr += '<Organizations>' outstr += '<Organization>' outstr += '<organizationID>0</organizationID>' outstr += '<nationalEIN></nationalEIN>' outstr += '<name></name>' outstr += '<missionStatement></missionStatement>' outstr += '<description></description>' outstr += '<location>' outstr += xmlh.output_val("city", "") outstr += xmlh.output_val("region", "") outstr += xmlh.output_val("postalCode", "") outstr += '</location>' outstr += '<organizationURL></organizationURL>' outstr += '<donateURL></donateURL>' outstr += '<logoURL></logoURL>' outstr += '<detailURL></detailURL>' outstr += '</Organization>' numorgs += 1 outstr += '</Organizations>' skipped_listings = {} skipped_listings["body"] = skipped_listings["title"] = \ skipped_listings["not-ok"] = 0 outstr += '<VolunteerOpportunities>' for i, url in enumerate(crawl_craigslist.pages): page = crawl_craigslist.pages[url] ok = extract(page, "it's OK to distribute this "+ "charitable volunteerism opportunity") if ok == "": skipped_listings["not-ok"] += 1 continue title = extract(page, "<title>(.+?)</title>") if title == "": skipped_listings["title"] += 1 continue body = extract(page, '<div id="userbody">(.+?)<') if len(body) < 25: skipped_listings["body"] += 1 continue item_id = extract(url, "/vol/(.+?)[.]html$") locstr = extract(page, "Location: (.+?)<") datestr = extract(page, "Date: (.+?)<") ts = dateutil.parser.parse(datestr) datetimestr = ts.strftime("%Y-%m-%dT%H:%M:%S") datestr = ts.strftime("%Y-%m-%d") if (maxrecs>0 and i>maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) if progress and i > 0 and i % 250 == 0: msg = "skipped " + str(skipped_listings["title"]+skipped_listings["body"]) msg += " listings ("+str(skipped_listings["title"]) + " for no-title and " msg += str(skipped_listings["body"]) + " for short body and " msg += str(skipped_listings["not-ok"]) + " for no-redistrib)" xmlh.print_progress(msg) #print "---" #print "title:",title #print "loc:",locstr #print "date:",datestr #print "body:",body[0:100] # craigslist is full of weird escapes-- strip them body = re.sub(r'&[a-z]+;', '', body) title = re.sub(r'&[a-z]+;', '', title) locstr = re.sub(r'&[a-z]+;', '', locstr) outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (item_id) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) # avoid CDATA in body... esc_body = xml.sax.saxutils.escape(body) esc_body100 = xml.sax.saxutils.escape(body[0:100]) outstr += '<description>%s</description>' % (esc_body) outstr += '<abstract>%s</abstract>' % (esc_body100 + "...") outstr += '<lastUpdated>%s</lastUpdated>' % (datetimestr) # TODO: expires # TODO: synthesize location from metro... outstr += '<locations><location>' outstr += '<name>%s</name>' % (xml.sax.saxutils.escape(locstr)) # what about the few that do geocode? lat, lng = "", "" try: domain, unused = url.split("vol/") lat, lng = CL_LATLONGS[domain].split(",") except: # ignore for now #print url #continue pass outstr += '<latitude>%s</latitude>' % (lat) outstr += '<longitude>%s</longitude>' % (lng) outstr += '</location></locations>' #outstr += '<locations><location>' #outstr += '<city>%s</city>' % ( #outstr += '<region>%s</region>' % ( #outstr += '</location></locations>' outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<openEnded>No</openEnded>' outstr += '<startDate>%s</startDate>' % (datestr) # TODO: endDate = startDate + N=14 days? # TODO: timezone??? #outstr += '<endDate>%s</endDate>' % ( outstr += '</dateTimeDuration></dateTimeDurations>' # TODO: categories??? #outstr += '<categoryTags>' outstr += '</VolunteerOpportunity>' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" from xml.dom import minidom org_id = "140" mission_statement = "Do it yourself volunteer opportunities." org_desc = "Do it yourself volunteer opportunities" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading diy custom TSV...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "diy") outstr += xmlh.output_val('feedID', "diy") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.allforgood.org/") outstr += '</FeedInfo>' outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>allforgood.org</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.allforgood.org/</organizationURL>' outstr += '<donateURL>http://www.allforgood.org/</donateURL>' outstr += '<logoURL>http://www.allforgood.org/</logoURL>' outstr += '<detailURL>http://www.allforgood.org/</detailURL>' outstr += '</Organization></Organizations>' outstr += '<VolunteerOpportunities>' lines = instr.split("\n") header = lines.pop(0).strip().split("\t") for i, line in enumerate(lines): row = line.strip().split("\t") if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + get_field("title", row, header) + ']]>' url = get_field("url", row, header) if not title or not url: continue sponsor = get_field("sponsoringOrganization", row, header) desc = ('<![CDATA[' + sponsor + ': ' + get_field("description", row, header) + ' Areas of interest: ' + get_field("subjectArea", row, header) + ' Tags: ' + get_field("keywords", row, header) + ']]>') start_date = last_updated outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<self_directed>Yes</self_directed>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL><![CDATA[%s]]></detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) outstr += '<openEnded>Yes</openEnded>' outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" from xml.dom import minidom org_id = "140" mission_statement = "Do it yourself volunteer opportunities." org_desc = "Do it yourself volunteer opportunities" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") numorgs = 1 numopps = 0 xmlh.print_progress("loading diy custom TSV...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += "<FeedInfo>" outstr += xmlh.output_val("providerID", org_id) outstr += xmlh.output_val("providerName", "diy") outstr += xmlh.output_val("feedID", "diy") outstr += xmlh.output_val("createdDateTime", xmlh.current_ts()) outstr += xmlh.output_val("providerURL", "http://www.allforgood.org/") outstr += "</FeedInfo>" outstr += "<Organizations><Organization>" outstr += xmlh.output_val("organizationID", org_id) outstr += "<nationalEIN></nationalEIN>" outstr += "<name>allforgood.org</name>" outstr += xmlh.output_val("missionStatement", mission_statement) outstr += xmlh.output_val("description", org_desc) outstr += "<location>" outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += "</location>" outstr += "<organizationURL>http://www.allforgood.org/</organizationURL>" outstr += "<donateURL>http://www.allforgood.org/</donateURL>" outstr += "<logoURL>http://www.allforgood.org/</logoURL>" outstr += "<detailURL>http://www.allforgood.org/</detailURL>" outstr += "</Organization></Organizations>" outstr += "<VolunteerOpportunities>" lines = instr.split("\n") header = lines.pop(0).strip().split("\t") for i, line in enumerate(lines): row = line.strip().split("\t") if maxrec > 0 and i > maxrec: break title = "<![CDATA[" + get_field("title", row, header) + "]]>" url = get_field("url", row, header) if not title or not url: continue sponsor = get_field("sponsoringOrganization", row, header) desc = ( "<![CDATA[" + sponsor + ": " + get_field("description", row, header) + " Areas of interest: " + get_field("subjectArea", row, header) + " Tags: " + get_field("keywords", row, header) + "]]>" ) start_date = last_updated outstr += "<VolunteerOpportunity>" outstr += "<volunteerOpportunityID>%s</volunteerOpportunityID>" % (str(i)) outstr += ( "<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>" % (org_id) ) outstr += ( "<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>" % (org_id) ) outstr += "<self_directed>Yes</self_directed>" outstr += "<title>%s</title>" % (title) outstr += "<detailURL><![CDATA[%s]]></detailURL>" % (url) outstr += "<description>%s</description>" % (desc) outstr += "<abstract>%s</abstract>" % (desc) outstr += "<lastUpdated>%s</lastUpdated>" % (last_updated) outstr += "<dateTimeDurations><dateTimeDuration>" outstr += "<startDate>%s</startDate>" % (start_date) outstr += "<openEnded>Yes</openEnded>" outstr += "</dateTimeDuration></dateTimeDurations>" outstr += "<locations><location><virtual>Yes</virtual></location></locations>" outstr += "</VolunteerOpportunity>" numopps += 1 outstr += "</VolunteerOpportunities>" outstr += "</FootprintFeed>" return outstr, numorgs, numopps
def parse(instr, maxrec, progress): """return FPXML given sparked feed data""" feed = xmlh.parse_or_die(instr.encode('utf-8')) org_id = str(139) mission_statement = "Sparked makes it easy for people with busy lives to help nonprofits get valuable work done when it's convenient. We call it microvolunteering. Through the convenience of the Internet, and with the collaboration of others, micro-volunteers use their professional skills to help causes they care about." org_desc = "Sparked is the world's first Microvolunteering network" today = datetime.now() last_updated = today.strftime("%Y-%m-%dT%H:%M:%S") start_date = last_updated numorgs = 1 numopps = 0 xmlh.print_progress("loading sparked.com custom XML...") # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', org_id) outstr += xmlh.output_val('providerName', "sparked") outstr += xmlh.output_val('feedID', "sparked") outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', "http://www.sparked.com/") outstr += '</FeedInfo>' # 1 "organization" in sparked.com postings outstr += '<Organizations><Organization>' outstr += xmlh.output_val('organizationID', org_id) outstr += '<nationalEIN></nationalEIN>' outstr += '<name>sparked.com</name>' outstr += xmlh.output_val('missionStatement', mission_statement) outstr += xmlh.output_val('description', org_desc) outstr += '<location>' outstr += xmlh.output_val("city", "San Francisco") outstr += xmlh.output_val("region", "CA") outstr += xmlh.output_val("postalCode", "94105") outstr += '</location>' outstr += '<organizationURL>http://www.sparked.com/</organizationURL>' outstr += '<donateURL>http://www.sparked.com/</donateURL>' outstr += '<logoURL>http://www.sparked.com/imgver4/logo_sparked.gif</logoURL>' outstr += '<detailURL>http://www.sparked.com/</detailURL>' outstr += '</Organization></Organizations>' outstr += '\n<VolunteerOpportunities>\n' nodes = feed.getElementsByTagName('challenge') for i, node in enumerate(nodes): if maxrec > 0 and i > maxrec: break title = '<![CDATA[' + xmlh.get_tag_val(node, "title") + ']]>' desc = '<![CDATA[' + xmlh.get_tag_val(node, "description") + ']]>' url = xmlh.get_tag_val(node, "url") start_date = last_updated open_ended = True #01234567 #02/15/11 mdy = xmlh.get_tag_val(node, "deadline") if mdy: try: end_date = str(2000 + int(mdy[6:])) + "-" + mdy[0:2] + "-" + mdy[3:5] open_ended = False except: pass outstr += '<VolunteerOpportunity>' outstr += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( str(i)) outstr += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( org_id) outstr += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>%s</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' % ( org_id) outstr += '<micro>Yes</micro>' outstr += '<title>%s</title>' % (title) outstr += '<detailURL>%s</detailURL>' % (url) outstr += '<description>%s</description>' % (desc) outstr += '<abstract>%s</abstract>' % (desc) outstr += '<lastUpdated>%s</lastUpdated>' % (last_updated) outstr += '<dateTimeDurations><dateTimeDuration>' outstr += '<startDate>%s</startDate>' % (start_date) if open_ended: outstr += '<openEnded>Yes</openEnded>' else: outstr += '<openEnded>No</openEnded>' outstr += '<endDate>%s</endDate>' % (end_date) outstr += '</dateTimeDuration></dateTimeDurations>' outstr += '<locations><location><virtual>Yes</virtual></location></locations>' outstr += '</VolunteerOpportunity>\n' numopps += 1 outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): numorgs = numopps = 0 instr = re.sub(r'<(/?db):', r'<\1_', instr) opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>', instr, re.DOTALL) volopps = "" for i, oppstr in enumerate(opps): #if progress and i > 0 and i % 250 == 0: # print str(datetime.now())+": ", i, " opportunities processed." if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(oppstr, known_elnames, progress=False) orgid = register_org(item) # logoURL -- sigh, this is for the opportunity not the org volopps += '<VolunteerOpportunity>' volopps += xmlh.output_val('volunteerOpportunityID', str(i)) volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid)) volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID") volopps += xmlh.output_node('title', item, "Title") volopps += xmlh.output_node('abstract', item, "Description") volopps += xmlh.output_node('description', item, "Description") volopps += xmlh.output_node('detailURL', item, "DetailURL") volopps += xmlh.output_val('volunteersNeeded', "-8888") try: oppdates = item.getElementsByTagName("OpportunityDate") except: oppdates = [] if len(oppdates) > 1: print datetime.now(), \ "parse_servenet.py: only 1 OpportunityDate supported." #return None oppdate = oppdates[0] elif len(oppdates) == 0: oppdate = None else: oppdate = oppdates[0] volopps += '<dateTimeDurations><dateTimeDuration>' if oppdate: volopps += xmlh.output_val('openEnded', 'No') volopps += xmlh.output_val( 'duration', 'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"), xmlh.get_tag_val(oppdate, "DurationUnit"))) volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += xmlh.output_node('startDate', oppdate, "StartDate") volopps += xmlh.output_node('endDate', oppdate, "EndDate") else: volopps += xmlh.output_val('openEnded', 'Yes') volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += '</dateTimeDuration></dateTimeDurations>' volopps += '<locations>' try: opplocs = item.getElementsByTagName("Location") except: opplocs = [] for opploc in opplocs: volopps += '<location>' virtual_tag = opploc.getElementsByTagName("Virtual") if virtual_tag and xmlh.get_tag_val( opploc, "Virtual").lower() == "yes": volopps += xmlh.output_val('virtual', 'Yes') else: volopps += xmlh.output_node('region', opploc, "StateOrProvince") volopps += xmlh.output_node('country', opploc, "Country") volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode") volopps += '</location>' volopps += '</locations>' volopps += '<categoryTags/>' volopps += '</VolunteerOpportunity>' numopps += 1 # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', providerID) outstr += xmlh.output_val('providerName', providerName) outstr += xmlh.output_val('feedID', feedID) outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', providerURL) outstr += xmlh.output_val('description', feedDescription) # TODO: capture ts -- use now?! outstr += '</FeedInfo>' # hardcoded: Organization outstr += '<Organizations>' for key in ORGS: outstr += ORGS[key] numorgs += 1 outstr += '</Organizations>' outstr += '<VolunteerOpportunities>' outstr += volopps outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(instr, maxrecs, progress): """return FPXML given usaservice data""" # TODO: progress known_elnames = [ 'channel', 'db:abstract', 'db:address', 'db:attendee_count', 'db:categories', 'db:city', 'db:country', 'db:county', 'db:dateTime', 'db:event', 'db:eventType', 'db:guest_total', 'db:host', 'db:latitude', 'db:length', 'db:longitude', 'db:rsvp', 'db:scheduledTime', 'db:state', 'db:street', 'db:title', 'db:venue_name', 'db:zipcode', 'description', 'docs', 'guid', 'item', 'language', 'link', 'pubDate', 'rss', 'title', ] # convert to footprint format s = '<?xml version="1.0" ?>' s += '<FootprintFeed schemaVersion="0.1">' s += '<FeedInfo>' # TODO: assign provider IDs? s += '<providerID>101</providerID>' s += '<providerName>usaservice.org</providerName>' s += '<feedID>1</feedID>' s += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts() s += '<providerURL>http://www.usaservice.org/</providerURL>' s += '<description>Syndicated events</description>' # TODO: capture ts -- use now?! s += '</FeedInfo>' numorgs = numopps = 0 # hardcoded: Organization s += '<Organizations>' s += '<Organization>' s += '<organizationID>0</organizationID>' s += '<nationalEIN></nationalEIN>' s += '<name></name>' s += '<missionStatement></missionStatement>' s += '<description></description>' s += '<location><city></city><region></region><postalCode></postalCode></location>' s += '<organizationURL></organizationURL>' s += '<donateURL></donateURL>' s += '<logoURL></logoURL>' s += '<detailURL></detailURL>' s += '</Organization>' numorgs += 1 s += '</Organizations>' s += '<VolunteerOpportunities>' instr = re.sub(r'<(/?db):', r'<\1_', instr) for i, line in enumerate(instr.splitlines()): if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(line, known_elnames, progress=False) # unmapped: db_rsvp (seems to be same as link, but with #rsvp at end of url?) # unmapped: db_host (no equivalent?) # unmapped: db_county (seems to be empty) # unmapped: attendee_count # unmapped: guest_total # unmapped: db_title (dup of title, above) s += '<VolunteerOpportunity>' s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( xmlh.get_tag_val(item, "guid")) # hardcoded: sponsoringOrganizationID s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' # hardcoded: volunteerHubOrganizationID s += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title")) s += '<abstract>%s</abstract>' % (xmlh.get_tag_val(item, "abstract")) s += '<volunteersNeeded>-8888</volunteersNeeded>' dbscheduledTimes = item.getElementsByTagName("db_scheduledTime") if (dbscheduledTimes.length != 1): print datetime.now( ), "parse_usaservice: only 1 db_scheduledTime supported." return None dbscheduledTime = dbscheduledTimes[0] s += '<dateTimeDurations><dateTimeDuration>' length = xmlh.get_tag_val(dbscheduledTime, "db_length") if length == "" or length == "-1": s += '<openEnded>Yes</openEnded>' else: s += '<openEnded>No</openEnded>' date, time = xmlh.get_tag_val(dbscheduledTime, "db_dateTime").split(" ") s += '<startDate>%s</startDate>' % (date) # TODO: timezone??? s += '<startTime>%s</startTime>' % (time) s += '</dateTimeDuration></dateTimeDurations>' dbaddresses = item.getElementsByTagName("db_address") if (dbaddresses.length != 1): print datetime.now( ), "parse_usaservice: only 1 db_address supported." return None dbaddress = dbaddresses[0] s += '<locations><location>' s += '<name>%s</name>' % (xmlh.get_tag_val(item, "db_venue_name")) s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val( dbaddress, "db_street")) s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "db_city")) s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "db_state")) s += '<country>%s</country>' % (xmlh.get_tag_val( dbaddress, "db_country")) s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val( dbaddress, "db_zipcode")) s += '<latitude>%s</latitude>' % (xmlh.get_tag_val( item, "db_latitude")) s += '<longitude>%s</longitude>' % (xmlh.get_tag_val( item, "db_longitude")) s += '</location></locations>' type = xmlh.get_tag_val(item, "db_eventType") s += '<categoryTags><categoryTag>%s</categoryTag></categoryTags>' % ( type) s += '<contactName>%s</contactName>' % xmlh.get_tag_val( item, "db_host") s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(item, "link")) s += '<description>%s</description>' % (xmlh.get_tag_val( item, "description")) pubdate = xmlh.get_tag_val(item, "pubDate") if re.search("[0-9][0-9] [A-Z][a-z][a-z] [0-9][0-9][0-9][0-9]", pubdate): # TODO: parse() is ignoring timzone... ts = dateutil.parser.parse(pubdate) pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S") s += '<lastUpdated>%s</lastUpdated>' % (pubdate) s += '</VolunteerOpportunity>' numopps += 1 s += '</VolunteerOpportunities>' s += '</FootprintFeed>' #s = re.sub(r'><([^/])', r'>\n<\1', s) return s, numorgs, numopps
def parse_fast(instr, maxrecs, progress): """fast parser but doesn't check correctness, i.e. must be pre-checked by caller.""" numorgs = numopps = 0 outstr_list = ['<?xml version="1.0" ?>'] outstr_list.append('<FootprintFeed schemaVersion="0.1">') # note: processes Organizations first, so ID lookups work for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) xmlh.set_default_value(node, node.firstChild, "feedID", "0") set_default_time_elem(node, node.firstChild, "createdDateTime") outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('<Organizations>') for match in re.finditer( re.compile('<Organization>.+?</Organization>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numorgs += 1 outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('</Organizations>') outstr_list.append('<VolunteerOpportunities>') for match in re.finditer( re.compile('<VolunteerOpportunity>.+?</VolunteerOpportunity>', re.DOTALL), instr): opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numopps += 1 if (maxrecs > 0 and numopps > maxrecs): break #if progress and numopps % 250 == 0: # print datetime.now(), ": ", numopps, " records generated." # these set_default_* functions dont do anything if the field # doesnt already exists xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888) xmlh.set_default_value(opp, opp, "paid", "No") xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither") xmlh.set_default_value(opp, opp, "language", "English") set_default_time_elem(opp, opp, "lastUpdated") set_default_time_elem(opp, opp, "expires", xmlh.current_ts(DEFAULT_EXPIRATION)) try: opplocs = opp.getElementsByTagName("location") except: opplocs = [] for loc in opplocs: xmlh.set_default_value(opp, loc, "virtual", "No") xmlh.set_default_value(opp, loc, "country", "US") try: dttms = opp.getElementsByTagName("dateTimeDurations") except: dttms = [] for dttm in dttms: # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No") xmlh.set_default_value(opp, dttm, "iCalRecurrence", "") if (dttm.getElementsByTagName("startTime") == None and dttm.getElementsByTagName("endTime") == None): set_default_time_elem(opp, dttm, "timeFlexible", "Yes") else: set_default_time_elem(opp, dttm, "timeFlexible", "No") xmlh.set_default_value(opp, dttm, "openEnded", "No") try: time_elems = opp.getElementsByTagName("startTime") time_elems += opp.getElementsByTagName("endTime") except: time_elems = [] for el in time_elems: xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles") str_opp = xmlh.prettyxml(opp, True) outstr_list.append(str_opp) outstr_list.append('</VolunteerOpportunities>') outstr_list.append('</FootprintFeed>') return "".join(outstr_list), numorgs, numopps
def parse(instr, maxrecs = 0, progress = False): """parser main.""" data = {} updated = {} maxrow, maxcol = parse_gspreadsheet(instr, data, updated, progress) if DEBUG and progress: print str(datetime.now())+": maxrow="+str(maxrow)+" maxcol="+str(maxcol) # find header row: look for "opportunity title" (case insensitive) header_row, header_startcol = find_header_row(data, 'opportunity\s*title') header_colidx = {} header_names = {} header_col = header_startcol while True: header_str = cellval(data, header_row, header_col) if not header_str: break field_name = None header_str = header_str.lower() if header_str.find("title") >= 0: field_name = "OpportunityTitle" elif (header_str.find("organization") >= 0 and header_str.find("sponsor") >= 0): field_name = "SponsoringOrganization" elif header_str.find("description") >= 0: field_name = "Description" elif header_str.find("skills") >= 0: field_name = "Skills" elif header_str.find("location") >= 0 and header_str.find("name") >= 0: field_name = "LocationName" elif header_str.find("street") >= 0: field_name = "LocationStreet" elif header_str.find("city") >= 0: field_name = "LocationCity" elif header_str.find("state") >= 0 or header_str.find("province") >= 0: field_name = "LocationProvince" elif header_str.find("zip") >= 0 or header_str.find("postal") >= 0: field_name = "LocationPostalCode" elif header_str.find("country") >= 0: field_name = "LocationCountry" elif header_str.find("start") >= 0 and header_str.find("date") >= 0: field_name = "StartDate" elif header_str.find("start") >= 0 and header_str.find("time") >= 0: field_name = "StartTime" elif header_str.find("end") >= 0 and header_str.find("date") >= 0: field_name = "EndDate" elif header_str.find("end") >= 0 and header_str.find("time") >= 0: field_name = "EndTime" elif header_str.find("contact") >= 0 and header_str.find("name") >= 0: field_name = "ContactName" elif header_str.find("email") >= 0 or header_str.find("e-mail") >= 0: field_name = "ContactEmail" elif header_str.find("phone") >= 0: field_name = "ContactPhone" elif header_str.find("website") >= 0 or header_str.find("url") >= 0: field_name = "URL" elif header_str.find("often") >= 0: field_name = "Frequency" elif header_str.find("days") >= 0 and header_str.find("week") >= 0: field_name = "DaysOfWeek" elif header_str.find("paid") >= 0: field_name = "Paid" elif header_str.find("self_directed") >= 0: field_name = "SelfDirected" elif header_str.find("commitment") >= 0 or header_str.find("hours") >= 0: field_name = "CommitmentHours" elif header_str.find("age") >= 0 and header_str.find("min") >= 0: field_name = "MinimumAge" elif header_str.find("kid") >= 0: field_name = "KidFriendly" elif header_str.find("senior") >= 0 and header_str.find("only") >= 0: field_name = "SeniorsOnly" elif header_str.find("sex") >= 0 or header_str.find("gender") >= 0: field_name = "SexRestrictedTo" elif header_str.find("volunteer appeal") >= 0: field_name = None elif header_str.find("volunteerOptIn") >= 0: field_name = None elif header_str.find("booksOptIn") >= 0: field_name = None else: parser_error("couldn't map header '"+header_str+"' to a field name.") if field_name != None: header_colidx[field_name] = header_col header_names[header_col] = field_name #print header_str, "=>", field_name header_col += 1 if len(header_names) < 10: parser_error("too few fields found: "+str(len(header_names))) # check to see if there's a header-description row header_desc = cellval(data, header_row+1, header_startcol) if not header_desc: parser_error("empty spreadsheet? blank row not allowed below header row") return '', 0, 0 #data_startrow = 3 else: header_desc = header_desc.lower() data_startrow = header_row + 1 if header_desc.find("up to") >= 0: data_startrow += 1 # find the data global CURRENT_ROW CURRENT_ROW = row = data_startrow blankrows = 0 volopps = '<VolunteerOpportunities>' numorgs = numopps = 0 while True: blankrow = True #rowstr = "row="+str(row)+"\n" record = {} record['LastUpdated'] = '2000-01-01' for field_name in header_colidx: col = header_colidx[field_name] val = cellval(data, row, col) if val: blankrow = False else: val = "" #rowstr += " "+field_name+"="+val+"\n" record[field_name] = val key = 'R'+str(row)+'C'+str(col) if (key in updated and updated[key] > record['LastUpdated']): record['LastUpdated'] = updated[key] if blankrow: blankrows += 1 if blankrows > MAX_BLANKROWS: break else: numopps += 1 blankrows = 0 record['oppid'] = str(numopps) volopps += record_to_fpxml(record) row += 1 CURRENT_ROW = row CURRENT_ROW = None if DEBUG and progress: print str(datetime.now())+": ", numopps, "opportunities found." volopps += '</VolunteerOpportunities>' outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' # providerID replaced by caller outstr += '<providerID></providerID>' # providerName replaced by caller outstr += '<providerName></providerName>' outstr += '<feedID>1</feedID>' outstr += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts() # providerURL replaced by caller outstr += '<providerURL></providerURL>' outstr += '<description></description>' outstr += '</FeedInfo>' outstr += "<Organizations>" for orgname in KNOWN_ORGS: outstr += "<Organization>" outstr += xmlh.output_val("organizationID", KNOWN_ORGS[orgname]) outstr += xmlh.output_val("name", orgname, cdata=True) outstr += "</Organization>" outstr += "</Organizations>" outstr += volopps outstr += '</FootprintFeed>' #outstr = re.sub(r'><', '>\n<', outstr) #print outstr return outstr, numorgs, numopps